AMDGPULegalizerInfo.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp]

1	//===- AMDGPULegalizerInfo.cpp ------------------------------------ C++ --==//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// This file implements the targeting of the Machinelegalizer class for
10	/// AMDGPU.
11	/// \todo This should be generated by TableGen.
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPULegalizerInfo.h"
15
16	#include "AMDGPU.h"
17	#include "AMDGPUGlobalISelUtils.h"
18	#include "AMDGPUInstrInfo.h"
19	#include "AMDGPUTargetMachine.h"
20	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21	#include "SIInstrInfo.h"
22	#include "SIMachineFunctionInfo.h"
23	#include "SIRegisterInfo.h"
24	#include "Utils/AMDGPUBaseInfo.h"
25	#include "llvm/ADT/ScopeExit.h"
26	#include "llvm/BinaryFormat/ELF.h"
27	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28	#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31	#include "llvm/CodeGen/GlobalISel/Utils.h"
32	#include "llvm/CodeGen/TargetOpcodes.h"
33	#include "llvm/IR/DiagnosticInfo.h"
34	#include "llvm/IR/IntrinsicsAMDGPU.h"
35	#include "llvm/IR/IntrinsicsR600.h"
36
37	#define DEBUG_TYPE "amdgpu-legalinfo"
38
39	using namespace llvm;
40	using namespace LegalizeActions;
41	using namespace LegalizeMutations;
42	using namespace LegalityPredicates;
43	using namespace MIPatternMatch;
44
45	// Hack until load/store selection patterns support any tuple of legal types.
46	static cl::opt<bool> EnableNewLegality(
47	"amdgpu-global-isel-new-legality",
48	cl::desc ("Use GlobalISel desired legality, rather than try to use"
49	"rules compatible with selection patterns"),
50	cl::init(Val: false),
51	cl::ReallyHidden);
52
53	static constexpr unsigned MaxRegisterSize = `1024`;
54
55	// Round the number of elements to the next power of two elements
56	static LLT getPow2VectorType(LLT Ty) {
57	unsigned NElts = Ty.getNumElements();
58	unsigned Pow2NElts = `1` << Log2_32_Ceil(Value: NElts);
59	return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
60	}
61
62	// Round the number of bits to the next power of two bits
63	static LLT getPow2ScalarType(LLT Ty) {
64	unsigned Bits = Ty.getSizeInBits();
65	unsigned Pow2Bits = `1` << Log2_32_Ceil(Value: Bits);
66	return LLT::scalar(SizeInBits: Pow2Bits);
67	}
68
69	/// \returns true if this is an odd sized vector which should widen by adding an
70	/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71	/// excludes s1 vectors, which should always be scalarized.
72	static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73	return [=](const LegalityQuery &Query) {
74	const LLT Ty = Query.Types [TypeIdx];
75	if (!Ty.isVector())
76	return false;
77
78	const LLT EltTy = Ty.getElementType();
79	const unsigned EltSize = EltTy.getSizeInBits();
80	return Ty.getNumElements() % `2` != `0` &&
81	EltSize > `1` && EltSize < `32` &&
82	Ty.getSizeInBits() % `32` != `0`;
83	};
84	}
85
86	static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87	return [=](const LegalityQuery &Query) {
88	const LLT Ty = Query.Types [TypeIdx];
89	return Ty.getSizeInBits() % `32` == `0`;
90	};
91	}
92
93	static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94	return [=](const LegalityQuery &Query) {
95	const LLT Ty = Query.Types [TypeIdx];
96	const LLT EltTy = Ty.getScalarType();
97	return EltTy.getSizeInBits() == `16` && Ty.getNumElements() > `2`;
98	};
99	}
100
101	static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102	return [=](const LegalityQuery &Query) {
103	const LLT Ty = Query.Types [TypeIdx];
104	const LLT EltTy = Ty.getElementType();
105	return std::pair(TypeIdx,
106	LLT::fixed_vector(NumElements: Ty.getNumElements() + `1`, ScalarTy: EltTy));
107	};
108	}
109
110	static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111	return [=](const LegalityQuery &Query) {
112	const LLT Ty = Query.Types [TypeIdx];
113	const LLT EltTy = Ty.getElementType();
114	unsigned Size = Ty.getSizeInBits();
115	unsigned Pieces = (Size + `63`) / `64`;
116	unsigned NewNumElts = (Ty.getNumElements() + `1`) / Pieces;
117	return std::pair(TypeIdx, LLT::scalarOrVector(
118	EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
119	};
120	}
121
122	// Increase the number of vector elements to reach the next multiple of 32-bit
123	// type.
124	static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125	return [=](const LegalityQuery &Query) {
126	const LLT Ty = Query.Types [TypeIdx];
127
128	const LLT EltTy = Ty.getElementType();
129	const int Size = Ty.getSizeInBits();
130	const int EltSize = EltTy.getSizeInBits();
131	const int NextMul32 = (Size + `31`) / `32`;
132
133	assert(EltSize < `32`);
134
135	const int NewNumElts = (`32` * NextMul32 + EltSize - `1`) / EltSize;
136	return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
137	};
138	}
139
140	// Increase the number of vector elements to reach the next legal RegClass.
141	static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142	return [=](const LegalityQuery &Query) {
143	const LLT Ty = Query.Types [TypeIdx];
144	const unsigned NumElts = Ty.getNumElements();
145	const unsigned EltSize = Ty.getElementType().getSizeInBits();
146	const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148	assert(EltSize == `32` \|\| EltSize == `64`);
149	assert(Ty.getSizeInBits() < MaxRegisterSize);
150
151	unsigned NewNumElts;
152	// Find the nearest legal RegClass that is larger than the current type.
153	for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154	if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
155	break;
156	}
157
158	return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarSizeInBits: EltSize));
159	};
160	}
161
162	static LLT getBufferRsrcScalarType(const LLT Ty) {
163	if (!Ty.isVector())
164	return LLT::scalar(SizeInBits: `128`);
165	const ElementCount NumElems = Ty.getElementCount();
166	return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: `128`));
167	}
168
169	static LLT getBufferRsrcRegisterType(const LLT Ty) {
170	if (!Ty.isVector())
171	return LLT::fixed_vector(NumElements: `4`, ScalarTy: LLT::scalar(SizeInBits: `32`));
172	const unsigned NumElems = Ty.getElementCount().getFixedValue();
173	return LLT::fixed_vector(NumElements: NumElems * `4`, ScalarTy: LLT::scalar(SizeInBits: `32`));
174	}
175
176	static LLT getBitcastRegisterType(const LLT Ty) {
177	const unsigned Size = Ty.getSizeInBits();
178
179	if (Size <= `32`) {
180	// <2 x s8> -> s16
181	// <4 x s8> -> s32
182	return LLT::scalar(SizeInBits: Size);
183	}
184
185	return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / `32`), ScalarSize: `32`);
186	}
187
188	static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189	return [=](const LegalityQuery &Query) {
190	const LLT Ty = Query.Types [TypeIdx];
191	return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192	};
193	}
194
195	static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196	return [=](const LegalityQuery &Query) {
197	const LLT Ty = Query.Types [TypeIdx];
198	unsigned Size = Ty.getSizeInBits();
199	assert(Size % `32` == `0`);
200	return std::pair(
201	TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / `32`), ScalarSize: `32`));
202	};
203	}
204
205	static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206	return [=](const LegalityQuery &Query) {
207	const LLT QueryTy = Query.Types [TypeIdx];
208	return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209	};
210	}
211
212	static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213	return [=](const LegalityQuery &Query) {
214	const LLT QueryTy = Query.Types [TypeIdx];
215	return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216	};
217	}
218
219	static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220	return [=](const LegalityQuery &Query) {
221	const LLT QueryTy = Query.Types [TypeIdx];
222	return QueryTy.isVector() && QueryTy.getNumElements() % `2` != `0`;
223	};
224	}
225
226	static bool isRegisterSize(unsigned Size) {
227	return Size % `32` == `0` && Size <= MaxRegisterSize;
228	}
229
230	static bool isRegisterVectorElementType(LLT EltTy) {
231	const int EltSize = EltTy.getSizeInBits();
232	return EltSize == `16` \|\| EltSize % `32` == `0`;
233	}
234
235	static bool isRegisterVectorType(LLT Ty) {
236	const int EltSize = Ty.getElementType().getSizeInBits();
237	return EltSize == `32` \|\| EltSize == `64` \|\|
238	(EltSize == `16` && Ty.getNumElements() % `2` == `0`) \|\|
239	EltSize == `128` \|\| EltSize == `256`;
240	}
241
242	// TODO: replace all uses of isRegisterType with isRegisterClassType
243	static bool isRegisterType(LLT Ty) {
244	if (!isRegisterSize(Size: Ty.getSizeInBits()))
245	return false;
246
247	if (Ty.isVector())
248	return isRegisterVectorType(Ty);
249
250	return true;
251	}
252
253	// Any combination of 32 or 64-bit elements up the maximum register size, and
254	// multiples of v2s16.
255	static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256	return [=](const LegalityQuery &Query) {
257	return isRegisterType(Ty: Query.Types [TypeIdx]);
258	};
259	}
260
261	// RegisterType that doesn't have a corresponding RegClass.
262	// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263	// should be removed.
264	static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265	return [=](const LegalityQuery &Query) {
266	LLT Ty = Query.Types [TypeIdx];
267	return isRegisterType(Ty) &&
268	!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
269	};
270	}
271
272	static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273	return [=](const LegalityQuery &Query) {
274	const LLT QueryTy = Query.Types [TypeIdx];
275	if (!QueryTy.isVector())
276	return false;
277	const LLT EltTy = QueryTy.getElementType();
278	return EltTy == LLT::scalar(SizeInBits: `16`) \|\| EltTy.getSizeInBits() >= `32`;
279	};
280	}
281
282	static const LLT S1 = LLT::scalar(SizeInBits: `1`);
283	static const LLT S8 = LLT::scalar(SizeInBits: `8`);
284	static const LLT S16 = LLT::scalar(SizeInBits: `16`);
285	static const LLT S32 = LLT::scalar(SizeInBits: `32`);
286	static const LLT F32 = LLT::float32();
287	static const LLT S64 = LLT::scalar(SizeInBits: `64`);
288	static const LLT F64 = LLT::float64();
289	static const LLT S96 = LLT::scalar(SizeInBits: `96`);
290	static const LLT S128 = LLT::scalar(SizeInBits: `128`);
291	static const LLT S160 = LLT::scalar(SizeInBits: `160`);
292	static const LLT S224 = LLT::scalar(SizeInBits: `224`);
293	static const LLT S256 = LLT::scalar(SizeInBits: `256`);
294	static const LLT S512 = LLT::scalar(SizeInBits: `512`);
295	static const LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
296
297	static const LLT V2S8 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `8`);
298	static const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
299	static const LLT V4S16 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`);
300	static const LLT V6S16 = LLT::fixed_vector(NumElements: `6`, ScalarSizeInBits: `16`);
301	static const LLT V8S16 = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `16`);
302	static const LLT V10S16 = LLT::fixed_vector(NumElements: `10`, ScalarSizeInBits: `16`);
303	static const LLT V12S16 = LLT::fixed_vector(NumElements: `12`, ScalarSizeInBits: `16`);
304	static const LLT V16S16 = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `16`);
305
306	static const LLT V2F16 = LLT::fixed_vector(NumElements: `2`, ScalarTy: LLT::float16());
307	static const LLT V2BF16 = V2F16; // FIXME
308
309	static const LLT V2S32 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`);
310	static const LLT V3S32 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `32`);
311	static const LLT V4S32 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`);
312	static const LLT V5S32 = LLT::fixed_vector(NumElements: `5`, ScalarSizeInBits: `32`);
313	static const LLT V6S32 = LLT::fixed_vector(NumElements: `6`, ScalarSizeInBits: `32`);
314	static const LLT V7S32 = LLT::fixed_vector(NumElements: `7`, ScalarSizeInBits: `32`);
315	static const LLT V8S32 = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `32`);
316	static const LLT V9S32 = LLT::fixed_vector(NumElements: `9`, ScalarSizeInBits: `32`);
317	static const LLT V10S32 = LLT::fixed_vector(NumElements: `10`, ScalarSizeInBits: `32`);
318	static const LLT V11S32 = LLT::fixed_vector(NumElements: `11`, ScalarSizeInBits: `32`);
319	static const LLT V12S32 = LLT::fixed_vector(NumElements: `12`, ScalarSizeInBits: `32`);
320	static const LLT V16S32 = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `32`);
321	static const LLT V32S32 = LLT::fixed_vector(NumElements: `32`, ScalarSizeInBits: `32`);
322
323	static const LLT V2S64 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
324	static const LLT V3S64 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `64`);
325	static const LLT V4S64 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `64`);
326	static const LLT V5S64 = LLT::fixed_vector(NumElements: `5`, ScalarSizeInBits: `64`);
327	static const LLT V6S64 = LLT::fixed_vector(NumElements: `6`, ScalarSizeInBits: `64`);
328	static const LLT V7S64 = LLT::fixed_vector(NumElements: `7`, ScalarSizeInBits: `64`);
329	static const LLT V8S64 = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `64`);
330	static const LLT V16S64 = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `64`);
331
332	static const LLT V2S128 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `128`);
333	static const LLT V4S128 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `128`);
334
335	static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
336	S160, S224, S256, S512};
337
338	static std::initializer_list<LLT> AllS16Vectors{
339	V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
340
341	static std::initializer_list<LLT> AllS32Vectors = {
342	V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
343	V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
344
345	static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
346	V6S64, V7S64, V8S64, V16S64};
347
348	// Checks whether a type is in the list of legal register types.
349	static bool isRegisterClassType(LLT Ty) {
350	if (Ty.isPointerOrPointerVector())
351	Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
352
353	return is_contained(Set: AllS32Vectors, Element: Ty) \|\| is_contained(Set: AllS64Vectors, Element: Ty) \|\|
354	is_contained(Set: AllScalarTypes, Element: Ty) \|\| is_contained(Set: AllS16Vectors, Element: Ty);
355	}
356
357	static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358	return [TypeIdx](const LegalityQuery &Query) {
359	return isRegisterClassType(Ty: Query.Types [TypeIdx]);
360	};
361	}
362
363	// If we have a truncating store or an extending load with a data size larger
364	// than 32-bits, we need to reduce to a 32-bit type.
365	static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
366	return [=](const LegalityQuery &Query) {
367	const LLT Ty = Query.Types [TypeIdx];
368	return !Ty.isVector() && Ty.getSizeInBits() > `32` &&
369	Query.MMODescrs [`0`].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
370	};
371	}
372
373	// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374	// handle some operations by just promoting the register during
375	// selection. There are also d16 loads on GFX9+ which preserve the high bits.
376	static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
377	bool IsLoad, bool IsAtomic) {
378	switch (AS) {
379	case AMDGPUAS::PRIVATE_ADDRESS:
380	// FIXME: Private element size.
381	return ST.enableFlatScratch() ? `128` : `32`;
382	case AMDGPUAS::LOCAL_ADDRESS:
383	return ST.useDS128() ? `128` : `64`;
384	case AMDGPUAS::GLOBAL_ADDRESS:
385	case AMDGPUAS::CONSTANT_ADDRESS:
386	case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
387	case AMDGPUAS::BUFFER_RESOURCE:
388	// Treat constant and global as identical. SMRD loads are sometimes usable for
389	// global loads (ideally constant address space should be eliminated)
390	// depending on the context. Legality cannot be context dependent, but
391	// RegBankSelect can split the load as necessary depending on the pointer
392	// register bank/uniformity and if the memory is invariant or not written in a
393	// kernel.
394	return IsLoad ? `512` : `128`;
395	default:
396	// FIXME: Flat addresses may contextually need to be split to 32-bit parts
397	// if they may alias scratch depending on the subtarget. This needs to be
398	// moved to custom handling to use addressMayBeAccessedAsPrivate
399	return ST.hasMultiDwordFlatScratchAddressing() \|\| IsAtomic ? `128` : `32`;
400	}
401	}
402
403	static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404	const LegalityQuery &Query) {
405	const LLT Ty = Query.Types [`0`];
406
407	// Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408	const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
409
410	unsigned RegSize = Ty.getSizeInBits();
411	uint64_t MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
412	uint64_t AlignBits = Query.MMODescrs [`0`].AlignInBits;
413	unsigned AS = Query.Types [`1`].getAddressSpace();
414
415	// All of these need to be custom lowered to cast the pointer operand.
416	if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
417	return false;
418
419	// Do not handle extending vector loads.
420	if (Ty.isVector() && MemSize != RegSize)
421	return false;
422
423	// TODO: We should be able to widen loads if the alignment is high enough, but
424	// we also need to modify the memory access size.
425	#if 0
426	// Accept widening loads based on alignment.
427	if (IsLoad && MemSize < Size)
428	MemSize = std::max(MemSize, Align);
429	#endif
430
431	// Only 1-byte and 2-byte to 32-bit extloads are valid.
432	if (MemSize != RegSize && RegSize != `32`)
433	return false;
434
435	if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
436	IsAtomic: Query.MMODescrs [`0`].Ordering !=
437	AtomicOrdering::NotAtomic))
438	return false;
439
440	switch (MemSize) {
441	case `8`:
442	case `16`:
443	case `32`:
444	case `64`:
445	case `128`:
446	break;
447	case `96`:
448	if (!ST.hasDwordx3LoadStores())
449	return false;
450	break;
451	case `256`:
452	case `512`:
453	// These may contextually need to be broken down.
454	break;
455	default:
456	return false;
457	}
458
459	assert(RegSize >= MemSize);
460
461	if (AlignBits < MemSize) {
462	const SITargetLowering *TLI = ST.getTargetLowering();
463	if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
464	Alignment: Align (AlignBits / `8`)))
465	return false;
466	}
467
468	return true;
469	}
470
471	// The newer buffer intrinsic forms take their resource arguments as
472	// pointers in address space 8, aka s128 values. However, in order to not break
473	// SelectionDAG, the underlying operations have to continue to take v4i32
474	// arguments. Therefore, we convert resource pointers - or vectors of them
475	// to integer values here.
476	static bool hasBufferRsrcWorkaround(const LLT Ty) {
477	if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
478	return true;
479	if (Ty.isVector()) {
480	const LLT ElemTy = Ty.getElementType();
481	return hasBufferRsrcWorkaround(Ty: ElemTy);
482	}
483	return false;
484	}
485
486	// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487	// workaround this. Eventually it should ignore the type for loads and only care
488	// about the size. Return true in cases where we will workaround this for now by
489	// bitcasting.
490	static bool loadStoreBitcastWorkaround(const LLT Ty) {
491	if (EnableNewLegality)
492	return false;
493
494	const unsigned Size = Ty.getSizeInBits();
495	if (Size <= `64`)
496	return false;
497	// Address space 8 pointers get their own workaround.
498	if (hasBufferRsrcWorkaround(Ty))
499	return false;
500	if (!Ty.isVector())
501	return true;
502
503	if (Ty.isPointerVector())
504	return true;
505
506	unsigned EltSize = Ty.getScalarSizeInBits();
507	return EltSize != `32` && EltSize != `64`;
508	}
509
510	static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
511	const LLT Ty = Query.Types [`0`];
512	return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
513	!hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
514	}
515
516	/// Return true if a load or store of the type should be lowered with a bitcast
517	/// to a different type.
518	static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519	const LLT MemTy) {
520	const unsigned MemSizeInBits = MemTy.getSizeInBits();
521	const unsigned Size = Ty.getSizeInBits();
522	if (Size != MemSizeInBits)
523	return Size <= `32` && Ty.isVector();
524
525	if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
526	return true;
527
528	// Don't try to handle bitcasting vector ext loads for now.
529	return Ty.isVector() && (!MemTy.isVector() \|\| MemTy == Ty) &&
530	(Size <= `32` \|\| isRegisterSize(Size)) &&
531	!isRegisterVectorElementType(EltTy: Ty.getElementType());
532	}
533
534	/// Return true if we should legalize a load by widening an odd sized memory
535	/// access up to the alignment. Note this case when the memory access itself
536	/// changes, not the size of the result register.
537	static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
538	uint64_t AlignInBits, unsigned AddrSpace,
539	unsigned Opcode) {
540	unsigned SizeInBits = MemoryTy.getSizeInBits();
541	// We don't want to widen cases that are naturally legal.
542	if (isPowerOf2_32(Value: SizeInBits))
543	return false;
544
545	// If we have 96-bit memory operations, we shouldn't touch them. Note we may
546	// end up widening these for a scalar load during RegBankSelect, if we don't
547	// have 96-bit scalar loads.
548	if (SizeInBits == `96` && ST.hasDwordx3LoadStores())
549	return false;
550
551	if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
552	return false;
553
554	// A load is known dereferenceable up to the alignment, so it's legal to widen
555	// to it.
556	//
557	// TODO: Could check dereferenceable for less aligned cases.
558	unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
559	if (AlignInBits < RoundedSize)
560	return false;
561
562	// Do not widen if it would introduce a slow unaligned load.
563	const SITargetLowering *TLI = ST.getTargetLowering();
564	unsigned Fast = `0`;
565	return TLI->allowsMisalignedMemoryAccessesImpl(
566	Size: RoundedSize, AddrSpace, Alignment: Align (AlignInBits / `8`),
567	Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
568	Fast;
569	}
570
571	static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572	unsigned Opcode) {
573	if (Query.MMODescrs [`0`].Ordering != AtomicOrdering::NotAtomic)
574	return false;
575
576	return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs [`0`].MemoryTy,
577	AlignInBits: Query.MMODescrs [`0`].AlignInBits,
578	AddrSpace: Query.Types [`1`].getAddressSpace(), Opcode);
579	}
580
581	/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582	/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583	/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
584	static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
585	MachineRegisterInfo &MRI, unsigned Idx) {
586	MachineOperand &MO = MI.getOperand(i: Idx);
587
588	const LLT PointerTy = MRI.getType(Reg: MO.getReg());
589
590	// Paranoidly prevent us from doing this multiple times.
591	if (!hasBufferRsrcWorkaround(Ty: PointerTy))
592	return PointerTy;
593
594	const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
595	const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
596	if (!PointerTy.isVector()) {
597	// Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598	const unsigned NumParts = PointerTy.getSizeInBits() / `32`;
599	const LLT S32 = LLT::scalar(SizeInBits: `32`);
600
601	Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
602	std::array<Register, `4`> VectorElems;
603	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
604	for (unsigned I = `0`; I < NumParts; ++I)
605	VectorElems [I] =
606	B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: `0`);
607	B.buildMergeValues(Res: MO, Ops: VectorElems);
608	MO.setReg(VectorReg);
609	return VectorTy;
610	}
611	Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
612	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
613	auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
614	B.buildIntToPtr(Dst: MO, Src: Scalar);
615	MO.setReg(BitcastReg);
616
617	return VectorTy;
618	}
619
620	/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621	/// the form in which the value must be in order to be passed to the low-level
622	/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623	/// needed in order to account for the fact that we can't define a register
624	/// class for s128 without breaking SelectionDAG.
625	static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
626	MachineRegisterInfo &MRI = *B.getMRI();
627	const LLT PointerTy = MRI.getType(Reg: Pointer);
628	const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
629	const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
630
631	if (!PointerTy.isVector()) {
632	// Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633	SmallVector<Register, `4`> PointerParts;
634	const unsigned NumParts = PointerTy.getSizeInBits() / `32`;
635	auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: `32`), Op: Pointer);
636	for (unsigned I = `0`; I < NumParts; ++I)
637	PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
638	return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: `0`);
639	}
640	Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: `0`);
641	return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: `0`);
642	}
643
644	static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
645	unsigned Idx) {
646	MachineOperand &MO = MI.getOperand(i: Idx);
647
648	const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
649	// Paranoidly prevent us from doing this multiple times.
650	if (!hasBufferRsrcWorkaround(Ty: PointerTy))
651	return;
652	MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
653	}
654
655	AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
656	const GCNTargetMachine &TM)
657	: ST(ST_) {
658	using namespace TargetOpcode;
659
660	auto GetAddrSpacePtr = [&TM](unsigned AS) {
661	return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
662	};
663
664	const LLT GlobalPtr = GetAddrSpacePtr (AMDGPUAS::GLOBAL_ADDRESS);
665	const LLT ConstantPtr = GetAddrSpacePtr (AMDGPUAS::CONSTANT_ADDRESS);
666	const LLT Constant32Ptr = GetAddrSpacePtr (AMDGPUAS::CONSTANT_ADDRESS_32BIT);
667	const LLT LocalPtr = GetAddrSpacePtr (AMDGPUAS::LOCAL_ADDRESS);
668	const LLT RegionPtr = GetAddrSpacePtr (AMDGPUAS::REGION_ADDRESS);
669	const LLT FlatPtr = GetAddrSpacePtr (AMDGPUAS::FLAT_ADDRESS);
670	const LLT PrivatePtr = GetAddrSpacePtr (AMDGPUAS::PRIVATE_ADDRESS);
671	const LLT BufferFatPtr = GetAddrSpacePtr (AMDGPUAS::BUFFER_FAT_POINTER);
672	const LLT RsrcPtr = GetAddrSpacePtr (AMDGPUAS::BUFFER_RESOURCE);
673	const LLT BufferStridedPtr =
674	GetAddrSpacePtr (AMDGPUAS::BUFFER_STRIDED_POINTER);
675
676	const LLT CodePtr = FlatPtr;
677
678	const std::initializer_list<LLT> AddrSpaces64 = {
679	GlobalPtr, ConstantPtr, FlatPtr
680	};
681
682	const std::initializer_list<LLT> AddrSpaces32 = {
683	LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
684	};
685
686	const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
687
688	const std::initializer_list<LLT> FPTypesBase = {
689	S32, S64
690	};
691
692	const std::initializer_list<LLT> FPTypes16 = {
693	S32, S64, S16
694	};
695
696	const std::initializer_list<LLT> FPTypesPK16 = {
697	S32, S64, S16, V2S16
698	};
699
700	const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
701
702	// s1 for VCC branches, s32 for SCC branches.
703	getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
704
705	// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706	// elements for v3s16
707	getActionDefinitionsBuilder(Opcode: G_PHI)
708	.legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
709	.legalFor(Types: AllS32Vectors)
710	.legalFor(Types: AllS64Vectors)
711	.legalFor(Types: AddrSpaces64)
712	.legalFor(Types: AddrSpaces32)
713	.legalFor(Types: AddrSpaces128)
714	.legalIf(Predicate: isPointer(TypeIdx: `0`))
715	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S256)
716	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
717	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `16`)
718	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
719	.scalarize(TypeIdx: `0`);
720
721	if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722	// Full set of gfx9 features.
723	if (ST.hasScalarAddSub64()) {
724	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
725	.legalFor(Types: {S64, S32, S16, V2S16})
726	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
727	.scalarize(TypeIdx: `0`)
728	.minScalar(TypeIdx: `0`, Ty: S16)
729	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
730	.maxScalar(TypeIdx: `0`, Ty: S32);
731	} else {
732	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
733	.legalFor(Types: {S32, S16, V2S16})
734	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
735	.scalarize(TypeIdx: `0`)
736	.minScalar(TypeIdx: `0`, Ty: S16)
737	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
738	.maxScalar(TypeIdx: `0`, Ty: S32);
739	}
740
741	if (ST.hasScalarSMulU64()) {
742	getActionDefinitionsBuilder(Opcode: G_MUL)
743	.legalFor(Types: {S64, S32, S16, V2S16})
744	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
745	.scalarize(TypeIdx: `0`)
746	.minScalar(TypeIdx: `0`, Ty: S16)
747	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
748	.custom();
749	} else {
750	getActionDefinitionsBuilder(Opcode: G_MUL)
751	.legalFor(Types: {S32, S16, V2S16})
752	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
753	.scalarize(TypeIdx: `0`)
754	.minScalar(TypeIdx: `0`, Ty: S16)
755	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
756	.custom();
757	}
758	assert(ST.hasMad64_32());
759
760	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761	.legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
762	.minScalarOrElt(TypeIdx: `0`, Ty: S16)
763	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
764	.scalarize(TypeIdx: `0`)
765	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
766	.lower();
767	} else if (ST.has16BitInsts()) {
768	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
769	.legalFor(Types: {S32, S16})
770	.minScalar(TypeIdx: `0`, Ty: S16)
771	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
772	.maxScalar(TypeIdx: `0`, Ty: S32)
773	.scalarize(TypeIdx: `0`);
774
775	getActionDefinitionsBuilder(Opcode: G_MUL)
776	.legalFor(Types: {S32, S16})
777	.scalarize(TypeIdx: `0`)
778	.minScalar(TypeIdx: `0`, Ty: S16)
779	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
780	.custom();
781	assert(ST.hasMad64_32());
782
783	// Technically the saturating operations require clamp bit support, but this
784	// was introduced at the same time as 16-bit operations.
785	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
786	.legalFor(Types: {S32, S16}) // Clamp modifier
787	.minScalar(TypeIdx: `0`, Ty: S16)
788	.scalarize(TypeIdx: `0`)
789	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `16`)
790	.lower();
791
792	// We're just lowering this, but it helps get a better result to try to
793	// coerce to the desired type first.
794	getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
795	.minScalar(TypeIdx: `0`, Ty: S16)
796	.scalarize(TypeIdx: `0`)
797	.lower();
798	} else {
799	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
800	.legalFor(Types: {S32})
801	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
802	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
803	.scalarize(TypeIdx: `0`);
804
805	auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
806	.legalFor(Types: {S32})
807	.scalarize(TypeIdx: `0`)
808	.minScalar(TypeIdx: `0`, Ty: S32)
809	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`);
810
811	if (ST.hasMad64_32())
812	Mul.custom();
813	else
814	Mul.maxScalar(TypeIdx: `0`, Ty: S32);
815
816	if (ST.hasIntClamp()) {
817	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
818	.legalFor(Types: {S32}) // Clamp modifier.
819	.scalarize(TypeIdx: `0`)
820	.minScalarOrElt(TypeIdx: `0`, Ty: S32)
821	.lower();
822	} else {
823	// Clamp bit support was added in VI, along with 16-bit operations.
824	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
825	.minScalar(TypeIdx: `0`, Ty: S32)
826	.scalarize(TypeIdx: `0`)
827	.lower();
828	}
829
830	// FIXME: DAG expansion gets better results. The widening uses the smaller
831	// range values and goes for the min/max lowering directly.
832	getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
833	.minScalar(TypeIdx: `0`, Ty: S32)
834	.scalarize(TypeIdx: `0`)
835	.lower();
836	}
837
838	getActionDefinitionsBuilder(
839	Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
840	.customFor(Types: {S32, S64})
841	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
842	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
843	.scalarize(TypeIdx: `0`);
844
845	auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
846	.legalFor(Types: {S32})
847	.maxScalar(TypeIdx: `0`, Ty: S32);
848
849	if (ST.hasVOP3PInsts()) {
850	Mulh
851	.clampMaxNumElements(TypeIdx: `0`, EltTy: S8, MaxElements: `2`)
852	.lowerFor(Types: {V2S8});
853	}
854
855	Mulh
856	.scalarize(TypeIdx: `0`)
857	.lower();
858
859	// Report legal for any types we can handle anywhere. For the cases only legal
860	// on the SALU, RegBankSelect will be able to re-legalize.
861	getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
862	.legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
863	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
864	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
865	.fewerElementsIf(Predicate: vectorWiderThan(TypeIdx: `0`, Size: `64`), Mutation: fewerEltsToSize64Vector(TypeIdx: `0`))
866	.widenScalarToNextPow2(TypeIdx: `0`)
867	.scalarize(TypeIdx: `0`);
868
869	getActionDefinitionsBuilder(
870	Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871	.legalFor(Types: {{S32, S1}, {S32, S32}})
872	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
873	.scalarize(TypeIdx: `0`);
874
875	getActionDefinitionsBuilder(Opcode: G_BITCAST)
876	// Don't worry about the size constraint.
877	.legalIf(Predicate: all(P0: isRegisterClassType(TypeIdx: `0`), P1: isRegisterClassType(TypeIdx: `1`)))
878	.lower();
879
880	getActionDefinitionsBuilder(Opcode: G_CONSTANT)
881	.legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
882	LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883	.legalIf(Predicate: isPointer(TypeIdx: `0`))
884	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
885	.widenScalarToNextPow2(TypeIdx: `0`);
886
887	getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
888	.legalFor(Types: {S32, S64, S16})
889	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
890
891	getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
892	.legalIf(Predicate: isRegisterType(TypeIdx: `0`))
893	// s1 and s16 are special cases because they have legal operations on
894	// them, but don't really occupy registers in the normal way.
895	.legalFor(Types: {S1, S16})
896	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
897	.clampScalarOrElt(TypeIdx: `0`, MinTy: S32, MaxTy: MaxScalar)
898	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
899	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `16`);
900
901	getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
902
903	// If the amount is divergent, we have to do a wave reduction to get the
904	// maximum value, so this is expanded during RegBankSelect.
905	getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
906	.legalFor(Types: {{PrivatePtr, S32}});
907
908	getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
909	.customFor(Types: {PrivatePtr});
910	getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
911	.legalFor(Types: {PrivatePtr});
912
913	getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
914
915	getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
916	.customIf(Predicate: typeIsNot(TypeIdx: `0`, Type: PrivatePtr));
917
918	getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
919
920	auto &FPOpActions = getActionDefinitionsBuilder(
921	Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922	G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
923	.legalFor(Types: {S32, S64});
924	auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
925	.customFor(Types: {S32, S64});
926	auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
927	.customFor(Types: {S32, S64});
928
929	if (ST.has16BitInsts()) {
930	if (ST.hasVOP3PInsts())
931	FPOpActions.legalFor(Types: {S16, V2S16});
932	else
933	FPOpActions.legalFor(Types: {S16});
934
935	TrigActions.customFor(Types: {S16});
936	FDIVActions.customFor(Types: {S16});
937	}
938
939	if (ST.hasPackedFP32Ops()) {
940	FPOpActions.legalFor(Types: {V2S32});
941	FPOpActions.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S32, NumElts: `2`);
942	}
943
944	auto &MinNumMaxNum = getActionDefinitionsBuilder(Opcodes: {
945	G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
946
947	if (ST.hasVOP3PInsts()) {
948	MinNumMaxNum.customFor(Types: FPTypesPK16)
949	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
950	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
951	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
952	.scalarize(TypeIdx: `0`);
953	} else if (ST.has16BitInsts()) {
954	MinNumMaxNum.customFor(Types: FPTypes16)
955	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
956	.scalarize(TypeIdx: `0`);
957	} else {
958	MinNumMaxNum.customFor(Types: FPTypesBase)
959	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
960	.scalarize(TypeIdx: `0`);
961	}
962
963	if (ST.hasVOP3PInsts())
964	FPOpActions.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`);
965
966	FPOpActions
967	.scalarize(TypeIdx: `0`)
968	.clampScalar(TypeIdx: `0`, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
969
970	TrigActions
971	.scalarize(TypeIdx: `0`)
972	.clampScalar(TypeIdx: `0`, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
973
974	FDIVActions
975	.scalarize(TypeIdx: `0`)
976	.clampScalar(TypeIdx: `0`, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
977
978	getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS})
979	.legalFor(Types: FPTypesPK16)
980	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
981	.scalarize(TypeIdx: `0`)
982	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
983
984	if (ST.has16BitInsts()) {
985	getActionDefinitionsBuilder(Opcode: G_FSQRT)
986	.legalFor(Types: {S16})
987	.customFor(Types: {S32, S64})
988	.scalarize(TypeIdx: `0`)
989	.unsupported();
990	getActionDefinitionsBuilder(Opcode: G_FFLOOR)
991	.legalFor(Types: {S32, S64, S16})
992	.scalarize(TypeIdx: `0`)
993	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
994
995	getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
996	.legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
997	.scalarize(TypeIdx: `0`)
998	.maxScalarIf(Predicate: typeIs(TypeIdx: `0`, TypesInit: S16), TypeIdx: `1`, Ty: S16)
999	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
1000	.lower();
1001
1002	getActionDefinitionsBuilder(Opcode: G_FFREXP)
1003	.customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1004	.scalarize(TypeIdx: `0`)
1005	.lower();
1006	} else {
1007	getActionDefinitionsBuilder(Opcode: G_FSQRT)
1008	.customFor(Types: {S32, S64, S16})
1009	.scalarize(TypeIdx: `0`)
1010	.unsupported();
1011
1012
1013	if (ST.hasFractBug()) {
1014	getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1015	.customFor(Types: {S64})
1016	.legalFor(Types: {S32, S64})
1017	.scalarize(TypeIdx: `0`)
1018	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1019	} else {
1020	getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1021	.legalFor(Types: {S32, S64})
1022	.scalarize(TypeIdx: `0`)
1023	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1024	}
1025
1026	getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1027	.legalFor(Types: {{S32, S32}, {S64, S32}})
1028	.scalarize(TypeIdx: `0`)
1029	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1030	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
1031	.lower();
1032
1033	getActionDefinitionsBuilder(Opcode: G_FFREXP)
1034	.customFor(Types: {{S32, S32}, {S64, S32}})
1035	.scalarize(TypeIdx: `0`)
1036	.minScalar(TypeIdx: `0`, Ty: S32)
1037	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
1038	.lower();
1039	}
1040
1041	getActionDefinitionsBuilder(Opcode: G_FPTRUNC)
1042	.legalFor(Types: {{S32, S64}, {S16, S32}})
1043	.scalarize(TypeIdx: `0`)
1044	.lower();
1045
1046	getActionDefinitionsBuilder(Opcode: G_FPEXT)
1047	.legalFor(Types: {{S64, S32}, {S32, S16}})
1048	.narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: `0`, Ty: S32))
1049	.scalarize(TypeIdx: `0`);
1050
1051	auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1052	if (ST.has16BitInsts()) {
1053	FSubActions
1054	// Use actual fsub instruction
1055	.legalFor(Types: {S32, S16})
1056	// Must use fadd + fneg
1057	.lowerFor(Types: {S64, V2S16});
1058	} else {
1059	FSubActions
1060	// Use actual fsub instruction
1061	.legalFor(Types: {S32})
1062	// Must use fadd + fneg
1063	.lowerFor(Types: {S64, S16, V2S16});
1064	}
1065
1066	FSubActions
1067	.scalarize(TypeIdx: `0`)
1068	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1069
1070	// Whether this is legal depends on the floating point mode for the function.
1071	auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1072	if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1073	FMad.customFor(Types: {S32, S16});
1074	else if (ST.hasMadMacF32Insts())
1075	FMad.customFor(Types: {S32});
1076	else if (ST.hasMadF16())
1077	FMad.customFor(Types: {S16});
1078	FMad.scalarize(TypeIdx: `0`)
1079	.lower();
1080
1081	auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1082	if (ST.has16BitInsts()) {
1083	FRem.customFor(Types: {S16, S32, S64});
1084	} else {
1085	FRem.minScalar(TypeIdx: `0`, Ty: S32)
1086	.customFor(Types: {S32, S64});
1087	}
1088	FRem.scalarize(TypeIdx: `0`);
1089
1090	// TODO: Do we need to clamp maximum bitwidth?
1091	getActionDefinitionsBuilder(Opcode: G_TRUNC)
1092	.legalIf(Predicate: isScalar(TypeIdx: `0`))
1093	.legalFor(Types: {{V2S16, V2S32}})
1094	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
1095	// Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096	// situations (like an invalid implicit use), we don't want to infinite loop
1097	// in the legalizer.
1098	.fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: `0`), Mutation: LegalizeMutations::scalarize(TypeIdx: `0`))
1099	.alwaysLegal();
1100
1101	getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1102	.legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1103	{S32, S1}, {S64, S1}, {S16, S1}})
1104	.scalarize(TypeIdx: `0`)
1105	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1106	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`);
1107
1108	// TODO: Split s1->s64 during regbankselect for VALU.
1109	auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1110	.legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S32}})
1111	.lowerIf(Predicate: typeIs(TypeIdx: `1`, TypesInit: S1))
1112	.customFor(Types: {{S32, S64}, {S64, S64}});
1113	if (ST.has16BitInsts())
1114	IToFP.legalFor(Types: {{S16, S16}});
1115	IToFP.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1116	.minScalar(TypeIdx: `0`, Ty: S32)
1117	.scalarize(TypeIdx: `0`)
1118	.widenScalarToNextPow2(TypeIdx: `1`);
1119
1120	auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1121	.legalFor(Types: {{S32, S32}, {S32, S64}, {S32, S16}})
1122	.customFor(Types: {{S64, S32}, {S64, S64}})
1123	.narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: `0`, Ty: S32));
1124	if (ST.has16BitInsts())
1125	FPToI.legalFor(Types: {{S16, S16}});
1126	else
1127	FPToI.minScalar(TypeIdx: `1`, Ty: S32);
1128
1129	FPToI.minScalar(TypeIdx: `0`, Ty: S32)
1130	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1131	.scalarize(TypeIdx: `0`)
1132	.lower();
1133
1134	getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1135	.customFor(Types: {S16, S32})
1136	.scalarize(TypeIdx: `0`)
1137	.lower();
1138
1139	// Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1140	getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1141	.scalarize(TypeIdx: `0`)
1142	.lower();
1143
1144	if (ST.has16BitInsts()) {
1145	getActionDefinitionsBuilder(
1146	Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1147	.legalFor(Types: {S16, S32, S64})
1148	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
1149	.scalarize(TypeIdx: `0`);
1150	} else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1151	getActionDefinitionsBuilder(
1152	Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1153	.legalFor(Types: {S32, S64})
1154	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1155	.scalarize(TypeIdx: `0`);
1156	} else {
1157	getActionDefinitionsBuilder(
1158	Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1159	.legalFor(Types: {S32})
1160	.customFor(Types: {S64})
1161	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1162	.scalarize(TypeIdx: `0`);
1163	}
1164
1165	getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1166	.unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1167	.legalIf(Predicate: all(P0: isPointer(TypeIdx: `0`), P1: sameSize(TypeIdx0: `0`, TypeIdx1: `1`)))
1168	.scalarize(TypeIdx: `0`)
1169	.scalarSameSizeAs(TypeIdx: `1`, SameSizeIdx: `0`);
1170
1171	getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1172	.legalIf(Predicate: all(P0: sameSize(TypeIdx0: `0`, TypeIdx1: `1`), P1: typeInSet(TypeIdx: `1`, TypesInit: {S64, S32})))
1173	.scalarSameSizeAs(TypeIdx: `1`, SameSizeIdx: `0`)
1174	.scalarize(TypeIdx: `0`);
1175
1176	auto &CmpBuilder =
1177	getActionDefinitionsBuilder(Opcode: G_ICMP)
1178	// The compare output type differs based on the register bank of the output,
1179	// so make both s1 and s32 legal.
1180	//
1181	// Scalar compares producing output in scc will be promoted to s32, as that
1182	// is the allocatable register type that will be needed for the copy from
1183	// scc. This will be promoted during RegBankSelect, and we assume something
1184	// before that won't try to use s32 result types.
1185	//
1186	// Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1187	// bank.
1188	.legalForCartesianProduct(
1189	Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1190	.legalForCartesianProduct(
1191	Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1192	if (ST.has16BitInsts()) {
1193	CmpBuilder.legalFor(Types: {{S1, S16}});
1194	}
1195
1196	CmpBuilder
1197	.widenScalarToNextPow2(TypeIdx: `1`)
1198	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1199	.scalarize(TypeIdx: `0`)
1200	.legalIf(Predicate: all(P0: typeInSet(TypeIdx: `0`, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: `1`)));
1201
1202	auto &FCmpBuilder =
1203	getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1204	Types0: {S1}, Types1: ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1205
1206	if (ST.hasSALUFloatInsts())
1207	FCmpBuilder.legalForCartesianProduct(Types0: {S32}, Types1: {S16, S32});
1208
1209	FCmpBuilder
1210	.widenScalarToNextPow2(TypeIdx: `1`)
1211	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1212	.scalarize(TypeIdx: `0`);
1213
1214	// FIXME: fpow has a selection pattern that should move to custom lowering.
1215	auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1216	if (ST.has16BitInsts())
1217	ExpOps.customFor(Types: {{S32}, {S16}});
1218	else
1219	ExpOps.customFor(Types: {S32});
1220	ExpOps.clampScalar(TypeIdx: `0`, MinTy: MinScalarFPTy, MaxTy: S32)
1221	.scalarize(TypeIdx: `0`);
1222
1223	getActionDefinitionsBuilder(Opcode: G_FPOWI)
1224	.clampScalar(TypeIdx: `0`, MinTy: MinScalarFPTy, MaxTy: S32)
1225	.lower();
1226
1227	auto &Log2Ops = getActionDefinitionsBuilder(Opcodes: {G_FLOG2, G_FEXP2});
1228	Log2Ops.customFor(Types: {S32});
1229	if (ST.has16BitInsts())
1230	Log2Ops.legalFor(Types: {S16});
1231	else
1232	Log2Ops.customFor(Types: {S16});
1233	Log2Ops.scalarize(TypeIdx: `0`)
1234	.lower();
1235
1236	auto &LogOps =
1237	getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1238	LogOps.customFor(Types: {S32, S16});
1239	LogOps.clampScalar(TypeIdx: `0`, MinTy: MinScalarFPTy, MaxTy: S32)
1240	.scalarize(TypeIdx: `0`);
1241
1242	// The 64-bit versions produce 32-bit results, but only on the SALU.
1243	getActionDefinitionsBuilder(Opcode: G_CTPOP)
1244	.legalFor(Types: {{S32, S32}, {S32, S64}})
1245	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1246	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`)
1247	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1248	.scalarize(TypeIdx: `0`)
1249	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`);
1250
1251	// If no 16 bit instr is available, lower into different instructions.
1252	if (ST.has16BitInsts())
1253	getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1254	.legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1255	.widenScalarToNextPow2(TypeIdx: `1`)
1256	.scalarize(TypeIdx: `0`)
1257	.lower();
1258	else
1259	getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1260	.legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1261	.lowerFor(Types: {S1, S16})
1262	.widenScalarToNextPow2(TypeIdx: `1`)
1263	.scalarize(TypeIdx: `0`)
1264	.lower();
1265
1266	// The hardware instructions return a different result on 0 than the generic
1267	// instructions expect. The hardware produces -1, but these produce the
1268	// bitwidth.
1269	getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1270	.scalarize(TypeIdx: `0`)
1271	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1272	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1273	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1274	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`)
1275	.custom();
1276
1277	// The 64-bit versions produce 32-bit results, but only on the SALU.
1278	getActionDefinitionsBuilder(Opcode: G_CTLZ_ZERO_UNDEF)
1279	.legalFor(Types: {{S32, S32}, {S32, S64}})
1280	.customIf(Predicate: scalarNarrowerThan(TypeIdx: `1`, Size: `32`))
1281	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1282	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1283	.scalarize(TypeIdx: `0`)
1284	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1285	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`);
1286
1287	getActionDefinitionsBuilder(Opcode: G_CTTZ_ZERO_UNDEF)
1288	.legalFor(Types: {{S32, S32}, {S32, S64}})
1289	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1290	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1291	.scalarize(TypeIdx: `0`)
1292	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1293	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`);
1294
1295	// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1296	// RegBankSelect.
1297	getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1298	.legalFor(Types: {S32, S64})
1299	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1300	.scalarize(TypeIdx: `0`)
1301	.widenScalarToNextPow2(TypeIdx: `0`);
1302
1303	if (ST.has16BitInsts()) {
1304	getActionDefinitionsBuilder(Opcode: G_BSWAP)
1305	.legalFor(Types: {S16, S32, V2S16})
1306	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
1307	// FIXME: Fixing non-power-of-2 before clamp is workaround for
1308	// narrowScalar limitation.
1309	.widenScalarToNextPow2(TypeIdx: `0`)
1310	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S32)
1311	.scalarize(TypeIdx: `0`);
1312
1313	if (ST.hasVOP3PInsts()) {
1314	getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1315	.legalFor(Types: {S32, S16, V2S16})
1316	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
1317	.minScalar(TypeIdx: `0`, Ty: S16)
1318	.widenScalarToNextPow2(TypeIdx: `0`)
1319	.scalarize(TypeIdx: `0`)
1320	.lower();
1321	} else {
1322	getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1323	.legalFor(Types: {S32, S16})
1324	.widenScalarToNextPow2(TypeIdx: `0`)
1325	.minScalar(TypeIdx: `0`, Ty: S16)
1326	.scalarize(TypeIdx: `0`)
1327	.lower();
1328	}
1329	} else {
1330	// TODO: Should have same legality without v_perm_b32
1331	getActionDefinitionsBuilder(Opcode: G_BSWAP)
1332	.legalFor(Types: {S32})
1333	.lowerIf(Predicate: scalarNarrowerThan(TypeIdx: `0`, Size: `32`))
1334	// FIXME: Fixing non-power-of-2 before clamp is workaround for
1335	// narrowScalar limitation.
1336	.widenScalarToNextPow2(TypeIdx: `0`)
1337	.maxScalar(TypeIdx: `0`, Ty: S32)
1338	.scalarize(TypeIdx: `0`)
1339	.lower();
1340
1341	getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1342	.legalFor(Types: {S32})
1343	.minScalar(TypeIdx: `0`, Ty: S32)
1344	.widenScalarToNextPow2(TypeIdx: `0`)
1345	.scalarize(TypeIdx: `0`)
1346	.lower();
1347	}
1348
1349	getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1350	// List the common cases
1351	.legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1352	.legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1353	.scalarize(TypeIdx: `0`)
1354	// Accept any address space as long as the size matches
1355	.legalIf(Predicate: sameSize(TypeIdx0: `0`, TypeIdx1: `1`))
1356	.widenScalarIf(Predicate: smallerThan(TypeIdx0: `1`, TypeIdx1: `0`),
1357	Mutation: [](const LegalityQuery &Query) {
1358	return std::pair(
1359	`1`, LLT::scalar(SizeInBits: Query.Types [`0`].getSizeInBits()));
1360	})
1361	.narrowScalarIf(Predicate: largerThan(TypeIdx0: `1`, TypeIdx1: `0`), Mutation: [](const LegalityQuery &Query) {
1362	return std::pair(`1`, LLT::scalar(SizeInBits: Query.Types [`0`].getSizeInBits()));
1363	});
1364
1365	getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1366	// List the common cases
1367	.legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1368	.legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1369	.scalarize(TypeIdx: `0`)
1370	// Accept any address space as long as the size matches
1371	.legalIf(Predicate: sameSize(TypeIdx0: `0`, TypeIdx1: `1`))
1372	.widenScalarIf(Predicate: smallerThan(TypeIdx0: `0`, TypeIdx1: `1`),
1373	Mutation: [](const LegalityQuery &Query) {
1374	return std::pair(
1375	`0`, LLT::scalar(SizeInBits: Query.Types [`1`].getSizeInBits()));
1376	})
1377	.narrowScalarIf(Predicate: largerThan(TypeIdx0: `0`, TypeIdx1: `1`), Mutation: [](const LegalityQuery &Query) {
1378	return std::pair(`0`, LLT::scalar(SizeInBits: Query.Types [`1`].getSizeInBits()));
1379	});
1380
1381	getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1382	.scalarize(TypeIdx: `0`)
1383	.custom();
1384
1385	const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1386	bool IsLoad) -> bool {
1387	const LLT DstTy = Query.Types [`0`];
1388
1389	// Split vector extloads.
1390	unsigned MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
1391
1392	if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1393	return true;
1394
1395	const LLT PtrTy = Query.Types [`1`];
1396	unsigned AS = PtrTy.getAddressSpace();
1397	if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1398	IsAtomic: Query.MMODescrs [`0`].Ordering !=
1399	AtomicOrdering::NotAtomic))
1400	return true;
1401
1402	// Catch weird sized loads that don't evenly divide into the access sizes
1403	// TODO: May be able to widen depending on alignment etc.
1404	unsigned NumRegs = (MemSize + `31`) / `32`;
1405	if (NumRegs == `3`) {
1406	if (!ST.hasDwordx3LoadStores())
1407	return true;
1408	} else {
1409	// If the alignment allows, these should have been widened.
1410	if (!isPowerOf2_32(Value: NumRegs))
1411	return true;
1412	}
1413
1414	return false;
1415	};
1416
1417	unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? `0` : `32`;
1418	unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? `0` : `16`;
1419	unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? `0` : `8`;
1420
1421	// TODO: Refine based on subtargets which support unaligned access or 128-bit
1422	// LDS
1423	// TODO: Unsupported flat for SI.
1424
1425	for (unsigned Op : {G_LOAD, G_STORE}) {
1426	const bool IsStore = Op == G_STORE;
1427
1428	auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1429	// Explicitly list some common cases.
1430	// TODO: Does this help compile time at all?
1431	Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1432	{.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1433	{.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1434	{.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1435	{.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1436	{.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1437	{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1438	{.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1439
1440	{.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: `32`},
1441	{.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: `32`},
1442	{.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: `32`},
1443	{.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: `8`},
1444	{.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: `16`},
1445	{.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: `32`},
1446
1447	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: `32`},
1448	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: `8`},
1449	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: `16`},
1450	{.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: `32`},
1451
1452	{.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1453	{.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1454	{.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1455	{.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1456	{.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1457	Actions.legalIf(
1458	Predicate: [=](const LegalityQuery &Query) -> bool {
1459	return isLoadStoreLegal(ST, Query);
1460	});
1461
1462	// The custom pointers (fat pointers, buffer resources) don't work with load
1463	// and store at this level. Fat pointers should have been lowered to
1464	// intrinsics before the translation to MIR.
1465	Actions.unsupportedIf(
1466	Predicate: typeInSet(TypeIdx: `1`, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1467
1468	// Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1469	// ptrtoint. This is needed to account for the fact that we can't have i128
1470	// as a register class for SelectionDAG reasons.
1471	Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1472	return hasBufferRsrcWorkaround(Ty: Query.Types [`0`]);
1473	});
1474
1475	// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1476	// 64-bits.
1477	//
1478	// TODO: Should generalize bitcast action into coerce, which will also cover
1479	// inserting addrspacecasts.
1480	Actions.customIf(Predicate: typeIs(TypeIdx: `1`, TypesInit: Constant32Ptr));
1481
1482	// Turn any illegal element vectors into something easier to deal
1483	// with. These will ultimately produce 32-bit scalar shifts to extract the
1484	// parts anyway.
1485	//
1486	// For odd 16-bit element vectors, prefer to split those into pieces with
1487	// 16-bit vector parts.
1488	Actions.bitcastIf(
1489	Predicate: [=](const LegalityQuery &Query) -> bool {
1490	return shouldBitcastLoadStoreType(ST, Ty: Query.Types [`0`],
1491	MemTy: Query.MMODescrs [`0`].MemoryTy);
1492	}, Mutation: bitcastToRegisterType(TypeIdx: `0`));
1493
1494	if (!IsStore) {
1495	// Widen suitably aligned loads by loading extra bytes. The standard
1496	// legalization actions can't properly express widening memory operands.
1497	Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1498	return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1499	});
1500	}
1501
1502	// FIXME: load/store narrowing should be moved to lower action
1503	Actions
1504	.narrowScalarIf(
1505	Predicate: [=](const LegalityQuery &Query) -> bool {
1506	return !Query.Types [`0`].isVector() &&
1507	needToSplitMemOp (Query, Op == G_LOAD);
1508	},
1509	Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1510	const LLT DstTy = Query.Types [`0`];
1511	const LLT PtrTy = Query.Types [`1`];
1512
1513	const unsigned DstSize = DstTy.getSizeInBits();
1514	unsigned MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
1515
1516	// Split extloads.
1517	if (DstSize > MemSize)
1518	return std::pair(`0`, LLT::scalar(SizeInBits: MemSize));
1519
1520	unsigned MaxSize = maxSizeForAddrSpace(
1521	ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1522	IsAtomic: Query.MMODescrs [`0`].Ordering != AtomicOrdering::NotAtomic);
1523	if (MemSize > MaxSize)
1524	return std::pair(`0`, LLT::scalar(SizeInBits: MaxSize));
1525
1526	uint64_t Align = Query.MMODescrs [`0`].AlignInBits;
1527	return std::pair(`0`, LLT::scalar(SizeInBits: Align));
1528	})
1529	.fewerElementsIf(
1530	Predicate: [=](const LegalityQuery &Query) -> bool {
1531	return Query.Types [`0`].isVector() &&
1532	needToSplitMemOp (Query, Op == G_LOAD);
1533	},
1534	Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1535	const LLT DstTy = Query.Types [`0`];
1536	const LLT PtrTy = Query.Types [`1`];
1537
1538	LLT EltTy = DstTy.getElementType();
1539	unsigned MaxSize = maxSizeForAddrSpace(
1540	ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1541	IsAtomic: Query.MMODescrs [`0`].Ordering != AtomicOrdering::NotAtomic);
1542
1543	// FIXME: Handle widened to power of 2 results better. This ends
1544	// up scalarizing.
1545	// FIXME: 3 element stores scalarized on SI
1546
1547	// Split if it's too large for the address space.
1548	unsigned MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
1549	if (MemSize > MaxSize) {
1550	unsigned NumElts = DstTy.getNumElements();
1551	unsigned EltSize = EltTy.getSizeInBits();
1552
1553	if (MaxSize % EltSize == `0`) {
1554	return std::pair(
1555	`0`, LLT::scalarOrVector(
1556	EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1557	}
1558
1559	unsigned NumPieces = MemSize / MaxSize;
1560
1561	// FIXME: Refine when odd breakdowns handled
1562	// The scalars will need to be re-legalized.
1563	if (NumPieces == `1` \|\| NumPieces >= NumElts \|\|
1564	NumElts % NumPieces != `0`)
1565	return std::pair(`0`, EltTy);
1566
1567	return std::pair(`0`,
1568	LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1569	}
1570
1571	// FIXME: We could probably handle weird extending loads better.
1572	if (DstTy.getSizeInBits() > MemSize)
1573	return std::pair(`0`, EltTy);
1574
1575	unsigned EltSize = EltTy.getSizeInBits();
1576	unsigned DstSize = DstTy.getSizeInBits();
1577	if (!isPowerOf2_32(Value: DstSize)) {
1578	// We're probably decomposing an odd sized store. Try to split
1579	// to the widest type. TODO: Account for alignment. As-is it
1580	// should be OK, since the new parts will be further legalized.
1581	unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1582	return std::pair(
1583	`0`, LLT::scalarOrVector(
1584	EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1585	}
1586
1587	// May need relegalization for the scalars.
1588	return std::pair(`0`, EltTy);
1589	})
1590	.minScalar(TypeIdx: `0`, Ty: S32)
1591	.narrowScalarIf(Predicate: isWideScalarExtLoadTruncStore(TypeIdx: `0`), Mutation: changeTo(TypeIdx: `0`, Ty: S32))
1592	.widenScalarToNextPow2(TypeIdx: `0`)
1593	.moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: `0`, Size: `32`), Mutation: moreEltsToNext32Bit(TypeIdx: `0`))
1594	.lower();
1595	}
1596
1597	// FIXME: Unaligned accesses not lowered.
1598	auto &ExtLoads = getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1599	.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: `8`},
1600	{.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: `2` * `8`},
1601	{.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: `8`},
1602	{.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: `16`},
1603	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: `8`},
1604	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: `16`},
1605	{.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: `8`},
1606	{.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: `2` * `8`}})
1607	.legalIf(
1608	Predicate: [=](const LegalityQuery &Query) -> bool {
1609	return isLoadStoreLegal(ST, Query);
1610	});
1611
1612	if (ST.hasFlatAddressSpace()) {
1613	ExtLoads.legalForTypesWithMemDesc(
1614	TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: `8`}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: `16`}});
1615	}
1616
1617	// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1618	// 64-bits.
1619	//
1620	// TODO: Should generalize bitcast action into coerce, which will also cover
1621	// inserting addrspacecasts.
1622	ExtLoads.customIf(Predicate: typeIs(TypeIdx: `1`, TypesInit: Constant32Ptr));
1623
1624	ExtLoads.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1625	.widenScalarToNextPow2(TypeIdx: `0`)
1626	.lower();
1627
1628	auto &Atomics = getActionDefinitionsBuilder(
1629	Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1630	G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1631	G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1632	G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1633	.legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1634	{S64, GlobalPtr}, {S64, LocalPtr},
1635	{S32, RegionPtr}, {S64, RegionPtr}});
1636	if (ST.hasFlatAddressSpace()) {
1637	Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1638	}
1639
1640	// TODO: v2bf16 operations, and fat buffer pointer support.
1641	auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1642	if (ST.hasLDSFPAtomicAddF32()) {
1643	Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1644	if (ST.hasLdsAtomicAddF64())
1645	Atomic.legalFor(Types: {{S64, LocalPtr}});
1646	if (ST.hasAtomicDsPkAdd16Insts())
1647	Atomic.legalFor(Types: {{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1648	}
1649	if (ST.hasAtomicFaddInsts())
1650	Atomic.legalFor(Types: {{S32, GlobalPtr}});
1651	if (ST.hasFlatAtomicFaddF32Inst())
1652	Atomic.legalFor(Types: {{S32, FlatPtr}});
1653
1654	if (ST.hasGFX90AInsts()) {
1655	// These are legal with some caveats, and should have undergone expansion in
1656	// the IR in most situations
1657	// TODO: Move atomic expansion into legalizer
1658	Atomic.legalFor(Types: {
1659	{S32, GlobalPtr},
1660	{S64, GlobalPtr},
1661	{S64, FlatPtr}
1662	});
1663	}
1664
1665	if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() \|\|
1666	ST.hasAtomicBufferGlobalPkAddF16Insts())
1667	Atomic.legalFor(Types: {{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1668	if (ST.hasAtomicGlobalPkAddBF16Inst())
1669	Atomic.legalFor(Types: {{V2BF16, GlobalPtr}});
1670	if (ST.hasAtomicFlatPkAdd16Insts())
1671	Atomic.legalFor(Types: {{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1672
1673
1674	// Most of the legalization work here is done by AtomicExpand. We could
1675	// probably use a simpler legality rule that just assumes anything is OK.
1676	auto &AtomicFMinFMax =
1677	getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1678	.legalFor(Types: {{F32, LocalPtr}, {F64, LocalPtr}});
1679
1680	if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1681	AtomicFMinFMax.legalFor(Types: {{F32, GlobalPtr},{F32, BufferFatPtr}});
1682	if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1683	AtomicFMinFMax.legalFor(Types: {{F64, GlobalPtr}, {F64, BufferFatPtr}});
1684	if (ST.hasAtomicFMinFMaxF32FlatInsts())
1685	AtomicFMinFMax.legalFor(Types: {F32, FlatPtr});
1686	if (ST.hasAtomicFMinFMaxF64FlatInsts())
1687	AtomicFMinFMax.legalFor(Types: {F64, FlatPtr});
1688
1689	// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1690	// demarshalling
1691	getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1692	.customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1693	{S32, FlatPtr}, {S64, FlatPtr}})
1694	.legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1695	{S32, RegionPtr}, {S64, RegionPtr}});
1696	// TODO: Pointer types, any 32-bit or 64-bit vector
1697
1698	// Condition should be s32 for scalar, s1 for vector.
1699	getActionDefinitionsBuilder(Opcode: G_SELECT)
1700	.legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1701	LocalPtr, FlatPtr, PrivatePtr,
1702	LLT::fixed_vector(NumElements: `2`, ScalarTy: LocalPtr),
1703	LLT::fixed_vector(NumElements: `2`, ScalarTy: PrivatePtr)},
1704	Types1: {S1, S32})
1705	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
1706	.scalarize(TypeIdx: `1`)
1707	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
1708	.fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: `0`), Mutation: scalarize(TypeIdx: `0`))
1709	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `2`)
1710	.clampMaxNumElements(TypeIdx: `0`, EltTy: LocalPtr, MaxElements: `2`)
1711	.clampMaxNumElements(TypeIdx: `0`, EltTy: PrivatePtr, MaxElements: `2`)
1712	.scalarize(TypeIdx: `0`)
1713	.widenScalarToNextPow2(TypeIdx: `0`)
1714	.legalIf(Predicate: all(P0: isPointer(TypeIdx: `0`), P1: typeInSet(TypeIdx: `1`, TypesInit: {S1, S32})));
1715
1716	// TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1717	// be more flexible with the shift amount type.
1718	auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1719	.legalFor(Types: {{S32, S32}, {S64, S32}});
1720	if (ST.has16BitInsts()) {
1721	if (ST.hasVOP3PInsts()) {
1722	Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1723	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`);
1724	} else
1725	Shifts.legalFor(Types: {{S16, S16}});
1726
1727	// TODO: Support 16-bit shift amounts for all types
1728	Shifts.widenScalarIf(
1729	Predicate: [=](const LegalityQuery &Query) {
1730	// Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1731	// 32-bit amount.
1732	const LLT ValTy = Query.Types [`0`];
1733	const LLT AmountTy = Query.Types [`1`];
1734	return ValTy.getSizeInBits() <= `16` &&
1735	AmountTy.getSizeInBits() < `16`;
1736	}, Mutation: changeTo(TypeIdx: `1`, Ty: S16));
1737	Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: `0`, TypesInit: S16), TypeIdx: `1`, Ty: S16);
1738	Shifts.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32);
1739	Shifts.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `16`);
1740	Shifts.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
1741
1742	getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1743	.minScalar(TypeIdx: `0`, Ty: S16)
1744	.scalarize(TypeIdx: `0`)
1745	.lower();
1746	} else {
1747	// Make sure we legalize the shift amount type first, as the general
1748	// expansion for the shifted type will produce much worse code if it hasn't
1749	// been truncated already.
1750	Shifts.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32);
1751	Shifts.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`);
1752	Shifts.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1753
1754	getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1755	.minScalar(TypeIdx: `0`, Ty: S32)
1756	.scalarize(TypeIdx: `0`)
1757	.lower();
1758	}
1759	Shifts.scalarize(TypeIdx: `0`);
1760
1761	for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1762	unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? `1` : `0`;
1763	unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? `0` : `1`;
1764	unsigned IdxTypeIdx = `2`;
1765
1766	getActionDefinitionsBuilder(Opcode: Op)
1767	.customIf(Predicate: [=](const LegalityQuery &Query) {
1768	const LLT EltTy = Query.Types [EltTypeIdx];
1769	const LLT VecTy = Query.Types [VecTypeIdx];
1770	const LLT IdxTy = Query.Types [IdxTypeIdx];
1771	const unsigned EltSize = EltTy.getSizeInBits();
1772	const bool isLegalVecType =
1773	!!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1774	// Address space 8 pointers are 128-bit wide values, but the logic
1775	// below will try to bitcast them to 2N x s64, which will fail.
1776	// Therefore, as an intermediate step, wrap extracts/insertions from a
1777	// ptrtoint-ing the vector and scalar arguments (or inttoptring the
1778	// extraction result) in order to produce a vector operation that can
1779	// be handled by the logic below.
1780	if (EltTy.isPointer() && EltSize > `64`)
1781	return true;
1782	return (EltSize == `32` \|\| EltSize == `64`) &&
1783	VecTy.getSizeInBits() % `32` == `0` &&
1784	VecTy.getSizeInBits() <= MaxRegisterSize &&
1785	IdxTy.getSizeInBits() == `32` &&
1786	isLegalVecType;
1787	})
1788	.bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx), P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: `32`)),
1789	Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1790	//.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1791	.bitcastIf(
1792	Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx), P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: `64`)),
1793	Mutation: [=](const LegalityQuery &Query) {
1794	// For > 64-bit element types, try to turn this into a 64-bit
1795	// element vector since we may be able to do better indexing
1796	// if this is scalar. If not, fall back to 32.
1797	const LLT EltTy = Query.Types [EltTypeIdx];
1798	const LLT VecTy = Query.Types [VecTypeIdx];
1799	const unsigned DstEltSize = EltTy.getSizeInBits();
1800	const unsigned VecSize = VecTy.getSizeInBits();
1801
1802	const unsigned TargetEltSize = DstEltSize % `64` == `0` ? `64` : `32`;
1803	return std::pair(
1804	VecTypeIdx,
1805	LLT::fixed_vector(NumElements: VecSize / TargetEltSize, ScalarSizeInBits: TargetEltSize));
1806	})
1807	.clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
1808	.clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
1809	.clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
1810	.clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: `32`)
1811	// TODO: Clamp elements for 64-bit vectors?
1812	.moreElementsIf(
1813	Predicate: isIllegalRegisterType(TypeIdx: VecTypeIdx),
1814	Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
1815	// It should only be necessary with variable indexes.
1816	// As a last resort, lower to the stack
1817	.lower();
1818	}
1819
1820	getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1821	.unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1822	const LLT &EltTy = Query.Types [`1`].getElementType();
1823	return Query.Types [`0`] != EltTy;
1824	});
1825
1826	for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1827	unsigned BigTyIdx = Op == G_EXTRACT ? `1` : `0`;
1828	unsigned LitTyIdx = Op == G_EXTRACT ? `0` : `1`;
1829
1830	// FIXME: Doesn't handle extract of illegal sizes.
1831	getActionDefinitionsBuilder(Opcode: Op)
1832	.lowerIf(Predicate: all(P0: typeIs(TypeIdx: LitTyIdx, TypesInit: S16), P1: sizeIs(TypeIdx: BigTyIdx, Size: `32`)))
1833	.lowerIf(Predicate: [=](const LegalityQuery &Query) {
1834	// Sub-vector(or single element) insert and extract.
1835	// TODO: verify immediate offset here since lower only works with
1836	// whole elements.
1837	const LLT BigTy = Query.Types [BigTyIdx];
1838	return BigTy.isVector();
1839	})
1840	// FIXME: Multiples of 16 should not be legal.
1841	.legalIf(Predicate: [=](const LegalityQuery &Query) {
1842	const LLT BigTy = Query.Types [BigTyIdx];
1843	const LLT LitTy = Query.Types [LitTyIdx];
1844	return (BigTy.getSizeInBits() % `32` == `0`) &&
1845	(LitTy.getSizeInBits() % `16` == `0`);
1846	})
1847	.widenScalarIf(
1848	Predicate: [=](const LegalityQuery &Query) {
1849	const LLT BigTy = Query.Types [BigTyIdx];
1850	return (BigTy.getScalarSizeInBits() < `16`);
1851	},
1852	Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: `16`))
1853	.widenScalarIf(
1854	Predicate: [=](const LegalityQuery &Query) {
1855	const LLT LitTy = Query.Types [LitTyIdx];
1856	return (LitTy.getScalarSizeInBits() < `16`);
1857	},
1858	Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: `16`))
1859	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1860	.widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: `32`);
1861
1862	}
1863
1864	auto &BuildVector = getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1865	.legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
1866	.legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
1867	.clampNumElements(TypeIdx: `0`, MinTy: V16S32, MaxTy: V32S32)
1868	.clampNumElements(TypeIdx: `0`, MinTy: V2S64, MaxTy: V16S64)
1869	.fewerElementsIf(Predicate: isWideVec16(TypeIdx: `0`), Mutation: changeTo(TypeIdx: `0`, Ty: V2S16))
1870	.moreElementsIf(
1871	Predicate: isIllegalRegisterType(TypeIdx: `0`),
1872	Mutation: moreElementsToNextExistingRegClass(TypeIdx: `0`));
1873
1874	if (ST.hasScalarPackInsts()) {
1875	BuildVector
1876	// FIXME: Should probably widen s1 vectors straight to s32
1877	.minScalarOrElt(TypeIdx: `0`, Ty: S16)
1878	.minScalar(TypeIdx: `1`, Ty: S16);
1879
1880	getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1881	.legalFor(Types: {V2S16, S32})
1882	.lower();
1883	} else {
1884	BuildVector.customFor(Types: {V2S16, S16});
1885	BuildVector.minScalarOrElt(TypeIdx: `0`, Ty: S32);
1886
1887	getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1888	.customFor(Types: {V2S16, S32})
1889	.lower();
1890	}
1891
1892	BuildVector.legalIf(Predicate: isRegisterType(TypeIdx: `0`));
1893
1894	// FIXME: Clamp maximum size
1895	getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
1896	.legalIf(Predicate: all(P0: isRegisterType(TypeIdx: `0`), P1: isRegisterType(TypeIdx: `1`)))
1897	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `32`)
1898	.clampMaxNumElements(TypeIdx: `1`, EltTy: S16, MaxElements: `2`) // TODO: Make 4?
1899	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `64`);
1900
1901	getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
1902
1903	// Merge/Unmerge
1904	for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1905	unsigned BigTyIdx = Op == G_MERGE_VALUES ? `0` : `1`;
1906	unsigned LitTyIdx = Op == G_MERGE_VALUES ? `1` : `0`;
1907
1908	auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1909	const LLT Ty = Query.Types [TypeIdx];
1910	if (Ty.isVector()) {
1911	const LLT &EltTy = Ty.getElementType();
1912	if (EltTy.getSizeInBits() < `8` \|\| EltTy.getSizeInBits() > `512`)
1913	return true;
1914	if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
1915	return true;
1916	}
1917	return false;
1918	};
1919
1920	auto &Builder = getActionDefinitionsBuilder(Opcode: Op)
1921	.legalIf(Predicate: all(P0: isRegisterType(TypeIdx: `0`), P1: isRegisterType(TypeIdx: `1`)))
1922	.lowerFor(Types: {{S16, V2S16}})
1923	.lowerIf(Predicate: [=](const LegalityQuery &Query) {
1924	const LLT BigTy = Query.Types [BigTyIdx];
1925	return BigTy.getSizeInBits() == `32`;
1926	})
1927	// Try to widen to s16 first for small types.
1928	// TODO: Only do this on targets with legal s16 shifts
1929	.minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: `16`), TypeIdx: LitTyIdx, Ty: S16)
1930	.widenScalarToNextPow2(TypeIdx: LitTyIdx, /Min/ MinSize: `16`)
1931	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1932	.fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: `0`, TypesInit: S16), P1: vectorWiderThan(TypeIdx: `1`, Size: `32`),
1933	args: elementTypeIs(TypeIdx: `1`, EltTy: S16)),
1934	Mutation: changeTo(TypeIdx: `1`, Ty: V2S16))
1935	// Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1936	// worth considering the multiples of 64 since 2192 and 2384 are not
1937	// valid.
1938	.clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
1939	.widenScalarToNextPow2(TypeIdx: LitTyIdx, /Min/ MinSize: `32`)
1940	// Break up vectors with weird elements into scalars
1941	.fewerElementsIf(
1942	Predicate: [=](const LegalityQuery &Query) { return notValidElt (Query, LitTyIdx); },
1943	Mutation: scalarize(TypeIdx: `0`))
1944	.fewerElementsIf(
1945	Predicate: [=](const LegalityQuery &Query) { return notValidElt (Query, BigTyIdx); },
1946	Mutation: scalarize(TypeIdx: `1`))
1947	.clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
1948
1949	if (Op == G_MERGE_VALUES) {
1950	Builder.widenScalarIf(
1951	// TODO: Use 16-bit shifts if legal for 8-bit values?
1952	Predicate: [=](const LegalityQuery &Query) {
1953	const LLT Ty = Query.Types [LitTyIdx];
1954	return Ty.getSizeInBits() < `32`;
1955	},
1956	Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
1957	}
1958
1959	Builder.widenScalarIf(
1960	Predicate: [=](const LegalityQuery &Query) {
1961	const LLT Ty = Query.Types [BigTyIdx];
1962	return Ty.getSizeInBits() % `16` != `0`;
1963	},
1964	Mutation: [=](const LegalityQuery &Query) {
1965	// Pick the next power of 2, or a multiple of 64 over 128.
1966	// Whichever is smaller.
1967	const LLT &Ty = Query.Types [BigTyIdx];
1968	unsigned NewSizeInBits = `1` << Log2_32_Ceil(Value: Ty.getSizeInBits() + `1`);
1969	if (NewSizeInBits >= `256`) {
1970	unsigned RoundedTo = alignTo<`64`>(Value: Ty.getSizeInBits() + `1`);
1971	if (RoundedTo < NewSizeInBits)
1972	NewSizeInBits = RoundedTo;
1973	}
1974	return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
1975	})
1976	// Any vectors left are the wrong size. Scalarize them.
1977	.scalarize(TypeIdx: `0`)
1978	.scalarize(TypeIdx: `1`);
1979	}
1980
1981	// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1982	// RegBankSelect.
1983	auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
1984	.legalFor(Types: {{S32}, {S64}});
1985
1986	if (ST.hasVOP3PInsts()) {
1987	SextInReg.lowerFor(Types: {{V2S16}})
1988	// Prefer to reduce vector widths for 16-bit vectors before lowering, to
1989	// get more vector shift opportunities, since we'll get those when
1990	// expanded.
1991	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`);
1992	} else if (ST.has16BitInsts()) {
1993	SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
1994	} else {
1995	// Prefer to promote to s32 before lowering if we don't have 16-bit
1996	// shifts. This avoid a lot of intermediate truncate and extend operations.
1997	SextInReg.lowerFor(Types: {{S32}, {S64}});
1998	}
1999
2000	SextInReg
2001	.scalarize(TypeIdx: `0`)
2002	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
2003	.lower();
2004
2005	getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
2006	.scalarize(TypeIdx: `0`)
2007	.lower();
2008
2009	// TODO: Only Try to form v2s16 with legal packed instructions.
2010	getActionDefinitionsBuilder(Opcode: G_FSHR)
2011	.legalFor(Types: {{S32, S32}})
2012	.lowerFor(Types: {{V2S16, V2S16}})
2013	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
2014	.scalarize(TypeIdx: `0`)
2015	.lower();
2016
2017	if (ST.hasVOP3PInsts()) {
2018	getActionDefinitionsBuilder(Opcode: G_FSHL)
2019	.lowerFor(Types: {{V2S16, V2S16}})
2020	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
2021	.scalarize(TypeIdx: `0`)
2022	.lower();
2023	} else {
2024	getActionDefinitionsBuilder(Opcode: G_FSHL)
2025	.scalarize(TypeIdx: `0`)
2026	.lower();
2027	}
2028
2029	getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
2030	.legalFor(Types: {S64});
2031
2032	getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
2033
2034	getActionDefinitionsBuilder(Opcode: G_FENCE)
2035	.alwaysLegal();
2036
2037	getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
2038	.scalarize(TypeIdx: `0`)
2039	.minScalar(TypeIdx: `0`, Ty: S32)
2040	.lower();
2041
2042	getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2043	.legalFor(Types: {{S32, S32}, {S64, S32}})
2044	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
2045	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
2046	.widenScalarToNextPow2(TypeIdx: `0`)
2047	.scalarize(TypeIdx: `0`);
2048
2049	getActionDefinitionsBuilder(
2050	Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2051	G_FCOPYSIGN,
2052
2053	G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2054	G_READ_REGISTER, G_WRITE_REGISTER,
2055
2056	G_SADDO, G_SSUBO})
2057	.lower();
2058
2059	if (ST.hasIEEEMinMax()) {
2060	getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2061	.legalFor(Types: FPTypesPK16)
2062	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
2063	.scalarize(TypeIdx: `0`);
2064	} else {
2065	// TODO: Implement
2066	getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM}).lower();
2067	}
2068
2069	getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2070	.lower();
2071
2072	getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2073
2074	getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2075	G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2076	G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2077	.unsupported();
2078
2079	getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2080
2081	getLegacyLegalizerInfo().computeTables();
2082	verify(MII: *ST.getInstrInfo());
2083	}
2084
2085	bool AMDGPULegalizerInfo::legalizeCustom(
2086	LegalizerHelper &Helper, MachineInstr &MI,
2087	LostDebugLocObserver &LocObserver) const {
2088	MachineIRBuilder &B = Helper.MIRBuilder;
2089	MachineRegisterInfo &MRI = *B.getMRI();
2090
2091	switch (MI.getOpcode()) {
2092	case TargetOpcode::G_ADDRSPACE_CAST:
2093	return legalizeAddrSpaceCast(MI, MRI, B);
2094	case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2095	return legalizeFroundeven(MI, MRI, B);
2096	case TargetOpcode::G_FCEIL:
2097	return legalizeFceil(MI, MRI, B);
2098	case TargetOpcode::G_FREM:
2099	return legalizeFrem(MI, MRI, B);
2100	case TargetOpcode::G_INTRINSIC_TRUNC:
2101	return legalizeIntrinsicTrunc(MI, MRI, B);
2102	case TargetOpcode::G_SITOFP:
2103	return legalizeITOFP(MI, MRI, B, Signed: true);
2104	case TargetOpcode::G_UITOFP:
2105	return legalizeITOFP(MI, MRI, B, Signed: false);
2106	case TargetOpcode::G_FPTOSI:
2107	return legalizeFPTOI(MI, MRI, B, Signed: true);
2108	case TargetOpcode::G_FPTOUI:
2109	return legalizeFPTOI(MI, MRI, B, Signed: false);
2110	case TargetOpcode::G_FMINNUM:
2111	case TargetOpcode::G_FMAXNUM:
2112	case TargetOpcode::G_FMINNUM_IEEE:
2113	case TargetOpcode::G_FMAXNUM_IEEE:
2114	return legalizeMinNumMaxNum(Helper, MI);
2115	case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2116	return legalizeExtractVectorElt(MI, MRI, B);
2117	case TargetOpcode::G_INSERT_VECTOR_ELT:
2118	return legalizeInsertVectorElt(MI, MRI, B);
2119	case TargetOpcode::G_FSIN:
2120	case TargetOpcode::G_FCOS:
2121	return legalizeSinCos(MI, MRI, B);
2122	case TargetOpcode::G_GLOBAL_VALUE:
2123	return legalizeGlobalValue(MI, MRI, B);
2124	case TargetOpcode::G_LOAD:
2125	case TargetOpcode::G_SEXTLOAD:
2126	case TargetOpcode::G_ZEXTLOAD:
2127	return legalizeLoad(Helper, MI);
2128	case TargetOpcode::G_STORE:
2129	return legalizeStore(Helper, MI);
2130	case TargetOpcode::G_FMAD:
2131	return legalizeFMad(MI, MRI, B);
2132	case TargetOpcode::G_FDIV:
2133	return legalizeFDIV(MI, MRI, B);
2134	case TargetOpcode::G_FFREXP:
2135	return legalizeFFREXP(MI, MRI, B);
2136	case TargetOpcode::G_FSQRT:
2137	return legalizeFSQRT(MI, MRI, B);
2138	case TargetOpcode::G_UDIV:
2139	case TargetOpcode::G_UREM:
2140	case TargetOpcode::G_UDIVREM:
2141	return legalizeUnsignedDIV_REM(MI, MRI, B);
2142	case TargetOpcode::G_SDIV:
2143	case TargetOpcode::G_SREM:
2144	case TargetOpcode::G_SDIVREM:
2145	return legalizeSignedDIV_REM(MI, MRI, B);
2146	case TargetOpcode::G_ATOMIC_CMPXCHG:
2147	return legalizeAtomicCmpXChg(MI, MRI, B);
2148	case TargetOpcode::G_FLOG2:
2149	return legalizeFlog2(MI, B);
2150	case TargetOpcode::G_FLOG:
2151	case TargetOpcode::G_FLOG10:
2152	return legalizeFlogCommon(MI, B);
2153	case TargetOpcode::G_FEXP2:
2154	return legalizeFExp2(MI, B);
2155	case TargetOpcode::G_FEXP:
2156	case TargetOpcode::G_FEXP10:
2157	return legalizeFExp(MI, B);
2158	case TargetOpcode::G_FPOW:
2159	return legalizeFPow(MI, B);
2160	case TargetOpcode::G_FFLOOR:
2161	return legalizeFFloor(MI, MRI, B);
2162	case TargetOpcode::G_BUILD_VECTOR:
2163	case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2164	return legalizeBuildVector(MI, MRI, B);
2165	case TargetOpcode::G_MUL:
2166	return legalizeMul(Helper, MI);
2167	case TargetOpcode::G_CTLZ:
2168	case TargetOpcode::G_CTTZ:
2169	return legalizeCTLZ_CTTZ(MI, MRI, B);
2170	case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2171	return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2172	case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2173	return legalizeFPTruncRound(MI, B);
2174	case TargetOpcode::G_STACKSAVE:
2175	return legalizeStackSave(MI, B);
2176	case TargetOpcode::G_GET_FPENV:
2177	return legalizeGetFPEnv(MI, MRI, B);
2178	case TargetOpcode::G_SET_FPENV:
2179	return legalizeSetFPEnv(MI, MRI, B);
2180	case TargetOpcode::G_TRAP:
2181	return legalizeTrap(MI, MRI, B);
2182	case TargetOpcode::G_DEBUGTRAP:
2183	return legalizeDebugTrap(MI, MRI, B);
2184	default:
2185	return false;
2186	}
2187
2188	llvm_unreachable("expected switch to return");
2189	}
2190
2191	Register AMDGPULegalizerInfo::getSegmentAperture(
2192	unsigned AS,
2193	MachineRegisterInfo &MRI,
2194	MachineIRBuilder &B) const {
2195	MachineFunction &MF = B.getMF();
2196	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2197	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2198	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2199
2200	assert(AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::PRIVATE_ADDRESS);
2201
2202	if (ST.hasApertureRegs()) {
2203	// Note: this register is somewhat broken. When used as a 32-bit operand,
2204	// it only returns zeroes. The real value is in the upper 32 bits.
2205	// Thus, we must emit extract the high 32 bits.
2206	const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2207	? AMDGPU::SRC_SHARED_BASE
2208	: AMDGPU::SRC_PRIVATE_BASE;
2209	// FIXME: It would be more natural to emit a COPY here, but then copy
2210	// coalescing would kick in and it would think it's okay to use the "HI"
2211	// subregister (instead of extracting the HI 32 bits) which is an artificial
2212	// (unusable) register.
2213	// Register TableGen definitions would need an overhaul to get rid of the
2214	// artificial "HI" aperture registers and prevent this kind of issue from
2215	// happening.
2216	Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2217	MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2218	B.buildInstr(Opc: AMDGPU::S_MOV_B64, DstOps: {Dst}, SrcOps: {Register (ApertureRegNo)});
2219	return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: `1`);
2220	}
2221
2222	// TODO: can we be smarter about machine pointer info?
2223	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2224	Register LoadAddr = MRI.createGenericVirtualRegister(
2225	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
2226	// For code object version 5, private_base and shared_base are passed through
2227	// implicit kernargs.
2228	if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2229	AMDGPU::AMDHSA_COV5) {
2230	AMDGPUTargetLowering::ImplicitParameter Param =
2231	AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2232	: AMDGPUTargetLowering::PRIVATE_BASE;
2233	uint64_t Offset =
2234	ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2235
2236	Register KernargPtrReg = MRI.createGenericVirtualRegister(
2237	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
2238
2239	if (!loadInputValue(DstReg: KernargPtrReg, B,
2240	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2241	return Register ();
2242
2243	MachineMemOperand *MMO = MF.getMachineMemOperand(
2244	PtrInfo,
2245	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
2246	MachineMemOperand::MOInvariant,
2247	MemTy: LLT::scalar(SizeInBits: `32`), base_alignment: commonAlignment(A: Align (`64`), Offset));
2248
2249	// Pointer address
2250	B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
2251	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset).getReg(Idx: `0`));
2252	// Load address
2253	return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: `0`);
2254	}
2255
2256	Register QueuePtr = MRI.createGenericVirtualRegister(
2257	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
2258
2259	if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2260	return Register ();
2261
2262	// Offset into amd_queue_t for group_segment_aperture_base_hi /
2263	// private_segment_aperture_base_hi.
2264	uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? `0x40` : `0x44`;
2265
2266	MachineMemOperand *MMO = MF.getMachineMemOperand(
2267	PtrInfo,
2268	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
2269	MachineMemOperand::MOInvariant,
2270	MemTy: LLT::scalar(SizeInBits: `32`), base_alignment: commonAlignment(A: Align (`64`), Offset: StructOffset));
2271
2272	B.buildPtrAdd(Res: LoadAddr, Op0: QueuePtr,
2273	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: StructOffset).getReg(Idx: `0`));
2274	return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: `0`);
2275	}
2276
2277	/// Return true if the value is a known valid address, such that a null check is
2278	/// not necessary.
2279	static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2280	const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2281	MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2282	switch (Def->getOpcode()) {
2283	case AMDGPU::G_FRAME_INDEX:
2284	case AMDGPU::G_GLOBAL_VALUE:
2285	case AMDGPU::G_BLOCK_ADDR:
2286	return true;
2287	case AMDGPU::G_CONSTANT: {
2288	const ConstantInt *CI = Def->getOperand(i: `1`).getCImm();
2289	return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2290	}
2291	default:
2292	return false;
2293	}
2294
2295	return false;
2296	}
2297
2298	bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2299	MachineInstr &MI, MachineRegisterInfo &MRI,
2300	MachineIRBuilder &B) const {
2301	MachineFunction &MF = B.getMF();
2302
2303	// MI can either be a G_ADDRSPACE_CAST or a
2304	// G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2305	assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST \|\|
2306	(isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2307	Intrinsic::amdgcn_addrspacecast_nonnull));
2308
2309	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2310	Register Dst = MI.getOperand(i: `0`).getReg();
2311	Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: `2`).getReg()
2312	: MI.getOperand(i: `1`).getReg();
2313	LLT DstTy = MRI.getType(Reg: Dst);
2314	LLT SrcTy = MRI.getType(Reg: Src);
2315	unsigned DestAS = DstTy.getAddressSpace();
2316	unsigned SrcAS = SrcTy.getAddressSpace();
2317
2318	// TODO: Avoid reloading from the queue ptr for each cast, or at least each
2319	// vector element.
2320	assert(!DstTy.isVector());
2321
2322	const AMDGPUTargetMachine &TM
2323	= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2324
2325	if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2326	MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2327	return true;
2328	}
2329
2330	if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2331	(DestAS == AMDGPUAS::LOCAL_ADDRESS \|\|
2332	DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2333	// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2334	// G_ADDRSPACE_CAST we need to guess.
2335	if (isa<GIntrinsic>(Val: MI) \|\| isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2336	// Extract low 32-bits of the pointer.
2337	B.buildExtract(Res: Dst, Src, Index: `0`);
2338	MI.eraseFromParent();
2339	return true;
2340	}
2341
2342	unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
2343
2344	auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2345	auto FlatNull = B.buildConstant(Res: SrcTy, Val: `0`);
2346
2347	// Extract low 32-bits of the pointer.
2348	auto PtrLo32 = B.buildExtract(Res: DstTy, Src, Index: `0`);
2349
2350	auto CmpRes =
2351	B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: `1`), Op0: Src, Op1: FlatNull.getReg(Idx: `0`));
2352	B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: `0`));
2353
2354	MI.eraseFromParent();
2355	return true;
2356	}
2357
2358	if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2359	(SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\|
2360	SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2361	Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2362	if (!ApertureReg.isValid())
2363	return false;
2364
2365	// Coerce the type of the low half of the result so we can use merge_values.
2366	Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: `0`);
2367
2368	// TODO: Should we allow mismatched types but matching sizes in merges to
2369	// avoid the ptrtoint?
2370	auto BuildPtr = B.buildMergeLikeInstr(Res: DstTy, Ops: {SrcAsInt, ApertureReg});
2371
2372	// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2373	// G_ADDRSPACE_CAST we need to guess.
2374	if (isa<GIntrinsic>(Val: MI) \|\| isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2375	B.buildCopy(Res: Dst, Op: BuildPtr);
2376	MI.eraseFromParent();
2377	return true;
2378	}
2379
2380	auto SegmentNull = B.buildConstant(Res: SrcTy, Val: TM.getNullPointerValue(AddrSpace: SrcAS));
2381	auto FlatNull = B.buildConstant(Res: DstTy, Val: TM.getNullPointerValue(AddrSpace: DestAS));
2382
2383	auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: `1`), Op0: Src,
2384	Op1: SegmentNull.getReg(Idx: `0`));
2385
2386	B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2387
2388	MI.eraseFromParent();
2389	return true;
2390	}
2391
2392	if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2393	SrcTy.getSizeInBits() == `64`) {
2394	// Truncate.
2395	B.buildExtract(Res: Dst, Src, Index: `0`);
2396	MI.eraseFromParent();
2397	return true;
2398	}
2399
2400	if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2401	DstTy.getSizeInBits() == `64`) {
2402	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2403	uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2404	auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2405	auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2406	B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2407	MI.eraseFromParent();
2408	return true;
2409	}
2410
2411	DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2412	MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2413
2414	LLVMContext &Ctx = MF.getFunction().getContext();
2415	Ctx.diagnose(DI: InvalidAddrSpaceCast);
2416	B.buildUndef(Res: Dst);
2417	MI.eraseFromParent();
2418	return true;
2419	}
2420
2421	bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2422	MachineRegisterInfo &MRI,
2423	MachineIRBuilder &B) const {
2424	Register Src = MI.getOperand(i: `1`).getReg();
2425	LLT Ty = MRI.getType(Reg: Src);
2426	assert(Ty.isScalar() && Ty.getSizeInBits() == `64`);
2427
2428	APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2429	APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2430
2431	auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2432	auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2433
2434	// TODO: Should this propagate fast-math-flags?
2435	auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2436	auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2437
2438	auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2439	auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2440
2441	auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: `1`), Op0: Fabs, Op1: C2);
2442	B.buildSelect(Res: MI.getOperand(i: `0`).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2443	MI.eraseFromParent();
2444	return true;
2445	}
2446
2447	bool AMDGPULegalizerInfo::legalizeFceil(
2448	MachineInstr &MI, MachineRegisterInfo &MRI,
2449	MachineIRBuilder &B) const {
2450
2451	const LLT S1 = LLT::scalar(SizeInBits: `1`);
2452	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2453
2454	Register Src = MI.getOperand(i: `1`).getReg();
2455	assert(MRI.getType(Src) == S64);
2456
2457	// result = trunc(src)
2458	// if (src > 0.0 && src != result)
2459	// result += 1.0
2460
2461	auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2462
2463	const auto Zero = B.buildFConstant(Res: S64, Val: `0.0`);
2464	const auto One = B.buildFConstant(Res: S64, Val: `1.0`);
2465	auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2466	auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2467	auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2468	auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2469
2470	// TODO: Should this propagate fast-math-flags?
2471	B.buildFAdd(Dst: MI.getOperand(i: `0`).getReg(), Src0: Trunc, Src1: Add);
2472	MI.eraseFromParent();
2473	return true;
2474	}
2475
2476	bool AMDGPULegalizerInfo::legalizeFrem(
2477	MachineInstr &MI, MachineRegisterInfo &MRI,
2478	MachineIRBuilder &B) const {
2479	Register DstReg = MI.getOperand(i: `0`).getReg();
2480	Register Src0Reg = MI.getOperand(i: `1`).getReg();
2481	Register Src1Reg = MI.getOperand(i: `2`).getReg();
2482	auto Flags = MI.getFlags();
2483	LLT Ty = MRI.getType(Reg: DstReg);
2484
2485	auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2486	auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2487	auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2488	B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2489	MI.eraseFromParent();
2490	return true;
2491	}
2492
2493	static MachineInstrBuilder extractF64Exponent(Register Hi,
2494	MachineIRBuilder &B) {
2495	const unsigned FractBits = `52`;
2496	const unsigned ExpBits = `11`;
2497	LLT S32 = LLT::scalar(SizeInBits: `32`);
2498
2499	auto Const0 = B.buildConstant(Res: S32, Val: FractBits - `32`);
2500	auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2501
2502	auto ExpPart = B.buildIntrinsic(ID: Intrinsic::amdgcn_ubfe, Res: {S32})
2503	.addUse(RegNo: Hi)
2504	.addUse(RegNo: Const0.getReg(Idx: `0`))
2505	.addUse(RegNo: Const1.getReg(Idx: `0`));
2506
2507	return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: `1023`));
2508	}
2509
2510	bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2511	MachineInstr &MI, MachineRegisterInfo &MRI,
2512	MachineIRBuilder &B) const {
2513	const LLT S1 = LLT::scalar(SizeInBits: `1`);
2514	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2515	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2516
2517	Register Src = MI.getOperand(i: `1`).getReg();
2518	assert(MRI.getType(Src) == S64);
2519
2520	// TODO: Should this use extract since the low half is unused?
2521	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2522	Register Hi = Unmerge.getReg(Idx: `1`);
2523
2524	// Extract the upper half, since this is where we will find the sign and
2525	// exponent.
2526	auto Exp = extractF64Exponent(Hi, B);
2527
2528	const unsigned FractBits = `52`;
2529
2530	// Extract the sign bit.
2531	const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(`1`) << `31`);
2532	auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2533
2534	const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(`1`) << FractBits) - `1`);
2535
2536	const auto Zero32 = B.buildConstant(Res: S32, Val: `0`);
2537
2538	// Extend back to 64-bits.
2539	auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2540
2541	auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2542	auto Not = B.buildNot(Dst: S64, Src0: Shr);
2543	auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2544	auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - `1`);
2545
2546	auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2547	auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2548
2549	auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2550	B.buildSelect(Res: MI.getOperand(i: `0`).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2551	MI.eraseFromParent();
2552	return true;
2553	}
2554
2555	bool AMDGPULegalizerInfo::legalizeITOFP(
2556	MachineInstr &MI, MachineRegisterInfo &MRI,
2557	MachineIRBuilder &B, bool Signed) const {
2558
2559	Register Dst = MI.getOperand(i: `0`).getReg();
2560	Register Src = MI.getOperand(i: `1`).getReg();
2561
2562	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2563	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2564
2565	assert(MRI.getType(Src) == S64);
2566
2567	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2568	auto ThirtyTwo = B.buildConstant(Res: S32, Val: `32`);
2569
2570	if (MRI.getType(Reg: Dst) == S64) {
2571	auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: `1`))
2572	: B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: `1`));
2573
2574	auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: `0`));
2575	auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2576
2577	// TODO: Should this propagate fast-math-flags?
2578	B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2579	MI.eraseFromParent();
2580	return true;
2581	}
2582
2583	assert(MRI.getType(Dst) == S32);
2584
2585	auto One = B.buildConstant(Res: S32, Val: `1`);
2586
2587	MachineInstrBuilder ShAmt;
2588	if (Signed) {
2589	auto ThirtyOne = B.buildConstant(Res: S32, Val: `31`);
2590	auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: `0`), Src1: Unmerge.getReg(Idx: `1`));
2591	auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2592	auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2593	auto LS = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32})
2594	.addUse(RegNo: Unmerge.getReg(Idx: `1`));
2595	auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2596	ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2597	} else
2598	ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: `1`));
2599	auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2600	auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2601	auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: `0`));
2602	auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: `1`), Src1: Adjust);
2603	auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2604	auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2605	B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2606	MI.eraseFromParent();
2607	return true;
2608	}
2609
2610	// TODO: Copied from DAG implementation. Verify logic and document how this
2611	// actually works.
2612	bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2613	MachineRegisterInfo &MRI,
2614	MachineIRBuilder &B,
2615	bool Signed) const {
2616
2617	Register Dst = MI.getOperand(i: `0`).getReg();
2618	Register Src = MI.getOperand(i: `1`).getReg();
2619
2620	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2621	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2622
2623	const LLT SrcLT = MRI.getType(Reg: Src);
2624	assert((SrcLT == S32 \|\| SrcLT == S64) && MRI.getType(Dst) == S64);
2625
2626	unsigned Flags = MI.getFlags();
2627
2628	// The basic idea of converting a floating point number into a pair of 32-bit
2629	// integers is illustrated as follows:
2630	//
2631	// tf := trunc(val);
2632	// hif := floor(tf 2^-32);*
2633	// lof := tf - hif 2^32; // lof is always positive due to floor.*
2634	// hi := fptoi(hif);
2635	// lo := fptoi(lof);
2636	//
2637	auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2638	MachineInstrBuilder Sign;
2639	if (Signed && SrcLT == S32) {
2640	// However, a 32-bit floating point number has only 23 bits mantissa and
2641	// it's not enough to hold all the significant bits of `lof` if val is
2642	// negative. To avoid the loss of precision, We need to take the absolute
2643	// value after truncating and flip the result back based on the original
2644	// signedness.
2645	Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: `31`));
2646	Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2647	}
2648	MachineInstrBuilder K0, K1;
2649	if (SrcLT == S64) {
2650	K0 = B.buildFConstant(
2651	Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/2^-32/ `0x3df0000000000000`)));
2652	K1 = B.buildFConstant(
2653	Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/-2^32/ `0xc1f0000000000000`)));
2654	} else {
2655	K0 = B.buildFConstant(
2656	Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/2^-32/ `0x2f800000`)));
2657	K1 = B.buildFConstant(
2658	Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/-2^32/ `0xcf800000`)));
2659	}
2660
2661	auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2662	auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2663	auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2664
2665	auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2666	: B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2667	auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2668
2669	if (Signed && SrcLT == S32) {
2670	// Flip the result based on the signedness, which is either all 0s or 1s.
2671	Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2672	// r := xor({lo, hi}, sign) - sign;
2673	B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2674	Src1: Sign);
2675	} else
2676	B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2677	MI.eraseFromParent();
2678
2679	return true;
2680	}
2681
2682	bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2683	MachineInstr &MI) const {
2684	MachineFunction &MF = Helper.MIRBuilder.getMF();
2685	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2686
2687	const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE \|\|
2688	MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2689
2690	// With ieee_mode disabled, the instructions have the correct behavior
2691	// already for G_FMINNUM/G_FMAXNUM
2692	if (!MFI->getMode().IEEE)
2693	return !IsIEEEOp;
2694
2695	if (IsIEEEOp)
2696	return true;
2697
2698	return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2699	}
2700
2701	bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2702	MachineInstr &MI, MachineRegisterInfo &MRI,
2703	MachineIRBuilder &B) const {
2704	// TODO: Should move some of this into LegalizerHelper.
2705
2706	// TODO: Promote dynamic indexing of s16 to s32
2707
2708	Register Dst = MI.getOperand(i: `0`).getReg();
2709	Register Vec = MI.getOperand(i: `1`).getReg();
2710
2711	LLT VecTy = MRI.getType(Reg: Vec);
2712	LLT EltTy = VecTy.getElementType();
2713	assert(EltTy == MRI.getType(Dst));
2714
2715	// Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2716	// but we can't go directly to that logic becasue you can't bitcast a vector
2717	// of pointers to a vector of integers. Therefore, introduce an intermediate
2718	// vector of integers using ptrtoint (and inttoptr on the output) in order to
2719	// drive the legalization forward.
2720	if (EltTy.isPointer() && EltTy.getSizeInBits() > `64`) {
2721	LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2722	LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2723
2724	auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2725	auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: `2`));
2726	B.buildIntToPtr(Dst, Src: IntElt);
2727
2728	MI.eraseFromParent();
2729	return true;
2730	}
2731
2732	// FIXME: Artifact combiner probably should have replaced the truncated
2733	// constant before this, so we shouldn't need
2734	// getIConstantVRegValWithLookThrough.
2735	std::optional<ValueAndVReg> MaybeIdxVal =
2736	getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: `2`).getReg(), MRI);
2737	if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2738	return true;
2739	const uint64_t IdxVal = MaybeIdxVal ->Value.getZExtValue();
2740
2741	if (IdxVal < VecTy.getNumElements()) {
2742	auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
2743	B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
2744	} else {
2745	B.buildUndef(Res: Dst);
2746	}
2747
2748	MI.eraseFromParent();
2749	return true;
2750	}
2751
2752	bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2753	MachineInstr &MI, MachineRegisterInfo &MRI,
2754	MachineIRBuilder &B) const {
2755	// TODO: Should move some of this into LegalizerHelper.
2756
2757	// TODO: Promote dynamic indexing of s16 to s32
2758
2759	Register Dst = MI.getOperand(i: `0`).getReg();
2760	Register Vec = MI.getOperand(i: `1`).getReg();
2761	Register Ins = MI.getOperand(i: `2`).getReg();
2762
2763	LLT VecTy = MRI.getType(Reg: Vec);
2764	LLT EltTy = VecTy.getElementType();
2765	assert(EltTy == MRI.getType(Ins));
2766
2767	// Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2768	// but we can't go directly to that logic becasue you can't bitcast a vector
2769	// of pointers to a vector of integers. Therefore, make the pointer vector
2770	// into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2771	// new value, and then inttoptr the result vector back. This will then allow
2772	// the rest of legalization to take over.
2773	if (EltTy.isPointer() && EltTy.getSizeInBits() > `64`) {
2774	LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2775	LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2776
2777	auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2778	auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
2779	auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
2780	Idx: MI.getOperand(i: `3`));
2781	B.buildIntToPtr(Dst, Src: IntVecDest);
2782	MI.eraseFromParent();
2783	return true;
2784	}
2785
2786	// FIXME: Artifact combiner probably should have replaced the truncated
2787	// constant before this, so we shouldn't need
2788	// getIConstantVRegValWithLookThrough.
2789	std::optional<ValueAndVReg> MaybeIdxVal =
2790	getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: `3`).getReg(), MRI);
2791	if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2792	return true;
2793
2794	const uint64_t IdxVal = MaybeIdxVal ->Value.getZExtValue();
2795
2796	unsigned NumElts = VecTy.getNumElements();
2797	if (IdxVal < NumElts) {
2798	SmallVector<Register, `8`> SrcRegs;
2799	for (unsigned i = `0`; i < NumElts; ++i)
2800	SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
2801	B.buildUnmerge(Res: SrcRegs, Op: Vec);
2802
2803	SrcRegs [IdxVal] = MI.getOperand(i: `2`).getReg();
2804	B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
2805	} else {
2806	B.buildUndef(Res: Dst);
2807	}
2808
2809	MI.eraseFromParent();
2810	return true;
2811	}
2812
2813	bool AMDGPULegalizerInfo::legalizeSinCos(
2814	MachineInstr &MI, MachineRegisterInfo &MRI,
2815	MachineIRBuilder &B) const {
2816
2817	Register DstReg = MI.getOperand(i: `0`).getReg();
2818	Register SrcReg = MI.getOperand(i: `1`).getReg();
2819	LLT Ty = MRI.getType(Reg: DstReg);
2820	unsigned Flags = MI.getFlags();
2821
2822	Register TrigVal;
2823	auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: `0.5` * numbers::inv_pi);
2824	if (ST.hasTrigReducedRange()) {
2825	auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
2826	TrigVal = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {Ty})
2827	.addUse(RegNo: MulVal.getReg(Idx: `0`))
2828	.setMIFlags(Flags)
2829	.getReg(Idx: `0`);
2830	} else
2831	TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: `0`);
2832
2833	Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2834	Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2835	B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
2836	.addUse(RegNo: TrigVal)
2837	.setMIFlags(Flags);
2838	MI.eraseFromParent();
2839	return true;
2840	}
2841
2842	bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2843	MachineIRBuilder &B,
2844	const GlobalValue *GV,
2845	int64_t Offset,
2846	unsigned GAFlags) const {
2847	assert(isInt<`32`>(Offset + `4`) && "32-bit offset is expected!");
2848	// In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2849	// to the following code sequence:
2850	//
2851	// For constant address space:
2852	// s_getpc_b64 s[0:1]
2853	// s_add_u32 s0, s0, $symbol
2854	// s_addc_u32 s1, s1, 0
2855	//
2856	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
2857	// a fixup or relocation is emitted to replace $symbol with a literal
2858	// constant, which is a pc-relative offset from the encoding of the $symbol
2859	// operand to the global variable.
2860	//
2861	// For global address space:
2862	// s_getpc_b64 s[0:1]
2863	// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2864	// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2865	//
2866	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
2867	// fixups or relocations are emitted to replace $symbol@@lo and*
2868	// $symbol@@hi with lower 32 bits and higher 32 bits of a literal constant,*
2869	// which is a 64-bit pc-relative offset from the encoding of the $symbol
2870	// operand to the global variable.
2871
2872	LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
2873
2874	Register PCReg = PtrTy.getSizeInBits() != `32` ? DstReg :
2875	B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
2876
2877	MachineInstrBuilder MIB = B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET)
2878	.addDef(RegNo: PCReg);
2879
2880	MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
2881	if (GAFlags == SIInstrInfo::MO_NONE)
2882	MIB.addImm(Val: `0`);
2883	else
2884	MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + `1`);
2885
2886	if (!B.getMRI()->getRegClassOrNull(Reg: PCReg))
2887	B.getMRI()->setRegClass(Reg: PCReg, RC: &AMDGPU::SReg_64RegClass);
2888
2889	if (PtrTy.getSizeInBits() == `32`)
2890	B.buildExtract(Res: DstReg, Src: PCReg, Index: `0`);
2891	return true;
2892	}
2893
2894	// Emit a ABS32_LO / ABS32_HI relocation stub.
2895	void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2896	Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2897	MachineRegisterInfo &MRI) const {
2898	bool RequiresHighHalf = PtrTy.getSizeInBits() != `32`;
2899
2900	LLT S32 = LLT::scalar(SizeInBits: `32`);
2901
2902	// Use the destination directly, if and only if we store the lower address
2903	// part only and we don't have a register class being set.
2904	Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
2905	? DstReg
2906	: MRI.createGenericVirtualRegister(Ty: S32);
2907
2908	if (!MRI.getRegClassOrNull(Reg: AddrLo))
2909	MRI.setRegClass(Reg: AddrLo, RC: &AMDGPU::SReg_32RegClass);
2910
2911	// Write the lower half.
2912	B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
2913	.addDef(RegNo: AddrLo)
2914	.addGlobalAddress(GV, Offset: `0`, TargetFlags: SIInstrInfo::MO_ABS32_LO);
2915
2916	// If required, write the upper half as well.
2917	if (RequiresHighHalf) {
2918	assert(PtrTy.getSizeInBits() == `64` &&
2919	"Must provide a 64-bit pointer type!");
2920
2921	Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
2922	MRI.setRegClass(Reg: AddrHi, RC: &AMDGPU::SReg_32RegClass);
2923
2924	B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
2925	.addDef(RegNo: AddrHi)
2926	.addGlobalAddress(GV, Offset: `0`, TargetFlags: SIInstrInfo::MO_ABS32_HI);
2927
2928	// Use the destination directly, if and only if we don't have a register
2929	// class being set.
2930	Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
2931	? DstReg
2932	: MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `64`));
2933
2934	if (!MRI.getRegClassOrNull(Reg: AddrDst))
2935	MRI.setRegClass(Reg: AddrDst, RC: &AMDGPU::SReg_64RegClass);
2936
2937	B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
2938
2939	// If we created a new register for the destination, cast the result into
2940	// the final output.
2941	if (AddrDst != DstReg)
2942	B.buildCast(Dst: DstReg, Src: AddrDst);
2943	} else if (AddrLo != DstReg) {
2944	// If we created a new register for the destination, cast the result into
2945	// the final output.
2946	B.buildCast(Dst: DstReg, Src: AddrLo);
2947	}
2948	}
2949
2950	bool AMDGPULegalizerInfo::legalizeGlobalValue(
2951	MachineInstr &MI, MachineRegisterInfo &MRI,
2952	MachineIRBuilder &B) const {
2953	Register DstReg = MI.getOperand(i: `0`).getReg();
2954	LLT Ty = MRI.getType(Reg: DstReg);
2955	unsigned AS = Ty.getAddressSpace();
2956
2957	const GlobalValue *GV = MI.getOperand(i: `1`).getGlobal();
2958	MachineFunction &MF = B.getMF();
2959	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2960
2961	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {
2962	if (!MFI->isModuleEntryFunction() &&
2963	GV->getName() != "llvm.amdgcn.module.lds") {
2964	const Function &Fn = MF.getFunction();
2965	DiagnosticInfoUnsupported BadLDSDecl(
2966	Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2967	DS_Warning);
2968	Fn.getContext().diagnose(DI: BadLDSDecl);
2969
2970	// We currently don't have a way to correctly allocate LDS objects that
2971	// aren't directly associated with a kernel. We do force inlining of
2972	// functions that use local objects. However, if these dead functions are
2973	// not eliminated, we don't want a compile time error. Just emit a warning
2974	// and a trap, since there should be no callable path here.
2975	B.buildTrap();
2976	B.buildUndef(Res: DstReg);
2977	MI.eraseFromParent();
2978	return true;
2979	}
2980
2981	// TODO: We could emit code to handle the initialization somewhere.
2982	// We ignore the initializer for now and legalize it to allow selection.
2983	// The initializer will anyway get errored out during assembly emission.
2984	const SITargetLowering *TLI = ST.getTargetLowering();
2985	if (!TLI->shouldUseLDSConstAddress(GV)) {
2986	MI.getOperand(i: `1`).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2987	return true; // Leave in place;
2988	}
2989
2990	if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2991	Type *Ty = GV->getValueType();
2992	// HIP uses an unsized array `extern __shared__ T s[]` or similar
2993	// zero-sized type in other languages to declare the dynamic shared
2994	// memory which size is not known at the compile time. They will be
2995	// allocated by the runtime and placed directly after the static
2996	// allocated ones. They all share the same offset.
2997	if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2998	// Adjust alignment for that dynamic shared memory array.
2999	MFI->setDynLDSAlign(F: MF.getFunction(), GV: *cast<GlobalVariable>(Val: GV));
3000	LLT S32 = LLT::scalar(SizeInBits: `32`);
3001	auto Sz = B.buildIntrinsic(ID: Intrinsic::amdgcn_groupstaticsize, Res: {S32});
3002	B.buildIntToPtr(Dst: DstReg, Src: Sz);
3003	MI.eraseFromParent();
3004	return true;
3005	}
3006	}
3007
3008	B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(),
3009	GV: *cast<GlobalVariable>(Val: GV)));
3010	MI.eraseFromParent();
3011	return true;
3012	}
3013
3014	if (ST.isAmdPalOS() \|\| ST.isMesa3DOS()) {
3015	buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
3016	MI.eraseFromParent();
3017	return true;
3018	}
3019
3020	const SITargetLowering *TLI = ST.getTargetLowering();
3021
3022	if (TLI->shouldEmitFixup(GV)) {
3023	buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: `0`);
3024	MI.eraseFromParent();
3025	return true;
3026	}
3027
3028	if (TLI->shouldEmitPCReloc(GV)) {
3029	buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: `0`, GAFlags: SIInstrInfo::MO_REL32);
3030	MI.eraseFromParent();
3031	return true;
3032	}
3033
3034	LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
3035	Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
3036
3037	LLT LoadTy = Ty.getSizeInBits() == `32` ? PtrTy : Ty;
3038	MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3039	PtrInfo: MachinePointerInfo::getGOT(MF),
3040	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
3041	MachineMemOperand::MOInvariant,
3042	MemTy: LoadTy, base_alignment: Align (`8`));
3043
3044	buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: `0`, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3045
3046	if (Ty.getSizeInBits() == `32`) {
3047	// Truncate if this is a 32-bit constant address.
3048	auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3049	B.buildExtract(Res: DstReg, Src: Load, Index: `0`);
3050	} else
3051	B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3052
3053	MI.eraseFromParent();
3054	return true;
3055	}
3056
3057	static LLT widenToNextPowerOf2(LLT Ty) {
3058	if (Ty.isVector())
3059	return Ty.changeElementCount(
3060	EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3061	return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3062	}
3063
3064	bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3065	MachineInstr &MI) const {
3066	MachineIRBuilder &B = Helper.MIRBuilder;
3067	MachineRegisterInfo &MRI = *B.getMRI();
3068	GISelChangeObserver &Observer = Helper.Observer;
3069
3070	Register PtrReg = MI.getOperand(i: `1`).getReg();
3071	LLT PtrTy = MRI.getType(Reg: PtrReg);
3072	unsigned AddrSpace = PtrTy.getAddressSpace();
3073
3074	if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3075	LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
3076	auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3077	Observer.changingInstr(MI);
3078	MI.getOperand(i: `1`).setReg(Cast.getReg(Idx: `0`));
3079	Observer.changedInstr(MI);
3080	return true;
3081	}
3082
3083	if (MI.getOpcode() != AMDGPU::G_LOAD)
3084	return false;
3085
3086	Register ValReg = MI.getOperand(i: `0`).getReg();
3087	LLT ValTy = MRI.getType(Reg: ValReg);
3088
3089	if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3090	Observer.changingInstr(MI);
3091	castBufferRsrcFromV4I32(MI, B, MRI, Idx: `0`);
3092	Observer.changedInstr(MI);
3093	return true;
3094	}
3095
3096	MachineMemOperand MMO = MI.memoperands_begin();
3097	const unsigned ValSize = ValTy.getSizeInBits();
3098	const LLT MemTy = MMO->getMemoryType();
3099	const Align MemAlign = MMO->getAlign();
3100	const unsigned MemSize = MemTy.getSizeInBits();
3101	const uint64_t AlignInBits = `8` * MemAlign.value();
3102
3103	// Widen non-power-of-2 loads to the alignment if needed
3104	if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3105	const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3106
3107	// This was already the correct extending load result type, so just adjust
3108	// the memory type.
3109	if (WideMemSize == ValSize) {
3110	MachineFunction &MF = B.getMF();
3111
3112	MachineMemOperand *WideMMO =
3113	MF.getMachineMemOperand(MMO, Offset: `0`, Size: WideMemSize / `8`);
3114	Observer.changingInstr(MI);
3115	MI.setMemRefs(MF, MemRefs: {WideMMO});
3116	Observer.changedInstr(MI);
3117	return true;
3118	}
3119
3120	// Don't bother handling edge case that should probably never be produced.
3121	if (ValSize > WideMemSize)
3122	return false;
3123
3124	LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3125
3126	Register WideLoad;
3127	if (!WideTy.isVector()) {
3128	WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`).getReg(Idx: `0`);
3129	B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: `0`);
3130	} else {
3131	// Extract the subvector.
3132
3133	if (isRegisterType(Ty: ValTy)) {
3134	// If this a case where G_EXTRACT is legal, use it.
3135	// (e.g. <3 x s32> -> <4 x s32>)
3136	WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`).getReg(Idx: `0`);
3137	B.buildExtract(Res: ValReg, Src: WideLoad, Index: `0`);
3138	} else {
3139	// For cases where the widened type isn't a nice register value, unmerge
3140	// from a widened register (e.g. <3 x s16> -> <4 x s16>)
3141	WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`).getReg(Idx: `0`);
3142	B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3143	}
3144	}
3145
3146	MI.eraseFromParent();
3147	return true;
3148	}
3149
3150	return false;
3151	}
3152
3153	bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3154	MachineInstr &MI) const {
3155	MachineIRBuilder &B = Helper.MIRBuilder;
3156	MachineRegisterInfo &MRI = *B.getMRI();
3157	GISelChangeObserver &Observer = Helper.Observer;
3158
3159	Register DataReg = MI.getOperand(i: `0`).getReg();
3160	LLT DataTy = MRI.getType(Reg: DataReg);
3161
3162	if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3163	Observer.changingInstr(MI);
3164	castBufferRsrcArgToV4I32(MI, B, Idx: `0`);
3165	Observer.changedInstr(MI);
3166	return true;
3167	}
3168	return false;
3169	}
3170
3171	bool AMDGPULegalizerInfo::legalizeFMad(
3172	MachineInstr &MI, MachineRegisterInfo &MRI,
3173	MachineIRBuilder &B) const {
3174	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
3175	assert(Ty.isScalar());
3176
3177	MachineFunction &MF = B.getMF();
3178	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3179
3180	// TODO: Always legal with future ftz flag.
3181	// FIXME: Do we need just output?
3182	if (Ty == LLT::float32() &&
3183	MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3184	return true;
3185	if (Ty == LLT::float16() &&
3186	MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3187	return true;
3188
3189	MachineIRBuilder HelperBuilder(MI);
3190	GISelObserverWrapper DummyObserver;
3191	LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3192	return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3193	}
3194
3195	bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3196	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3197	Register DstReg = MI.getOperand(i: `0`).getReg();
3198	Register PtrReg = MI.getOperand(i: `1`).getReg();
3199	Register CmpVal = MI.getOperand(i: `2`).getReg();
3200	Register NewVal = MI.getOperand(i: `3`).getReg();
3201
3202	assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3203	"this should not have been custom lowered");
3204
3205	LLT ValTy = MRI.getType(Reg: CmpVal);
3206	LLT VecTy = LLT::fixed_vector(NumElements: `2`, ScalarTy: ValTy);
3207
3208	Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: `0`);
3209
3210	B.buildInstr(Opcode: AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3211	.addDef(RegNo: DstReg)
3212	.addUse(RegNo: PtrReg)
3213	.addUse(RegNo: PackedVal)
3214	.setMemRefs(MI.memoperands());
3215
3216	MI.eraseFromParent();
3217	return true;
3218	}
3219
3220	/// Return true if it's known that \p Src can never be an f32 denormal value.
3221	static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3222	Register Src) {
3223	const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3224	switch (DefMI->getOpcode()) {
3225	case TargetOpcode::G_INTRINSIC: {
3226	switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3227	case Intrinsic::amdgcn_frexp_mant:
3228	return true;
3229	default:
3230	break;
3231	}
3232
3233	break;
3234	}
3235	case TargetOpcode::G_FFREXP: {
3236	if (DefMI->getOperand(i: `0`).getReg() == Src)
3237	return true;
3238	break;
3239	}
3240	case TargetOpcode::G_FPEXT: {
3241	return MRI.getType(Reg: DefMI->getOperand(i: `1`).getReg()) == LLT::scalar(SizeInBits: `16`);
3242	}
3243	default:
3244	return false;
3245	}
3246
3247	return false;
3248	}
3249
3250	static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3251	if (Flags & MachineInstr::FmAfn)
3252	return true;
3253	const auto &Options = MF.getTarget().Options;
3254	return Options.UnsafeFPMath \|\| Options.ApproxFuncFPMath;
3255	}
3256
3257	static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3258	unsigned Flags) {
3259	return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3260	MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3261	DenormalMode::PreserveSign;
3262	}
3263
3264	std::pair<Register, Register>
3265	AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3266	unsigned Flags) const {
3267	if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3268	return {};
3269
3270	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3271	auto SmallestNormal = B.buildFConstant(
3272	Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3273	auto IsLtSmallestNormal =
3274	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Src, Op1: SmallestNormal);
3275
3276	auto Scale32 = B.buildFConstant(Res: F32, Val: `0x1.0p+32`);
3277	auto One = B.buildFConstant(Res: F32, Val: `1.0`);
3278	auto ScaleFactor =
3279	B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3280	auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3281
3282	return {ScaledInput.getReg(Idx: `0`), IsLtSmallestNormal.getReg(Idx: `0`)};
3283	}
3284
3285	bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3286	MachineIRBuilder &B) const {
3287	// v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3288	// If we have to handle denormals, scale up the input and adjust the result.
3289
3290	// scaled = x (is_denormal ? 0x1.0p+32 : 1.0)*
3291	// log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3292
3293	Register Dst = MI.getOperand(i: `0`).getReg();
3294	Register Src = MI.getOperand(i: `1`).getReg();
3295	LLT Ty = B.getMRI()->getType(Reg: Dst);
3296	unsigned Flags = MI.getFlags();
3297
3298	if (Ty == LLT::scalar(SizeInBits: `16`)) {
3299	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3300	// Nothing in half is a denormal when promoted to f32.
3301	auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3302	auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {F32})
3303	.addUse(RegNo: Ext.getReg(Idx: `0`))
3304	.setMIFlags(Flags);
3305	B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3306	MI.eraseFromParent();
3307	return true;
3308	}
3309
3310	assert(Ty == LLT::scalar(`32`));
3311
3312	auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3313	if (!ScaledInput) {
3314	B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {MI.getOperand(i: `0`)})
3315	.addUse(RegNo: Src)
3316	.setMIFlags(Flags);
3317	MI.eraseFromParent();
3318	return true;
3319	}
3320
3321	auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3322	.addUse(RegNo: ScaledInput)
3323	.setMIFlags(Flags);
3324
3325	auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: `32.0`);
3326	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3327	auto ResultOffset =
3328	B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3329	B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3330
3331	MI.eraseFromParent();
3332	return true;
3333	}
3334
3335	static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3336	Register Z, unsigned Flags) {
3337	auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3338	return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: `0`);
3339	}
3340
3341	bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3342	MachineIRBuilder &B) const {
3343	const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3344	assert(IsLog10 \|\| MI.getOpcode() == TargetOpcode::G_FLOG);
3345
3346	MachineRegisterInfo &MRI = *B.getMRI();
3347	Register Dst = MI.getOperand(i: `0`).getReg();
3348	Register X = MI.getOperand(i: `1`).getReg();
3349	unsigned Flags = MI.getFlags();
3350	const LLT Ty = MRI.getType(Reg: X);
3351	MachineFunction &MF = B.getMF();
3352
3353	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3354	const LLT F16 = LLT::scalar(SizeInBits: `16`);
3355
3356	const AMDGPUTargetMachine &TM =
3357	static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3358
3359	if (Ty == F16 \|\| MI.getFlag(Flag: MachineInstr::FmAfn) \|\|
3360	TM.Options.ApproxFuncFPMath \|\| TM.Options.UnsafeFPMath) {
3361	if (Ty == F16 && !ST.has16BitInsts()) {
3362	Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3363	auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3364	legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: `0`), IsLog10, Flags);
3365	B.buildFPTrunc(Res: Dst, Op: LogVal);
3366	} else {
3367	legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3368	}
3369
3370	MI.eraseFromParent();
3371	return true;
3372	}
3373
3374	auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3375	if (ScaledInput)
3376	X = ScaledInput;
3377
3378	auto Y =
3379	B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty}).addUse(RegNo: X).setMIFlags(Flags);
3380
3381	Register R;
3382	if (ST.hasFastFMAF32()) {
3383	// c+cc are ln(2)/ln(10) to more than 49 bits
3384	const float c_log10 = `0x1.344134p-2f`;
3385	const float cc_log10 = `0x1.09f79ep-26f`;
3386
3387	// c + cc is ln(2) to more than 49 bits
3388	const float c_log = `0x1.62e42ep-1f`;
3389	const float cc_log = `0x1.efa39ep-25f`;
3390
3391	auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3392	auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3393
3394	R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags).getReg(Idx: `0`);
3395	auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags);
3396	auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags);
3397	auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags);
3398	R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags).getReg(Idx: `0`);
3399	} else {
3400	// ch+ct is ln(2)/ln(10) to more than 36 bits
3401	const float ch_log10 = `0x1.344000p-2f`;
3402	const float ct_log10 = `0x1.3509f6p-18f`;
3403
3404	// ch + ct is ln(2) to more than 36 bits
3405	const float ch_log = `0x1.62e000p-1f`;
3406	const float ct_log = `0x1.0bfbe8p-15f`;
3407
3408	auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3409	auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3410
3411	auto MaskConst = B.buildConstant(Res: Ty, Val: `0xfffff000`);
3412	auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3413	auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3414	auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags);
3415
3416	Register Mad0 =
3417	getMad(B, Ty, X: YH.getReg(Idx: `0`), Y: CT.getReg(Idx: `0`), Z: YTCT.getReg(Idx: `0`), Flags);
3418	Register Mad1 = getMad(B, Ty, X: YT.getReg(Idx: `0`), Y: CH.getReg(Idx: `0`), Z: Mad0, Flags);
3419	R = getMad(B, Ty, X: YH.getReg(Idx: `0`), Y: CH.getReg(Idx: `0`), Z: Mad1, Flags);
3420	}
3421
3422	const bool IsFiniteOnly =
3423	(MI.getFlag(Flag: MachineInstr::FmNoNans) \|\| TM.Options.NoNaNsFPMath) &&
3424	(MI.getFlag(Flag: MachineInstr::FmNoInfs) \|\| TM.Options.NoInfsFPMath);
3425
3426	if (!IsFiniteOnly) {
3427	// Expand isfinite(x) => fabs(x) < inf
3428	auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3429	auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3430	auto IsFinite =
3431	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Fabs, Op1: Inf, Flags);
3432	R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(Idx: `0`);
3433	}
3434
3435	if (ScaledInput) {
3436	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3437	auto ShiftK =
3438	B.buildFConstant(Res: Ty, Val: IsLog10 ? `0x1.344136p+3f` : `0x1.62e430p+4f`);
3439	auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3440	B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3441	} else {
3442	B.buildCopy(Res: Dst, Op: R);
3443	}
3444
3445	MI.eraseFromParent();
3446	return true;
3447	}
3448
3449	bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3450	Register Src, bool IsLog10,
3451	unsigned Flags) const {
3452	const double Log2BaseInverted =
3453	IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3454
3455	LLT Ty = B.getMRI()->getType(Reg: Dst);
3456
3457	if (Ty == LLT::scalar(SizeInBits: `32`)) {
3458	auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3459	if (ScaledInput) {
3460	auto LogSrc = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3461	.addUse(RegNo: Src)
3462	.setMIFlags(Flags);
3463	auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -`32.0` * Log2BaseInverted);
3464	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3465	auto ResultOffset =
3466	B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3467	auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3468
3469	if (ST.hasFastFMAF32())
3470	B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3471	else {
3472	auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3473	B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3474	}
3475
3476	return true;
3477	}
3478	}
3479
3480	auto Log2Operand = Ty == LLT::scalar(SizeInBits: `16`)
3481	? B.buildFLog2(Dst: Ty, Src, Flags)
3482	: B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3483	.addUse(RegNo: Src)
3484	.setMIFlags(Flags);
3485	auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3486	B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3487	return true;
3488	}
3489
3490	bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3491	MachineIRBuilder &B) const {
3492	// v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3493	// If we have to handle denormals, scale up the input and adjust the result.
3494
3495	Register Dst = MI.getOperand(i: `0`).getReg();
3496	Register Src = MI.getOperand(i: `1`).getReg();
3497	unsigned Flags = MI.getFlags();
3498	LLT Ty = B.getMRI()->getType(Reg: Dst);
3499	const LLT F16 = LLT::scalar(SizeInBits: `16`);
3500	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3501
3502	if (Ty == F16) {
3503	// Nothing in half is a denormal when promoted to f32.
3504	auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3505	auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {F32})
3506	.addUse(RegNo: Ext.getReg(Idx: `0`))
3507	.setMIFlags(Flags);
3508	B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3509	MI.eraseFromParent();
3510	return true;
3511	}
3512
3513	assert(Ty == F32);
3514
3515	if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3516	B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3517	.addUse(RegNo: Src)
3518	.setMIFlags(Flags);
3519	MI.eraseFromParent();
3520	return true;
3521	}
3522
3523	// bool needs_scaling = x < -0x1.f80000p+6f;
3524	// v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) (s ? 0x1.0p-64f : 1.0f);*
3525
3526	// -nextafter(128.0, -1)
3527	auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -`0x1.f80000p+6f`);
3528	auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Src,
3529	Op1: RangeCheckConst, Flags);
3530
3531	auto SixtyFour = B.buildFConstant(Res: Ty, Val: `0x1.0p+6f`);
3532	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3533	auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3534	auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3535
3536	auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3537	.addUse(RegNo: AddInput.getReg(Idx: `0`))
3538	.setMIFlags(Flags);
3539
3540	auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: `0x1.0p-64f`);
3541	auto One = B.buildFConstant(Res: Ty, Val: `1.0`);
3542	auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3543	B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3544	MI.eraseFromParent();
3545	return true;
3546	}
3547
3548	bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3549	Register X, unsigned Flags) const {
3550	LLT Ty = B.getMRI()->getType(Reg: Dst);
3551	LLT F32 = LLT::scalar(SizeInBits: `32`);
3552
3553	if (Ty != F32 \|\| !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3554	auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3555	auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Log2E, Flags);
3556
3557	if (Ty == F32) {
3558	B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3559	.addUse(RegNo: Mul.getReg(Idx: `0`))
3560	.setMIFlags(Flags);
3561	} else {
3562	B.buildFExp2(Dst, Src: Mul.getReg(Idx: `0`), Flags);
3563	}
3564
3565	return true;
3566	}
3567
3568	auto Threshold = B.buildFConstant(Res: Ty, Val: -`0x1.5d58a0p+6f`);
3569	auto NeedsScaling =
3570	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: X, Op1: Threshold, Flags);
3571	auto ScaleOffset = B.buildFConstant(Res: Ty, Val: `0x1.0p+6f`);
3572	auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3573	auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3574
3575	auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3576	auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3577
3578	auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3579	.addUse(RegNo: ExpInput.getReg(Idx: `0`))
3580	.setMIFlags(Flags);
3581
3582	auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: `0x1.969d48p-93f`);
3583	auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3584	B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3585	return true;
3586	}
3587
3588	bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3589	MachineIRBuilder &B) const {
3590	Register Dst = MI.getOperand(i: `0`).getReg();
3591	Register X = MI.getOperand(i: `1`).getReg();
3592	const unsigned Flags = MI.getFlags();
3593	MachineFunction &MF = B.getMF();
3594	MachineRegisterInfo &MRI = *B.getMRI();
3595	LLT Ty = MRI.getType(Reg: Dst);
3596	const LLT F16 = LLT::scalar(SizeInBits: `16`);
3597	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3598	const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3599
3600	if (Ty == F16) {
3601	// v_exp_f16 (fmul x, log2e)
3602	if (allowApproxFunc(MF, Flags)) {
3603	// TODO: Does this really require fast?
3604	legalizeFExpUnsafe(B, Dst, X, Flags);
3605	MI.eraseFromParent();
3606	return true;
3607	}
3608
3609	// exp(f16 x) ->
3610	// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3611
3612	// Nothing in half is a denormal when promoted to f32.
3613	auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
3614	Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
3615	legalizeFExpUnsafe(B, Dst: Lowered, X: Ext.getReg(Idx: `0`), Flags);
3616	B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
3617	MI.eraseFromParent();
3618	return true;
3619	}
3620
3621	assert(Ty == F32);
3622
3623	// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3624	// library behavior. Also, is known-not-daz source sufficient?
3625	if (allowApproxFunc(MF, Flags)) {
3626	legalizeFExpUnsafe(B, Dst, X, Flags);
3627	MI.eraseFromParent();
3628	return true;
3629	}
3630
3631	// Algorithm:
3632	//
3633	// e^x = 2^(x/ln(2)) = 2^(x(64/ln(2))/64)*
3634	//
3635	// x(64/ln(2)) = n + f, \|f\| <= 0.5, n is integer*
3636	// n = 64m + j, 0 <= j < 64*
3637	//
3638	// e^x = 2^((64m + j + f)/64)*
3639	// = (2^m) (2^(j/64)) * 2^(f/64)*
3640	// = (2^m) (2^(j/64)) * e^(f(ln(2)/64))
3641	//
3642	// f = x(64/ln(2)) - n*
3643	// r = f(ln(2)/64) = x - n(ln(2)/64)
3644	//
3645	// e^x = (2^m) (2^(j/64)) * e^r*
3646	//
3647	// (2^(j/64)) is precomputed
3648	//
3649	// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3650	// e^r = 1 + q
3651	//
3652	// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3653	//
3654	// e^x = (2^m) ( (2^(j/64)) + q(2^(j/64)) )
3655	const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3656	Register PH, PL;
3657
3658	if (ST.hasFastFMAF32()) {
3659	const float c_exp = numbers::log2ef;
3660	const float cc_exp = `0x1.4ae0bep-26f`; // c+cc are 49 bits
3661	const float c_exp10 = `0x1.a934f0p+1f`;
3662	const float cc_exp10 = `0x1.2f346ep-24f`;
3663
3664	auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
3665	PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: `0`);
3666	auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
3667	auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
3668
3669	auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
3670	PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: `0`);
3671	} else {
3672	const float ch_exp = `0x1.714000p+0f`;
3673	const float cl_exp = `0x1.47652ap-12f`; // ch + cl are 36 bits
3674
3675	const float ch_exp10 = `0x1.a92000p+1f`;
3676	const float cl_exp10 = `0x1.4f0978p-11f`;
3677
3678	auto MaskConst = B.buildConstant(Res: Ty, Val: `0xfffff000`);
3679	auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
3680	auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
3681
3682	auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
3683	PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: `0`);
3684
3685	auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
3686	auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
3687
3688	Register Mad0 =
3689	getMad(B, Ty, X: XL.getReg(Idx: `0`), Y: CH.getReg(Idx: `0`), Z: XLCL.getReg(Idx: `0`), Flags);
3690	PL = getMad(B, Ty, X: XH.getReg(Idx: `0`), Y: CL.getReg(Idx: `0`), Z: Mad0, Flags);
3691	}
3692
3693	auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
3694
3695	// It is unsafe to contract this fsub into the PH multiply.
3696	auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
3697	auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
3698	auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: `32`), Src0: E);
3699
3700	auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3701	.addUse(RegNo: A.getReg(Idx: `0`))
3702	.setMIFlags(Flags);
3703	auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
3704
3705	auto UnderflowCheckConst =
3706	B.buildFConstant(Res: Ty, Val: IsExp10 ? -`0x1.66d3e8p+5f` : -`0x1.9d1da0p+6f`);
3707	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3708	auto Underflow =
3709	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: X, Op1: UnderflowCheckConst);
3710
3711	R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
3712
3713	const auto &Options = MF.getTarget().Options;
3714
3715	if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3716	auto OverflowCheckConst =
3717	B.buildFConstant(Res: Ty, Val: IsExp10 ? `0x1.344136p+5f` : `0x1.62e430p+6f`);
3718
3719	auto Overflow =
3720	B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: `1`), Op0: X, Op1: OverflowCheckConst);
3721	auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3722	R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
3723	}
3724
3725	B.buildCopy(Res: Dst, Op: R);
3726	MI.eraseFromParent();
3727	return true;
3728	}
3729
3730	bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3731	MachineIRBuilder &B) const {
3732	Register Dst = MI.getOperand(i: `0`).getReg();
3733	Register Src0 = MI.getOperand(i: `1`).getReg();
3734	Register Src1 = MI.getOperand(i: `2`).getReg();
3735	unsigned Flags = MI.getFlags();
3736	LLT Ty = B.getMRI()->getType(Reg: Dst);
3737	const LLT F16 = LLT::float16();
3738	const LLT F32 = LLT::float32();
3739
3740	if (Ty == F32) {
3741	auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
3742	auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
3743	.addUse(RegNo: Log.getReg(Idx: `0`))
3744	.addUse(RegNo: Src1)
3745	.setMIFlags(Flags);
3746	B.buildFExp2(Dst, Src: Mul, Flags);
3747	} else if (Ty == F16) {
3748	// There's no f16 fmul_legacy, so we need to convert for it.
3749	auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
3750	auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
3751	auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
3752	auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
3753	.addUse(RegNo: Ext0.getReg(Idx: `0`))
3754	.addUse(RegNo: Ext1.getReg(Idx: `0`))
3755	.setMIFlags(Flags);
3756	B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
3757	} else
3758	return false;
3759
3760	MI.eraseFromParent();
3761	return true;
3762	}
3763
3764	// Find a source register, ignoring any possible source modifiers.
3765	static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3766	Register ModSrc = OrigSrc;
3767	if (MachineInstr *SrcFNeg = getOpcodeDef(Opcode: AMDGPU::G_FNEG, Reg: ModSrc, MRI)) {
3768	ModSrc = SrcFNeg->getOperand(i: `1`).getReg();
3769	if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
3770	ModSrc = SrcFAbs->getOperand(i: `1`).getReg();
3771	} else if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
3772	ModSrc = SrcFAbs->getOperand(i: `1`).getReg();
3773	return ModSrc;
3774	}
3775
3776	bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3777	MachineRegisterInfo &MRI,
3778	MachineIRBuilder &B) const {
3779
3780	const LLT S1 = LLT::scalar(SizeInBits: `1`);
3781	const LLT F64 = LLT::float64();
3782	Register Dst = MI.getOperand(i: `0`).getReg();
3783	Register OrigSrc = MI.getOperand(i: `1`).getReg();
3784	unsigned Flags = MI.getFlags();
3785	assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3786	"this should not have been custom lowered");
3787
3788	// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3789	// is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3790	// efficient way to implement it is using V_FRACT_F64. The workaround for the
3791	// V_FRACT bug is:
3792	// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3793	//
3794	// Convert floor(x) to (x - fract(x))
3795
3796	auto Fract = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {F64})
3797	.addUse(RegNo: OrigSrc)
3798	.setMIFlags(Flags);
3799
3800	// Give source modifier matching some assistance before obscuring a foldable
3801	// pattern.
3802
3803	// TODO: We can avoid the neg on the fract? The input sign to fract
3804	// shouldn't matter?
3805	Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3806
3807	auto Const =
3808	B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: `0x3fefffffffffffff`));
3809
3810	Register Min = MRI.createGenericVirtualRegister(Ty: F64);
3811
3812	// We don't need to concern ourselves with the snan handling difference, so
3813	// use the one which will directly select.
3814	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3815	if (MFI->getMode().IEEE)
3816	B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
3817	else
3818	B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
3819
3820	Register CorrectedFract = Min;
3821	if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
3822	auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
3823	CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: `0`);
3824	}
3825
3826	auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
3827	B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
3828
3829	MI.eraseFromParent();
3830	return true;
3831	}
3832
3833	// Turn an illegal packed v2s16 build vector into bit operations.
3834	// TODO: This should probably be a bitcast action in LegalizerHelper.
3835	bool AMDGPULegalizerInfo::legalizeBuildVector(
3836	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3837	Register Dst = MI.getOperand(i: `0`).getReg();
3838	const LLT S32 = LLT::scalar(SizeInBits: `32`);
3839	const LLT S16 = LLT::scalar(SizeInBits: `16`);
3840	assert(MRI.getType(Dst) == LLT::fixed_vector(`2`, `16`));
3841
3842	Register Src0 = MI.getOperand(i: `1`).getReg();
3843	Register Src1 = MI.getOperand(i: `2`).getReg();
3844
3845	if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3846	assert(MRI.getType(Src0) == S32);
3847	Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: `1`).getReg()).getReg(Idx: `0`);
3848	Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: `2`).getReg()).getReg(Idx: `0`);
3849	}
3850
3851	auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
3852	B.buildBitcast(Dst, Src: Merge);
3853
3854	MI.eraseFromParent();
3855	return true;
3856	}
3857
3858	// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3859	//
3860	// Source and accumulation registers must all be 32-bits.
3861	//
3862	// TODO: When the multiply is uniform, we should produce a code sequence
3863	// that is better suited to instruction selection on the SALU. Instead of
3864	// the outer loop going over parts of the result, the outer loop should go
3865	// over parts of one of the factors. This should result in instruction
3866	// selection that makes full use of S_ADDC_U32 instructions.
3867	void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3868	MutableArrayRef<Register> Accum,
3869	ArrayRef<Register> Src0,
3870	ArrayRef<Register> Src1,
3871	bool UsePartialMad64_32,
3872	bool SeparateOddAlignedProducts) const {
3873	// Use (possibly empty) vectors of S1 registers to represent the set of
3874	// carries from one pair of positions to the next.
3875	using Carry = SmallVector<Register, `2`>;
3876
3877	MachineIRBuilder &B = Helper.MIRBuilder;
3878	GISelKnownBits &KB = *Helper.getKnownBits();
3879
3880	const LLT S1 = LLT::scalar(SizeInBits: `1`);
3881	const LLT S32 = LLT::scalar(SizeInBits: `32`);
3882	const LLT S64 = LLT::scalar(SizeInBits: `64`);
3883
3884	Register Zero32;
3885	Register Zero64;
3886
3887	auto getZero32 = [&]() -> Register {
3888	if (!Zero32)
3889	Zero32 = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
3890	return Zero32;
3891	};
3892	auto getZero64 = [&]() -> Register {
3893	if (!Zero64)
3894	Zero64 = B.buildConstant(Res: S64, Val: `0`).getReg(Idx: `0`);
3895	return Zero64;
3896	};
3897
3898	SmallVector<bool, `2`> Src0KnownZeros, Src1KnownZeros;
3899	for (unsigned i = `0`; i < Src0.size(); ++i) {
3900	Src0KnownZeros.push_back(Elt: KB.getKnownBits(R: Src0 [i]).isZero());
3901	Src1KnownZeros.push_back(Elt: KB.getKnownBits(R: Src1 [i]).isZero());
3902	}
3903
3904	// Merge the given carries into the 32-bit LocalAccum, which is modified
3905	// in-place.
3906	//
3907	// Returns the carry-out, which is a single S1 register or null.
3908	auto mergeCarry =
3909	[&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3910	if (CarryIn.empty())
3911	return Register ();
3912
3913	bool HaveCarryOut = true;
3914	Register CarryAccum;
3915	if (CarryIn.size() == `1`) {
3916	if (!LocalAccum) {
3917	LocalAccum = B.buildZExt(Res: S32, Op: CarryIn [`0`]).getReg(Idx: `0`);
3918	return Register ();
3919	}
3920
3921	CarryAccum = getZero32 ();
3922	} else {
3923	CarryAccum = B.buildZExt(Res: S32, Op: CarryIn [`0`]).getReg(Idx: `0`);
3924	for (unsigned i = `1`; i + `1` < CarryIn.size(); ++i) {
3925	CarryAccum =
3926	B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32 (), CarryIn: CarryIn [i])
3927	.getReg(Idx: `0`);
3928	}
3929
3930	if (!LocalAccum) {
3931	LocalAccum = getZero32 ();
3932	HaveCarryOut = false;
3933	}
3934	}
3935
3936	auto Add =
3937	B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
3938	LocalAccum = Add.getReg(Idx: `0`);
3939	return HaveCarryOut ? Add.getReg(Idx: `1`) : Register ();
3940	};
3941
3942	// Build a multiply-add chain to compute
3943	//
3944	// LocalAccum + (partial products at DstIndex)
3945	// + (opportunistic subset of CarryIn)
3946	//
3947	// LocalAccum is an array of one or two 32-bit registers that are updated
3948	// in-place. The incoming registers may be null.
3949	//
3950	// In some edge cases, carry-ins can be consumed "for free". In that case,
3951	// the consumed carry bits are removed from CarryIn in-place.
3952	auto buildMadChain =
3953	[&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3954	-> Carry {
3955	assert((DstIndex + `1` < Accum.size() && LocalAccum.size() == `2`) \|\|
3956	(DstIndex + `1` >= Accum.size() && LocalAccum.size() == `1`));
3957
3958	Carry CarryOut;
3959	unsigned j0 = `0`;
3960
3961	// Use plain 32-bit multiplication for the most significant part of the
3962	// result by default.
3963	if (LocalAccum.size() == `1` &&
3964	(!UsePartialMad64_32 \|\| !CarryIn.empty())) {
3965	do {
3966	// Skip multiplication if one of the operands is 0
3967	unsigned j1 = DstIndex - j0;
3968	if (Src0KnownZeros [j0] \|\| Src1KnownZeros [j1]) {
3969	++j0;
3970	continue;
3971	}
3972	auto Mul = B.buildMul(Dst: S32, Src0: Src0 [j0], Src1: Src1 [j1]);
3973	if (!LocalAccum [`0`] \|\| KB.getKnownBits(R: LocalAccum [`0`]).isZero()) {
3974	LocalAccum [`0`] = Mul.getReg(Idx: `0`);
3975	} else {
3976	if (CarryIn.empty()) {
3977	LocalAccum [`0`] = B.buildAdd(Dst: S32, Src0: LocalAccum [`0`], Src1: Mul).getReg(Idx: `0`);
3978	} else {
3979	LocalAccum [`0`] =
3980	B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum [`0`], Op1: Mul, CarryIn: CarryIn.back())
3981	.getReg(Idx: `0`);
3982	CarryIn.pop_back();
3983	}
3984	}
3985	++j0;
3986	} while (j0 <= DstIndex && (!UsePartialMad64_32 \|\| !CarryIn.empty()));
3987	}
3988
3989	// Build full 64-bit multiplies.
3990	if (j0 <= DstIndex) {
3991	bool HaveSmallAccum = false;
3992	Register Tmp;
3993
3994	if (LocalAccum [`0`]) {
3995	if (LocalAccum.size() == `1`) {
3996	Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum [`0`]).getReg(Idx: `0`);
3997	HaveSmallAccum = true;
3998	} else if (LocalAccum [`1`]) {
3999	Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: `0`);
4000	HaveSmallAccum = false;
4001	} else {
4002	Tmp = B.buildZExt(Res: S64, Op: LocalAccum [`0`]).getReg(Idx: `0`);
4003	HaveSmallAccum = true;
4004	}
4005	} else {
4006	assert(LocalAccum.size() == `1` \|\| !LocalAccum[`1`]);
4007	Tmp = getZero64 ();
4008	HaveSmallAccum = true;
4009	}
4010
4011	do {
4012	unsigned j1 = DstIndex - j0;
4013	if (Src0KnownZeros [j0] \|\| Src1KnownZeros [j1]) {
4014	++j0;
4015	continue;
4016	}
4017	auto Mad = B.buildInstr(Opc: AMDGPU::G_AMDGPU_MAD_U64_U32, DstOps: {S64, S1},
4018	SrcOps: {Src0 [j0], Src1 [j1], Tmp});
4019	Tmp = Mad.getReg(Idx: `0`);
4020	if (!HaveSmallAccum)
4021	CarryOut.push_back(Elt: Mad.getReg(Idx: `1`));
4022	HaveSmallAccum = false;
4023
4024	++j0;
4025	} while (j0 <= DstIndex);
4026
4027	auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
4028	LocalAccum [`0`] = Unmerge.getReg(Idx: `0`);
4029	if (LocalAccum.size() > `1`)
4030	LocalAccum [`1`] = Unmerge.getReg(Idx: `1`);
4031	}
4032
4033	return CarryOut;
4034	};
4035
4036	// Outer multiply loop, iterating over destination parts from least
4037	// significant to most significant parts.
4038	//
4039	// The columns of the following diagram correspond to the destination parts
4040	// affected by one iteration of the outer loop (ignoring boundary
4041	// conditions).
4042	//
4043	// Dest index relative to 2 i: 1 0 -1*
4044	// ------
4045	// Carries from previous iteration: e o
4046	// Even-aligned partial product sum: E E .
4047	// Odd-aligned partial product sum: O O
4048	//
4049	// 'o' is OddCarry, 'e' is EvenCarry.
4050	// EE and OO are computed from partial products via buildMadChain and use
4051	// accumulation where possible and appropriate.
4052	//
4053	Register SeparateOddCarry;
4054	Carry EvenCarry;
4055	Carry OddCarry;
4056
4057	for (unsigned i = `0`; i <= Accum.size() / `2`; ++i) {
4058	Carry OddCarryIn = std::move(OddCarry);
4059	Carry EvenCarryIn = std::move(EvenCarry);
4060	OddCarry.clear();
4061	EvenCarry.clear();
4062
4063	// Partial products at offset 2 i.*
4064	if (`2` * i < Accum.size()) {
4065	auto LocalAccum = Accum.drop_front(N: `2` * i).take_front(N: `2`);
4066	EvenCarry = buildMadChain (LocalAccum, `2` * i, EvenCarryIn);
4067	}
4068
4069	// Partial products at offset 2 i - 1.*
4070	if (i > `0`) {
4071	if (!SeparateOddAlignedProducts) {
4072	auto LocalAccum = Accum.drop_front(N: `2` * i - `1`).take_front(N: `2`);
4073	OddCarry = buildMadChain (LocalAccum, `2` * i - `1`, OddCarryIn);
4074	} else {
4075	bool IsHighest = `2` * i >= Accum.size();
4076	Register SeparateOddOut[`2`];
4077	auto LocalAccum = MutableArrayRef(SeparateOddOut)
4078	.take_front(N: IsHighest ? `1` : `2`);
4079	OddCarry = buildMadChain (LocalAccum, `2` * i - `1`, OddCarryIn);
4080
4081	MachineInstr *Lo;
4082
4083	if (i == `1`) {
4084	if (!IsHighest)
4085	Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum [`2` * i - `1`], Op1: SeparateOddOut[`0`]);
4086	else
4087	Lo = B.buildAdd(Dst: S32, Src0: Accum [`2` * i - `1`], Src1: SeparateOddOut[`0`]);
4088	} else {
4089	Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum [`2` * i - `1`], Op1: SeparateOddOut[`0`],
4090	CarryIn: SeparateOddCarry);
4091	}
4092	Accum [`2` * i - `1`] = Lo->getOperand(i: `0`).getReg();
4093
4094	if (!IsHighest) {
4095	auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum [`2` * i], Op1: SeparateOddOut[`1`],
4096	CarryIn: Lo->getOperand(i: `1`).getReg());
4097	Accum [`2` * i] = Hi.getReg(Idx: `0`);
4098	SeparateOddCarry = Hi.getReg(Idx: `1`);
4099	}
4100	}
4101	}
4102
4103	// Add in the carries from the previous iteration
4104	if (i > `0`) {
4105	if (Register CarryOut = mergeCarry (Accum [`2` * i - `1`], OddCarryIn))
4106	EvenCarryIn.push_back(Elt: CarryOut);
4107
4108	if (`2` * i < Accum.size()) {
4109	if (Register CarryOut = mergeCarry (Accum [`2` * i], EvenCarryIn))
4110	OddCarry.push_back(Elt: CarryOut);
4111	}
4112	}
4113	}
4114	}
4115
4116	// Custom narrowing of wide multiplies using wide multiply-add instructions.
4117	//
4118	// TODO: If the multiply is followed by an addition, we should attempt to
4119	// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4120	bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4121	MachineInstr &MI) const {
4122	assert(ST.hasMad64_32());
4123	assert(MI.getOpcode() == TargetOpcode::G_MUL);
4124
4125	MachineIRBuilder &B = Helper.MIRBuilder;
4126	MachineRegisterInfo &MRI = *B.getMRI();
4127
4128	Register DstReg = MI.getOperand(i: `0`).getReg();
4129	Register Src0 = MI.getOperand(i: `1`).getReg();
4130	Register Src1 = MI.getOperand(i: `2`).getReg();
4131
4132	LLT Ty = MRI.getType(Reg: DstReg);
4133	assert(Ty.isScalar());
4134
4135	unsigned Size = Ty.getSizeInBits();
4136	unsigned NumParts = Size / `32`;
4137	assert((Size % `32`) == `0`);
4138	assert(NumParts >= `2`);
4139
4140	// Whether to use MAD_64_32 for partial products whose high half is
4141	// discarded. This avoids some ADD instructions but risks false dependency
4142	// stalls on some subtargets in some cases.
4143	const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4144
4145	// Whether to compute odd-aligned partial products separately. This is
4146	// advisable on subtargets where the accumulator of MAD_64_32 must be placed
4147	// in an even-aligned VGPR.
4148	const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4149
4150	LLT S32 = LLT::scalar(SizeInBits: `32`);
4151	SmallVector<Register, `2`> Src0Parts, Src1Parts;
4152	for (unsigned i = `0`; i < NumParts; ++i) {
4153	Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4154	Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4155	}
4156	B.buildUnmerge(Res: Src0Parts, Op: Src0);
4157	B.buildUnmerge(Res: Src1Parts, Op: Src1);
4158
4159	SmallVector<Register, `2`> AccumRegs(NumParts);
4160	buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4161	SeparateOddAlignedProducts);
4162
4163	B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4164	MI.eraseFromParent();
4165	return true;
4166	}
4167
4168	// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4169	// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4170	// case with a single min instruction instead of a compare+select.
4171	bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4172	MachineRegisterInfo &MRI,
4173	MachineIRBuilder &B) const {
4174	Register Dst = MI.getOperand(i: `0`).getReg();
4175	Register Src = MI.getOperand(i: `1`).getReg();
4176	LLT DstTy = MRI.getType(Reg: Dst);
4177	LLT SrcTy = MRI.getType(Reg: Src);
4178
4179	unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4180	? AMDGPU::G_AMDGPU_FFBH_U32
4181	: AMDGPU::G_AMDGPU_FFBL_B32;
4182	auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4183	B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4184
4185	MI.eraseFromParent();
4186	return true;
4187	}
4188
4189	bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4190	MachineRegisterInfo &MRI,
4191	MachineIRBuilder &B) const {
4192	Register Dst = MI.getOperand(i: `0`).getReg();
4193	Register Src = MI.getOperand(i: `1`).getReg();
4194	LLT SrcTy = MRI.getType(Reg: Src);
4195	TypeSize NumBits = SrcTy.getSizeInBits();
4196
4197	assert(NumBits < `32u`);
4198
4199	auto ShiftAmt = B.buildConstant(Res: S32, Val: `32u` - NumBits);
4200	auto Extend = B.buildAnyExt(Res: S32, Op: {Src}).getReg(Idx: `0u`);
4201	auto Shift = B.buildShl(Dst: S32, Src0: Extend, Src1: ShiftAmt);
4202	auto Ctlz = B.buildInstr(Opc: AMDGPU::G_AMDGPU_FFBH_U32, DstOps: {S32}, SrcOps: {Shift});
4203	B.buildTrunc(Res: Dst, Op: Ctlz);
4204	MI.eraseFromParent();
4205	return true;
4206	}
4207
4208	// Check that this is a G_XOR x, -1
4209	static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4210	if (MI.getOpcode() != TargetOpcode::G_XOR)
4211	return false;
4212	auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: `2`).getReg(), MRI);
4213	return ConstVal && *ConstVal == -`1`;
4214	}
4215
4216	// Return the use branch instruction, otherwise null if the usage is invalid.
4217	static MachineInstr *
4218	verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4219	MachineBasicBlock &UncondBrTarget, bool* &Negated) {
4220	Register CondDef = MI.getOperand(i: `0`).getReg();
4221	if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4222	return nullptr;
4223
4224	MachineBasicBlock *Parent = MI.getParent();
4225	MachineInstr UseMI = &MRI.use_instr_nodbg_begin(RegNo: CondDef);
4226
4227	if (isNot(MRI, MI: *UseMI)) {
4228	Register NegatedCond = UseMI->getOperand(i: `0`).getReg();
4229	if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4230	return nullptr;
4231
4232	// We're deleting the def of this value, so we need to remove it.
4233	eraseInstr(MI&: *UseMI, MRI);
4234
4235	UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4236	Negated = true;
4237	}
4238
4239	if (UseMI->getParent() != Parent \|\| UseMI->getOpcode() != AMDGPU::G_BRCOND)
4240	return nullptr;
4241
4242	// Make sure the cond br is followed by a G_BR, or is the last instruction.
4243	MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4244	if (Next == Parent->end()) {
4245	MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4246	if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4247	return nullptr;
4248	UncondBrTarget = &*NextMBB;
4249	} else {
4250	if (Next ->getOpcode() != AMDGPU::G_BR)
4251	return nullptr;
4252	Br = &*Next;
4253	UncondBrTarget = Br->getOperand(i: `0`).getMBB();
4254	}
4255
4256	return UseMI;
4257	}
4258
4259	bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4260	const ArgDescriptor *Arg,
4261	const TargetRegisterClass *ArgRC,
4262	LLT ArgTy) const {
4263	MCRegister SrcReg = Arg->getRegister();
4264	assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4265	assert(DstReg.isVirtual() && "Virtual register expected");
4266
4267	Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4268	RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4269	if (Arg->isMasked()) {
4270	// TODO: Should we try to emit this once in the entry block?
4271	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4272	const unsigned Mask = Arg->getMask();
4273	const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4274
4275	Register AndMaskSrc = LiveIn;
4276
4277	// TODO: Avoid clearing the high bits if we know workitem id y/z are always
4278	// 0.
4279	if (Shift != `0`) {
4280	auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4281	AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: `0`);
4282	}
4283
4284	B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4285	} else {
4286	B.buildCopy(Res: DstReg, Op: LiveIn);
4287	}
4288
4289	return true;
4290	}
4291
4292	bool AMDGPULegalizerInfo::loadInputValue(
4293	Register DstReg, MachineIRBuilder &B,
4294	AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4295	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4296	const ArgDescriptor Arg = nullptr*;
4297	const TargetRegisterClass *ArgRC;
4298	LLT ArgTy;
4299
4300	CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4301	const ArgDescriptor WorkGroupIDX =
4302	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
4303	// If GridZ is not programmed in an entry function then the hardware will set
4304	// it to all zeros, so there is no need to mask the GridY value in the low
4305	// order bits.
4306	const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4307	Reg: AMDGPU::TTMP7,
4308	Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~`0u` : `0xFFFFu`);
4309	const ArgDescriptor WorkGroupIDZ =
4310	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: `0xFFFF0000u`);
4311	if (ST.hasArchitectedSGPRs() &&
4312	(AMDGPU::isCompute(CC) \|\| CC == CallingConv::AMDGPU_Gfx)) {
4313	switch (ArgType) {
4314	case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4315	Arg = &WorkGroupIDX;
4316	ArgRC = &AMDGPU::SReg_32RegClass;
4317	ArgTy = LLT::scalar(SizeInBits: `32`);
4318	break;
4319	case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4320	Arg = &WorkGroupIDY;
4321	ArgRC = &AMDGPU::SReg_32RegClass;
4322	ArgTy = LLT::scalar(SizeInBits: `32`);
4323	break;
4324	case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4325	Arg = &WorkGroupIDZ;
4326	ArgRC = &AMDGPU::SReg_32RegClass;
4327	ArgTy = LLT::scalar(SizeInBits: `32`);
4328	break;
4329	default:
4330	break;
4331	}
4332	}
4333
4334	if (!Arg)
4335	std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4336
4337	if (!Arg) {
4338	if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4339	// The intrinsic may appear when we have a 0 sized kernarg segment, in which
4340	// case the pointer argument may be missing and we use null.
4341	B.buildConstant(Res: DstReg, Val: `0`);
4342	return true;
4343	}
4344
4345	// It's undefined behavior if a function marked with the amdgpu-no-*
4346	// attributes uses the corresponding intrinsic.
4347	B.buildUndef(Res: DstReg);
4348	return true;
4349	}
4350
4351	if (!Arg->isRegister() \|\| !Arg->getRegister().isValid())
4352	return false; // TODO: Handle these
4353	return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4354	}
4355
4356	bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4357	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4358	AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4359	if (!loadInputValue(DstReg: MI.getOperand(i: `0`).getReg(), B, ArgType))
4360	return false;
4361
4362	MI.eraseFromParent();
4363	return true;
4364	}
4365
4366	static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4367	int64_t C) {
4368	B.buildConstant(Res: MI.getOperand(i: `0`).getReg(), Val: C);
4369	MI.eraseFromParent();
4370	return true;
4371	}
4372
4373	bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4374	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4375	unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4376	unsigned MaxID = ST.getMaxWorkitemID(Kernel: B.getMF().getFunction(), Dimension: Dim);
4377	if (MaxID == `0`)
4378	return replaceWithConstant(B, MI, C: `0`);
4379
4380	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4381	const ArgDescriptor *Arg;
4382	const TargetRegisterClass *ArgRC;
4383	LLT ArgTy;
4384	std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4385
4386	Register DstReg = MI.getOperand(i: `0`).getReg();
4387	if (!Arg) {
4388	// It's undefined behavior if a function marked with the amdgpu-no-*
4389	// attributes uses the corresponding intrinsic.
4390	B.buildUndef(Res: DstReg);
4391	MI.eraseFromParent();
4392	return true;
4393	}
4394
4395	if (Arg->isMasked()) {
4396	// Don't bother inserting AssertZext for packed IDs since we're emitting the
4397	// masking operations anyway.
4398	//
4399	// TODO: We could assert the top bit is 0 for the source copy.
4400	if (!loadInputValue(DstReg, B, ArgType))
4401	return false;
4402	} else {
4403	Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `32`));
4404	if (!loadInputValue(DstReg: TmpReg, B, ArgType))
4405	return false;
4406	B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
4407	}
4408
4409	MI.eraseFromParent();
4410	return true;
4411	}
4412
4413	Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4414	int64_t Offset) const {
4415	LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
4416	Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
4417
4418	// TODO: If we passed in the base kernel offset we could have a better
4419	// alignment than 4, but we don't really need it.
4420	if (!loadInputValue(DstReg: KernArgReg, B,
4421	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4422	llvm_unreachable("failed to find kernarg segment ptr");
4423
4424	auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset);
4425	// TODO: Should get nuw
4426	return B.buildPtrAdd(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: `0`);
4427	}
4428
4429	/// Legalize a value that's loaded from kernel arguments. This is only used by
4430	/// legacy intrinsics.
4431	bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4432	MachineIRBuilder &B,
4433	uint64_t Offset,
4434	Align Alignment) const {
4435	Register DstReg = MI.getOperand(i: `0`).getReg();
4436
4437	assert(B.getMRI()->getType(DstReg) == LLT::scalar(`32`) &&
4438	"unexpected kernarg parameter type");
4439
4440	Register Ptr = getKernargParameterPtr(B, Offset);
4441	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4442	B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo, Alignment: Align (`4`),
4443	MMOFlags: MachineMemOperand::MODereferenceable \|
4444	MachineMemOperand::MOInvariant);
4445	MI.eraseFromParent();
4446	return true;
4447	}
4448
4449	bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4450	MachineRegisterInfo &MRI,
4451	MachineIRBuilder &B) const {
4452	Register Dst = MI.getOperand(i: `0`).getReg();
4453	LLT DstTy = MRI.getType(Reg: Dst);
4454	LLT S16 = LLT::scalar(SizeInBits: `16`);
4455	LLT S32 = LLT::scalar(SizeInBits: `32`);
4456	LLT S64 = LLT::scalar(SizeInBits: `64`);
4457
4458	if (DstTy == S16)
4459	return legalizeFDIV16(MI, MRI, B);
4460	if (DstTy == S32)
4461	return legalizeFDIV32(MI, MRI, B);
4462	if (DstTy == S64)
4463	return legalizeFDIV64(MI, MRI, B);
4464
4465	return false;
4466	}
4467
4468	void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4469	Register DstDivReg,
4470	Register DstRemReg,
4471	Register X,
4472	Register Y) const {
4473	const LLT S1 = LLT::scalar(SizeInBits: `1`);
4474	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4475
4476	// See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4477	// algorithm used here.
4478
4479	// Initial estimate of inv(y).
4480	auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
4481	auto RcpIFlag = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {FloatY});
4482	auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x4f7ffffe`));
4483	auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
4484	auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
4485
4486	// One round of UNR.
4487	auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: `0`), Src1: Y);
4488	auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
4489	Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
4490
4491	// Quotient/remainder estimate.
4492	auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
4493	auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
4494
4495	// First quotient/remainder refinement.
4496	auto One = B.buildConstant(Res: S32, Val: `1`);
4497	auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4498	if (DstDivReg)
4499	Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4500	R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4501
4502	// Second quotient/remainder refinement.
4503	Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4504	if (DstDivReg)
4505	B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4506
4507	if (DstRemReg)
4508	B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4509	}
4510
4511	// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4512	//
4513	// Return lo, hi of result
4514	//
4515	// %cvt.lo = G_UITOFP Val.lo
4516	// %cvt.hi = G_UITOFP Val.hi
4517	// %mad = G_FMAD %cvt.hi, 232, %cvt.lo
4518	// %rcp = G_AMDGPU_RCP_IFLAG %mad
4519	// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4520	// %mul2 = G_FMUL %mul1, 2(-32)
4521	// %trunc = G_INTRINSIC_TRUNC %mul2
4522	// %mad2 = G_FMAD %trunc, -(232), %mul1
4523	// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4524	static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4525	Register Val) {
4526	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4527	auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
4528
4529	auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: `0`));
4530	auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: `1`));
4531
4532	auto Mad = B.buildFMAD(
4533	Dst: S32, Src0: CvtHi, // 232
4534	Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x4f800000`)), Src2: CvtLo);
4535
4536	auto Rcp = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {Mad});
4537	auto Mul1 = B.buildFMul(
4538	Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x5f7ffffc`)));
4539
4540	// 2(-32)
4541	auto Mul2 = B.buildFMul(
4542	Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x2f800000`)));
4543	auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
4544
4545	// -(232)
4546	auto Mad2 = B.buildFMAD(
4547	Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0xcf800000`)),
4548	Src2: Mul1);
4549
4550	auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
4551	auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
4552
4553	return {ResultLo.getReg(Idx: `0`), ResultHi.getReg(Idx: `0`)};
4554	}
4555
4556	void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4557	Register DstDivReg,
4558	Register DstRemReg,
4559	Register Numer,
4560	Register Denom) const {
4561	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4562	const LLT S64 = LLT::scalar(SizeInBits: `64`);
4563	const LLT S1 = LLT::scalar(SizeInBits: `1`);
4564	Register RcpLo, RcpHi;
4565
4566	std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
4567
4568	auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
4569
4570	auto Zero64 = B.buildConstant(Res: S64, Val: `0`);
4571	auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
4572
4573	auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
4574	auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
4575
4576	auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
4577	Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: `0`);
4578	Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: `1`);
4579
4580	auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
4581	auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: `1`));
4582	auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
4583
4584	auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
4585	auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
4586	auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
4587	Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: `0`);
4588	Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: `1`);
4589
4590	auto Zero32 = B.buildConstant(Res: S32, Val: `0`);
4591	auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
4592	auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: `1`));
4593	auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
4594
4595	auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
4596	Register NumerLo = UnmergeNumer.getReg(Idx: `0`);
4597	Register NumerHi = UnmergeNumer.getReg(Idx: `1`);
4598
4599	auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
4600	auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
4601	auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
4602	Register Mul3_Lo = UnmergeMul3.getReg(Idx: `0`);
4603	Register Mul3_Hi = UnmergeMul3.getReg(Idx: `1`);
4604	auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
4605	auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: `1`));
4606	auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
4607	auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
4608
4609	auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
4610	Register DenomLo = UnmergeDenom.getReg(Idx: `0`);
4611	Register DenomHi = UnmergeDenom.getReg(Idx: `1`);
4612
4613	auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4614	auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
4615
4616	auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
4617	auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
4618
4619	auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4620	auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
4621
4622	// TODO: Here and below portions of the code can be enclosed into if/endif.
4623	// Currently control flow is unconditional and we have 4 selects after
4624	// potential endif to substitute PHIs.
4625
4626	// if C3 != 0 ...
4627	auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
4628	auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: `1`));
4629	auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: `1`));
4630	auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
4631
4632	auto One64 = B.buildConstant(Res: S64, Val: `1`);
4633	auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
4634
4635	auto C4 =
4636	B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
4637	auto C5 =
4638	B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
4639	auto C6 = B.buildSelect(
4640	Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
4641
4642	// if (C6 != 0)
4643	auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
4644	auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
4645
4646	auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: `1`));
4647	auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: `1`));
4648	auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
4649
4650	// endif C6
4651	// endif C3
4652
4653	if (DstDivReg) {
4654	auto Sel1 = B.buildSelect(
4655	Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
4656	B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4657	Op0: Sel1, Op1: MulHi3);
4658	}
4659
4660	if (DstRemReg) {
4661	auto Sel2 = B.buildSelect(
4662	Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
4663	B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4664	Op0: Sel2, Op1: Sub1);
4665	}
4666	}
4667
4668	bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4669	MachineRegisterInfo &MRI,
4670	MachineIRBuilder &B) const {
4671	Register DstDivReg, DstRemReg;
4672	switch (MI.getOpcode()) {
4673	default:
4674	llvm_unreachable("Unexpected opcode!");
4675	case AMDGPU::G_UDIV: {
4676	DstDivReg = MI.getOperand(i: `0`).getReg();
4677	break;
4678	}
4679	case AMDGPU::G_UREM: {
4680	DstRemReg = MI.getOperand(i: `0`).getReg();
4681	break;
4682	}
4683	case AMDGPU::G_UDIVREM: {
4684	DstDivReg = MI.getOperand(i: `0`).getReg();
4685	DstRemReg = MI.getOperand(i: `1`).getReg();
4686	break;
4687	}
4688	}
4689
4690	const LLT S64 = LLT::scalar(SizeInBits: `64`);
4691	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4692	const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4693	Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
4694	Register Den = MI.getOperand(i: FirstSrcOpIdx + `1`).getReg();
4695	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
4696
4697	if (Ty == S32)
4698	legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
4699	else if (Ty == S64)
4700	legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
4701	else
4702	return false;
4703
4704	MI.eraseFromParent();
4705	return true;
4706	}
4707
4708	bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4709	MachineRegisterInfo &MRI,
4710	MachineIRBuilder &B) const {
4711	const LLT S64 = LLT::scalar(SizeInBits: `64`);
4712	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4713
4714	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
4715	if (Ty != S32 && Ty != S64)
4716	return false;
4717
4718	const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4719	Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
4720	Register RHS = MI.getOperand(i: FirstSrcOpIdx + `1`).getReg();
4721
4722	auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - `1`);
4723	auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
4724	auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
4725
4726	LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: `0`);
4727	RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: `0`);
4728
4729	LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: `0`);
4730	RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: `0`);
4731
4732	Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4733	switch (MI.getOpcode()) {
4734	default:
4735	llvm_unreachable("Unexpected opcode!");
4736	case AMDGPU::G_SDIV: {
4737	DstDivReg = MI.getOperand(i: `0`).getReg();
4738	TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4739	break;
4740	}
4741	case AMDGPU::G_SREM: {
4742	DstRemReg = MI.getOperand(i: `0`).getReg();
4743	TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4744	break;
4745	}
4746	case AMDGPU::G_SDIVREM: {
4747	DstDivReg = MI.getOperand(i: `0`).getReg();
4748	DstRemReg = MI.getOperand(i: `1`).getReg();
4749	TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4750	TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4751	break;
4752	}
4753	}
4754
4755	if (Ty == S32)
4756	legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
4757	else
4758	legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
4759
4760	if (DstDivReg) {
4761	auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: `0`);
4762	auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: `0`);
4763	B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
4764	}
4765
4766	if (DstRemReg) {
4767	auto Sign = LHSign.getReg(Idx: `0`); // Remainder sign is the same as LHS
4768	auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: `0`);
4769	B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
4770	}
4771
4772	MI.eraseFromParent();
4773	return true;
4774	}
4775
4776	bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4777	MachineRegisterInfo &MRI,
4778	MachineIRBuilder &B) const {
4779	Register Res = MI.getOperand(i: `0`).getReg();
4780	Register LHS = MI.getOperand(i: `1`).getReg();
4781	Register RHS = MI.getOperand(i: `2`).getReg();
4782	uint16_t Flags = MI.getFlags();
4783	LLT ResTy = MRI.getType(Reg: Res);
4784
4785	const MachineFunction &MF = B.getMF();
4786	bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn) \|\|
4787	MF.getTarget().Options.UnsafeFPMath;
4788
4789	if (auto CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
4790	if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: `16`))
4791	return false;
4792
4793	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4794	// the CI documentation has a worst case error of 1 ulp.
4795	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4796	// use it as long as we aren't trying to use denormals.
4797	//
4798	// v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4799
4800	// 1 / x -> RCP(x)
4801	if (CLHS->isExactlyValue(V: `1.0`)) {
4802	B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
4803	.addUse(RegNo: RHS)
4804	.setMIFlags(Flags);
4805
4806	MI.eraseFromParent();
4807	return true;
4808	}
4809
4810	// -1 / x -> RCP( FNEG(x) )
4811	if (CLHS->isExactlyValue(V: -`1.0`)) {
4812	auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
4813	B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
4814	.addUse(RegNo: FNeg.getReg(Idx: `0`))
4815	.setMIFlags(Flags);
4816
4817	MI.eraseFromParent();
4818	return true;
4819	}
4820	}
4821
4822	// For f16 require afn or arcp.
4823	// For f32 require afn.
4824	if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: `16`) \|\|
4825	!MI.getFlag(Flag: MachineInstr::FmArcp)))
4826	return false;
4827
4828	// x / y -> x (1.0 / y)*
4829	auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
4830	.addUse(RegNo: RHS)
4831	.setMIFlags(Flags);
4832	B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
4833
4834	MI.eraseFromParent();
4835	return true;
4836	}
4837
4838	bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4839	MachineRegisterInfo &MRI,
4840	MachineIRBuilder &B) const {
4841	Register Res = MI.getOperand(i: `0`).getReg();
4842	Register X = MI.getOperand(i: `1`).getReg();
4843	Register Y = MI.getOperand(i: `2`).getReg();
4844	uint16_t Flags = MI.getFlags();
4845	LLT ResTy = MRI.getType(Reg: Res);
4846
4847	const MachineFunction &MF = B.getMF();
4848	bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath \|\|
4849	MI.getFlag(Flag: MachineInstr::FmAfn);
4850
4851	if (!AllowInaccurateRcp)
4852	return false;
4853
4854	auto NegY = B.buildFNeg(Dst: ResTy, Src0: Y);
4855	auto One = B.buildFConstant(Res: ResTy, Val: `1.0`);
4856
4857	auto R = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
4858	.addUse(RegNo: Y)
4859	.setMIFlags(Flags);
4860
4861	auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4862	R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
4863
4864	auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4865	R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
4866
4867	auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
4868	auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
4869
4870	B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
4871	MI.eraseFromParent();
4872	return true;
4873	}
4874
4875	bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4876	MachineRegisterInfo &MRI,
4877	MachineIRBuilder &B) const {
4878	if (legalizeFastUnsafeFDIV(MI, MRI, B))
4879	return true;
4880
4881	Register Res = MI.getOperand(i: `0`).getReg();
4882	Register LHS = MI.getOperand(i: `1`).getReg();
4883	Register RHS = MI.getOperand(i: `2`).getReg();
4884
4885	uint16_t Flags = MI.getFlags();
4886
4887	LLT S16 = LLT::scalar(SizeInBits: `16`);
4888	LLT S32 = LLT::scalar(SizeInBits: `32`);
4889
4890	auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
4891	auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
4892
4893	auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
4894	.addUse(RegNo: RHSExt.getReg(Idx: `0`))
4895	.setMIFlags(Flags);
4896
4897	auto QUOT = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: RCP, Flags);
4898	auto RDst = B.buildFPTrunc(Res: S16, Op: QUOT, Flags);
4899
4900	B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
4901	.addUse(RegNo: RDst.getReg(Idx: `0`))
4902	.addUse(RegNo: RHS)
4903	.addUse(RegNo: LHS)
4904	.setMIFlags(Flags);
4905
4906	MI.eraseFromParent();
4907	return true;
4908	}
4909
4910	static constexpr unsigned SPDenormModeBitField =
4911	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `4`, Values: `2`);
4912
4913	// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4914	// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4915	static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4916	const GCNSubtarget &ST,
4917	SIModeRegisterDefaults Mode) {
4918	// Set SP denorm mode to this value.
4919	unsigned SPDenormMode =
4920	Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4921
4922	if (ST.hasDenormModeInst()) {
4923	// Preserve default FP64FP16 denorm mode while updating FP32 mode.
4924	uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4925
4926	uint32_t NewDenormModeValue = SPDenormMode \| (DPDenormModeDefault << `2`);
4927	B.buildInstr(Opcode: AMDGPU::S_DENORM_MODE)
4928	.addImm(Val: NewDenormModeValue);
4929
4930	} else {
4931	B.buildInstr(Opcode: AMDGPU::S_SETREG_IMM32_B32)
4932	.addImm(Val: SPDenormMode)
4933	.addImm(Val: SPDenormModeBitField);
4934	}
4935	}
4936
4937	bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4938	MachineRegisterInfo &MRI,
4939	MachineIRBuilder &B) const {
4940	if (legalizeFastUnsafeFDIV(MI, MRI, B))
4941	return true;
4942
4943	Register Res = MI.getOperand(i: `0`).getReg();
4944	Register LHS = MI.getOperand(i: `1`).getReg();
4945	Register RHS = MI.getOperand(i: `2`).getReg();
4946	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4947	SIModeRegisterDefaults Mode = MFI->getMode();
4948
4949	uint16_t Flags = MI.getFlags();
4950
4951	LLT S32 = LLT::scalar(SizeInBits: `32`);
4952	LLT S1 = LLT::scalar(SizeInBits: `1`);
4953
4954	auto One = B.buildFConstant(Res: S32, Val: `1.0f`);
4955
4956	auto DenominatorScaled =
4957	B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
4958	.addUse(RegNo: LHS)
4959	.addUse(RegNo: RHS)
4960	.addImm(Val: `0`)
4961	.setMIFlags(Flags);
4962	auto NumeratorScaled =
4963	B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
4964	.addUse(RegNo: LHS)
4965	.addUse(RegNo: RHS)
4966	.addImm(Val: `1`)
4967	.setMIFlags(Flags);
4968
4969	auto ApproxRcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
4970	.addUse(RegNo: DenominatorScaled.getReg(Idx: `0`))
4971	.setMIFlags(Flags);
4972	auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
4973
4974	const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4975	const bool HasDynamicDenormals =
4976	(Mode.FP32Denormals.Input == DenormalMode::Dynamic) \|\|
4977	(Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4978
4979	Register SavedSPDenormMode;
4980	if (!PreservesDenormals) {
4981	if (HasDynamicDenormals) {
4982	SavedSPDenormMode = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
4983	B.buildInstr(Opcode: AMDGPU::S_GETREG_B32)
4984	.addDef(RegNo: SavedSPDenormMode)
4985	.addImm(Val: SPDenormModeBitField);
4986	}
4987	toggleSPDenormMode(Enable: true, B, ST, Mode);
4988	}
4989
4990	auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
4991	auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
4992	auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
4993	auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
4994	auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
4995	auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
4996
4997	if (!PreservesDenormals) {
4998	if (HasDynamicDenormals) {
4999	assert(SavedSPDenormMode);
5000	B.buildInstr(Opcode: AMDGPU::S_SETREG_B32)
5001	.addReg(RegNo: SavedSPDenormMode)
5002	.addImm(Val: SPDenormModeBitField);
5003	} else
5004	toggleSPDenormMode(Enable: false, B, ST, Mode);
5005	}
5006
5007	auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S32})
5008	.addUse(RegNo: Fma4.getReg(Idx: `0`))
5009	.addUse(RegNo: Fma1.getReg(Idx: `0`))
5010	.addUse(RegNo: Fma3.getReg(Idx: `0`))
5011	.addUse(RegNo: NumeratorScaled.getReg(Idx: `1`))
5012	.setMIFlags(Flags);
5013
5014	B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5015	.addUse(RegNo: Fmas.getReg(Idx: `0`))
5016	.addUse(RegNo: RHS)
5017	.addUse(RegNo: LHS)
5018	.setMIFlags(Flags);
5019
5020	MI.eraseFromParent();
5021	return true;
5022	}
5023
5024	bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5025	MachineRegisterInfo &MRI,
5026	MachineIRBuilder &B) const {
5027	if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5028	return true;
5029
5030	Register Res = MI.getOperand(i: `0`).getReg();
5031	Register LHS = MI.getOperand(i: `1`).getReg();
5032	Register RHS = MI.getOperand(i: `2`).getReg();
5033
5034	uint16_t Flags = MI.getFlags();
5035
5036	LLT S64 = LLT::scalar(SizeInBits: `64`);
5037	LLT S1 = LLT::scalar(SizeInBits: `1`);
5038
5039	auto One = B.buildFConstant(Res: S64, Val: `1.0`);
5040
5041	auto DivScale0 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5042	.addUse(RegNo: LHS)
5043	.addUse(RegNo: RHS)
5044	.addImm(Val: `0`)
5045	.setMIFlags(Flags);
5046
5047	auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(Idx: `0`), Flags);
5048
5049	auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S64})
5050	.addUse(RegNo: DivScale0.getReg(Idx: `0`))
5051	.setMIFlags(Flags);
5052
5053	auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
5054	auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
5055	auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
5056
5057	auto DivScale1 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5058	.addUse(RegNo: LHS)
5059	.addUse(RegNo: RHS)
5060	.addImm(Val: `1`)
5061	.setMIFlags(Flags);
5062
5063	auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5064	auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(Idx: `0`), Src1: Fma3, Flags);
5065	auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(Idx: `0`), Flags);
5066
5067	Register Scale;
5068	if (!ST.hasUsableDivScaleConditionOutput()) {
5069	// Workaround a hardware bug on SI where the condition output from div_scale
5070	// is not usable.
5071
5072	LLT S32 = LLT::scalar(SizeInBits: `32`);
5073
5074	auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5075	auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5076	auto Scale0Unmerge = B.buildUnmerge(Res: S32, Op: DivScale0);
5077	auto Scale1Unmerge = B.buildUnmerge(Res: S32, Op: DivScale1);
5078
5079	auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: `1`),
5080	Op1: Scale1Unmerge.getReg(Idx: `1`));
5081	auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: `1`),
5082	Op1: Scale0Unmerge.getReg(Idx: `1`));
5083	Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(Idx: `0`);
5084	} else {
5085	Scale = DivScale1.getReg(Idx: `1`);
5086	}
5087
5088	auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S64})
5089	.addUse(RegNo: Fma4.getReg(Idx: `0`))
5090	.addUse(RegNo: Fma3.getReg(Idx: `0`))
5091	.addUse(RegNo: Mul.getReg(Idx: `0`))
5092	.addUse(RegNo: Scale)
5093	.setMIFlags(Flags);
5094
5095	B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res: ArrayRef(Res))
5096	.addUse(RegNo: Fmas.getReg(Idx: `0`))
5097	.addUse(RegNo: RHS)
5098	.addUse(RegNo: LHS)
5099	.setMIFlags(Flags);
5100
5101	MI.eraseFromParent();
5102	return true;
5103	}
5104
5105	bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5106	MachineRegisterInfo &MRI,
5107	MachineIRBuilder &B) const {
5108	Register Res0 = MI.getOperand(i: `0`).getReg();
5109	Register Res1 = MI.getOperand(i: `1`).getReg();
5110	Register Val = MI.getOperand(i: `2`).getReg();
5111	uint16_t Flags = MI.getFlags();
5112
5113	LLT Ty = MRI.getType(Reg: Res0);
5114	LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: `16`) ? LLT::scalar(SizeInBits: `16`) : LLT::scalar(SizeInBits: `32`);
5115
5116	auto Mant = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_mant, Res: {Ty})
5117	.addUse(RegNo: Val)
5118	.setMIFlags(Flags);
5119	auto Exp = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_exp, Res: {InstrExpTy})
5120	.addUse(RegNo: Val)
5121	.setMIFlags(Flags);
5122
5123	if (ST.hasFractBug()) {
5124	auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5125	auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5126	auto IsFinite =
5127	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Fabs, Op1: Inf, Flags);
5128	auto Zero = B.buildConstant(Res: InstrExpTy, Val: `0`);
5129	Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5130	Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5131	}
5132
5133	B.buildCopy(Res: Res0, Op: Mant);
5134	B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5135
5136	MI.eraseFromParent();
5137	return true;
5138	}
5139
5140	bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5141	MachineRegisterInfo &MRI,
5142	MachineIRBuilder &B) const {
5143	Register Res = MI.getOperand(i: `0`).getReg();
5144	Register LHS = MI.getOperand(i: `2`).getReg();
5145	Register RHS = MI.getOperand(i: `3`).getReg();
5146	uint16_t Flags = MI.getFlags();
5147
5148	LLT S32 = LLT::scalar(SizeInBits: `32`);
5149	LLT S1 = LLT::scalar(SizeInBits: `1`);
5150
5151	auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5152	const APFloat C0Val(`1.0f`);
5153
5154	auto C0 = B.buildFConstant(Res: S32, Val: `0x1p+96f`);
5155	auto C1 = B.buildFConstant(Res: S32, Val: `0x1p-32f`);
5156	auto C2 = B.buildFConstant(Res: S32, Val: `1.0f`);
5157
5158	auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5159	auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5160
5161	auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5162
5163	auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5164	.addUse(RegNo: Mul0.getReg(Idx: `0`))
5165	.setMIFlags(Flags);
5166
5167	auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5168
5169	B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5170
5171	MI.eraseFromParent();
5172	return true;
5173	}
5174
5175	bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5176	MachineRegisterInfo &MRI,
5177	MachineIRBuilder &B) const {
5178	// Bypass the correct expansion a standard promotion through G_FSQRT would
5179	// get. The f32 op is accurate enough for the f16 cas.
5180	unsigned Flags = MI.getFlags();
5181	assert(!ST.has16BitInsts());
5182	const LLT F32 = LLT::scalar(SizeInBits: `32`);
5183	auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: `1`), Flags);
5184	auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: {F32})
5185	.addUse(RegNo: Ext.getReg(Idx: `0`))
5186	.setMIFlags(Flags);
5187	B.buildFPTrunc(Res: MI.getOperand(i: `0`), Op: Log2, Flags);
5188	MI.eraseFromParent();
5189	return true;
5190	}
5191
5192	bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5193	MachineRegisterInfo &MRI,
5194	MachineIRBuilder &B) const {
5195	MachineFunction &MF = B.getMF();
5196	Register Dst = MI.getOperand(i: `0`).getReg();
5197	Register X = MI.getOperand(i: `1`).getReg();
5198	const unsigned Flags = MI.getFlags();
5199	const LLT S1 = LLT::scalar(SizeInBits: `1`);
5200	const LLT F32 = LLT::scalar(SizeInBits: `32`);
5201	const LLT I32 = LLT::scalar(SizeInBits: `32`);
5202
5203	if (allowApproxFunc(MF, Flags)) {
5204	B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({Dst}))
5205	.addUse(RegNo: X)
5206	.setMIFlags(Flags);
5207	MI.eraseFromParent();
5208	return true;
5209	}
5210
5211	auto ScaleThreshold = B.buildFConstant(Res: F32, Val: `0x1.0p-96f`);
5212	auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5213	auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: `0x1.0p+32f`);
5214	auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5215	auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5216
5217	Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5218	if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5219	B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({SqrtS}))
5220	.addUse(RegNo: SqrtX.getReg(Idx: `0`))
5221	.setMIFlags(Flags);
5222
5223	auto NegOne = B.buildConstant(Res: I32, Val: -`1`);
5224	auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5225
5226	auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5227	auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5228
5229	auto PosOne = B.buildConstant(Res: I32, Val: `1`);
5230	auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5231
5232	auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5233	auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5234
5235	auto Zero = B.buildFConstant(Res: F32, Val: `0.0f`);
5236	auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5237
5238	SqrtS =
5239	B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: `0`);
5240
5241	auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
5242	SqrtS =
5243	B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: `0`);
5244	} else {
5245	auto SqrtR =
5246	B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F32}).addReg(RegNo: SqrtX.getReg(Idx: `0`));
5247	B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
5248
5249	auto Half = B.buildFConstant(Res: F32, Val: `0.5f`);
5250	auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
5251	auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
5252	auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
5253	SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
5254	SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(Idx: `0`);
5255	auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
5256	auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
5257	SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(Idx: `0`);
5258	}
5259
5260	auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: `0x1.0p-16f`);
5261
5262	auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
5263
5264	SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: `0`);
5265
5266	auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: `1`), Src: SqrtX, Mask: fcZero \| fcPosInf);
5267	B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
5268
5269	MI.eraseFromParent();
5270	return true;
5271	}
5272
5273	bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5274	MachineRegisterInfo &MRI,
5275	MachineIRBuilder &B) const {
5276	// For double type, the SQRT and RSQ instructions don't have required
5277	// precision, we apply Goldschmidt's algorithm to improve the result:
5278	//
5279	// y0 = rsq(x)
5280	// g0 = x y0*
5281	// h0 = 0.5 y0*
5282	//
5283	// r0 = 0.5 - h0 g0*
5284	// g1 = g0 r0 + g0*
5285	// h1 = h0 r0 + h0*
5286	//
5287	// r1 = 0.5 - h1 g1 => d0 = x - g1 * g1*
5288	// g2 = g1 r1 + g1 g2 = d0 * h1 + g1*
5289	// h2 = h1 r1 + h1*
5290	//
5291	// r2 = 0.5 - h2 g2 => d1 = x - g2 * g2*
5292	// g3 = g2 r2 + g2 g3 = d1 * h1 + g2*
5293	//
5294	// sqrt(x) = g3
5295
5296	const LLT S1 = LLT::scalar(SizeInBits: `1`);
5297	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5298	const LLT F64 = LLT::scalar(SizeInBits: `64`);
5299
5300	Register Dst = MI.getOperand(i: `0`).getReg();
5301	assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5302
5303	Register X = MI.getOperand(i: `1`).getReg();
5304	unsigned Flags = MI.getFlags();
5305
5306	auto ScaleConstant = B.buildFConstant(Res: F64, Val: `0x1.0p-767`);
5307
5308	auto ZeroInt = B.buildConstant(Res: S32, Val: `0`);
5309	auto Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant);
5310
5311	// Scale up input if it is too small.
5312	auto ScaleUpFactor = B.buildConstant(Res: S32, Val: `256`);
5313	auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
5314	auto SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags);
5315
5316	auto SqrtY =
5317	B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F64}).addReg(RegNo: SqrtX.getReg(Idx: `0`));
5318
5319	auto Half = B.buildFConstant(Res: F64, Val: `0.5`);
5320	auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
5321	auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
5322
5323	auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
5324	auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
5325
5326	auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
5327	auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
5328
5329	auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
5330	auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
5331
5332	auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
5333
5334	auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
5335	auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
5336
5337	auto SqrtRet = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
5338
5339	// Scale down the result.
5340	auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -`128`);
5341	auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
5342	SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtRet, Src1: ScaleDown, Flags);
5343
5344	// TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5345	// with finite only or nsz because rsq(+/-0) = +/-inf
5346
5347	// TODO: Check for DAZ and expand to subnormals
5348	auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: `1`), Src: SqrtX, Mask: fcZero \| fcPosInf);
5349
5350	// If x is +INF, +0, or -0, use its original value
5351	B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
5352
5353	MI.eraseFromParent();
5354	return true;
5355	}
5356
5357	bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5358	MachineRegisterInfo &MRI,
5359	MachineIRBuilder &B) const {
5360	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
5361	if (Ty == LLT::scalar(SizeInBits: `32`))
5362	return legalizeFSQRTF32(MI, MRI, B);
5363	if (Ty == LLT::scalar(SizeInBits: `64`))
5364	return legalizeFSQRTF64(MI, MRI, B);
5365	if (Ty == LLT::scalar(SizeInBits: `16`))
5366	return legalizeFSQRTF16(MI, MRI, B);
5367	return false;
5368	}
5369
5370	// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5371	// FIXME: Why do we handle this one but not other removed instructions?
5372	//
5373	// Reciprocal square root. The clamp prevents infinite results, clamping
5374	// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5375	// +-max_float.
5376	bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5377	MachineRegisterInfo &MRI,
5378	MachineIRBuilder &B) const {
5379	if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5380	return true;
5381
5382	Register Dst = MI.getOperand(i: `0`).getReg();
5383	Register Src = MI.getOperand(i: `2`).getReg();
5384	auto Flags = MI.getFlags();
5385
5386	LLT Ty = MRI.getType(Reg: Dst);
5387
5388	const fltSemantics *FltSemantics;
5389	if (Ty == LLT::scalar(SizeInBits: `32`))
5390	FltSemantics = &APFloat::IEEEsingle();
5391	else if (Ty == LLT::scalar(SizeInBits: `64`))
5392	FltSemantics = &APFloat::IEEEdouble();
5393	else
5394	return false;
5395
5396	auto Rsq = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {Ty})
5397	.addUse(RegNo: Src)
5398	.setMIFlags(Flags);
5399
5400	// We don't need to concern ourselves with the snan handling difference, since
5401	// the rsq quieted (or not) so use the one which will directly select.
5402	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5403	const bool UseIEEE = MFI->getMode().IEEE;
5404
5405	auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
5406	auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
5407	B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
5408
5409	auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: FltSemantics, Negative: true*));
5410
5411	if (UseIEEE)
5412	B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5413	else
5414	B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5415	MI.eraseFromParent();
5416	return true;
5417	}
5418
5419	// TODO: Fix pointer type handling
5420	bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5421	MachineInstr &MI,
5422	Intrinsic::ID IID) const {
5423
5424	MachineIRBuilder &B = Helper.MIRBuilder;
5425	MachineRegisterInfo &MRI = *B.getMRI();
5426
5427	bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 \|\|
5428	IID == Intrinsic::amdgcn_permlanex16;
5429
5430	auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5431	Register Src2, LLT VT) -> Register {
5432	auto LaneOp = B.buildIntrinsic(ID: IID, Res: {VT}).addUse(RegNo: Src0);
5433	switch (IID) {
5434	case Intrinsic::amdgcn_readfirstlane:
5435	case Intrinsic::amdgcn_permlane64:
5436	return LaneOp.getReg(Idx: `0`);
5437	case Intrinsic::amdgcn_readlane:
5438	return LaneOp.addUse(RegNo: Src1).getReg(Idx: `0`);
5439	case Intrinsic::amdgcn_writelane:
5440	return LaneOp.addUse(RegNo: Src1).addUse(RegNo: Src2).getReg(Idx: `0`);
5441	case Intrinsic::amdgcn_permlane16:
5442	case Intrinsic::amdgcn_permlanex16: {
5443	Register Src3 = MI.getOperand(i: `5`).getReg();
5444	Register Src4 = MI.getOperand(i: `6`).getImm();
5445	Register Src5 = MI.getOperand(i: `7`).getImm();
5446	return LaneOp.addUse(RegNo: Src1)
5447	.addUse(RegNo: Src2)
5448	.addUse(RegNo: Src3)
5449	.addImm(Val: Src4)
5450	.addImm(Val: Src5)
5451	.getReg(Idx: `0`);
5452	}
5453	default:
5454	llvm_unreachable("unhandled lane op");
5455	}
5456	};
5457
5458	Register DstReg = MI.getOperand(i: `0`).getReg();
5459	Register Src0 = MI.getOperand(i: `2`).getReg();
5460	Register Src1, Src2;
5461	if (IID == Intrinsic::amdgcn_readlane \|\| IID == Intrinsic::amdgcn_writelane \|\|
5462	IsPermLane16) {
5463	Src1 = MI.getOperand(i: `3`).getReg();
5464	if (IID == Intrinsic::amdgcn_writelane \|\| IsPermLane16) {
5465	Src2 = MI.getOperand(i: `4`).getReg();
5466	}
5467	}
5468
5469	LLT Ty = MRI.getType(Reg: DstReg);
5470	unsigned Size = Ty.getSizeInBits();
5471
5472	if (Size == `32`) {
5473	// Already legal
5474	return true;
5475	}
5476
5477	if (Size < `32`) {
5478	Src0 = B.buildAnyExt(Res: S32, Op: Src0).getReg(Idx: `0`);
5479
5480	if (IsPermLane16)
5481	Src1 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: `32`), Op: Src1).getReg(Idx: `0`);
5482
5483	if (IID == Intrinsic::amdgcn_writelane)
5484	Src2 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: `32`), Op: Src2).getReg(Idx: `0`);
5485
5486	Register LaneOpDst = createLaneOp (Src0, Src1, Src2, S32);
5487	B.buildTrunc(Res: DstReg, Op: LaneOpDst);
5488	MI.eraseFromParent();
5489	return true;
5490	}
5491
5492	if (Size % `32` != `0`)
5493	return false;
5494
5495	LLT PartialResTy = S32;
5496	if (Ty.isVector()) {
5497	LLT EltTy = Ty.getElementType();
5498	switch (EltTy.getSizeInBits()) {
5499	case `16`:
5500	PartialResTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: `2`));
5501	break;
5502	case `32`:
5503	PartialResTy = EltTy;
5504	break;
5505	default:
5506	// Handle all other cases via S32 pieces;
5507	break;
5508	}
5509	}
5510
5511	SmallVector<Register, `2`> PartialRes;
5512	unsigned NumParts = Size / `32`;
5513	MachineInstrBuilder Src0Parts = B.buildUnmerge(Res: PartialResTy, Op: Src0);
5514	MachineInstrBuilder Src1Parts, Src2Parts;
5515
5516	if (IsPermLane16)
5517	Src1Parts = B.buildUnmerge(Res: PartialResTy, Op: Src1);
5518
5519	if (IID == Intrinsic::amdgcn_writelane)
5520	Src2Parts = B.buildUnmerge(Res: PartialResTy, Op: Src2);
5521
5522	for (unsigned i = `0`; i < NumParts; ++i) {
5523	Src0 = Src0Parts.getReg(Idx: i);
5524
5525	if (IsPermLane16)
5526	Src1 = Src1Parts.getReg(Idx: i);
5527
5528	if (IID == Intrinsic::amdgcn_writelane)
5529	Src2 = Src2Parts.getReg(Idx: i);
5530
5531	PartialRes.push_back(Elt: createLaneOp (Src0, Src1, Src2, PartialResTy));
5532	}
5533
5534	B.buildMergeLikeInstr(Res: DstReg, Ops: PartialRes);
5535	MI.eraseFromParent();
5536	return true;
5537	}
5538
5539	bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5540	MachineRegisterInfo &MRI,
5541	MachineIRBuilder &B) const {
5542	uint64_t Offset =
5543	ST.getTargetLowering()->getImplicitParameterOffset(
5544	MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
5545	LLT DstTy = MRI.getType(Reg: DstReg);
5546	LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
5547
5548	Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
5549	if (!loadInputValue(DstReg: KernargPtrReg, B,
5550	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5551	return false;
5552
5553	// FIXME: This should be nuw
5554	B.buildPtrAdd(Res: DstReg, Op0: KernargPtrReg, Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: `0`));
5555	return true;
5556	}
5557
5558	/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5559	/// bits of the pointer and replace them with the stride argument, then
5560	/// merge_values everything together. In the common case of a raw buffer (the
5561	/// stride component is 0), we can just AND off the upper half.
5562	bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5563	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5564	Register Result = MI.getOperand(i: `0`).getReg();
5565	Register Pointer = MI.getOperand(i: `2`).getReg();
5566	Register Stride = MI.getOperand(i: `3`).getReg();
5567	Register NumRecords = MI.getOperand(i: `4`).getReg();
5568	Register Flags = MI.getOperand(i: `5`).getReg();
5569
5570	LLT S32 = LLT::scalar(SizeInBits: `32`);
5571
5572	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
5573	auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
5574	Register LowHalf = Unmerge.getReg(Idx: `0`);
5575	Register HighHalf = Unmerge.getReg(Idx: `1`);
5576
5577	auto AndMask = B.buildConstant(Res: S32, Val: `0x0000ffff`);
5578	auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
5579
5580	MachineInstrBuilder NewHighHalf = Masked;
5581	std::optional<ValueAndVReg> StrideConst =
5582	getIConstantVRegValWithLookThrough(VReg: Stride, MRI);
5583	if (!StrideConst \|\| !StrideConst ->Value.isZero()) {
5584	MachineInstrBuilder ShiftedStride;
5585	if (StrideConst) {
5586	uint32_t StrideVal = StrideConst ->Value.getZExtValue();
5587	uint32_t ShiftedStrideVal = StrideVal << `16`;
5588	ShiftedStride = B.buildConstant(Res: S32, Val: ShiftedStrideVal);
5589	} else {
5590	auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
5591	auto ShiftConst = B.buildConstant(Res: S32, Val: `16`);
5592	ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
5593	}
5594	NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
5595	}
5596	Register NewHighHalfReg = NewHighHalf.getReg(Idx: `0`);
5597	B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
5598	MI.eraseFromParent();
5599	return true;
5600	}
5601
5602	bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5603	MachineRegisterInfo &MRI,
5604	MachineIRBuilder &B) const {
5605	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5606	if (!MFI->isEntryFunction()) {
5607	return legalizePreloadedArgIntrin(MI, MRI, B,
5608	ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5609	}
5610
5611	Register DstReg = MI.getOperand(i: `0`).getReg();
5612	if (!getImplicitArgPtr(DstReg, MRI, B))
5613	return false;
5614
5615	MI.eraseFromParent();
5616	return true;
5617	}
5618
5619	bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5620	MachineRegisterInfo &MRI,
5621	MachineIRBuilder &B) const {
5622	Function &F = B.getMF().getFunction();
5623	std::optional<uint32_t> KnownSize =
5624	AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5625	if (KnownSize.has_value())
5626	B.buildConstant(Res: DstReg, Val: *KnownSize);
5627	return false;
5628	}
5629
5630	bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5631	MachineRegisterInfo &MRI,
5632	MachineIRBuilder &B) const {
5633
5634	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5635	if (!MFI->isEntryFunction()) {
5636	return legalizePreloadedArgIntrin(MI, MRI, B,
5637	ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5638	}
5639
5640	Register DstReg = MI.getOperand(i: `0`).getReg();
5641	if (!getLDSKernelId(DstReg, MRI, B))
5642	return false;
5643
5644	MI.eraseFromParent();
5645	return true;
5646	}
5647
5648	bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5649	MachineRegisterInfo &MRI,
5650	MachineIRBuilder &B,
5651	unsigned AddrSpace) const {
5652	Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
5653	auto Unmerge = B.buildUnmerge(Res: LLT::scalar(SizeInBits: `32`), Op: MI.getOperand(i: `2`).getReg());
5654	Register Hi32 = Unmerge.getReg(Idx: `1`);
5655
5656	B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: `0`), Op0: Hi32, Op1: ApertureReg);
5657	MI.eraseFromParent();
5658	return true;
5659	}
5660
5661	// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5662	// offset (the offset that is included in bounds checking and swizzling, to be
5663	// split between the instruction's voffset and immoffset fields) and soffset
5664	// (the offset that is excluded from bounds checking and swizzling, to go in
5665	// the instruction's soffset field). This function takes the first kind of
5666	// offset and figures out how to split it between voffset and immoffset.
5667	std::pair<Register, unsigned>
5668	AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5669	Register OrigOffset) const {
5670	const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5671	Register BaseReg;
5672	unsigned ImmOffset;
5673	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5674	MachineRegisterInfo &MRI = *B.getMRI();
5675
5676	std::tie(args&: BaseReg, args&: ImmOffset) =
5677	AMDGPU::getBaseWithConstantOffset(MRI, Reg: OrigOffset);
5678
5679	// If BaseReg is a pointer, convert it to int.
5680	if (MRI.getType(Reg: BaseReg).isPointer())
5681	BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: `0`);
5682
5683	// If the immediate value is too big for the immoffset field, put only bits
5684	// that would normally fit in the immoffset field. The remaining value that
5685	// is copied/added for the voffset field is a large power of 2, and it
5686	// stands more chance of being CSEd with the copy/add for another similar
5687	// load/store.
5688	// However, do not do that rounding down if that is a negative
5689	// number, as it appears to be illegal to have a negative offset in the
5690	// vgpr, even if adding the immediate offset makes it positive.
5691	unsigned Overflow = ImmOffset & ~MaxImm;
5692	ImmOffset -= Overflow;
5693	if ((int32_t)Overflow < `0`) {
5694	Overflow += ImmOffset;
5695	ImmOffset = `0`;
5696	}
5697
5698	if (Overflow != `0`) {
5699	if (!BaseReg) {
5700	BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: `0`);
5701	} else {
5702	auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
5703	BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: `0`);
5704	}
5705	}
5706
5707	if (!BaseReg)
5708	BaseReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
5709
5710	return std::pair(BaseReg, ImmOffset);
5711	}
5712
5713	/// Handle register layout difference for f16 images for some subtargets.
5714	Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5715	MachineRegisterInfo &MRI,
5716	Register Reg,
5717	bool ImageStore) const {
5718	const LLT S16 = LLT::scalar(SizeInBits: `16`);
5719	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5720	LLT StoreVT = MRI.getType(Reg);
5721	assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5722
5723	if (ST.hasUnpackedD16VMem()) {
5724	auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5725
5726	SmallVector<Register, `4`> WideRegs;
5727	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
5728	WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: `0`));
5729
5730	int NumElts = StoreVT.getNumElements();
5731
5732	return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
5733	.getReg(Idx: `0`);
5734	}
5735
5736	if (ImageStore && ST.hasImageStoreD16Bug()) {
5737	if (StoreVT.getNumElements() == `2`) {
5738	SmallVector<Register, `4`> PackedRegs;
5739	Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: `0`);
5740	PackedRegs.push_back(Elt: Reg);
5741	PackedRegs.resize(N: `2`, NV: B.buildUndef(Res: S32).getReg(Idx: `0`));
5742	return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: `2`, ScalarTy: S32), Ops: PackedRegs)
5743	.getReg(Idx: `0`);
5744	}
5745
5746	if (StoreVT.getNumElements() == `3`) {
5747	SmallVector<Register, `4`> PackedRegs;
5748	auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5749	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
5750	PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5751	PackedRegs.resize(N: `6`, NV: B.buildUndef(Res: S16).getReg(Idx: `0`));
5752	Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: `6`, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: `0`);
5753	return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: `3`, ScalarTy: S32), Src: Reg).getReg(Idx: `0`);
5754	}
5755
5756	if (StoreVT.getNumElements() == `4`) {
5757	SmallVector<Register, `4`> PackedRegs;
5758	Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: `2`, ScalarTy: S32), Src: Reg).getReg(Idx: `0`);
5759	auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
5760	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
5761	PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5762	PackedRegs.resize(N: `4`, NV: B.buildUndef(Res: S32).getReg(Idx: `0`));
5763	return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: `4`, ScalarTy: S32), Ops: PackedRegs)
5764	.getReg(Idx: `0`);
5765	}
5766
5767	llvm_unreachable("invalid data type");
5768	}
5769
5770	if (StoreVT == LLT::fixed_vector(NumElements: `3`, ScalarTy: S16)) {
5771	Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: `4`, ScalarTy: S16), Op0: Reg)
5772	.getReg(Idx: `0`);
5773	}
5774	return Reg;
5775	}
5776
5777	Register AMDGPULegalizerInfo::fixStoreSourceType(
5778	MachineIRBuilder &B, Register VData, bool IsFormat) const {
5779	MachineRegisterInfo *MRI = B.getMRI();
5780	LLT Ty = MRI->getType(Reg: VData);
5781
5782	const LLT S16 = LLT::scalar(SizeInBits: `16`);
5783
5784	// Fixup buffer resources themselves needing to be v4i128.
5785	if (hasBufferRsrcWorkaround(Ty))
5786	return castBufferRsrcToV4I32(Pointer: VData, B);
5787
5788	// Fixup illegal register types for i8 stores.
5789	if (Ty == LLT::scalar(SizeInBits: `8`) \|\| Ty == S16) {
5790	Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: `32`), Op: VData).getReg(Idx: `0`);
5791	return AnyExt;
5792	}
5793
5794	if (Ty.isVector()) {
5795	if (Ty.getElementType() == S16 && Ty.getNumElements() <= `4`) {
5796	if (IsFormat)
5797	return handleD16VData(B, MRI&: *MRI, Reg: VData);
5798	}
5799	}
5800
5801	return VData;
5802	}
5803
5804	bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5805	MachineRegisterInfo &MRI,
5806	MachineIRBuilder &B,
5807	bool IsTyped,
5808	bool IsFormat) const {
5809	Register VData = MI.getOperand(i: `1`).getReg();
5810	LLT Ty = MRI.getType(Reg: VData);
5811	LLT EltTy = Ty.getScalarType();
5812	const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == `16`);
5813	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5814
5815	VData = fixStoreSourceType(B, VData, IsFormat);
5816	castBufferRsrcArgToV4I32(MI, B, Idx: `2`);
5817	Register RSrc = MI.getOperand(i: `2`).getReg();
5818
5819	MachineMemOperand MMO = MI.memoperands_begin();
5820	const int MemSize = MMO->getSize().getValue();
5821
5822	unsigned ImmOffset;
5823
5824	// The typed intrinsics add an immediate after the registers.
5825	const unsigned NumVIndexOps = IsTyped ? `8` : `7`;
5826
5827	// The struct intrinsic variants add one additional operand over raw.
5828	const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5829	Register VIndex;
5830	int OpOffset = `0`;
5831	if (HasVIndex) {
5832	VIndex = MI.getOperand(i: `3`).getReg();
5833	OpOffset = `1`;
5834	} else {
5835	VIndex = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
5836	}
5837
5838	Register VOffset = MI.getOperand(i: `3` + OpOffset).getReg();
5839	Register SOffset = MI.getOperand(i: `4` + OpOffset).getReg();
5840
5841	unsigned Format = `0`;
5842	if (IsTyped) {
5843	Format = MI.getOperand(i: `5` + OpOffset).getImm();
5844	++OpOffset;
5845	}
5846
5847	unsigned AuxiliaryData = MI.getOperand(i: `5` + OpOffset).getImm();
5848
5849	std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
5850
5851	unsigned Opc;
5852	if (IsTyped) {
5853	Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5854	AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5855	} else if (IsFormat) {
5856	Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5857	AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5858	} else {
5859	switch (MemSize) {
5860	case `1`:
5861	Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5862	break;
5863	case `2`:
5864	Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5865	break;
5866	default:
5867	Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5868	break;
5869	}
5870	}
5871
5872	auto MIB = B.buildInstr(Opcode: Opc)
5873	.addUse(RegNo: VData) // vdata
5874	.addUse(RegNo: RSrc) // rsrc
5875	.addUse(RegNo: VIndex) // vindex
5876	.addUse(RegNo: VOffset) // voffset
5877	.addUse(RegNo: SOffset) // soffset
5878	.addImm(Val: ImmOffset); // offset(imm)
5879
5880	if (IsTyped)
5881	MIB.addImm(Val: Format);
5882
5883	MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5884	.addImm(Val: HasVIndex ? -`1` : `0`) // idxen(imm)
5885	.addMemOperand(MMO);
5886
5887	MI.eraseFromParent();
5888	return true;
5889	}
5890
5891	static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5892	Register VIndex, Register VOffset, Register SOffset,
5893	unsigned ImmOffset, unsigned Format,
5894	unsigned AuxiliaryData, MachineMemOperand *MMO,
5895	bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5896	auto MIB = B.buildInstr(Opcode: Opc)
5897	.addDef(RegNo: LoadDstReg) // vdata
5898	.addUse(RegNo: RSrc) // rsrc
5899	.addUse(RegNo: VIndex) // vindex
5900	.addUse(RegNo: VOffset) // voffset
5901	.addUse(RegNo: SOffset) // soffset
5902	.addImm(Val: ImmOffset); // offset(imm)
5903
5904	if (IsTyped)
5905	MIB.addImm(Val: Format);
5906
5907	MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5908	.addImm(Val: HasVIndex ? -`1` : `0`) // idxen(imm)
5909	.addMemOperand(MMO);
5910	}
5911
5912	bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5913	MachineRegisterInfo &MRI,
5914	MachineIRBuilder &B,
5915	bool IsFormat,
5916	bool IsTyped) const {
5917	// FIXME: Verifier should enforce 1 MMO for these intrinsics.
5918	MachineMemOperand MMO = MI.memoperands_begin();
5919	const LLT MemTy = MMO->getMemoryType();
5920	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5921
5922	Register Dst = MI.getOperand(i: `0`).getReg();
5923
5924	Register StatusDst;
5925	int OpOffset = `0`;
5926	assert(MI.getNumExplicitDefs() == `1` \|\| MI.getNumExplicitDefs() == `2`);
5927	bool IsTFE = MI.getNumExplicitDefs() == `2`;
5928	if (IsTFE) {
5929	StatusDst = MI.getOperand(i: `1`).getReg();
5930	++OpOffset;
5931	}
5932
5933	castBufferRsrcArgToV4I32(MI, B, Idx: `2` + OpOffset);
5934	Register RSrc = MI.getOperand(i: `2` + OpOffset).getReg();
5935
5936	// The typed intrinsics add an immediate after the registers.
5937	const unsigned NumVIndexOps = IsTyped ? `8` : `7`;
5938
5939	// The struct intrinsic variants add one additional operand over raw.
5940	const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5941	Register VIndex;
5942	if (HasVIndex) {
5943	VIndex = MI.getOperand(i: `3` + OpOffset).getReg();
5944	++OpOffset;
5945	} else {
5946	VIndex = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
5947	}
5948
5949	Register VOffset = MI.getOperand(i: `3` + OpOffset).getReg();
5950	Register SOffset = MI.getOperand(i: `4` + OpOffset).getReg();
5951
5952	unsigned Format = `0`;
5953	if (IsTyped) {
5954	Format = MI.getOperand(i: `5` + OpOffset).getImm();
5955	++OpOffset;
5956	}
5957
5958	unsigned AuxiliaryData = MI.getOperand(i: `5` + OpOffset).getImm();
5959	unsigned ImmOffset;
5960
5961	LLT Ty = MRI.getType(Reg: Dst);
5962	// Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5963	// logic doesn't have to handle that case.
5964	if (hasBufferRsrcWorkaround(Ty)) {
5965	Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: `0`);
5966	Dst = MI.getOperand(i: `0`).getReg();
5967	}
5968	LLT EltTy = Ty.getScalarType();
5969	const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == `16`);
5970	const bool Unpacked = ST.hasUnpackedD16VMem();
5971
5972	std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
5973
5974	unsigned Opc;
5975
5976	// TODO: Support TFE for typed and narrow loads.
5977	if (IsTyped) {
5978	if (IsTFE)
5979	return false;
5980	Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5981	AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5982	} else if (IsFormat) {
5983	if (IsD16) {
5984	if (IsTFE)
5985	return false;
5986	Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5987	} else {
5988	Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5989	: AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5990	}
5991	} else {
5992	switch (MemTy.getSizeInBits()) {
5993	case `8`:
5994	Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
5995	: AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5996	break;
5997	case `16`:
5998	Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
5999	: AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6000	break;
6001	default:
6002	Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6003	: AMDGPU::G_AMDGPU_BUFFER_LOAD;
6004	break;
6005	}
6006	}
6007
6008	if (IsTFE) {
6009	unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: `32`);
6010	unsigned NumLoadDWords = NumValueDWords + `1`;
6011	LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
6012	Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
6013	buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6014	Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6015	if (MemTy.getSizeInBits() < `32`) {
6016	Register ExtDst = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6017	B.buildUnmerge(Res: {ExtDst, StatusDst}, Op: LoadDstReg);
6018	B.buildTrunc(Res: Dst, Op: ExtDst);
6019	} else if (NumValueDWords == `1`) {
6020	B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
6021	} else {
6022	SmallVector<Register, `5`> LoadElts;
6023	for (unsigned I = `0`; I != NumValueDWords; ++I)
6024	LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
6025	LoadElts.push_back(Elt: StatusDst);
6026	B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
6027	LoadElts.truncate(N: NumValueDWords);
6028	B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
6029	}
6030	} else if ((!IsD16 && MemTy.getSizeInBits() < `32`) \|\|
6031	(IsD16 && !Ty.isVector())) {
6032	Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6033	buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6034	Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6035	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6036	B.buildTrunc(Res: Dst, Op: LoadDstReg);
6037	} else if (Unpacked && IsD16 && Ty.isVector()) {
6038	LLT UnpackedTy = Ty.changeElementSize(NewEltSize: `32`);
6039	Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
6040	buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6041	Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6042	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6043	// FIXME: G_TRUNC should work, but legalization currently fails
6044	auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
6045	SmallVector<Register, `4`> Repack;
6046	for (unsigned I = `0`, N = Unmerge ->getNumOperands() - `1`; I != N; ++I)
6047	Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: `0`));
6048	B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
6049	} else {
6050	buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6051	AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6052	}
6053
6054	MI.eraseFromParent();
6055	return true;
6056	}
6057
6058	static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6059	switch (IntrID) {
6060	case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6061	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6062	case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6063	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6064	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6065	case Intrinsic::amdgcn_raw_buffer_atomic_add:
6066	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6067	case Intrinsic::amdgcn_struct_buffer_atomic_add:
6068	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6069	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6070	case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6071	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6072	case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6073	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6074	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6075	case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6076	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6077	case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6078	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6079	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6080	case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6081	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6082	case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6083	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6084	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6085	case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6086	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6087	case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6088	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6089	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6090	case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6091	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6092	case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6093	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6094	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6095	case Intrinsic::amdgcn_raw_buffer_atomic_and:
6096	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6097	case Intrinsic::amdgcn_struct_buffer_atomic_and:
6098	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6099	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6100	case Intrinsic::amdgcn_raw_buffer_atomic_or:
6101	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6102	case Intrinsic::amdgcn_struct_buffer_atomic_or:
6103	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6104	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6105	case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6106	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6107	case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6108	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6109	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6110	case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6111	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6112	case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6113	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6114	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6115	case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6116	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6117	case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6118	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6119	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6120	case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6121	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6122	case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6123	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6124	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6125	case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6126	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6127	case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6128	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6129	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6130	case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6131	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6132	case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6133	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6134	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6135	case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6136	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6137	case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6138	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6139	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6140	case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6141	case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6142	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6143	default:
6144	llvm_unreachable("unhandled atomic opcode");
6145	}
6146	}
6147
6148	bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6149	MachineIRBuilder &B,
6150	Intrinsic::ID IID) const {
6151	const bool IsCmpSwap =
6152	IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap \|\|
6153	IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap \|\|
6154	IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap \|\|
6155	IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6156
6157	Register Dst = MI.getOperand(i: `0`).getReg();
6158	// Since we don't have 128-bit atomics, we don't need to handle the case of
6159	// p8 argmunents to the atomic itself
6160	Register VData = MI.getOperand(i: `2`).getReg();
6161
6162	Register CmpVal;
6163	int OpOffset = `0`;
6164
6165	if (IsCmpSwap) {
6166	CmpVal = MI.getOperand(i: `3`).getReg();
6167	++OpOffset;
6168	}
6169
6170	castBufferRsrcArgToV4I32(MI, B, Idx: `3` + OpOffset);
6171	Register RSrc = MI.getOperand(i: `3` + OpOffset).getReg();
6172	const unsigned NumVIndexOps = IsCmpSwap ? `9` : `8`;
6173
6174	// The struct intrinsic variants add one additional operand over raw.
6175	const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6176	Register VIndex;
6177	if (HasVIndex) {
6178	VIndex = MI.getOperand(i: `4` + OpOffset).getReg();
6179	++OpOffset;
6180	} else {
6181	VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: `0`).getReg(Idx: `0`);
6182	}
6183
6184	Register VOffset = MI.getOperand(i: `4` + OpOffset).getReg();
6185	Register SOffset = MI.getOperand(i: `5` + OpOffset).getReg();
6186	unsigned AuxiliaryData = MI.getOperand(i: `6` + OpOffset).getImm();
6187
6188	MachineMemOperand MMO = MI.memoperands_begin();
6189
6190	unsigned ImmOffset;
6191	std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6192
6193	auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
6194	.addDef(RegNo: Dst)
6195	.addUse(RegNo: VData); // vdata
6196
6197	if (IsCmpSwap)
6198	MIB.addReg(RegNo: CmpVal);
6199
6200	MIB.addUse(RegNo: RSrc) // rsrc
6201	.addUse(RegNo: VIndex) // vindex
6202	.addUse(RegNo: VOffset) // voffset
6203	.addUse(RegNo: SOffset) // soffset
6204	.addImm(Val: ImmOffset) // offset(imm)
6205	.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6206	.addImm(Val: HasVIndex ? -`1` : `0`) // idxen(imm)
6207	.addMemOperand(MMO);
6208
6209	MI.eraseFromParent();
6210	return true;
6211	}
6212
6213	/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6214	/// vector with s16 typed elements.
6215	static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6216	SmallVectorImpl<Register> &PackedAddrs,
6217	unsigned ArgOffset,
6218	const AMDGPU::ImageDimIntrinsicInfo *Intr,
6219	bool IsA16, bool IsG16) {
6220	const LLT S16 = LLT::scalar(SizeInBits: `16`);
6221	const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
6222	auto EndIdx = Intr->VAddrEnd;
6223
6224	for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6225	MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6226	if (!SrcOp.isReg())
6227	continue; // _L to _LZ may have eliminated this.
6228
6229	Register AddrReg = SrcOp.getReg();
6230
6231	if ((I < Intr->GradientStart) \|\|
6232	(I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) \|\|
6233	(I >= Intr->CoordStart && !IsA16)) {
6234	if ((I < Intr->GradientStart) && IsA16 &&
6235	(B.getMRI()->getType(Reg: AddrReg) == S16)) {
6236	assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6237	// Special handling of bias when A16 is on. Bias is of type half but
6238	// occupies full 32-bit.
6239	PackedAddrs.push_back(
6240	Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: `0`)})
6241	.getReg(Idx: `0`));
6242	} else {
6243	assert((!IsA16 \|\| Intr->NumBiasArgs == `0` \|\| I != Intr->BiasIndex) &&
6244	"Bias needs to be converted to 16 bit in A16 mode");
6245	// Handle any gradient or coordinate operands that should not be packed
6246	AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: `0`);
6247	PackedAddrs.push_back(Elt: AddrReg);
6248	}
6249	} else {
6250	// Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6251	// derivatives dx/dh and dx/dv are packed with undef.
6252	if (((I + `1`) >= EndIdx) \|\|
6253	((Intr->NumGradients / `2`) % `2` == `1` &&
6254	(I == static_cast<unsigned>(Intr->GradientStart +
6255	(Intr->NumGradients / `2`) - `1`) \|\|
6256	I == static_cast<unsigned>(Intr->GradientStart +
6257	Intr->NumGradients - `1`))) \|\|
6258	// Check for _L to _LZ optimization
6259	!MI.getOperand(i: ArgOffset + I + `1`).isReg()) {
6260	PackedAddrs.push_back(
6261	Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: `0`)})
6262	.getReg(Idx: `0`));
6263	} else {
6264	PackedAddrs.push_back(
6265	Elt: B.buildBuildVector(
6266	Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + `1`).getReg()})
6267	.getReg(Idx: `0`));
6268	++I;
6269	}
6270	}
6271	}
6272	}
6273
6274	/// Convert from separate vaddr components to a single vector address register,
6275	/// and replace the remaining operands with $noreg.
6276	static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6277	int DimIdx, int NumVAddrs) {
6278	const LLT S32 = LLT::scalar(SizeInBits: `32`);
6279	(void)S32;
6280	SmallVector<Register, `8`> AddrRegs;
6281	for (int I = `0`; I != NumVAddrs; ++I) {
6282	MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6283	if (SrcOp.isReg()) {
6284	AddrRegs.push_back(Elt: SrcOp.getReg());
6285	assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6286	}
6287	}
6288
6289	int NumAddrRegs = AddrRegs.size();
6290	if (NumAddrRegs != `1`) {
6291	auto VAddr =
6292	B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: `32`), Ops: AddrRegs);
6293	MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: `0`));
6294	}
6295
6296	for (int I = `1`; I != NumVAddrs; ++I) {
6297	MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6298	if (SrcOp.isReg())
6299	MI.getOperand(i: DimIdx + I).setReg(AMDGPU::NoRegister);
6300	}
6301	}
6302
6303	/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6304	///
6305	/// Depending on the subtarget, load/store with 16-bit element data need to be
6306	/// rewritten to use the low half of 32-bit registers, or directly use a packed
6307	/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6308	/// registers.
6309	///
6310	/// We don't want to directly select image instructions just yet, but also want
6311	/// to exposes all register repacking to the legalizer/combiners. We also don't
6312	/// want a selected instruction entering RegBankSelect. In order to avoid
6313	/// defining a multitude of intermediate image instructions, directly hack on
6314	/// the intrinsic's arguments. In cases like a16 addresses, this requires
6315	/// padding now unnecessary arguments with $noreg.
6316	bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6317	MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6318	const AMDGPU::ImageDimIntrinsicInfo Intr) const* {
6319
6320	const MachineFunction &MF = *MI.getMF();
6321	const unsigned NumDefs = MI.getNumExplicitDefs();
6322	const unsigned ArgOffset = NumDefs + `1`;
6323	bool IsTFE = NumDefs == `2`;
6324	// We are only processing the operands of d16 image operations on subtargets
6325	// that use the unpacked register layout, or need to repack the TFE result.
6326
6327	// TODO: Do we need to guard against already legalized intrinsics?
6328	const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6329	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
6330
6331	MachineRegisterInfo *MRI = B.getMRI();
6332	const LLT S32 = LLT::scalar(SizeInBits: `32`);
6333	const LLT S16 = LLT::scalar(SizeInBits: `16`);
6334	const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
6335
6336	unsigned DMask = `0`;
6337	Register VData;
6338	LLT Ty;
6339
6340	if (!BaseOpcode->NoReturn \|\| BaseOpcode->Store) {
6341	VData = MI.getOperand(i: NumDefs == `0` ? `1` : `0`).getReg();
6342	Ty = MRI->getType(Reg: VData);
6343	}
6344
6345	const bool IsAtomicPacked16Bit =
6346	(BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 \|\|
6347	BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6348
6349	// Check for 16 bit addresses and pack if true.
6350	LLT GradTy =
6351	MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
6352	LLT AddrTy =
6353	MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
6354	const bool IsG16 =
6355	ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6356	const bool IsA16 = AddrTy == S16;
6357	const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6358
6359	int DMaskLanes = `0`;
6360	if (!BaseOpcode->Atomic) {
6361	DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
6362	if (BaseOpcode->Gather4) {
6363	DMaskLanes = `4`;
6364	} else if (DMask != `0`) {
6365	DMaskLanes = llvm::popcount(Value: DMask);
6366	} else if (!IsTFE && !BaseOpcode->Store) {
6367	// If dmask is 0, this is a no-op load. This can be eliminated.
6368	B.buildUndef(Res: MI.getOperand(i: `0`));
6369	MI.eraseFromParent();
6370	return true;
6371	}
6372	}
6373
6374	Observer.changingInstr(MI);
6375	auto ChangedInstr = make_scope_exit(F: [&] { Observer.changedInstr(MI); });
6376
6377	const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6378	: AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6379	const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6380	: AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6381	unsigned NewOpcode = LoadOpcode;
6382	if (BaseOpcode->Store)
6383	NewOpcode = StoreOpcode;
6384	else if (BaseOpcode->NoReturn)
6385	NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6386
6387	// Track that we legalized this
6388	MI.setDesc(B.getTII().get(Opcode: NewOpcode));
6389
6390	// Expecting to get an error flag since TFC is on - and dmask is 0 Force
6391	// dmask to be at least 1 otherwise the instruction will fail
6392	if (IsTFE && DMask == `0`) {
6393	DMask = `0x1`;
6394	DMaskLanes = `1`;
6395	MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
6396	}
6397
6398	if (BaseOpcode->Atomic) {
6399	Register VData0 = MI.getOperand(i: `2`).getReg();
6400	LLT Ty = MRI->getType(Reg: VData0);
6401
6402	// TODO: Allow atomic swap and bit ops for v2s16/v4s16
6403	if (Ty.isVector() && !IsAtomicPacked16Bit)
6404	return false;
6405
6406	if (BaseOpcode->AtomicX2) {
6407	Register VData1 = MI.getOperand(i: `3`).getReg();
6408	// The two values are packed in one register.
6409	LLT PackedTy = LLT::fixed_vector(NumElements: `2`, ScalarTy: Ty);
6410	auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
6411	MI.getOperand(i: `2`).setReg(Concat.getReg(Idx: `0`));
6412	MI.getOperand(i: `3`).setReg(AMDGPU::NoRegister);
6413	}
6414	}
6415
6416	unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6417
6418	// Rewrite the addressing register layout before doing anything else.
6419	if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6420	// 16 bit gradients are supported, but are tied to the A16 control
6421	// so both gradients and addresses must be 16 bit
6422	return false;
6423	}
6424
6425	if (IsA16 && !ST.hasA16()) {
6426	// A16 not supported
6427	return false;
6428	}
6429
6430	const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
6431	const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6432
6433	if (IsA16 \|\| IsG16) {
6434	// Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6435	// instructions expect VGPR_32
6436	SmallVector<Register, `4`> PackedRegs;
6437
6438	packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6439
6440	// See also below in the non-a16 branch
6441	const bool UseNSA = ST.hasNSAEncoding() &&
6442	PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6443	(PackedRegs.size() <= NSAMaxSize \|\| HasPartialNSA);
6444	const bool UsePartialNSA =
6445	UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6446
6447	if (UsePartialNSA) {
6448	// Pack registers that would go over NSAMaxSize into last VAddr register
6449	LLT PackedAddrTy =
6450	LLT::fixed_vector(NumElements: `2` * (PackedRegs.size() - NSAMaxSize + `1`), ScalarSizeInBits: `16`);
6451	auto Concat = B.buildConcatVectors(
6452	Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - `1`));
6453	PackedRegs [NSAMaxSize - `1`] = Concat.getReg(Idx: `0`);
6454	PackedRegs.resize(N: NSAMaxSize);
6455	} else if (!UseNSA && PackedRegs.size() > `1`) {
6456	LLT PackedAddrTy = LLT::fixed_vector(NumElements: `2` * PackedRegs.size(), ScalarSizeInBits: `16`);
6457	auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
6458	PackedRegs [`0`] = Concat.getReg(Idx: `0`);
6459	PackedRegs.resize(N: `1`);
6460	}
6461
6462	const unsigned NumPacked = PackedRegs.size();
6463	for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6464	MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6465	if (!SrcOp.isReg()) {
6466	assert(SrcOp.isImm() && SrcOp.getImm() == `0`);
6467	continue;
6468	}
6469
6470	assert(SrcOp.getReg() != AMDGPU::NoRegister);
6471
6472	if (I - Intr->VAddrStart < NumPacked)
6473	SrcOp.setReg(PackedRegs [I - Intr->VAddrStart]);
6474	else
6475	SrcOp.setReg(AMDGPU::NoRegister);
6476	}
6477	} else {
6478	// If the register allocator cannot place the address registers contiguously
6479	// without introducing moves, then using the non-sequential address encoding
6480	// is always preferable, since it saves VALU instructions and is usually a
6481	// wash in terms of code size or even better.
6482	//
6483	// However, we currently have no way of hinting to the register allocator
6484	// that MIMG addresses should be placed contiguously when it is possible to
6485	// do so, so force non-NSA for the common 2-address case as a heuristic.
6486	//
6487	// SIShrinkInstructions will convert NSA encodings to non-NSA after register
6488	// allocation when possible.
6489	//
6490	// Partial NSA is allowed on GFX11+ where the final register is a contiguous
6491	// set of the remaining addresses.
6492	const bool UseNSA = ST.hasNSAEncoding() &&
6493	CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6494	(CorrectedNumVAddrs <= NSAMaxSize \|\| HasPartialNSA);
6495	const bool UsePartialNSA =
6496	UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6497
6498	if (UsePartialNSA) {
6499	convertImageAddrToPacked(B, MI,
6500	DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - `1`,
6501	NumVAddrs: Intr->NumVAddrs - NSAMaxSize + `1`);
6502	} else if (!UseNSA && Intr->NumVAddrs > `1`) {
6503	convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
6504	NumVAddrs: Intr->NumVAddrs);
6505	}
6506	}
6507
6508	int Flags = `0`;
6509	if (IsA16)
6510	Flags \|= `1`;
6511	if (IsG16)
6512	Flags \|= `2`;
6513	MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
6514
6515	if (BaseOpcode->NoReturn) { // No TFE for stores?
6516	// TODO: Handle dmask trim
6517	if (!Ty.isVector() \|\| !IsD16)
6518	return true;
6519
6520	Register RepackedReg = handleD16VData(B, MRI&: MRI, Reg: VData, ImageStore: true*);
6521	if (RepackedReg != VData) {
6522	MI.getOperand(i: `1`).setReg(RepackedReg);
6523	}
6524
6525	return true;
6526	}
6527
6528	Register DstReg = MI.getOperand(i: `0`).getReg();
6529	const LLT EltTy = Ty.getScalarType();
6530	const int NumElts = Ty.isVector() ? Ty.getNumElements() : `1`;
6531
6532	// Confirm that the return type is large enough for the dmask specified
6533	if (NumElts < DMaskLanes)
6534	return false;
6535
6536	if (NumElts > `4` \|\| DMaskLanes > `4`)
6537	return false;
6538
6539	// Image atomic instructions are using DMask to specify how many bits
6540	// input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6541	// DMaskLanes for image atomic has default value '0'.
6542	// We must be sure that atomic variants (especially packed) will not be
6543	// truncated from v2s16 or v4s16 to s16 type.
6544	//
6545	// ChangeElementCount will be needed for image load where Ty is always scalar.
6546	const unsigned AdjustedNumElts = DMaskLanes == `0` ? `1` : DMaskLanes;
6547	const LLT AdjustedTy =
6548	DMaskLanes == `0`
6549	? Ty
6550	: Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
6551
6552	// The raw dword aligned data component of the load. The only legal cases
6553	// where this matters should be when using the packed D16 format, for
6554	// s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6555	LLT RoundedTy;
6556
6557	// S32 vector to cover all data, plus TFE result element.
6558	LLT TFETy;
6559
6560	// Register type to use for each loaded component. Will be S32 or V2S16.
6561	LLT RegTy;
6562
6563	if (IsD16 && ST.hasUnpackedD16VMem()) {
6564	RoundedTy =
6565	LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: `32`);
6566	TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + `1`, ScalarSizeInBits: `32`);
6567	RegTy = S32;
6568	} else {
6569	unsigned EltSize = EltTy.getSizeInBits();
6570	unsigned RoundedElts = (AdjustedTy.getSizeInBits() + `31`) / `32`;
6571	unsigned RoundedSize = `32` * RoundedElts;
6572	RoundedTy = LLT::scalarOrVector(
6573	EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
6574	TFETy = LLT::fixed_vector(NumElements: RoundedSize / `32` + `1`, ScalarTy: S32);
6575	RegTy = !IsTFE && EltSize == `16` ? V2S16 : S32;
6576	}
6577
6578	// The return type does not need adjustment.
6579	// TODO: Should we change s16 case to s32 or <2 x s16>?
6580	if (!IsTFE && (RoundedTy == Ty \|\| !Ty.isVector()))
6581	return true;
6582
6583	Register Dst1Reg;
6584
6585	// Insert after the instruction.
6586	B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
6587
6588	// TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6589	// s16> instead of s32, we would only need 1 bitcast instead of multiple.
6590	const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6591	const int ResultNumRegs = LoadResultTy.getSizeInBits() / `32`;
6592
6593	Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
6594
6595	MI.getOperand(i: `0`).setReg(NewResultReg);
6596
6597	// In the IR, TFE is supposed to be used with a 2 element struct return
6598	// type. The instruction really returns these two values in one contiguous
6599	// register, with one additional dword beyond the loaded data. Rewrite the
6600	// return type to use a single register result.
6601
6602	if (IsTFE) {
6603	Dst1Reg = MI.getOperand(i: `1`).getReg();
6604	if (MRI->getType(Reg: Dst1Reg) != S32)
6605	return false;
6606
6607	// TODO: Make sure the TFE operand bit is set.
6608	MI.removeOperand(OpNo: `1`);
6609
6610	// Handle the easy case that requires no repack instructions.
6611	if (Ty == S32) {
6612	B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
6613	return true;
6614	}
6615	}
6616
6617	// Now figure out how to copy the new result register back into the old
6618	// result.
6619	SmallVector<Register, `5`> ResultRegs(ResultNumRegs, Dst1Reg);
6620
6621	const int NumDataRegs = IsTFE ? ResultNumRegs - `1` : ResultNumRegs;
6622
6623	if (ResultNumRegs == `1`) {
6624	assert(!IsTFE);
6625	ResultRegs [`0`] = NewResultReg;
6626	} else {
6627	// We have to repack into a new vector of some kind.
6628	for (int I = `0`; I != NumDataRegs; ++I)
6629	ResultRegs [I] = MRI->createGenericVirtualRegister(Ty: RegTy);
6630	B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
6631
6632	// Drop the final TFE element to get the data part. The TFE result is
6633	// directly written to the right place already.
6634	if (IsTFE)
6635	ResultRegs.resize(N: NumDataRegs);
6636	}
6637
6638	// For an s16 scalar result, we form an s32 result with a truncate regardless
6639	// of packed vs. unpacked.
6640	if (IsD16 && !Ty.isVector()) {
6641	B.buildTrunc(Res: DstReg, Op: ResultRegs [`0`]);
6642	return true;
6643	}
6644
6645	// Avoid a build/concat_vector of 1 entry.
6646	if (Ty == V2S16 && NumDataRegs == `1` && !ST.hasUnpackedD16VMem()) {
6647	B.buildBitcast(Dst: DstReg, Src: ResultRegs [`0`]);
6648	return true;
6649	}
6650
6651	assert(Ty.isVector());
6652
6653	if (IsD16) {
6654	// For packed D16 results with TFE enabled, all the data components are
6655	// S32. Cast back to the expected type.
6656	//
6657	// TODO: We don't really need to use load s32 elements. We would only need one
6658	// cast for the TFE result if a multiple of v2s16 was used.
6659	if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6660	for (Register &Reg : ResultRegs)
6661	Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: `0`);
6662	} else if (ST.hasUnpackedD16VMem()) {
6663	for (Register &Reg : ResultRegs)
6664	Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: `0`);
6665	}
6666	}
6667
6668	auto padWithUndef = [&](LLT Ty, int NumElts) {
6669	if (NumElts == `0`)
6670	return;
6671	Register Undef = B.buildUndef(Res: Ty).getReg(Idx: `0`);
6672	for (int I = `0`; I != NumElts; ++I)
6673	ResultRegs.push_back(Elt: Undef);
6674	};
6675
6676	// Pad out any elements eliminated due to the dmask.
6677	LLT ResTy = MRI->getType(Reg: ResultRegs [`0`]);
6678	if (!ResTy.isVector()) {
6679	padWithUndef (ResTy, NumElts - ResultRegs.size());
6680	B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
6681	return true;
6682	}
6683
6684	assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6685	const int RegsToCover = (Ty.getSizeInBits() + `31`) / `32`;
6686
6687	// Deal with the one annoying legal case.
6688	const LLT V3S16 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `16`);
6689	if (Ty == V3S16) {
6690	if (IsTFE) {
6691	if (ResultRegs.size() == `1`) {
6692	NewResultReg = ResultRegs [`0`];
6693	} else if (ResultRegs.size() == `2`) {
6694	LLT V4S16 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`);
6695	NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: `0`);
6696	} else {
6697	return false;
6698	}
6699	}
6700
6701	if (MRI->getType(Reg: DstReg).getNumElements() <
6702	MRI->getType(Reg: NewResultReg).getNumElements()) {
6703	B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
6704	} else {
6705	B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
6706	}
6707	return true;
6708	}
6709
6710	padWithUndef (ResTy, RegsToCover - ResultRegs.size());
6711	B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
6712	return true;
6713	}
6714
6715	bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6716	MachineInstr &MI) const {
6717	MachineIRBuilder &B = Helper.MIRBuilder;
6718	GISelChangeObserver &Observer = Helper.Observer;
6719
6720	Register OrigDst = MI.getOperand(i: `0`).getReg();
6721	Register Dst;
6722	LLT Ty = B.getMRI()->getType(Reg: OrigDst);
6723	unsigned Size = Ty.getSizeInBits();
6724	MachineFunction &MF = B.getMF();
6725	unsigned Opc = `0`;
6726	if (Size < `32` && ST.hasScalarSubwordLoads()) {
6727	assert(Size == `8` \|\| Size == `16`);
6728	Opc = Size == `8` ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6729	: AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6730	// The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6731	// destination register.
6732	Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `32`));
6733	} else {
6734	Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6735	Dst = OrigDst;
6736	}
6737
6738	Observer.changingInstr(MI);
6739
6740	// Handle needing to s.buffer.load() a p8 value.
6741	if (hasBufferRsrcWorkaround(Ty)) {
6742	Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: `0`);
6743	B.setInsertPt(MBB&: B.getMBB(), II: MI);
6744	}
6745	if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
6746	Ty = getBitcastRegisterType(Ty);
6747	Helper.bitcastDst(MI, CastTy: Ty, OpIdx: `0`);
6748	B.setInsertPt(MBB&: B.getMBB(), II: MI);
6749	}
6750
6751	// FIXME: We don't really need this intermediate instruction. The intrinsic
6752	// should be fixed to have a memory operand. Since it's readnone, we're not
6753	// allowed to add one.
6754	MI.setDesc(B.getTII().get(Opcode: Opc));
6755	MI.removeOperand(OpNo: `1`); // Remove intrinsic ID
6756
6757	// FIXME: When intrinsic definition is fixed, this should have an MMO already.
6758	const unsigned MemSize = (Size + `7`) / `8`;
6759	const Align MemAlign = B.getDataLayout().getABITypeAlign(
6760	Ty: getTypeForLLT(Ty, C&: MF.getFunction().getContext()));
6761	MachineMemOperand *MMO = MF.getMachineMemOperand(
6762	PtrInfo: MachinePointerInfo (),
6763	F: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
6764	MachineMemOperand::MOInvariant,
6765	Size: MemSize, BaseAlignment: MemAlign);
6766	MI.addMemOperand(MF, MO: MMO);
6767	if (Dst != OrigDst) {
6768	MI.getOperand(i: `0`).setReg(Dst);
6769	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6770	B.buildTrunc(Res: OrigDst, Op: Dst);
6771	}
6772
6773	// If we don't have 96-bit result scalar loads, widening to 128-bit should
6774	// always be legal. We may need to restore this to a 96-bit result if it turns
6775	// out this needs to be converted to a vector load during RegBankSelect.
6776	if (!isPowerOf2_32(Value: Size) && (Size != `96` \|\| !ST.hasScalarDwordx3Loads())) {
6777	if (Ty.isVector())
6778	Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: `0`);
6779	else
6780	Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: `0`);
6781	}
6782
6783	Observer.changedInstr(MI);
6784	return true;
6785	}
6786
6787	// TODO: Move to selection
6788	bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6789	MachineRegisterInfo &MRI,
6790	MachineIRBuilder &B) const {
6791	if (!ST.isTrapHandlerEnabled() \|\|
6792	ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6793	return legalizeTrapEndpgm(MI, MRI, B);
6794
6795	return ST.supportsGetDoorbellID() ?
6796	legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6797	}
6798
6799	bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6800	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6801	const DebugLoc &DL = MI.getDebugLoc();
6802	MachineBasicBlock &BB = B.getMBB();
6803	MachineFunction *MF = BB.getParent();
6804
6805	if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
6806	BuildMI(BB, I: BB.end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
6807	.addImm(Val: `0`);
6808	MI.eraseFromParent();
6809	return true;
6810	}
6811
6812	// We need a block split to make the real endpgm a terminator. We also don't
6813	// want to break phis in successor blocks, so we can't just delete to the
6814	// end of the block.
6815	BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /UpdateLiveIns/);
6816	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6817	MF->push_back(MBB: TrapBB);
6818	BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
6819	.addImm(Val: `0`);
6820	BuildMI(BB, I: &MI, MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
6821	.addMBB(MBB: TrapBB);
6822
6823	BB.addSuccessor(Succ: TrapBB);
6824	MI.eraseFromParent();
6825	return true;
6826	}
6827
6828	bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6829	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6830	MachineFunction &MF = B.getMF();
6831	const LLT S64 = LLT::scalar(SizeInBits: `64`);
6832
6833	Register SGPR01(AMDGPU::SGPR0_SGPR1);
6834	// For code object version 5, queue_ptr is passed through implicit kernarg.
6835	if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
6836	AMDGPU::AMDHSA_COV5) {
6837	AMDGPUTargetLowering::ImplicitParameter Param =
6838	AMDGPUTargetLowering::QUEUE_PTR;
6839	uint64_t Offset =
6840	ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
6841
6842	Register KernargPtrReg = MRI.createGenericVirtualRegister(
6843	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
6844
6845	if (!loadInputValue(DstReg: KernargPtrReg, B,
6846	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6847	return false;
6848
6849	// TODO: can we be smarter about machine pointer info?
6850	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6851	MachineMemOperand *MMO = MF.getMachineMemOperand(
6852	PtrInfo,
6853	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
6854	MachineMemOperand::MOInvariant,
6855	MemTy: LLT::scalar(SizeInBits: `64`), base_alignment: commonAlignment(A: Align (`64`), Offset));
6856
6857	// Pointer address
6858	Register LoadAddr = MRI.createGenericVirtualRegister(
6859	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
6860	B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
6861	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset).getReg(Idx: `0`));
6862	// Load address
6863	Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: `0`);
6864	B.buildCopy(Res: SGPR01, Op: Temp);
6865	B.buildInstr(Opcode: AMDGPU::S_TRAP)
6866	.addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6867	.addReg(RegNo: SGPR01, flags: RegState::Implicit);
6868	MI.eraseFromParent();
6869	return true;
6870	}
6871
6872	// Pass queue pointer to trap handler as input, and insert trap instruction
6873	// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6874	Register LiveIn =
6875	MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
6876	if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
6877	return false;
6878
6879	B.buildCopy(Res: SGPR01, Op: LiveIn);
6880	B.buildInstr(Opcode: AMDGPU::S_TRAP)
6881	.addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6882	.addReg(RegNo: SGPR01, flags: RegState::Implicit);
6883
6884	MI.eraseFromParent();
6885	return true;
6886	}
6887
6888	bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6889	MachineRegisterInfo &MRI,
6890	MachineIRBuilder &B) const {
6891	// We need to simulate the 's_trap 2' instruction on targets that run in
6892	// PRIV=1 (where it is treated as a nop).
6893	if (ST.hasPrivEnabledTrap2NopBug()) {
6894	ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
6895	DL: MI.getDebugLoc());
6896	MI.eraseFromParent();
6897	return true;
6898	}
6899
6900	B.buildInstr(Opcode: AMDGPU::S_TRAP)
6901	.addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6902	MI.eraseFromParent();
6903	return true;
6904	}
6905
6906	bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
6907	MachineRegisterInfo &MRI,
6908	MachineIRBuilder &B) const {
6909	// Is non-HSA path or trap-handler disabled? Then, report a warning
6910	// accordingly
6911	if (!ST.isTrapHandlerEnabled() \|\|
6912	ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6913	DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6914	"debugtrap handler not supported",
6915	MI.getDebugLoc(), DS_Warning);
6916	LLVMContext &Ctx = B.getMF().getFunction().getContext();
6917	Ctx.diagnose(DI: NoTrap);
6918	} else {
6919	// Insert debug-trap instruction
6920	B.buildInstr(Opcode: AMDGPU::S_TRAP)
6921	.addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6922	}
6923
6924	MI.eraseFromParent();
6925	return true;
6926	}
6927
6928	bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6929	MachineIRBuilder &B) const {
6930	MachineRegisterInfo &MRI = *B.getMRI();
6931	const LLT S16 = LLT::scalar(SizeInBits: `16`);
6932	const LLT S32 = LLT::scalar(SizeInBits: `32`);
6933	const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
6934	const LLT V3S32 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `32`);
6935
6936	Register DstReg = MI.getOperand(i: `0`).getReg();
6937	Register NodePtr = MI.getOperand(i: `2`).getReg();
6938	Register RayExtent = MI.getOperand(i: `3`).getReg();
6939	Register RayOrigin = MI.getOperand(i: `4`).getReg();
6940	Register RayDir = MI.getOperand(i: `5`).getReg();
6941	Register RayInvDir = MI.getOperand(i: `6`).getReg();
6942	Register TDescr = MI.getOperand(i: `7`).getReg();
6943
6944	if (!ST.hasGFX10_AEncoding()) {
6945	DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6946	"intrinsic not supported on subtarget",
6947	MI.getDebugLoc());
6948	B.getMF().getFunction().getContext().diagnose(DI: BadIntrin);
6949	return false;
6950	}
6951
6952	const bool IsGFX11 = AMDGPU::isGFX11(STI: ST);
6953	const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: ST);
6954	const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: ST);
6955	const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == `16`;
6956	const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == `64`;
6957	const unsigned NumVDataDwords = `4`;
6958	const unsigned NumVAddrDwords = IsA16 ? (Is64 ? `9` : `8`) : (Is64 ? `12` : `11`);
6959	const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? `4` : `5`) : NumVAddrDwords;
6960	const bool UseNSA =
6961	IsGFX12Plus \|\| (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6962
6963	const unsigned BaseOpcodes[`2`][`2`] = {
6964	{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6965	{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6966	AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6967	int Opcode;
6968	if (UseNSA) {
6969	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
6970	MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6971	: IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6972	: AMDGPU::MIMGEncGfx10NSA,
6973	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
6974	} else {
6975	assert(!IsGFX12Plus);
6976	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
6977	MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6978	: AMDGPU::MIMGEncGfx10Default,
6979	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
6980	}
6981	assert(Opcode != -`1`);
6982
6983	SmallVector<Register, `12`> Ops;
6984	if (UseNSA && IsGFX11Plus) {
6985	auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6986	auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
6987	auto Merged = B.buildMergeLikeInstr(
6988	Res: V3S32, Ops: {Unmerge.getReg(Idx: `0`), Unmerge.getReg(Idx: `1`), Unmerge.getReg(Idx: `2`)});
6989	Ops.push_back(Elt: Merged.getReg(Idx: `0`));
6990	};
6991
6992	Ops.push_back(Elt: NodePtr);
6993	Ops.push_back(Elt: RayExtent);
6994	packLanes (RayOrigin);
6995
6996	if (IsA16) {
6997	auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
6998	auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
6999	auto MergedDir = B.buildMergeLikeInstr(
7000	Res: V3S32,
7001	Ops: {B.buildBitcast(
7002	Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: `0`),
7003	UnmergeRayDir.getReg(Idx: `0`)}))
7004	.getReg(Idx: `0`),
7005	B.buildBitcast(
7006	Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: `1`),
7007	UnmergeRayDir.getReg(Idx: `1`)}))
7008	.getReg(Idx: `0`),
7009	B.buildBitcast(
7010	Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: `2`),
7011	UnmergeRayDir.getReg(Idx: `2`)}))
7012	.getReg(Idx: `0`)});
7013	Ops.push_back(Elt: MergedDir.getReg(Idx: `0`));
7014	} else {
7015	packLanes (RayDir);
7016	packLanes (RayInvDir);
7017	}
7018	} else {
7019	if (Is64) {
7020	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
7021	Ops.push_back(Elt: Unmerge.getReg(Idx: `0`));
7022	Ops.push_back(Elt: Unmerge.getReg(Idx: `1`));
7023	} else {
7024	Ops.push_back(Elt: NodePtr);
7025	}
7026	Ops.push_back(Elt: RayExtent);
7027
7028	auto packLanes = [&Ops, &S32, &B](Register Src) {
7029	auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7030	Ops.push_back(Elt: Unmerge.getReg(Idx: `0`));
7031	Ops.push_back(Elt: Unmerge.getReg(Idx: `1`));
7032	Ops.push_back(Elt: Unmerge.getReg(Idx: `2`));
7033	};
7034
7035	packLanes (RayOrigin);
7036	if (IsA16) {
7037	auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7038	auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7039	Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
7040	Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
7041	Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
7042	B.buildMergeLikeInstr(Res: R1,
7043	Ops: {UnmergeRayDir.getReg(Idx: `0`), UnmergeRayDir.getReg(Idx: `1`)});
7044	B.buildMergeLikeInstr(
7045	Res: R2, Ops: {UnmergeRayDir.getReg(Idx: `2`), UnmergeRayInvDir.getReg(Idx: `0`)});
7046	B.buildMergeLikeInstr(
7047	Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: `1`), UnmergeRayInvDir.getReg(Idx: `2`)});
7048	Ops.push_back(Elt: R1);
7049	Ops.push_back(Elt: R2);
7050	Ops.push_back(Elt: R3);
7051	} else {
7052	packLanes (RayDir);
7053	packLanes (RayInvDir);
7054	}
7055	}
7056
7057	if (!UseNSA) {
7058	// Build a single vector containing all the operands so far prepared.
7059	LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: `32`);
7060	Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: `0`);
7061	Ops.clear();
7062	Ops.push_back(Elt: MergedOps);
7063	}
7064
7065	auto MIB = B.buildInstr(Opcode: AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7066	.addDef(RegNo: DstReg)
7067	.addImm(Val: Opcode);
7068
7069	for (Register R : Ops) {
7070	MIB.addUse(RegNo: R);
7071	}
7072
7073	MIB.addUse(RegNo: TDescr)
7074	.addImm(Val: IsA16 ? `1` : `0`)
7075	.cloneMemRefs(OtherMI: MI);
7076
7077	MI.eraseFromParent();
7078	return true;
7079	}
7080
7081	bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
7082	MachineIRBuilder &B) const {
7083	unsigned Opc;
7084	int RoundMode = MI.getOperand(i: `2`).getImm();
7085
7086	if (RoundMode == (int)RoundingMode::TowardPositive)
7087	Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
7088	else if (RoundMode == (int)RoundingMode::TowardNegative)
7089	Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
7090	else
7091	return false;
7092
7093	B.buildInstr(Opcode: Opc)
7094	.addDef(RegNo: MI.getOperand(i: `0`).getReg())
7095	.addUse(RegNo: MI.getOperand(i: `1`).getReg());
7096
7097	MI.eraseFromParent();
7098
7099	return true;
7100	}
7101
7102	bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7103	MachineIRBuilder &B) const {
7104	const SITargetLowering *TLI = ST.getTargetLowering();
7105	Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7106	Register DstReg = MI.getOperand(i: `0`).getReg();
7107	B.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {DstReg}, SrcOps: {StackPtr});
7108	MI.eraseFromParent();
7109	return true;
7110	}
7111
7112	bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7113	MachineIRBuilder &B) const {
7114	// With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7115	if (!ST.hasArchitectedSGPRs())
7116	return false;
7117	LLT S32 = LLT::scalar(SizeInBits: `32`);
7118	Register DstReg = MI.getOperand(i: `0`).getReg();
7119	auto TTMP8 = B.buildCopy(Res: S32, Op: Register (AMDGPU::TTMP8));
7120	auto LSB = B.buildConstant(Res: S32, Val: `25`);
7121	auto Width = B.buildConstant(Res: S32, Val: `5`);
7122	B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
7123	MI.eraseFromParent();
7124	return true;
7125	}
7126
7127	static constexpr unsigned FPEnvModeBitField =
7128	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `0`, Values: `23`);
7129
7130	static constexpr unsigned FPEnvTrapBitField =
7131	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: `0`, Values: `5`);
7132
7133	bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7134	MachineRegisterInfo &MRI,
7135	MachineIRBuilder &B) const {
7136	Register Src = MI.getOperand(i: `0`).getReg();
7137	if (MRI.getType(Reg: Src) != S64)
7138	return false;
7139
7140	auto ModeReg =
7141	B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7142	/HasSideEffects=/true, /isConvergent=/false)
7143	.addImm(Val: FPEnvModeBitField);
7144	auto TrapReg =
7145	B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7146	/HasSideEffects=/true, /isConvergent=/false)
7147	.addImm(Val: FPEnvTrapBitField);
7148	B.buildMergeLikeInstr(Res: Src, Ops: {ModeReg, TrapReg});
7149	MI.eraseFromParent();
7150	return true;
7151	}
7152
7153	bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7154	MachineRegisterInfo &MRI,
7155	MachineIRBuilder &B) const {
7156	Register Src = MI.getOperand(i: `0`).getReg();
7157	if (MRI.getType(Reg: Src) != S64)
7158	return false;
7159
7160	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: `0`));
7161	B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7162	/HasSideEffects=/true, /isConvergent=/false)
7163	.addImm(Val: static_cast<int16_t>(FPEnvModeBitField))
7164	.addReg(RegNo: Unmerge.getReg(Idx: `0`));
7165	B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7166	/HasSideEffects=/true, /isConvergent=/false)
7167	.addImm(Val: static_cast<int16_t>(FPEnvTrapBitField))
7168	.addReg(RegNo: Unmerge.getReg(Idx: `1`));
7169	MI.eraseFromParent();
7170	return true;
7171	}
7172
7173	bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7174	MachineInstr &MI) const {
7175	MachineIRBuilder &B = Helper.MIRBuilder;
7176	MachineRegisterInfo &MRI = *B.getMRI();
7177
7178	// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7179	auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
7180	switch (IntrID) {
7181	case Intrinsic::amdgcn_if:
7182	case Intrinsic::amdgcn_else: {
7183	MachineInstr Br = nullptr*;
7184	MachineBasicBlock UncondBrTarget = nullptr*;
7185	bool Negated = false;
7186	if (MachineInstr *BrCond =
7187	verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7188	const SIRegisterInfo *TRI
7189	= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7190
7191	Register Def = MI.getOperand(i: `1`).getReg();
7192	Register Use = MI.getOperand(i: `3`).getReg();
7193
7194	MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: `1`).getMBB();
7195
7196	if (Negated)
7197	std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7198
7199	B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7200	if (IntrID == Intrinsic::amdgcn_if) {
7201	B.buildInstr(Opcode: AMDGPU::SI_IF)
7202	.addDef(RegNo: Def)
7203	.addUse(RegNo: Use)
7204	.addMBB(MBB: UncondBrTarget);
7205	} else {
7206	B.buildInstr(Opcode: AMDGPU::SI_ELSE)
7207	.addDef(RegNo: Def)
7208	.addUse(RegNo: Use)
7209	.addMBB(MBB: UncondBrTarget);
7210	}
7211
7212	if (Br) {
7213	Br->getOperand(i: `0`).setMBB(CondBrTarget);
7214	} else {
7215	// The IRTranslator skips inserting the G_BR for fallthrough cases, but
7216	// since we're swapping branch targets it needs to be reinserted.
7217	// FIXME: IRTranslator should probably not do this
7218	B.buildBr(Dest&: *CondBrTarget);
7219	}
7220
7221	MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
7222	MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
7223	MI.eraseFromParent();
7224	BrCond->eraseFromParent();
7225	return true;
7226	}
7227
7228	return false;
7229	}
7230	case Intrinsic::amdgcn_loop: {
7231	MachineInstr Br = nullptr*;
7232	MachineBasicBlock UncondBrTarget = nullptr*;
7233	bool Negated = false;
7234	if (MachineInstr *BrCond =
7235	verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7236	const SIRegisterInfo *TRI
7237	= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7238
7239	MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: `1`).getMBB();
7240	Register Reg = MI.getOperand(i: `2`).getReg();
7241
7242	if (Negated)
7243	std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7244
7245	B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7246	B.buildInstr(Opcode: AMDGPU::SI_LOOP)
7247	.addUse(RegNo: Reg)
7248	.addMBB(MBB: UncondBrTarget);
7249
7250	if (Br)
7251	Br->getOperand(i: `0`).setMBB(CondBrTarget);
7252	else
7253	B.buildBr(Dest&: *CondBrTarget);
7254
7255	MI.eraseFromParent();
7256	BrCond->eraseFromParent();
7257	MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
7258	return true;
7259	}
7260
7261	return false;
7262	}
7263	case Intrinsic::amdgcn_addrspacecast_nonnull:
7264	return legalizeAddrSpaceCast(MI, MRI, B);
7265	case Intrinsic::amdgcn_make_buffer_rsrc:
7266	return legalizePointerAsRsrcIntrin(MI, MRI, B);
7267	case Intrinsic::amdgcn_kernarg_segment_ptr:
7268	if (!AMDGPU::isKernel(CC: B.getMF().getFunction().getCallingConv())) {
7269	// This only makes sense to call in a kernel, so just lower to null.
7270	B.buildConstant(Res: MI.getOperand(i: `0`).getReg(), Val: `0`);
7271	MI.eraseFromParent();
7272	return true;
7273	}
7274
7275	return legalizePreloadedArgIntrin(
7276	MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7277	case Intrinsic::amdgcn_implicitarg_ptr:
7278	return legalizeImplicitArgPtr(MI, MRI, B);
7279	case Intrinsic::amdgcn_workitem_id_x:
7280	return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: `0`,
7281	ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7282	case Intrinsic::amdgcn_workitem_id_y:
7283	return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: `1`,
7284	ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7285	case Intrinsic::amdgcn_workitem_id_z:
7286	return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: `2`,
7287	ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7288	case Intrinsic::amdgcn_workgroup_id_x:
7289	return legalizePreloadedArgIntrin(MI, MRI, B,
7290	ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7291	case Intrinsic::amdgcn_workgroup_id_y:
7292	return legalizePreloadedArgIntrin(MI, MRI, B,
7293	ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7294	case Intrinsic::amdgcn_workgroup_id_z:
7295	return legalizePreloadedArgIntrin(MI, MRI, B,
7296	ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7297	case Intrinsic::amdgcn_wave_id:
7298	return legalizeWaveID(MI, B);
7299	case Intrinsic::amdgcn_lds_kernel_id:
7300	return legalizePreloadedArgIntrin(MI, MRI, B,
7301	ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7302	case Intrinsic::amdgcn_dispatch_ptr:
7303	return legalizePreloadedArgIntrin(MI, MRI, B,
7304	ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
7305	case Intrinsic::amdgcn_queue_ptr:
7306	return legalizePreloadedArgIntrin(MI, MRI, B,
7307	ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
7308	case Intrinsic::amdgcn_implicit_buffer_ptr:
7309	return legalizePreloadedArgIntrin(
7310	MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7311	case Intrinsic::amdgcn_dispatch_id:
7312	return legalizePreloadedArgIntrin(MI, MRI, B,
7313	ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
7314	case Intrinsic::r600_read_ngroups_x:
7315	// TODO: Emit error for hsa
7316	return legalizeKernargMemParameter(MI, B,
7317	Offset: SI::KernelInputOffsets::NGROUPS_X);
7318	case Intrinsic::r600_read_ngroups_y:
7319	return legalizeKernargMemParameter(MI, B,
7320	Offset: SI::KernelInputOffsets::NGROUPS_Y);
7321	case Intrinsic::r600_read_ngroups_z:
7322	return legalizeKernargMemParameter(MI, B,
7323	Offset: SI::KernelInputOffsets::NGROUPS_Z);
7324	case Intrinsic::r600_read_local_size_x:
7325	// TODO: Could insert G_ASSERT_ZEXT from s16
7326	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
7327	case Intrinsic::r600_read_local_size_y:
7328	// TODO: Could insert G_ASSERT_ZEXT from s16
7329	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
7330	// TODO: Could insert G_ASSERT_ZEXT from s16
7331	case Intrinsic::r600_read_local_size_z:
7332	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
7333	case Intrinsic::r600_read_global_size_x:
7334	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_X);
7335	case Intrinsic::r600_read_global_size_y:
7336	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7337	case Intrinsic::r600_read_global_size_z:
7338	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7339	case Intrinsic::amdgcn_fdiv_fast:
7340	return legalizeFDIVFastIntrin(MI, MRI, B);
7341	case Intrinsic::amdgcn_is_shared:
7342	return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
7343	case Intrinsic::amdgcn_is_private:
7344	return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
7345	case Intrinsic::amdgcn_wavefrontsize: {
7346	B.buildConstant(Res: MI.getOperand(i: `0`), Val: ST.getWavefrontSize());
7347	MI.eraseFromParent();
7348	return true;
7349	}
7350	case Intrinsic::amdgcn_s_buffer_load:
7351	return legalizeSBufferLoad(Helper, MI);
7352	case Intrinsic::amdgcn_raw_buffer_store:
7353	case Intrinsic::amdgcn_raw_ptr_buffer_store:
7354	case Intrinsic::amdgcn_struct_buffer_store:
7355	case Intrinsic::amdgcn_struct_ptr_buffer_store:
7356	return legalizeBufferStore(MI, MRI, B, IsTyped: false, IsFormat: false);
7357	case Intrinsic::amdgcn_raw_buffer_store_format:
7358	case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7359	case Intrinsic::amdgcn_struct_buffer_store_format:
7360	case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7361	return legalizeBufferStore(MI, MRI, B, IsTyped: false, IsFormat: true);
7362	case Intrinsic::amdgcn_raw_tbuffer_store:
7363	case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7364	case Intrinsic::amdgcn_struct_tbuffer_store:
7365	case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7366	return legalizeBufferStore(MI, MRI, B, IsTyped: true, IsFormat: true);
7367	case Intrinsic::amdgcn_raw_buffer_load:
7368	case Intrinsic::amdgcn_raw_ptr_buffer_load:
7369	case Intrinsic::amdgcn_raw_atomic_buffer_load:
7370	case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7371	case Intrinsic::amdgcn_struct_buffer_load:
7372	case Intrinsic::amdgcn_struct_ptr_buffer_load:
7373	return legalizeBufferLoad(MI, MRI, B, IsFormat: false, IsTyped: false);
7374	case Intrinsic::amdgcn_raw_buffer_load_format:
7375	case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7376	case Intrinsic::amdgcn_struct_buffer_load_format:
7377	case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7378	return legalizeBufferLoad(MI, MRI, B, IsFormat: true, IsTyped: false);
7379	case Intrinsic::amdgcn_raw_tbuffer_load:
7380	case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7381	case Intrinsic::amdgcn_struct_tbuffer_load:
7382	case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7383	return legalizeBufferLoad(MI, MRI, B, IsFormat: true, IsTyped: true);
7384	case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7385	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7386	case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7387	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7388	case Intrinsic::amdgcn_raw_buffer_atomic_add:
7389	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7390	case Intrinsic::amdgcn_struct_buffer_atomic_add:
7391	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7392	case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7393	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7394	case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7395	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7396	case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7397	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7398	case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7399	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7400	case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7401	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7402	case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7403	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7404	case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7405	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7406	case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7407	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7408	case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7409	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7410	case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7411	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7412	case Intrinsic::amdgcn_raw_buffer_atomic_and:
7413	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7414	case Intrinsic::amdgcn_struct_buffer_atomic_and:
7415	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7416	case Intrinsic::amdgcn_raw_buffer_atomic_or:
7417	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7418	case Intrinsic::amdgcn_struct_buffer_atomic_or:
7419	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7420	case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7421	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7422	case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7423	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7424	case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7425	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7426	case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7427	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7428	case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7429	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7430	case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7431	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7432	case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7433	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7434	case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7435	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7436	case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7437	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7438	case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7439	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7440	case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7441	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7442	case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7443	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7444	case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7445	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7446	case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7447	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7448	return legalizeBufferAtomic(MI, B, IID: IntrID);
7449	case Intrinsic::amdgcn_rsq_clamp:
7450	return legalizeRsqClampIntrinsic(MI, MRI, B);
7451	case Intrinsic::amdgcn_image_bvh_intersect_ray:
7452	return legalizeBVHIntrinsic(MI, B);
7453	case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7454	case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7455	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7456	case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7457	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7458	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7459	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7460	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7461	Register Index = MI.getOperand(i: `5`).getReg();
7462	LLT S32 = LLT::scalar(SizeInBits: `32`);
7463	if (MRI.getType(Reg: Index) != S32)
7464	MI.getOperand(i: `5`).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: `0`));
7465	return true;
7466	}
7467	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7468	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7469	case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7470	Register Index = MI.getOperand(i: `7`).getReg();
7471	LLT S32 = LLT::scalar(SizeInBits: `32`);
7472	if (MRI.getType(Reg: Index) != S32)
7473	MI.getOperand(i: `7`).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: `0`));
7474	return true;
7475	}
7476	case Intrinsic::amdgcn_fmed3: {
7477	GISelChangeObserver &Observer = Helper.Observer;
7478
7479	// FIXME: This is to workaround the inability of tablegen match combiners to
7480	// match intrinsics in patterns.
7481	Observer.changingInstr(MI);
7482	MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_FMED3));
7483	MI.removeOperand(OpNo: `1`);
7484	Observer.changedInstr(MI);
7485	return true;
7486	}
7487	case Intrinsic::amdgcn_readlane:
7488	case Intrinsic::amdgcn_writelane:
7489	case Intrinsic::amdgcn_readfirstlane:
7490	case Intrinsic::amdgcn_permlane16:
7491	case Intrinsic::amdgcn_permlanex16:
7492	case Intrinsic::amdgcn_permlane64:
7493	return legalizeLaneOp(Helper, MI, IID: IntrID);
7494	default: {
7495	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7496	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
7497	return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
7498	return true;
7499	}
7500	}
7501
7502	return true;
7503	}
7504

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp