AMDGPUInstCombineIntrinsic.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp]

1	//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file
10	// This file implements a TargetTransformInfo analysis pass specific to the
11	// AMDGPU target machine. It uses the target's detailed information to provide
12	// more precise answers to certain TTI queries, while letting the target
13	// independent and default TTI implementations handle the rest.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "AMDGPUInstrInfo.h"
18	#include "AMDGPUTargetTransformInfo.h"
19	#include "GCNSubtarget.h"
20	#include "SIDefines.h"
21	#include "llvm/ADT/FloatingPointMode.h"
22	#include "llvm/ADT/STLExtras.h"
23	#include "llvm/ADT/Sequence.h"
24	#include "llvm/Analysis/ConstantFolding.h"
25	#include "llvm/Analysis/ValueTracking.h"
26	#include "llvm/IR/Constants.h"
27	#include "llvm/IR/Dominators.h"
28	#include "llvm/IR/IntrinsicsAMDGPU.h"
29	#include "llvm/Support/MathExtras.h"
30	#include "llvm/Transforms/InstCombine/InstCombiner.h"
31	#include <optional>
32
33	using namespace llvm;
34	using namespace llvm::PatternMatch;
35
36	#define DEBUG_TYPE "AMDGPUtti"
37
38	namespace {
39
40	struct AMDGPUImageDMaskIntrinsic {
41	unsigned Intr;
42	};
43
44	#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
45	#include "AMDGPUGenSearchableTables.inc"
46
47	} // end anonymous namespace
48
49	// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
50	//
51	// A single NaN input is folded to minnum, so we rely on that folding for
52	// handling NaNs.
53	static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
54	const APFloat &Src2) {
55	assert(!Src0.isNaN() && !Src1.isNaN() && !Src2.isNaN() &&
56	"nans handled separately");
57	APFloat Max3 = maxnum(A: maxnum(A: Src0, B: Src1), B: Src2);
58
59	if (Max3.bitwiseIsEqual(RHS: Src0))
60	return maxnum(A: Src1, B: Src2);
61
62	if (Max3.bitwiseIsEqual(RHS: Src1))
63	return maxnum(A: Src0, B: Src2);
64
65	return maxnum(A: Src0, B: Src1);
66	}
67
68	// Check if a value can be converted to a 16-bit value without losing precision.
69	// The value is expected to be either a float (IsFloat = true) or an unsigned
70	// integer (IsFloat = false). When AllowI16SExt is set, a sext from i16 is also
71	// accepted: for unsigned addresses sext and zext only differ for a negative
72	// i16, which is out of bounds anyway (see caller).
73	static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat,
74	bool AllowI16SExt = false) {
75	Type *VTy = V.getType();
76	if (VTy->isHalfTy() \|\| VTy->isIntegerTy(BitWidth: `16`)) {
77	// The value is already 16-bit, so we don't want to convert to 16-bit again!
78	return false;
79	}
80	if (IsFloat) {
81	if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(Val: &V)) {
82	// We need to check that if we cast the index down to a half, we do not
83	// lose precision.
84	APFloat FloatValue(ConstFloat->getValueAPF());
85	bool LosesInfo = true;
86	FloatValue.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero,
87	losesInfo: &LosesInfo);
88	return !LosesInfo;
89	}
90	} else {
91	if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Val: &V)) {
92	// We need to check that if we cast the index down to an i16, we do not
93	// lose precision.
94	APInt IntValue(ConstInt->getValue());
95	return IntValue.getActiveBits() <= `16`;
96	}
97	}
98
99	// Coordinates may arrive as extractelement((s\|z\|fp)ext Vec), Idx. The
100	// widening cast has one use per lane, so it is never sunk into the extract;
101	// strip the extract here so the cast check below is common to scalar and
102	// vector coords.
103	Value *CastCandidate;
104	if (!match(V: &V, P: m_ExtractElt(Val: m_Value(V&: CastCandidate), Idx: m_Value())))
105	CastCandidate = &V;
106
107	Value *CastSrc;
108	bool IsExt = IsFloat ? match(V: CastCandidate, P: m_FPExt(Op: m_Value(V&: CastSrc)))
109	: match(V: CastCandidate, P: m_ZExt(Op: m_Value(V&: CastSrc)));
110	if (!IsExt && !IsFloat && AllowI16SExt)
111	IsExt = match(V: CastCandidate, P: m_SExt(Op: m_Value(V&: CastSrc)));
112	if (IsExt) {
113	Type *CastSrcTy = CastSrc->getType()->getScalarType();
114	if (CastSrcTy->isHalfTy() \|\| CastSrcTy->isIntegerTy(BitWidth: `16`))
115	return true;
116	}
117
118	return false;
119	}
120
121	// Convert a value to 16-bit.
122	static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
123	Type *VTy = V.getType();
124	if (isa<FPExtInst, SExtInst, ZExtInst>(Val: &V))
125	return cast<Instruction>(Val: &V)->getOperand(i: `0`);
126	// Vector form: extractelement((s\|z\|fp)ext Vec), Idx -> extractelement(Vec,
127	// Idx), taking the narrow lane directly so the widening cast can be removed.
128	Instruction *VecCast;
129	Value *Idx;
130	if (match(V: &V, P: m_ExtractElt(Val: m_Instruction(I&: VecCast), Idx: m_Value(V&: Idx))) &&
131	isa<FPExtInst, SExtInst, ZExtInst>(Val: VecCast))
132	return Builder.CreateExtractElement(Vec: VecCast->getOperand(i: `0`), Idx);
133	if (VTy->isIntegerTy())
134	return Builder.CreateIntCast(V: &V, DestTy: Type::getInt16Ty(C&: V.getContext()), isSigned: false);
135	if (VTy->isFloatingPointTy())
136	return Builder.CreateFPCast(V: &V, DestTy: Type::getHalfTy(C&: V.getContext()));
137
138	llvm_unreachable("Should never be called!");
139	}
140
141	/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
142	/// modified arguments (based on OldIntr) and replaces InstToReplace with
143	/// this newly created intrinsic call.
144	static std::optional<Instruction *> modifyIntrinsicCall(
145	IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
146	InstCombiner &IC,
147	std::function<void(SmallVectorImpl<Value > &, SmallVectorImpl<Type > &)>
148	Func) {
149	SmallVector<Type *, `4`> OverloadTys;
150	if (!Intrinsic::isSignatureValid(F: OldIntr.getCalledFunction(), OverloadTys))
151	return std::nullopt;
152
153	SmallVector<Value *, `8`> Args(OldIntr.args());
154
155	// Modify arguments and types
156	Func (Args, OverloadTys);
157
158	CallInst *NewCall =
159	IC.Builder.CreateIntrinsicWithoutFolding(ID: NewIntr, OverloadTypes: OverloadTys, Args);
160	NewCall->takeName(V: &OldIntr);
161	NewCall->copyMetadata(SrcInst: OldIntr);
162	if (isa<FPMathOperator>(Val: NewCall))
163	NewCall->copyFastMathFlags(I: &OldIntr);
164	// Copy attributes
165	AttributeList OldAttrList = OldIntr.getAttributes();
166	NewCall->setAttributes(OldAttrList);
167
168	// Erase and replace uses
169	if (!InstToReplace.getType()->isVoidTy())
170	IC.replaceInstUsesWith(I&: InstToReplace, V: NewCall);
171
172	bool RemoveOldIntr = &OldIntr != &InstToReplace;
173
174	auto *RetValue = IC.eraseInstFromFunction(I&: InstToReplace);
175	if (RemoveOldIntr)
176	IC.eraseInstFromFunction(I&: OldIntr);
177
178	return RetValue;
179	}
180
181	static std::optional<Instruction *>
182	simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
183	const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
184	IntrinsicInst &II, InstCombiner &IC) {
185	// Optimize _L to _LZ when _L is zero
186	if (const auto *LZMappingInfo =
187	AMDGPU::getMIMGLZMappingInfo(L: ImageDimIntr->BaseOpcode)) {
188	if (auto *ConstantLod =
189	dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->LodIndex))) {
190	if (ConstantLod->isZero() \|\| ConstantLod->isNegative()) {
191	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
192	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: LZMappingInfo->LZ,
193	Dim: ImageDimIntr->Dim);
194	return modifyIntrinsicCall(
195	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
196	Args.erase(Args.begin() + ImageDimIntr->LodIndex);
197	});
198	}
199	}
200	}
201
202	// Optimize _mip away, when 'lod' is zero
203	if (const auto *MIPMappingInfo =
204	AMDGPU::getMIMGMIPMappingInfo(MIP: ImageDimIntr->BaseOpcode)) {
205	if (auto *ConstantMip =
206	dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->MipIndex))) {
207	if (ConstantMip->isZero()) {
208	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
209	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: MIPMappingInfo->NONMIP,
210	Dim: ImageDimIntr->Dim);
211	return modifyIntrinsicCall(
212	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
213	Args.erase(Args.begin() + ImageDimIntr->MipIndex);
214	});
215	}
216	}
217	}
218
219	// Optimize _bias away when 'bias' is zero
220	if (const auto *BiasMappingInfo =
221	AMDGPU::getMIMGBiasMappingInfo(Bias: ImageDimIntr->BaseOpcode)) {
222	if (auto *ConstantBias =
223	dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->BiasIndex))) {
224	if (ConstantBias->isZero()) {
225	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
226	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: BiasMappingInfo->NoBias,
227	Dim: ImageDimIntr->Dim);
228	return modifyIntrinsicCall(
229	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
230	Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
231	ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
232	});
233	}
234	}
235	}
236
237	// Optimize _offset away when 'offset' is zero
238	if (const auto *OffsetMappingInfo =
239	AMDGPU::getMIMGOffsetMappingInfo(Offset: ImageDimIntr->BaseOpcode)) {
240	if (auto *ConstantOffset =
241	dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->OffsetIndex))) {
242	if (ConstantOffset->isZero()) {
243	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
244	AMDGPU::getImageDimIntrinsicByBaseOpcode(
245	BaseOpcode: OffsetMappingInfo->NoOffset, Dim: ImageDimIntr->Dim);
246	return modifyIntrinsicCall(
247	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
248	Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
249	});
250	}
251	}
252	}
253
254	// Try to use D16
255	if (ST->hasD16Images()) {
256
257	const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
258	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode);
259
260	if (BaseOpcode->HasD16) {
261
262	// If the only use of image intrinsic is a fptrunc (with conversion to
263	// half) then both fptrunc and image intrinsic will be replaced with image
264	// intrinsic with D16 flag.
265	if (II.hasOneUse()) {
266	Instruction *User = II.user_back();
267
268	if (User->getOpcode() == Instruction::FPTrunc &&
269	User->getType()->getScalarType()->isHalfTy()) {
270
271	return modifyIntrinsicCall(OldIntr&: II, InstToReplace&: *User, NewIntr: ImageDimIntr->Intr, IC,
272	Func: [&](auto &Args, auto &ArgTys) {
273	// Change return type of image intrinsic.
274	// Set it to return type of fptrunc.
275	ArgTys[`0`] = User->getType();
276	});
277	}
278	}
279
280	// Only perform D16 folding if every user of the image sample is
281	// an ExtractElementInst immediately followed by an FPTrunc to half.
282	SmallVector<std::pair<ExtractElementInst , FPTruncInst >, `4`>
283	ExtractTruncPairs;
284	bool AllHalfExtracts = true;
285
286	for (User *U : II.users()) {
287	auto *Ext = dyn_cast<ExtractElementInst>(Val: U);
288	if (!Ext \|\| !Ext->hasOneUse()) {
289	AllHalfExtracts = false;
290	break;
291	}
292
293	auto Tr = dyn_cast<FPTruncInst>(Val: Ext->user_begin());
294	if (!Tr \|\| !Tr->getType()->isHalfTy()) {
295	AllHalfExtracts = false;
296	break;
297	}
298
299	ExtractTruncPairs.emplace_back(Args&: Ext, Args&: Tr);
300	}
301
302	if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
303	auto *VecTy = cast<VectorType>(Val: II.getType());
304	Type *HalfVecTy =
305	VecTy->getWithNewType(EltTy: Type::getHalfTy(C&: II.getContext()));
306
307	// Obtain the original image sample intrinsic's signature
308	// and replace its return type with the half-vector for D16 folding
309	SmallVector<Type *, `8`> OverloadTys;
310	if (!Intrinsic::isSignatureValid(F: II.getCalledFunction(), OverloadTys))
311	return std::nullopt;
312
313	OverloadTys [`0`] = HalfVecTy;
314	Module *M = II.getModule();
315	Function *HalfDecl = Intrinsic::getOrInsertDeclaration(
316	M, id: ImageDimIntr->Intr, OverloadTys);
317
318	II.mutateType(Ty: HalfVecTy);
319	II.setCalledFunction(HalfDecl);
320
321	IRBuilder<> Builder(II.getContext());
322	for (auto &[Ext, Tr] : ExtractTruncPairs) {
323	Value *Idx = Ext->getIndexOperand();
324
325	Builder.SetInsertPoint(Tr);
326
327	Value *HalfExtract = Builder.CreateExtractElement(Vec: &II, Idx);
328	HalfExtract->takeName(V: Tr);
329
330	Tr->replaceAllUsesWith(V: HalfExtract);
331	}
332
333	for (auto &[Ext, Tr] : ExtractTruncPairs) {
334	IC.eraseInstFromFunction(I&: *Tr);
335	IC.eraseInstFromFunction(I&: *Ext);
336	}
337
338	return &II;
339	}
340	}
341	}
342
343	// Try to use A16 or G16
344	if (!ST->hasA16() && !ST->hasG16())
345	return std::nullopt;
346
347	// Address is interpreted as float if the instruction has a sampler or as
348	// unsigned int if there is no sampler.
349	bool HasSampler =
350	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode)->Sampler;
351	bool FloatCoord = false;
352	// true means derivatives can be converted to 16 bit, coordinates not
353	bool OnlyDerivatives = false;
354
355	// Sampler-less addresses are unsigned, so a sext from i16 folds to a16 like a
356	// zext: they only disagree for a negative i16 (>= 0x8000), which is out of
357	// bounds while the max image dimension is <= 0x8000.
358	bool AllowI16SExt = !HasSampler;
359
360	for (unsigned OperandIndex = ImageDimIntr->GradientStart;
361	OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
362	Value *Coord = II.getOperand(i_nocapture: OperandIndex);
363	// If the values are not derived from 16-bit values, we cannot optimize.
364	if (!canSafelyConvertTo16Bit(V&: *Coord, IsFloat: HasSampler, AllowI16SExt)) {
365	if (OperandIndex < ImageDimIntr->CoordStart \|\|
366	ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
367	return std::nullopt;
368	}
369	// All gradients can be converted, so convert only them
370	OnlyDerivatives = true;
371	break;
372	}
373
374	assert(OperandIndex == ImageDimIntr->GradientStart \|\|
375	FloatCoord == Coord->getType()->isFloatingPointTy());
376	FloatCoord = Coord->getType()->isFloatingPointTy();
377	}
378
379	if (!OnlyDerivatives && !ST->hasA16())
380	OnlyDerivatives = true; // Only supports G16
381
382	// Check if there is a bias parameter and if it can be converted to f16
383	if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != `0`) {
384	Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
385	assert(HasSampler &&
386	"Only image instructions with a sampler can have a bias");
387	if (!canSafelyConvertTo16Bit(V&: *Bias, IsFloat: HasSampler))
388	OnlyDerivatives = true;
389	}
390
391	if (OnlyDerivatives && (!ST->hasG16() \|\| ImageDimIntr->GradientStart ==
392	ImageDimIntr->CoordStart))
393	return std::nullopt;
394
395	Type *CoordType = FloatCoord ? Type::getHalfTy(C&: II.getContext())
396	: Type::getInt16Ty(C&: II.getContext());
397
398	return modifyIntrinsicCall(
399	OldIntr&: II, InstToReplace&: II, NewIntr: II.getIntrinsicID(), IC, Func: [&](auto &Args, auto &ArgTys) {
400	ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
401	if (!OnlyDerivatives) {
402	ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
403
404	// Change the bias type
405	if (ImageDimIntr->NumBiasArgs != `0`)
406	ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(C&: II.getContext());
407	}
408
409	unsigned EndIndex =
410	OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
411	for (unsigned OperandIndex = ImageDimIntr->GradientStart;
412	OperandIndex < EndIndex; OperandIndex++) {
413	Args[OperandIndex] =
414	convertTo16Bit(V&: *II.getOperand(i_nocapture: OperandIndex), Builder&: IC.Builder);
415	}
416
417	// Convert the bias
418	if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != `0`) {
419	Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
420	Args[ImageDimIntr->BiasIndex] = convertTo16Bit(V&: *Bias, Builder&: IC.Builder);
421	}
422	});
423	}
424
425	bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
426	const Value Op0, const* Value *Op1,
427	InstCombiner &IC) const {
428	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
429	// infinity, gives +0.0. If we can prove we don't have one of the special
430	// cases then we can use a normal multiply instead.
431	// TODO: Create and use isKnownFiniteNonZero instead of just matching
432	// constants here.
433	if (match(V: Op0, P: PatternMatch::m_FiniteNonZero()) \|\|
434	match(V: Op1, P: PatternMatch::m_FiniteNonZero())) {
435	// One operand is not zero or infinity or NaN.
436	return true;
437	}
438
439	SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(I: &I);
440	if (isKnownNeverInfOrNaN(V: Op0, SQ) && isKnownNeverInfOrNaN(V: Op1, SQ)) {
441	// Neither operand is infinity or NaN.
442	return true;
443	}
444	return false;
445	}
446
447	/// Match an fpext from half to float, or a constant we can convert.
448	static Value matchFPExtFromF16(Value Arg) {
449	Value Src = nullptr*;
450	ConstantFP CFP = nullptr*;
451	if (match(V: Arg, P: m_OneUse(SubPattern: m_FPExt(Op: m_Value(V&: Src))))) {
452	if (Src->getType()->isHalfTy())
453	return Src;
454	} else if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
455	bool LosesInfo;
456	APFloat Val(CFP->getValueAPF());
457	Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
458	if (!LosesInfo)
459	return ConstantFP::get(Ty: Type::getHalfTy(C&: Arg->getContext()), V: Val);
460	}
461	return nullptr;
462	}
463
464	// Trim all zero components from the end of the vector \p UseV and return
465	// an appropriate bitset with known elements.
466	static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
467	Instruction *I) {
468	auto *VTy = cast<FixedVectorType>(Val: UseV->getType());
469	unsigned VWidth = VTy->getNumElements();
470	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
471
472	for (int i = VWidth - `1`; i > `0`; --i) {
473	auto *Elt = findScalarElement(V: UseV, EltNo: i);
474	if (!Elt)
475	break;
476
477	if (auto *ConstElt = dyn_cast<Constant>(Val: Elt)) {
478	if (!ConstElt->isNullValue() && !isa<UndefValue>(Val: Elt))
479	break;
480	} else {
481	break;
482	}
483
484	DemandedElts.clearBit(BitPosition: i);
485	}
486
487	return DemandedElts;
488	}
489
490	// Trim elements of the end of the vector \p V, if they are
491	// equal to the first element of the vector.
492	static APInt defaultComponentBroadcast(Value *V) {
493	auto *VTy = cast<FixedVectorType>(Val: V->getType());
494	unsigned VWidth = VTy->getNumElements();
495	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
496	Value *FirstComponent = findScalarElement(V, EltNo: `0`);
497
498	SmallVector<int> ShuffleMask;
499	if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: V))
500	SVI->getShuffleMask(Result&: ShuffleMask);
501
502	for (int I = VWidth - `1`; I > `0`; --I) {
503	if (ShuffleMask.empty()) {
504	auto *Elt = findScalarElement(V, EltNo: I);
505	if (!Elt \|\| (Elt != FirstComponent && !isa<UndefValue>(Val: Elt)))
506	break;
507	} else {
508	// Detect identical elements in the shufflevector result, even though
509	// findScalarElement cannot tell us what that element is.
510	if (ShuffleMask [I] != ShuffleMask [`0`] && ShuffleMask [I] != PoisonMaskElem)
511	break;
512	}
513	DemandedElts.clearBit(BitPosition: I);
514	}
515
516	return DemandedElts;
517	}
518
519	static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
520	IntrinsicInst &II,
521	APInt DemandedElts,
522	int DMaskIdx = -`1`,
523	bool IsLoad = true);
524
525	/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
526	static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
527	return (SqrtOp->getType()->isFloatTy() &&
528	(SqrtOp->hasApproxFunc() \|\| SqrtOp->getFPAccuracy() >= `1.0f`)) \|\|
529	SqrtOp->getType()->isHalfTy();
530	}
531
532	/// Return true if we can easily prove that use U is uniform.
533	static bool isTriviallyUniform(const Use &U) {
534	Value *V = U.get();
535	if (isa<Constant>(Val: V))
536	return true;
537	if (const auto *A = dyn_cast<Argument>(Val: V))
538	return AMDGPU::isArgPassedInSGPR(Arg: A);
539	if (const auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
540	if (!AMDGPU::isIntrinsicAlwaysUniform(IntrID: II->getIntrinsicID()))
541	return false;
542	// If II and U are in different blocks then there is a possibility of
543	// temporal divergence.
544	return II->getParent() == cast<Instruction>(Val: U.getUser())->getParent();
545	}
546	return false;
547	}
548
549	/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
550	///
551	/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
552	bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
553	IntrinsicInst &II,
554	unsigned LaneArgIdx) const {
555	unsigned MaskBits = ST->getWavefrontSizeLog2();
556	APInt DemandedMask(`32`, maskTrailingOnes<unsigned>(N: MaskBits));
557
558	KnownBits Known(`32`);
559	if (IC.SimplifyDemandedBits(I: &II, OpNo: LaneArgIdx, DemandedMask, Known))
560	return true;
561
562	if (!Known.isConstant())
563	return false;
564
565	// Out of bounds indexes may appear in wave64 code compiled for wave32.
566	// Unlike the DAG version, SimplifyDemandedBits does not change constants, so
567	// manually fix it up.
568
569	Value *LaneArg = II.getArgOperand(i: LaneArgIdx);
570	Constant *MaskedConst =
571	ConstantInt::get(Ty: LaneArg->getType(), V: Known.getConstant() & DemandedMask);
572	if (MaskedConst != LaneArg) {
573	II.getOperandUse(i: LaneArgIdx).set(MaskedConst);
574	return true;
575	}
576
577	return false;
578	}
579
580	static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
581	Function &NewCallee, ArrayRef<Value *> Ops) {
582	SmallVector<OperandBundleDef, `2`> OpBundles;
583	Old.getOperandBundlesAsDefs(Defs&: OpBundles);
584
585	CallInst *NewCall = B.CreateCall(Callee: &NewCallee, Args: Ops, OpBundles);
586	NewCall->takeName(V: &Old);
587	return NewCall;
588	}
589
590	// Return true for sequences of instructions that effectively assign
591	// each lane to its thread ID
592	static bool isThreadID(const GCNSubtarget &ST, Value *V) {
593	// Case 1:
594	// wave32: mbcnt_lo(-1, 0)
595	// wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
596	auto W32Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(Ops: m_ConstantInt<-`1`>(),
597	Ops: m_ConstantInt<`0`>());
598	auto W64Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
599	Ops: m_ConstantInt<-`1`>(), Ops: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
600	Ops: m_ConstantInt<-`1`>(), Ops: m_ConstantInt<`0`>()));
601	if (ST.isWave32() && match(V, P: W32Pred))
602	return true;
603	if (ST.isWave64() && match(V, P: W64Pred))
604	return true;
605
606	return false;
607	}
608
609	Instruction *
610	GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
611	IntrinsicInst &II) const {
612	const auto IID = II.getIntrinsicID();
613	assert(IID == Intrinsic::amdgcn_readlane \|\|
614	IID == Intrinsic::amdgcn_readfirstlane \|\|
615	IID == Intrinsic::amdgcn_permlane64);
616
617	Instruction *OpInst = dyn_cast<Instruction>(Val: II.getOperand(i_nocapture: `0`));
618
619	// Only do this if both instructions are in the same block
620	// (so the exec mask won't change) and the readlane is the only user of its
621	// operand.
622	if (!OpInst \|\| !OpInst->hasOneUser() \|\| OpInst->getParent() != II.getParent())
623	return nullptr;
624
625	const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
626
627	// If this is a readlane, check that the second operand is a constant, or is
628	// defined before OpInst so we know it's safe to move this intrinsic higher.
629	Value LaneID = nullptr*;
630	if (IsReadLane) {
631	LaneID = II.getOperand(i_nocapture: `1`);
632
633	// readlane take an extra operand for the lane ID, so we must check if that
634	// LaneID value can be used at the point where we want to move the
635	// intrinsic.
636	if (auto *LaneIDInst = dyn_cast<Instruction>(Val: LaneID)) {
637	if (!IC.getDominatorTree().dominates(Def: LaneIDInst, User: OpInst))
638	return nullptr;
639	}
640	}
641
642	// Hoist the intrinsic (II) through OpInst.
643	//
644	// (II (OpInst x)) -> (OpInst (II x))
645	const auto DoIt = [&](unsigned OpIdx,
646	Function NewIntrinsic) -> Instruction {
647	SmallVector<Value *, `2`> Ops{OpInst->getOperand(i: OpIdx)};
648	if (IsReadLane)
649	Ops.push_back(Elt: LaneID);
650
651	// Rewrite the intrinsic call.
652	CallInst NewII = rewriteCall(B&: IC.Builder, Old&: II, NewCallee&: NewIntrinsic, Ops);
653
654	// Rewrite OpInst so it takes the result of the intrinsic now.
655	Instruction &NewOp = *OpInst->clone();
656	NewOp.setOperand(i: OpIdx, Val: NewII);
657	return &NewOp;
658	};
659
660	// TODO(?): Should we do more with permlane64?
661	if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(Val: OpInst))
662	return nullptr;
663
664	if (isa<UnaryOperator>(Val: OpInst))
665	return DoIt (`0`, II.getCalledFunction());
666
667	if (isa<CastInst>(Val: OpInst)) {
668	Value *Src = OpInst->getOperand(i: `0`);
669	Type *SrcTy = Src->getType();
670	if (!isTypeLegal(Ty: SrcTy))
671	return nullptr;
672
673	Function *Remangled =
674	Intrinsic::getOrInsertDeclaration(M: II.getModule(), id: IID, OverloadTys: {SrcTy});
675	return DoIt (`0`, Remangled);
676	}
677
678	// We can also hoist through binary operators if the other operand is uniform.
679	if (isa<BinaryOperator>(Val: OpInst)) {
680	// FIXME: If we had access to UniformityInfo here we could just check
681	// if the operand is uniform.
682	if (isTriviallyUniform(U: OpInst->getOperandUse(i: `0`)))
683	return DoIt (`1`, II.getCalledFunction());
684	if (isTriviallyUniform(U: OpInst->getOperandUse(i: `1`)))
685	return DoIt (`0`, II.getCalledFunction());
686	}
687
688	return nullptr;
689	}
690
691	/// Evaluate V as a function of the lane ID and return its value on Lane, or
692	/// std::nullopt if V is not a closed-form expression of the lane ID.
693	static std::optional<unsigned> evalLaneExpr(Value V, unsigned* Lane,
694	const GCNSubtarget &ST,
695	const DataLayout &DL,
696	unsigned Depth = `0`) {
697	if (Depth >= MaxAnalysisRecursionDepth)
698	return std::nullopt;
699
700	// Poison/undef in the index expression: bail and let InstCombine fold the
701	// intrinsic the usual way.
702	if (isa<UndefValue>(Val: V))
703	return std::nullopt;
704
705	if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val: V))
706	return CI->getZExtValue();
707
708	if (isThreadID(ST, V))
709	return Lane;
710
711	const BinaryOperator *BO = dyn_cast<BinaryOperator>(Val: V);
712	if (!BO)
713	return std::nullopt;
714
715	std::optional<unsigned> LHS =
716	evalLaneExpr(V: BO->getOperand(i_nocapture: `0`), Lane, ST, DL, Depth: Depth + `1`);
717	if (!LHS)
718	return std::nullopt;
719	std::optional<unsigned> RHS =
720	evalLaneExpr(V: BO->getOperand(i_nocapture: `1`), Lane, ST, DL, Depth: Depth + `1`);
721	if (!RHS)
722	return std::nullopt;
723
724	Type *Ty = BO->getType();
725	Constant Ops[] = {ConstantInt::get(Ty, V: LHS), ConstantInt::get(Ty, V: *RHS)};
726	auto *CI =
727	dyn_cast_or_null<ConstantInt>(Val: ConstantFoldInstOperands(I: BO, Ops, DL));
728	return CI ? std::optional<unsigned>(CI->getZExtValue()) : std::nullopt;
729	}
730
731	/// Build the per-lane shuffle map by evaluating Index for every lane in the
732	/// wave. Returns false if any lane index is non-constant or out of range.
733	static bool tryBuildShuffleMap(Value Index, const* GCNSubtarget &ST,
734	SmallVectorImpl<uint8_t> &Ids,
735	const DataLayout &DL) {
736	unsigned WaveSize = ST.getWavefrontSize();
737	Ids.resize(N: WaveSize);
738	for (unsigned Lane : seq(Size: WaveSize)) {
739	std::optional<unsigned> Val = evalLaneExpr(V: Index, Lane, ST, DL);
740	if (!Val \|\| *Val >= WaveSize)
741	return false;
742	Ids [Lane] = *Val;
743	}
744	return true;
745	}
746
747	/// Lanes are partitioned into groups of Period; each group is a translated
748	/// copy of the first: Ids[I] = Ids[I % Period] + (I & ~(Period - 1)).
749	template <unsigned Period>
750	static bool hasPeriodicLayout(ArrayRef<uint8_t> Ids) {
751	static_assert(isPowerOf2_32(Value: Period), "Period must be a power of two");
752	for (unsigned I = Period, E = Ids.size(); I < E; ++I)
753	if (Ids [I] != Ids [I % Period] + (I & ~(Period - `1`)))
754	return false;
755	return true;
756	}
757
758	/// Match an N-lane row pattern: each lane in [0, N) reads from a source lane
759	/// in the same N-lane row, and the pattern repeats periodically across rows.
760	template <unsigned N> static bool isRowPattern(ArrayRef<uint8_t> Ids) {
761	for (unsigned I = `0`; I < N; ++I)
762	if (Ids [I] >= N)
763	return false;
764	return hasPeriodicLayout<N>(Ids);
765	}
766
767	static constexpr auto isQuadPattern = isRowPattern<`4`>;
768	static constexpr auto isHalfRowPattern = isRowPattern<`8`>;
769	static constexpr auto isFullRowPattern = isRowPattern<`16`>;
770
771	/// Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp
772	/// QUAD_PERM control word: bits[1:0]=Ids[0], [3:2]=Ids[1], [5:4]=Ids[2],
773	/// [7:6]=Ids[3].
774	static std::optional<unsigned> matchQuadPermPattern(ArrayRef<uint8_t> Ids) {
775	if (!isQuadPattern(Ids))
776	return std::nullopt;
777	return Ids [`3`] << `6` \| Ids [`2`] << `4` \| Ids [`1`] << `2` \| Ids [`0`];
778	}
779
780	/// Match an N-lane reversal (mirror) pattern.
781	template <unsigned N> static bool matchMirrorPattern(ArrayRef<uint8_t> Ids) {
782	if (!isRowPattern<N>(Ids))
783	return false;
784	for (unsigned J = `0`; J < N; ++J)
785	if (Ids [J] != (N - `1`) - J)
786	return false;
787	return true;
788	}
789
790	static constexpr auto matchHalfRowMirrorPattern = matchMirrorPattern<`8`>;
791	static constexpr auto matchFullRowMirrorPattern = matchMirrorPattern<`16`>;
792
793	/// Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
794	static std::optional<unsigned> matchRowRotatePattern(ArrayRef<uint8_t> Ids) {
795	if (Ids [`0`] == `0` \|\| !isFullRowPattern(Ids))
796	return std::nullopt;
797	for (unsigned J = `1`; J < `16`; ++J)
798	if (Ids [J] != (Ids [`0`] + J) % `16`)
799	return std::nullopt;
800	return `16u` - Ids [`0`];
801	}
802
803	/// Match a row-share pattern: all 16 lanes of each row read the same source
804	/// lane. Returns the shared source lane index in [0, 16).
805	static std::optional<unsigned> matchRowSharePattern(ArrayRef<uint8_t> Ids) {
806	if (!isFullRowPattern(Ids))
807	return std::nullopt;
808	if (!all_equal(Range: Ids.take_front(N: `16`)))
809	return std::nullopt;
810	return Ids [`0`];
811	}
812
813	/// Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J,
814	/// with Mask in [1, 15].
815	static std::optional<unsigned> matchRowXMaskPattern(ArrayRef<uint8_t> Ids) {
816	unsigned Mask = Ids [`0`];
817	if (Mask == `0` \|\| !isFullRowPattern(Ids))
818	return std::nullopt;
819	for (unsigned J = `0`; J < `16`; ++J)
820	if (Ids [J] != (Mask ^ J))
821	return std::nullopt;
822	return Mask;
823	}
824
825	/// Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8
826	/// 24-bit selector (three bits per output lane).
827	static std::optional<unsigned> matchHalfRowPermPattern(ArrayRef<uint8_t> Ids) {
828	if (!isHalfRowPattern(Ids))
829	return std::nullopt;
830	unsigned Selector = `0`;
831	for (unsigned J = `0`; J < `8`; ++J)
832	Selector \|= Ids [J] << (J * `3`);
833	return Selector;
834	}
835
836	/// Pack a 16-lane permutation into a single 64-bit value: four bits per output
837	/// lane, lane J in bits [J4 + 3 : J4]. The caller splits it into the low and
838	/// high 32-bit selector operands of v_permlane16 / v_permlanex16.
839	static uint64_t computePermlane16Masks(ArrayRef<uint8_t> Ids) {
840	uint64_t Sel = `0`;
841	for (unsigned J = `0`; J < `16`; ++J)
842	Sel \|= static_cast<uint64_t>(Ids [J] & `0xF`) << (J * `4`);
843	return Sel;
844	}
845
846	/// Match a half-wave swap: lane J reads from lane J ^ 32. Only meaningful on
847	/// wave64 targets.
848	static bool matchHalfWaveSwapPattern(ArrayRef<uint8_t> Ids) {
849	if (Ids.size() != `64`)
850	return false;
851	for (unsigned J = `0`; J < `64`; ++J)
852	if (Ids [J] != (J ^ `32`))
853	return false;
854	return true;
855	}
856
857	/// Match a cross-row permutation suitable for v_permlanex16: every lane in
858	/// the low 16-lane half reads from the high half of its own row, and vice
859	/// versa.
860	static bool isCrossRowPattern(ArrayRef<uint8_t> Ids) {
861	if (!hasPeriodicLayout<`32`>(Ids))
862	return false;
863	for (unsigned J = `0`; J < `16`; ++J) {
864	if (Ids [J] < `16` \|\| Ids [J] >= `32`)
865	return false;
866	if (Ids [J + `16`] != Ids [J] - `16`)
867	return false;
868	}
869	return true;
870	}
871
872	/// Match a DS_SWIZZLE bitmask-mode permutation:
873	/// dst_lane = ((src_lane & AND) \| OR) ^ XOR
874	/// with each mask being five bits. Returns the encoded swizzle immediate.
875	/// The hardware applies the formula independently within each 32-lane group,
876	/// so on wave64 the high group must replicate the low one (translated by 32).
877	static std::optional<unsigned>
878	matchDsSwizzleBitmaskPattern(ArrayRef<uint8_t> Ids) {
879	if (!hasPeriodicLayout<`32`>(Ids))
880	return std::nullopt;
881
882	// The formula is per-bit: output bit B depends only on input bit B. Probe
883	// each bit with src=0 and src=(1<<B); if the output bit flipped, AND[B]=1
884	// and XOR[B] carries the constant offset; otherwise it is a constant bit
885	// encoded in OR (with AND[B]=0, XOR[B]=0).
886	unsigned AndMask = `0`, OrMask = `0`, XorMask = `0`;
887	for (unsigned B = `0`; B < `5`; ++B) {
888	unsigned Bit0 = (Ids [`0`] >> B) & `1`;
889	unsigned Bit1 = (Ids [`1u` << B] >> B) & `1`;
890	if (Bit0 != Bit1) {
891	AndMask \|= `1u` << B;
892	XorMask \|= Bit0 << B;
893	} else {
894	OrMask \|= Bit0 << B;
895	}
896	}
897
898	// The per-bit derivation assumes bit independence; verify the masks
899	// actually reproduce every lane in the 32-lane group.
900	for (unsigned I : seq(Size: `32u`)) {
901	unsigned Expected = ((I & AndMask) \| OrMask) ^ XorMask;
902	if (Ids [I] != Expected)
903	return std::nullopt;
904	}
905
906	return AMDGPU::Swizzle::BITMASK_PERM_ENC \|
907	AndMask << AMDGPU::Swizzle::BITMASK_AND_SHIFT \|
908	OrMask << AMDGPU::Swizzle::BITMASK_OR_SHIFT \|
909	XorMask << AMDGPU::Swizzle::BITMASK_XOR_SHIFT;
910	}
911
912	/// Match a GFX9+ DS_SWIZZLE rotate-mode permutation: a cyclic left-rotation
913	/// of all 32 lanes within each 32-lane group by a constant N in [0, 31],
914	/// i.e. dst_lane = (src_lane + N) % 32. On wave64, hasPeriodicLayout<32>
915	/// ensures both 32-lane groups rotate by the same amount.
916	static std::optional<unsigned>
917	matchDsSwizzleRotatePattern(ArrayRef<uint8_t> Ids) {
918	if (!hasPeriodicLayout<`32`>(Ids))
919	return std::nullopt;
920
921	// Determine the rotation amount from lane 0: every lane must read from
922	// lane (I + N) % 32 where N = Ids[0] and 0 <= N <= 31.
923	unsigned N = Ids [`0`];
924	if (N >= `32`)
925	return std::nullopt;
926
927	for (unsigned I = `0`; I < `32`; ++I)
928	if (Ids [I] != (I + N) % `32`)
929	return std::nullopt;
930
931	return AMDGPU::Swizzle::ROTATE_MODE_ENC \|
932	(N << AMDGPU::Swizzle::ROTATE_SIZE_SHIFT);
933	}
934
935	/// Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and
936	/// bound_ctrl=1 so out-of-bounds lanes are well-defined and the DPP mov can
937	/// be folded into a consuming VALU op by GCNDPPCombine.
938	static Value createUpdateDpp(IRBuilderBase &B, Value Val, unsigned Ctrl) {
939	Type *Ty = Val->getType();
940	return B.CreateIntrinsic(ID: Intrinsic::amdgcn_update_dpp, OverloadTypes: {Ty},
941	Args: {PoisonValue::get(T: Ty), Val, B.getInt32(C: Ctrl),
942	B.getInt32(C: `0xF`), B.getInt32(C: `0xF`), B.getTrue()});
943	}
944
945	/// Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
946	static Value createMovDpp8(IRBuilderBase &B, Value Val, unsigned Selector) {
947	return B.CreateIntrinsic(ID: Intrinsic::amdgcn_mov_dpp8, OverloadTypes: {Val->getType()},
948	Args: {Val, B.getInt32(C: Selector)});
949	}
950
951	/// Emit v_permlane16 with the precomputed lane-select halves.
952	static Value createPermlane16(IRBuilderBase &B, Value Val, uint32_t Lo,
953	uint32_t Hi) {
954	Type *Ty = Val->getType();
955	return B.CreateIntrinsic(ID: Intrinsic::amdgcn_permlane16, OverloadTypes: {Ty},
956	Args: {PoisonValue::get(T: Ty), Val, B.getInt32(C: Lo),
957	B.getInt32(C: Hi), B.getFalse(), B.getFalse()});
958	}
959
960	/// Emit v_permlanex16 with the precomputed lane-select halves. Each output
961	/// lane reads from the other 16-lane half of the same row.
962	static Value createPermlaneX16(IRBuilderBase &B, Value Val, uint32_t Lo,
963	uint32_t Hi) {
964	Type *Ty = Val->getType();
965	return B.CreateIntrinsic(ID: Intrinsic::amdgcn_permlanex16, OverloadTypes: {Ty},
966	Args: {PoisonValue::get(T: Ty), Val, B.getInt32(C: Lo),
967	B.getInt32(C: Hi), B.getFalse(), B.getFalse()});
968	}
969
970	/// Emit ds_swizzle with the given immediate, bitcasting/converting between
971	/// pointer/float types and i32 as required by the intrinsic signature.
972	static Value createDsSwizzle(IRBuilderBase &B, Value Val, unsigned Offset,
973	const DataLayout &DL) {
974	Type *OrigTy = Val->getType();
975	assert(DL.getTypeSizeInBits(OrigTy) == `32` &&
976	"ds_swizzle only supports 32-bit operands");
977	IntegerType *I32Ty = B.getInt32Ty();
978	Value *Src = Val;
979	if (OrigTy->isPointerTy())
980	Src = B.CreatePtrToInt(V: Src, DestTy: I32Ty);
981	else if (OrigTy != I32Ty)
982	Src = B.CreateBitCast(V: Src, DestTy: I32Ty);
983	Value *Result = B.CreateIntrinsic(ID: Intrinsic::amdgcn_ds_swizzle, OverloadTypes: {},
984	Args: {Src, B.getInt32(C: Offset)});
985	if (OrigTy->isPointerTy())
986	return B.CreateIntToPtr(V: Result, DestTy: OrigTy);
987	if (OrigTy != I32Ty)
988	return B.CreateBitCast(V: Result, DestTy: OrigTy);
989	return Result;
990	}
991
992	/// Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
993	static Value createPermlane64(IRBuilderBase &B, Value Val) {
994	return B.CreateIntrinsic(ID: Intrinsic::amdgcn_permlane64, OverloadTypes: {Val->getType()},
995	Args: {Val});
996	}
997
998	/// Given a shuffle map, try to emit the best hardware intrinsic.
999	static Value matchShuffleToHWIntrinsic(IRBuilderBase &B, Value Src,
1000	ArrayRef<uint8_t> Ids,
1001	const GCNSubtarget &ST,
1002	const DataLayout &DL) {
1003	// Identity shuffle (every lane reads itself) folds to the source value.
1004	if (all_of(Range: enumerate(First&: Ids),
1005	P: [](const auto &E) { return E.value() == E.index(); }))
1006	return Src;
1007
1008	// Uniform shuffle (all lanes read the same value) is handled by cheaper
1009	// broadcast/readlane intrinsics.
1010	if (all_equal(Range&: Ids))
1011	return nullptr;
1012
1013	if (std::optional<unsigned> QP = matchQuadPermPattern(Ids)) {
1014	if (ST.hasDPP())
1015	return createUpdateDpp(B, Val: Src, Ctrl: *QP);
1016	return createDsSwizzle(B, Val: Src, Offset: AMDGPU::Swizzle::QUAD_PERM_ENC \| *QP, DL);
1017	}
1018
1019	if (ST.hasDPP()) {
1020	if (matchHalfRowMirrorPattern(Ids))
1021	return createUpdateDpp(B, Val: Src, Ctrl: AMDGPU::DPP::ROW_HALF_MIRROR);
1022	if (matchFullRowMirrorPattern(Ids))
1023	return createUpdateDpp(B, Val: Src, Ctrl: AMDGPU::DPP::ROW_MIRROR);
1024	if (std::optional<unsigned> Amt = matchRowRotatePattern(Ids))
1025	return createUpdateDpp(B, Val: Src, Ctrl: AMDGPU::DPP::ROW_ROR_FIRST + *Amt - `1`);
1026	}
1027
1028	// row_share is supported on GFX90A and GFX10+; row_xmask is GFX10+ only.
1029	if (ST.hasDPPRowShare()) {
1030	if (std::optional<unsigned> Lane = matchRowSharePattern(Ids))
1031	return createUpdateDpp(B, Val: Src, Ctrl: AMDGPU::DPP::ROW_SHARE_FIRST + *Lane);
1032	}
1033
1034	if (ST.hasDPP() && ST.hasGFX10Insts()) {
1035	if (std::optional<unsigned> Mask = matchRowXMaskPattern(Ids))
1036	return createUpdateDpp(B, Val: Src, Ctrl: AMDGPU::DPP::ROW_XMASK_FIRST + *Mask);
1037	}
1038
1039	if (ST.hasDPP8()) {
1040	if (std::optional<unsigned> Sel = matchHalfRowPermPattern(Ids))
1041	return createMovDpp8(B, Val: Src, Selector: *Sel);
1042	}
1043
1044	if (ST.hasPermlane16Insts()) {
1045	if (isFullRowPattern(Ids)) {
1046	uint64_t Sel = computePermlane16Masks(Ids);
1047	return createPermlane16(B, Val: Src, Lo: Lo_32(Value: Sel), Hi: Hi_32(Value: Sel));
1048	}
1049	// Cross-row shuffles (e.g. XOR 16..31) — covered by permlanex16.
1050	if (isCrossRowPattern(Ids)) {
1051	uint64_t Sel = computePermlane16Masks(Ids);
1052	return createPermlaneX16(B, Val: Src, Lo: Lo_32(Value: Sel), Hi: Hi_32(Value: Sel));
1053	}
1054	}
1055
1056	// Generic DS_SWIZZLE bitmask-mode fallback: handles any 32-lane shuffle that
1057	// can be expressed as dst = ((src & AND) \| OR) ^ XOR with 5-bit masks. This
1058	// is available on every target that has ds_swizzle.
1059	if (std::optional<unsigned> Imm = matchDsSwizzleBitmaskPattern(Ids))
1060	return createDsSwizzle(B, Val: Src, Offset: *Imm, DL);
1061
1062	// DS_SWIZZLE rotate mode (GFX9+): handles cyclic 32-lane rotations that
1063	// bitmask mode cannot express (e.g. +1 mod 32 requires inter-bit carry).
1064	if (ST.hasDsSwizzleRotateMode()) {
1065	if (std::optional<unsigned> Imm = matchDsSwizzleRotatePattern(Ids))
1066	return createDsSwizzle(B, Val: Src, Offset: *Imm, DL);
1067	}
1068
1069	if (ST.hasPermLane64() && matchHalfWaveSwapPattern(Ids))
1070	return createPermlane64(B, Val: Src);
1071
1072	return nullptr;
1073	}
1074
1075	/// Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant
1076	/// function of the lane ID into a hardware-specific lane permutation intrinsic.
1077	static std::optional<Instruction *>
1078	tryOptimizeShufflePattern(InstCombiner &IC, IntrinsicInst &II,
1079	const GCNSubtarget &ST) {
1080	const DataLayout &DL = IC.getDataLayout();
1081	if (DL.getTypeSizeInBits(Ty: II.getType()) != `32`)
1082	return std::nullopt;
1083
1084	if (!ST.isWaveSizeKnown())
1085	return std::nullopt;
1086
1087	unsigned WaveSize = ST.getWavefrontSize();
1088	bool IsBpermute = II.getIntrinsicID() == Intrinsic::amdgcn_ds_bpermute;
1089	Value *Src = II.getArgOperand(i: IsBpermute ? `1` : `0`);
1090	Value *Index = II.getArgOperand(i: IsBpermute ? `0` : `1`);
1091
1092	SmallVector<uint8_t, `64`> Ids;
1093	if (IsBpermute) {
1094	Ids.resize(N: WaveSize);
1095	for (unsigned Lane : seq(Size: WaveSize)) {
1096	std::optional<unsigned> Val = evalLaneExpr(V: Index, Lane, ST, DL);
1097	if (!Val \|\| (Val & `3`) \|\| (Val >> `2`) >= WaveSize)
1098	return std::nullopt;
1099	Ids [Lane] = *Val >> `2`;
1100	}
1101	} else {
1102	if (!tryBuildShuffleMap(Index, ST, Ids, DL))
1103	return std::nullopt;
1104	}
1105
1106	Value *Result = matchShuffleToHWIntrinsic(B&: IC.Builder, Src, Ids, ST, DL);
1107	if (!Result)
1108	return std::nullopt;
1109
1110	return IC.replaceInstUsesWith(I&: II, V: Result);
1111	}
1112	std::optional<Instruction *>
1113	GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
1114	Intrinsic::ID IID = II.getIntrinsicID();
1115	switch (IID) {
1116	case Intrinsic::amdgcn_implicitarg_ptr: {
1117	if (II.getFunction()->hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr"))
1118	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1119	uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(F: *II.getFunction());
1120
1121	uint64_t CurrentOrNullBytes =
1122	II.getAttributes().getRetDereferenceableOrNullBytes();
1123	if (CurrentOrNullBytes != `0`) {
1124	// Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
1125	// into dereferenceable(max(A, B))
1126	uint64_t NewBytes = std::max(a: CurrentOrNullBytes, b: ImplicitArgBytes);
1127	II.addRetAttr(
1128	Attr: Attribute::getWithDereferenceableBytes(Context&: II.getContext(), Bytes: NewBytes));
1129	II.removeRetAttr(Kind: Attribute::DereferenceableOrNull);
1130	return &II;
1131	}
1132
1133	uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
1134	uint64_t NewBytes = std::max(a: CurrentBytes, b: ImplicitArgBytes);
1135	if (NewBytes != CurrentBytes) {
1136	II.addRetAttr(
1137	Attr: Attribute::getWithDereferenceableBytes(Context&: II.getContext(), Bytes: NewBytes));
1138	return &II;
1139	}
1140
1141	return std::nullopt;
1142	}
1143	case Intrinsic::amdgcn_rcp: {
1144	Value *Src = II.getArgOperand(i: `0`);
1145	if (isa<PoisonValue>(Val: Src))
1146	return IC.replaceInstUsesWith(I&: II, V: Src);
1147
1148	// TODO: Move to ConstantFolding/InstSimplify?
1149	if (isa<UndefValue>(Val: Src)) {
1150	Type *Ty = II.getType();
1151	auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
1152	return IC.replaceInstUsesWith(I&: II, V: QNaN);
1153	}
1154
1155	if (II.isStrictFP())
1156	break;
1157
1158	if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
1159	const APFloat &ArgVal = C->getValueAPF();
1160	APFloat Val(ArgVal.getSemantics(), `1`);
1161	Val.divide(RHS: ArgVal, RM: APFloat::rmNearestTiesToEven);
1162
1163	// This is more precise than the instruction may give.
1164	//
1165	// TODO: The instruction always flushes denormal results (except for f16),
1166	// should this also?
1167	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Val));
1168	}
1169
1170	FastMathFlags FMF = cast<FPMathOperator>(Val&: II).getFastMathFlags();
1171	if (!FMF.allowContract())
1172	break;
1173	auto *SrcCI = dyn_cast<IntrinsicInst>(Val: Src);
1174	if (!SrcCI)
1175	break;
1176
1177	auto IID = SrcCI->getIntrinsicID();
1178	// llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
1179	//
1180	// llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
1181	// relaxed.
1182	if (IID == Intrinsic::amdgcn_sqrt \|\| IID == Intrinsic::sqrt) {
1183	const FPMathOperator *SqrtOp = cast<FPMathOperator>(Val: SrcCI);
1184	FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
1185	if (!InnerFMF.allowContract() \|\| !SrcCI->hasOneUse())
1186	break;
1187
1188	if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
1189	break;
1190
1191	Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1192	M: SrcCI->getModule(), id: Intrinsic::amdgcn_rsq, OverloadTys: {SrcCI->getType()});
1193
1194	InnerFMF \|= FMF;
1195	II.setFastMathFlags(InnerFMF);
1196
1197	II.setCalledFunction(NewDecl);
1198	return IC.replaceOperand(I&: II, OpNum: `0`, V: SrcCI->getArgOperand(i: `0`));
1199	}
1200
1201	break;
1202	}
1203	case Intrinsic::amdgcn_sqrt:
1204	case Intrinsic::amdgcn_rsq:
1205	case Intrinsic::amdgcn_tanh: {
1206	Value *Src = II.getArgOperand(i: `0`);
1207	if (isa<PoisonValue>(Val: Src))
1208	return IC.replaceInstUsesWith(I&: II, V: Src);
1209
1210	// TODO: Move to ConstantFolding/InstSimplify?
1211	if (isa<UndefValue>(Val: Src)) {
1212	Type *Ty = II.getType();
1213	auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
1214	return IC.replaceInstUsesWith(I&: II, V: QNaN);
1215	}
1216
1217	// f16 amdgcn.sqrt is identical to regular sqrt.
1218	if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
1219	Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1220	M: II.getModule(), id: Intrinsic::sqrt, OverloadTys: {II.getType()});
1221	II.setCalledFunction(NewDecl);
1222	return &II;
1223	}
1224
1225	break;
1226	}
1227	case Intrinsic::amdgcn_log:
1228	case Intrinsic::amdgcn_exp2: {
1229	const bool IsLog = IID == Intrinsic::amdgcn_log;
1230	const bool IsExp = IID == Intrinsic::amdgcn_exp2;
1231	Value *Src = II.getArgOperand(i: `0`);
1232	Type *Ty = II.getType();
1233
1234	if (isa<PoisonValue>(Val: Src))
1235	return IC.replaceInstUsesWith(I&: II, V: Src);
1236
1237	if (IC.getSimplifyQuery().isUndefValue(V: Src))
1238	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
1239
1240	if (ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
1241	if (C->isInfinity()) {
1242	// exp2(+inf) -> +inf
1243	// log2(+inf) -> +inf
1244	if (!C->isNegative())
1245	return IC.replaceInstUsesWith(I&: II, V: C);
1246
1247	// exp2(-inf) -> 0
1248	if (IsExp && C->isNegative())
1249	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty));
1250	}
1251
1252	if (II.isStrictFP())
1253	break;
1254
1255	if (C->isNaN()) {
1256	Constant *Quieted = ConstantFP::get(Ty, V: C->getValue().makeQuiet());
1257	return IC.replaceInstUsesWith(I&: II, V: Quieted);
1258	}
1259
1260	// f32 instruction doesn't handle denormals, f16 does.
1261	if (C->isZero() \|\| (C->getValue().isDenormal() && Ty->isFloatTy())) {
1262	Constant FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, Negative: true*)
1263	: ConstantFP::get(Ty, V: `1.0`);
1264	return IC.replaceInstUsesWith(I&: II, V: FoldedValue);
1265	}
1266
1267	if (IsLog && C->isNegative())
1268	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
1269
1270	// TODO: Full constant folding matching hardware behavior.
1271	}
1272
1273	break;
1274	}
1275	case Intrinsic::amdgcn_frexp_mant:
1276	case Intrinsic::amdgcn_frexp_exp: {
1277	Value *Src = II.getArgOperand(i: `0`);
1278	if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
1279	int Exp;
1280	APFloat Significand =
1281	frexp(X: C->getValueAPF(), Exp, RM: APFloat::rmNearestTiesToEven);
1282
1283	if (IID == Intrinsic::amdgcn_frexp_mant) {
1284	return IC.replaceInstUsesWith(
1285	I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Significand));
1286	}
1287
1288	// Match instruction special case behavior.
1289	if (Exp == APFloat::IEK_NaN \|\| Exp == APFloat::IEK_Inf)
1290	Exp = `0`;
1291
1292	return IC.replaceInstUsesWith(I&: II,
1293	V: ConstantInt::getSigned(Ty: II.getType(), V: Exp));
1294	}
1295
1296	if (isa<PoisonValue>(Val: Src))
1297	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1298
1299	if (isa<UndefValue>(Val: Src)) {
1300	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1301	}
1302
1303	break;
1304	}
1305	case Intrinsic::amdgcn_class: {
1306	Value *Src0 = II.getArgOperand(i: `0`);
1307	Value *Src1 = II.getArgOperand(i: `1`);
1308	const ConstantInt *CMask = dyn_cast<ConstantInt>(Val: Src1);
1309	if (CMask) {
1310	II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1311	M: II.getModule(), id: Intrinsic::is_fpclass, OverloadTys: Src0->getType()));
1312
1313	// Clamp any excess bits, as they're illegal for the generic intrinsic.
1314	II.setArgOperand(i: `1`, v: ConstantInt::get(Ty: Src1->getType(),
1315	V: CMask->getZExtValue() & fcAllFlags));
1316	return &II;
1317	}
1318
1319	// Propagate poison.
1320	if (isa<PoisonValue>(Val: Src0) \|\| isa<PoisonValue>(Val: Src1))
1321	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1322
1323	// llvm.amdgcn.class(_, undef) -> false
1324	if (IC.getSimplifyQuery().isUndefValue(V: Src1))
1325	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: false));
1326
1327	// llvm.amdgcn.class(undef, mask) -> mask != 0
1328	if (IC.getSimplifyQuery().isUndefValue(V: Src0)) {
1329	Value *CmpMask = IC.Builder.CreateICmpNE(
1330	LHS: Src1, RHS: ConstantInt::getNullValue(Ty: Src1->getType()));
1331	return IC.replaceInstUsesWith(I&: II, V: CmpMask);
1332	}
1333	break;
1334	}
1335	case Intrinsic::amdgcn_cvt_pkrtz: {
1336	auto foldFPTruncToF16RTZ = [](Value Arg) -> Value {
1337	Type *HalfTy = Type::getHalfTy(C&: Arg->getContext());
1338
1339	if (isa<PoisonValue>(Val: Arg))
1340	return PoisonValue::get(T: HalfTy);
1341	if (isa<UndefValue>(Val: Arg))
1342	return UndefValue::get(T: HalfTy);
1343
1344	ConstantFP CFP = nullptr*;
1345	if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
1346	bool LosesInfo;
1347	APFloat Val(CFP->getValueAPF());
1348	Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
1349	return ConstantFP::get(Ty: HalfTy, V: Val);
1350	}
1351
1352	Value Src = nullptr*;
1353	if (match(V: Arg, P: m_FPExt(Op: m_Value(V&: Src)))) {
1354	if (Src->getType()->isHalfTy())
1355	return Src;
1356	}
1357
1358	return nullptr;
1359	};
1360
1361	if (Value *Src0 = foldFPTruncToF16RTZ (II.getArgOperand(i: `0`))) {
1362	if (Value *Src1 = foldFPTruncToF16RTZ (II.getArgOperand(i: `1`))) {
1363	Value *V = PoisonValue::get(T: II.getType());
1364	V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src0, Idx: (uint64_t)`0`);
1365	V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src1, Idx: (uint64_t)`1`);
1366	return IC.replaceInstUsesWith(I&: II, V);
1367	}
1368	}
1369
1370	break;
1371	}
1372	case Intrinsic::amdgcn_cvt_pknorm_i16:
1373	case Intrinsic::amdgcn_cvt_pknorm_u16:
1374	case Intrinsic::amdgcn_cvt_pk_i16:
1375	case Intrinsic::amdgcn_cvt_pk_u16: {
1376	Value *Src0 = II.getArgOperand(i: `0`);
1377	Value *Src1 = II.getArgOperand(i: `1`);
1378
1379	// TODO: Replace call with scalar operation if only one element is poison.
1380	if (isa<PoisonValue>(Val: Src0) && isa<PoisonValue>(Val: Src1))
1381	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1382
1383	if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
1384	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1385	}
1386
1387	break;
1388	}
1389	case Intrinsic::amdgcn_cvt_off_f32_i4: {
1390	Value* Arg = II.getArgOperand(i: `0`);
1391	Type *Ty = II.getType();
1392
1393	if (isa<PoisonValue>(Val: Arg))
1394	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: Ty));
1395
1396	if(IC.getSimplifyQuery().isUndefValue(V: Arg))
1397	return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty));
1398
1399	ConstantInt *CArg = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `0`));
1400	if (!CArg)
1401	break;
1402
1403	// Tabulated 0.0625 (sext (CArg & 0xf)).*
1404	constexpr size_t ResValsSize = `16`;
1405	static constexpr float ResVals[ResValsSize] = {
1406	`0.0`, `0.0625`, `0.125`, `0.1875`, `0.25`, `0.3125`, `0.375`, `0.4375`,
1407	-`0.5`, -`0.4375`, -`0.375`, -`0.3125`, -`0.25`, -`0.1875`, -`0.125`, -`0.0625`};
1408	Constant *Res =
1409	ConstantFP::get(Ty, V: ResVals[CArg->getZExtValue() & (ResValsSize - `1`)]);
1410	return IC.replaceInstUsesWith(I&: II, V: Res);
1411	}
1412	case Intrinsic::amdgcn_ubfe:
1413	case Intrinsic::amdgcn_sbfe: {
1414	// Decompose simple cases into standard shifts.
1415	Value *Src = II.getArgOperand(i: `0`);
1416	if (isa<UndefValue>(Val: Src)) {
1417	return IC.replaceInstUsesWith(I&: II, V: Src);
1418	}
1419
1420	unsigned Width;
1421	Type *Ty = II.getType();
1422	unsigned IntSize = Ty->getIntegerBitWidth();
1423
1424	ConstantInt *CWidth = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `2`));
1425	if (CWidth) {
1426	Width = CWidth->getZExtValue();
1427	if ((Width & (IntSize - `1`)) == `0`) {
1428	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getNullValue(Ty));
1429	}
1430
1431	// Hardware ignores high bits, so remove those.
1432	if (Width >= IntSize) {
1433	return IC.replaceOperand(
1434	I&: II, OpNum: `2`, V: ConstantInt::get(Ty: CWidth->getType(), V: Width & (IntSize - `1`)));
1435	}
1436	}
1437
1438	unsigned Offset;
1439	ConstantInt *COffset = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
1440	if (COffset) {
1441	Offset = COffset->getZExtValue();
1442	if (Offset >= IntSize) {
1443	return IC.replaceOperand(
1444	I&: II, OpNum: `1`,
1445	V: ConstantInt::get(Ty: COffset->getType(), V: Offset & (IntSize - `1`)));
1446	}
1447	}
1448
1449	bool Signed = IID == Intrinsic::amdgcn_sbfe;
1450
1451	if (!CWidth \|\| !COffset)
1452	break;
1453
1454	// The case of Width == 0 is handled above, which makes this transformation
1455	// safe. If Width == 0, then the ashr and lshr instructions become poison
1456	// value since the shift amount would be equal to the bit size.
1457	assert(Width != `0`);
1458
1459	// TODO: This allows folding to undef when the hardware has specific
1460	// behavior?
1461	if (Offset + Width < IntSize) {
1462	Value *Shl = IC.Builder.CreateShl(LHS: Src, RHS: IntSize - Offset - Width);
1463	Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Shl, RHS: IntSize - Width)
1464	: IC.Builder.CreateLShr(LHS: Shl, RHS: IntSize - Width);
1465	RightShift->takeName(V: &II);
1466	return IC.replaceInstUsesWith(I&: II, V: RightShift);
1467	}
1468
1469	Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Src, RHS: Offset)
1470	: IC.Builder.CreateLShr(LHS: Src, RHS: Offset);
1471
1472	RightShift->takeName(V: &II);
1473	return IC.replaceInstUsesWith(I&: II, V: RightShift);
1474	}
1475	case Intrinsic::amdgcn_exp:
1476	case Intrinsic::amdgcn_exp_row:
1477	case Intrinsic::amdgcn_exp_compr: {
1478	ConstantInt *En = cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
1479	unsigned EnBits = En->getZExtValue();
1480	if (EnBits == `0xf`)
1481	break; // All inputs enabled.
1482
1483	bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1484	bool Changed = false;
1485	for (int I = `0`; I < (IsCompr ? `2` : `4`); ++I) {
1486	if ((!IsCompr && (EnBits & (`1` << I)) == `0`) \|\|
1487	(IsCompr && ((EnBits & (`0x3` << (`2` * I))) == `0`))) {
1488	Value *Src = II.getArgOperand(i: I + `2`);
1489	if (!isa<PoisonValue>(Val: Src)) {
1490	IC.replaceOperand(I&: II, OpNum: I + `2`, V: PoisonValue::get(T: Src->getType()));
1491	Changed = true;
1492	}
1493	}
1494	}
1495
1496	if (Changed) {
1497	return &II;
1498	}
1499
1500	break;
1501	}
1502	case Intrinsic::amdgcn_fmed3: {
1503	Value *Src0 = II.getArgOperand(i: `0`);
1504	Value *Src1 = II.getArgOperand(i: `1`);
1505	Value *Src2 = II.getArgOperand(i: `2`);
1506
1507	for (Value *Src : {Src0, Src1, Src2}) {
1508	if (isa<PoisonValue>(Val: Src))
1509	return IC.replaceInstUsesWith(I&: II, V: Src);
1510	}
1511
1512	if (II.isStrictFP())
1513	break;
1514
1515	// med3 with a nan input acts like
1516	// v_min_f32(v_min_f32(s0, s1), s2)
1517	//
1518	// Signalingness is ignored with ieee=0, so we fold to
1519	// minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1520	// with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1521	// returned signaling nan will not be quieted.
1522
1523	// ieee=1
1524	// s0 snan: s2
1525	// s1 snan: s2
1526	// s2 snan: qnan
1527
1528	// s0 qnan: min(s1, s2)
1529	// s1 qnan: min(s0, s2)
1530	// s2 qnan: min(s0, s1)
1531
1532	// ieee=0
1533	// s0 _nan: min(s1, s2)
1534	// s1 _nan: min(s0, s2)
1535	// s2 _nan: min(s0, s1)
1536
1537	// med3 behavior with infinity
1538	// s0 +inf: max(s1, s2)
1539	// s1 +inf: max(s0, s2)
1540	// s2 +inf: max(s0, s1)
1541	// s0 -inf: min(s1, s2)
1542	// s1 -inf: min(s0, s2)
1543	// s2 -inf: min(s0, s1)
1544
1545	// Checking for NaN before canonicalization provides better fidelity when
1546	// mapping other operations onto fmed3 since the order of operands is
1547	// unchanged.
1548	Value V = nullptr*;
1549	const APFloat ConstSrc0 = nullptr*;
1550	const APFloat ConstSrc1 = nullptr*;
1551	const APFloat ConstSrc2 = nullptr*;
1552
1553	if ((match(V: Src0, P: m_APFloat(Res&: ConstSrc0)) &&
1554	(ConstSrc0->isNaN() \|\| ConstSrc0->isInfinity())) \|\|
1555	isa<UndefValue>(Val: Src0)) {
1556	const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1557	switch (fpenvIEEEMode(I: II)) {
1558	case KnownIEEEMode::On:
1559	// TODO: If Src2 is snan, does it need quieting?
1560	if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1561	return IC.replaceInstUsesWith(I&: II, V: Src2);
1562
1563	V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src1, RHS: Src2)
1564	: IC.Builder.CreateMinNum(LHS: Src1, RHS: Src2);
1565	break;
1566	case KnownIEEEMode::Off:
1567	V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src1, RHS: Src2)
1568	: IC.Builder.CreateMinimumNum(LHS: Src1, RHS: Src2);
1569	break;
1570	case KnownIEEEMode::Unknown:
1571	break;
1572	}
1573	} else if ((match(V: Src1, P: m_APFloat(Res&: ConstSrc1)) &&
1574	(ConstSrc1->isNaN() \|\| ConstSrc1->isInfinity())) \|\|
1575	isa<UndefValue>(Val: Src1)) {
1576	const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1577	switch (fpenvIEEEMode(I: II)) {
1578	case KnownIEEEMode::On:
1579	// TODO: If Src2 is snan, does it need quieting?
1580	if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1581	return IC.replaceInstUsesWith(I&: II, V: Src2);
1582
1583	V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src2)
1584	: IC.Builder.CreateMinNum(LHS: Src0, RHS: Src2);
1585	break;
1586	case KnownIEEEMode::Off:
1587	V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src2)
1588	: IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src2);
1589	break;
1590	case KnownIEEEMode::Unknown:
1591	break;
1592	}
1593	} else if ((match(V: Src2, P: m_APFloat(Res&: ConstSrc2)) &&
1594	(ConstSrc2->isNaN() \|\| ConstSrc2->isInfinity())) \|\|
1595	isa<UndefValue>(Val: Src2)) {
1596	switch (fpenvIEEEMode(I: II)) {
1597	case KnownIEEEMode::On:
1598	if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1599	auto *Quieted = ConstantFP::get(Ty: II.getType(), V: ConstSrc2->makeQuiet());
1600	return IC.replaceInstUsesWith(I&: II, V: Quieted);
1601	}
1602
1603	V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1604	? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src1)
1605	: IC.Builder.CreateMinNum(LHS: Src0, RHS: Src1);
1606	break;
1607	case KnownIEEEMode::Off:
1608	V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1609	? IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src1)
1610	: IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src1);
1611	break;
1612	case KnownIEEEMode::Unknown:
1613	break;
1614	}
1615	}
1616
1617	if (V) {
1618	if (auto *CI = dyn_cast<CallInst>(Val: V)) {
1619	CI->copyFastMathFlags(I: &II);
1620	CI->takeName(V: &II);
1621	}
1622	return IC.replaceInstUsesWith(I&: II, V);
1623	}
1624
1625	bool Swap = false;
1626	// Canonicalize constants to RHS operands.
1627	//
1628	// fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1629	if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1630	std::swap(a&: Src0, b&: Src1);
1631	Swap = true;
1632	}
1633
1634	if (isa<Constant>(Val: Src1) && !isa<Constant>(Val: Src2)) {
1635	std::swap(a&: Src1, b&: Src2);
1636	Swap = true;
1637	}
1638
1639	if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1640	std::swap(a&: Src0, b&: Src1);
1641	Swap = true;
1642	}
1643
1644	if (Swap) {
1645	II.setArgOperand(i: `0`, v: Src0);
1646	II.setArgOperand(i: `1`, v: Src1);
1647	II.setArgOperand(i: `2`, v: Src2);
1648	return &II;
1649	}
1650
1651	if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
1652	if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
1653	if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Val: Src2)) {
1654	APFloat Result = fmed3AMDGCN(Src0: C0->getValueAPF(), Src1: C1->getValueAPF(),
1655	Src2: C2->getValueAPF());
1656	return IC.replaceInstUsesWith(I&: II,
1657	V: ConstantFP::get(Ty: II.getType(), V: Result));
1658	}
1659	}
1660	}
1661
1662	if (!ST->hasMed3_16())
1663	break;
1664
1665	// Repeat floating-point width reduction done for minnum/maxnum.
1666	// fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1667	if (Value *X = matchFPExtFromF16(Arg: Src0)) {
1668	if (Value *Y = matchFPExtFromF16(Arg: Src1)) {
1669	if (Value *Z = matchFPExtFromF16(Arg: Src2)) {
1670	Value *NewCall = IC.Builder.CreateIntrinsic(
1671	ID: IID, OverloadTypes: {X->getType()}, Args: {X, Y, Z}, FMFSource: &II, Name: II.getName());
1672	return new FPExtInst (NewCall, II.getType());
1673	}
1674	}
1675	}
1676
1677	break;
1678	}
1679	case Intrinsic::amdgcn_icmp:
1680	case Intrinsic::amdgcn_fcmp: {
1681	const ConstantInt *CC = cast<ConstantInt>(Val: II.getArgOperand(i: `2`));
1682	// Guard against invalid arguments.
1683	int64_t CCVal = CC->getZExtValue();
1684	bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1685	if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE \|\|
1686	CCVal > CmpInst::LAST_ICMP_PREDICATE)) \|\|
1687	(!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE \|\|
1688	CCVal > CmpInst::LAST_FCMP_PREDICATE)))
1689	break;
1690
1691	Value *Src0 = II.getArgOperand(i: `0`);
1692	Value *Src1 = II.getArgOperand(i: `1`);
1693
1694	if (auto *CSrc0 = dyn_cast<Constant>(Val: Src0)) {
1695	if (auto *CSrc1 = dyn_cast<Constant>(Val: Src1)) {
1696	Constant *CCmp = ConstantFoldCompareInstOperands(
1697	Predicate: (ICmpInst::Predicate)CCVal, LHS: CSrc0, RHS: CSrc1, DL);
1698	if (CCmp && CCmp->isNullValue()) {
1699	return IC.replaceInstUsesWith(
1700	I&: II, V: IC.Builder.CreateSExt(V: CCmp, DestTy: II.getType()));
1701	}
1702
1703	// The result of V_ICMP/V_FCMP assembly instructions (which this
1704	// intrinsic exposes) is one bit per thread, masked with the EXEC
1705	// register (which contains the bitmask of live threads). So a
1706	// comparison that always returns true is the same as a read of the
1707	// EXEC register. ballot(true) reads EXEC at the wave-size width, so
1708	// zext/trunc the result to the intrinsic's return type.
1709	Type *WaveTy = IC.Builder.getIntNTy(N: ST->getWavefrontSize());
1710	Value *Ballot = IC.Builder.CreateIntrinsic(
1711	ID: Intrinsic::amdgcn_ballot, OverloadTypes: WaveTy, Args: IC.Builder.getTrue());
1712	Value *Result = IC.Builder.CreateZExtOrTrunc(V: Ballot, DestTy: II.getType());
1713	return IC.replaceInstUsesWith(I&: II, V: Result);
1714	}
1715
1716	// Canonicalize constants to RHS.
1717	CmpInst::Predicate SwapPred =
1718	CmpInst::getSwappedPredicate(pred: static_cast<CmpInst::Predicate>(CCVal));
1719	II.setArgOperand(i: `0`, v: Src1);
1720	II.setArgOperand(i: `1`, v: Src0);
1721	II.setArgOperand(
1722	i: `2`, v: ConstantInt::get(Ty: CC->getType(), V: static_cast<int>(SwapPred)));
1723	return &II;
1724	}
1725
1726	if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1727	break;
1728
1729	// Canonicalize compare eq with true value to compare != 0
1730	// llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1731	// -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1732	// llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1733	// -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1734	Value *ExtSrc;
1735	if (CCVal == CmpInst::ICMP_EQ &&
1736	((match(V: Src1, P: PatternMatch::m_One()) &&
1737	match(V: Src0, P: m_ZExt(Op: PatternMatch::m_Value(V&: ExtSrc)))) \|\|
1738	(match(V: Src1, P: PatternMatch::m_AllOnes()) &&
1739	match(V: Src0, P: m_SExt(Op: PatternMatch::m_Value(V&: ExtSrc))))) &&
1740	ExtSrc->getType()->isIntegerTy(BitWidth: `1`)) {
1741	IC.replaceOperand(I&: II, OpNum: `1`, V: ConstantInt::getNullValue(Ty: Src1->getType()));
1742	IC.replaceOperand(I&: II, OpNum: `2`,
1743	V: ConstantInt::get(Ty: CC->getType(), V: CmpInst::ICMP_NE));
1744	return &II;
1745	}
1746
1747	CmpPredicate SrcPred;
1748	Value *SrcLHS;
1749	Value *SrcRHS;
1750
1751	// Fold compare eq/ne with 0 from a compare result as the predicate to the
1752	// intrinsic. The typical use is a wave vote function in the library, which
1753	// will be fed from a user code condition compared with 0. Fold in the
1754	// redundant compare.
1755
1756	// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1757	// -> llvm.amdgcn.[if]cmp(a, b, pred)
1758	//
1759	// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1760	// -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1761	if (match(V: Src1, P: PatternMatch::m_Zero()) &&
1762	match(V: Src0, P: PatternMatch::m_ZExtOrSExt(
1763	Op: m_Cmp(Pred&: SrcPred, L: PatternMatch::m_Value(V&: SrcLHS),
1764	R: PatternMatch::m_Value(V&: SrcRHS))))) {
1765	if (CCVal == CmpInst::ICMP_EQ)
1766	SrcPred = CmpInst::getInversePredicate(pred: SrcPred);
1767
1768	Intrinsic::ID NewIID = CmpInst::isFPPredicate(P: SrcPred)
1769	? Intrinsic::amdgcn_fcmp
1770	: Intrinsic::amdgcn_icmp;
1771
1772	Type *Ty = SrcLHS->getType();
1773	if (auto *CmpType = dyn_cast<IntegerType>(Val: Ty)) {
1774	// Promote to next legal integer type.
1775	unsigned Width = CmpType->getBitWidth();
1776	unsigned NewWidth = Width;
1777
1778	// Don't do anything for i1 comparisons.
1779	if (Width == `1`)
1780	break;
1781
1782	if (Width <= `16`)
1783	NewWidth = `16`;
1784	else if (Width <= `32`)
1785	NewWidth = `32`;
1786	else if (Width <= `64`)
1787	NewWidth = `64`;
1788	else
1789	break; // Can't handle this.
1790
1791	if (Width != NewWidth) {
1792	IntegerType *CmpTy = IC.Builder.getIntNTy(N: NewWidth);
1793	if (CmpInst::isSigned(Pred: SrcPred)) {
1794	SrcLHS = IC.Builder.CreateSExt(V: SrcLHS, DestTy: CmpTy);
1795	SrcRHS = IC.Builder.CreateSExt(V: SrcRHS, DestTy: CmpTy);
1796	} else {
1797	SrcLHS = IC.Builder.CreateZExt(V: SrcLHS, DestTy: CmpTy);
1798	SrcRHS = IC.Builder.CreateZExt(V: SrcRHS, DestTy: CmpTy);
1799	}
1800	}
1801	} else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1802	break;
1803
1804	Value *Args[] = {SrcLHS, SrcRHS,
1805	ConstantInt::get(Ty: CC->getType(), V: SrcPred)};
1806	Value *NewCall = IC.Builder.CreateIntrinsic(
1807	ID: NewIID, OverloadTypes: {II.getType(), SrcLHS->getType()}, Args);
1808	NewCall->takeName(V: &II);
1809	return IC.replaceInstUsesWith(I&: II, V: NewCall);
1810	}
1811
1812	break;
1813	}
1814	case Intrinsic::amdgcn_mbcnt_hi:
1815	// exec_hi is all 0, so this is just a copy.
1816	if (ST->isWave32())
1817	return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: `1`));
1818	[[fallthrough]];
1819	case Intrinsic::amdgcn_mbcnt_lo: {
1820	ConstantRange AccRange =
1821	computeConstantRange(V: II.getArgOperand(i: `1`),
1822	/ForSigned=/false, SQ: IC.getSimplifyQuery());
1823	if (AccRange.isFullSet())
1824	return nullptr;
1825
1826	// TODO: Can raise lower bound by inspecting first argument.
1827	ConstantRange MbcntRange(APInt (`32`, `0`), APInt (`32`, `32` + `1`));
1828	ConstantRange ComputedRange = AccRange.add(Other: MbcntRange);
1829	if (ComputedRange.isFullSet())
1830	return nullptr;
1831
1832	if (std::optional<ConstantRange> ExistingRange = II.getRange()) {
1833	ComputedRange = ComputedRange.intersectWith(CR: *ExistingRange);
1834	if (ComputedRange == *ExistingRange)
1835	return nullptr;
1836	}
1837
1838	II.addRangeRetAttr(CR: ComputedRange);
1839	return nullptr;
1840	}
1841	case Intrinsic::amdgcn_ballot: {
1842	Value *Arg = II.getArgOperand(i: `0`);
1843	if (isa<PoisonValue>(Val: Arg))
1844	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1845
1846	if (auto *Src = dyn_cast<ConstantInt>(Val: Arg)) {
1847	if (Src->isZero()) {
1848	// amdgcn.ballot(i1 0) is zero.
1849	return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1850	}
1851	}
1852	if (ST->isWave32() && II.getType()->getIntegerBitWidth() == `64`) {
1853	// %b64 = call i64 ballot.i64(...)
1854	// =>
1855	// %b32 = call i32 ballot.i32(...)
1856	// %b64 = zext i32 %b32 to i64
1857	Value *Call = IC.Builder.CreateZExt(
1858	V: IC.Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_ballot,
1859	OverloadTypes: {IC.Builder.getInt32Ty()},
1860	Args: {II.getArgOperand(i: `0`)}),
1861	DestTy: II.getType());
1862	Call->takeName(V: &II);
1863	return IC.replaceInstUsesWith(I&: II, V: Call);
1864	}
1865	break;
1866	}
1867	case Intrinsic::amdgcn_wavefrontsize: {
1868	if (ST->isWaveSizeKnown())
1869	return IC.replaceInstUsesWith(
1870	I&: II, V: ConstantInt::get(Ty: II.getType(), V: ST->getWavefrontSize()));
1871	break;
1872	}
1873	case Intrinsic::amdgcn_wqm_vote: {
1874	// wqm_vote is identity when the argument is constant.
1875	if (!isa<Constant>(Val: II.getArgOperand(i: `0`)))
1876	break;
1877
1878	return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: `0`));
1879	}
1880	case Intrinsic::amdgcn_kill: {
1881	const ConstantInt *C = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `0`));
1882	if (!C \|\| !C->getZExtValue())
1883	break;
1884
1885	// amdgcn.kill(i1 1) is a no-op
1886	return IC.eraseInstFromFunction(I&: II);
1887	}
1888	case Intrinsic::amdgcn_s_sendmsg:
1889	case Intrinsic::amdgcn_s_sendmsghalt: {
1890	// The second operand is copied to m0, but is only actually used for
1891	// certain message types. For message types that are known to not use m0,
1892	// fold it to poison.
1893	using namespace AMDGPU::SendMsg;
1894
1895	Value *M0Val = II.getArgOperand(i: `1`);
1896	if (isa<PoisonValue>(Val: M0Val))
1897	break;
1898
1899	auto *MsgImm = cast<ConstantInt>(Val: II.getArgOperand(i: `0`));
1900	uint16_t MsgId, OpId, StreamId;
1901	decodeMsg(Val: MsgImm->getZExtValue(), MsgId, OpId, StreamId, STI: *ST);
1902
1903	if (!msgDoesNotUseM0(MsgId, STI: *ST))
1904	break;
1905
1906	// Drop UB-implying attributes since we're replacing with poison.
1907	II.dropUBImplyingAttrsAndMetadata();
1908	IC.replaceOperand(I&: II, OpNum: `1`, V: PoisonValue::get(T: M0Val->getType()));
1909	return nullptr;
1910	}
1911	case Intrinsic::amdgcn_update_dpp: {
1912	Value *Old = II.getArgOperand(i: `0`);
1913
1914	auto *BC = cast<ConstantInt>(Val: II.getArgOperand(i: `5`));
1915	auto *RM = cast<ConstantInt>(Val: II.getArgOperand(i: `3`));
1916	auto *BM = cast<ConstantInt>(Val: II.getArgOperand(i: `4`));
1917	if (BC->isNullValue() \|\| RM->getZExtValue() != `0xF` \|\|
1918	BM->getZExtValue() != `0xF` \|\| isa<PoisonValue>(Val: Old))
1919	break;
1920
1921	// If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1922	return IC.replaceOperand(I&: II, OpNum: `0`, V: PoisonValue::get(T: Old->getType()));
1923	}
1924	case Intrinsic::amdgcn_permlane16:
1925	case Intrinsic::amdgcn_permlane16_var:
1926	case Intrinsic::amdgcn_permlanex16:
1927	case Intrinsic::amdgcn_permlanex16_var: {
1928	// Discard vdst_in if it's not going to be read.
1929	Value *VDstIn = II.getArgOperand(i: `0`);
1930	if (isa<PoisonValue>(Val: VDstIn))
1931	break;
1932
1933	// FetchInvalid operand idx.
1934	unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 \|\|
1935	IID == Intrinsic::amdgcn_permlanex16)
1936	? `4` / for permlane16 and permlanex16 /
1937	: `3`; / for permlane16_var and permlanex16_var /
1938
1939	// BoundCtrl operand idx.
1940	// For permlane16 and permlanex16 it should be 5
1941	// For Permlane16_var and permlanex16_var it should be 4
1942	unsigned int BcIdx = FiIdx + `1`;
1943
1944	ConstantInt *FetchInvalid = cast<ConstantInt>(Val: II.getArgOperand(i: FiIdx));
1945	ConstantInt *BoundCtrl = cast<ConstantInt>(Val: II.getArgOperand(i: BcIdx));
1946	if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1947	break;
1948
1949	return IC.replaceOperand(I&: II, OpNum: `0`, V: PoisonValue::get(T: VDstIn->getType()));
1950	}
1951	case Intrinsic::amdgcn_wave_shuffle:
1952	return tryOptimizeShufflePattern(IC, II, ST: *ST);
1953	case Intrinsic::amdgcn_permlane64:
1954	case Intrinsic::amdgcn_readfirstlane:
1955	case Intrinsic::amdgcn_readlane:
1956	case Intrinsic::amdgcn_ds_bpermute: {
1957	// If the data argument is uniform these intrinsics return it unchanged.
1958	unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? `1` : `0`;
1959	const Use &Src = II.getArgOperandUse(i: SrcIdx);
1960	if (isTriviallyUniform(U: Src))
1961	return IC.replaceInstUsesWith(I&: II, V: Src.get());
1962
1963	if (IID == Intrinsic::amdgcn_readlane &&
1964	simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: `1`))
1965	return &II;
1966
1967	// If the lane argument of bpermute is uniform, change it to readlane. This
1968	// generates better code and can enable further optimizations because
1969	// readlane is AlwaysUniform.
1970	if (IID == Intrinsic::amdgcn_ds_bpermute) {
1971	const Use &Lane = II.getArgOperandUse(i: `0`);
1972	if (isTriviallyUniform(U: Lane)) {
1973	Value *NewLane = IC.Builder.CreateLShr(LHS: Lane, RHS: `2`);
1974	Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1975	M: II.getModule(), id: Intrinsic::amdgcn_readlane, OverloadTys: II.getType());
1976	II.setCalledFunction(NewDecl);
1977	II.setOperand(i_nocapture: `0`, Val_nocapture: Src);
1978	II.setOperand(i_nocapture: `1`, Val_nocapture: NewLane);
1979	return &II;
1980	}
1981	}
1982
1983	if (IID == Intrinsic::amdgcn_ds_bpermute)
1984	return tryOptimizeShufflePattern(IC, II, ST: *ST);
1985
1986	if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
1987	return Res;
1988
1989	return std::nullopt;
1990	}
1991	case Intrinsic::amdgcn_writelane: {
1992	// TODO: Fold bitcast like readlane.
1993	if (simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: `1`))
1994	return &II;
1995	return std::nullopt;
1996	}
1997	case Intrinsic::amdgcn_trig_preop: {
1998	// The intrinsic is declared with name mangling, but currently the
1999	// instruction only exists for f64
2000	if (!II.getType()->isDoubleTy())
2001	break;
2002
2003	Value *Src = II.getArgOperand(i: `0`);
2004	Value *Segment = II.getArgOperand(i: `1`);
2005	if (isa<PoisonValue>(Val: Src) \|\| isa<PoisonValue>(Val: Segment))
2006	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
2007
2008	if (isa<UndefValue>(Val: Segment))
2009	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
2010
2011	// Sign bit is not used.
2012	Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Val: Src);
2013	if (StrippedSign != Src)
2014	return IC.replaceOperand(I&: II, OpNum: `0`, V: StrippedSign);
2015
2016	if (II.isStrictFP())
2017	break;
2018
2019	const ConstantFP *CSrc = dyn_cast<ConstantFP>(Val: Src);
2020	if (!CSrc && !isa<UndefValue>(Val: Src))
2021	break;
2022
2023	// The instruction ignores special cases, and literally just extracts the
2024	// exponents. Fold undef to nan, and index the table as normal.
2025	APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
2026	: APFloat::getQNaN(Sem: II.getType()->getFltSemantics())
2027	.bitcastToAPInt();
2028
2029	const ConstantInt *Cseg = dyn_cast<ConstantInt>(Val: Segment);
2030	if (!Cseg) {
2031	if (isa<UndefValue>(Val: Src))
2032	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
2033	break;
2034	}
2035
2036	unsigned Exponent = FSrcInt.extractBitsAsZExtValue(numBits: `11`, bitPosition: `52`);
2037	unsigned SegmentVal = Cseg->getValue().trunc(width: `5`).getZExtValue();
2038	unsigned Shift = SegmentVal * `53`;
2039	if (Exponent > `1077`)
2040	Shift += Exponent - `1077`;
2041
2042	// 2.0/PI table.
2043	static const uint32_t TwoByPi[] = {
2044	`0xa2f9836e`, `0x4e441529`, `0xfc2757d1`, `0xf534ddc0`, `0xdb629599`, `0x3c439041`,
2045	`0xfe5163ab`, `0xdebbc561`, `0xb7246e3a`, `0x424dd2e0`, `0x06492eea`, `0x09d1921c`,
2046	`0xfe1deb1c`, `0xb129a73e`, `0xe88235f5`, `0x2ebb4484`, `0xe99c7026`, `0xb45f7e41`,
2047	`0x3991d639`, `0x835339f4`, `0x9c845f8b`, `0xbdf9283b`, `0x1ff897ff`, `0xde05980f`,
2048	`0xef2f118b`, `0x5a0a6d1f`, `0x6d367ecf`, `0x27cb09b7`, `0x4f463f66`, `0x9e5fea2d`,
2049	`0x7527bac7`, `0xebe5f17b`, `0x3d0739f7`, `0x8a5292ea`, `0x6bfb5fb1`, `0x1f8d5d08`,
2050	`0x56033046`};
2051
2052	// Return 0 for outbound segment (hardware behavior).
2053	unsigned Idx = Shift >> `5`;
2054	if (Idx + `2` >= std::size(TwoByPi)) {
2055	APFloat Zero = APFloat::getZero(Sem: II.getType()->getFltSemantics());
2056	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: II.getType(), V: Zero));
2057	}
2058
2059	unsigned BShift = Shift & `0x1f`;
2060	uint64_t Thi = Make_64(High: TwoByPi[Idx], Low: TwoByPi[Idx + `1`]);
2061	uint64_t Tlo = Make_64(High: TwoByPi[Idx + `2`], Low: `0`);
2062	if (BShift)
2063	Thi = (Thi << BShift) \| (Tlo >> (`64` - BShift));
2064	Thi = Thi >> `11`;
2065	APFloat Result = APFloat ((double)Thi);
2066
2067	int Scale = -`53` - Shift;
2068	if (Exponent >= `1968`)
2069	Scale += `128`;
2070
2071	Result = scalbn(X: Result, Exp: Scale, RM: RoundingMode::NearestTiesToEven);
2072	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: Src->getType(), V: Result));
2073	}
2074	case Intrinsic::amdgcn_fmul_legacy: {
2075	Value *Op0 = II.getArgOperand(i: `0`);
2076	Value *Op1 = II.getArgOperand(i: `1`);
2077
2078	for (Value *Src : {Op0, Op1}) {
2079	if (isa<PoisonValue>(Val: Src))
2080	return IC.replaceInstUsesWith(I&: II, V: Src);
2081	}
2082
2083	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2084	// infinity, gives +0.0.
2085	// TODO: Move to InstSimplify?
2086	if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) \|\|
2087	match(V: Op1, P: PatternMatch::m_AnyZeroFP()))
2088	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
2089
2090	// If we can prove we don't have one of the special cases then we can use a
2091	// normal fmul instruction instead.
2092	if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
2093	auto *FMul = IC.Builder.CreateFMulFMF(L: Op0, R: Op1, FMFSource: &II);
2094	FMul->takeName(V: &II);
2095	return IC.replaceInstUsesWith(I&: II, V: FMul);
2096	}
2097	break;
2098	}
2099	case Intrinsic::amdgcn_fma_legacy: {
2100	Value *Op0 = II.getArgOperand(i: `0`);
2101	Value *Op1 = II.getArgOperand(i: `1`);
2102	Value *Op2 = II.getArgOperand(i: `2`);
2103
2104	for (Value *Src : {Op0, Op1, Op2}) {
2105	if (isa<PoisonValue>(Val: Src))
2106	return IC.replaceInstUsesWith(I&: II, V: Src);
2107	}
2108
2109	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2110	// infinity, gives +0.0.
2111	// TODO: Move to InstSimplify?
2112	if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) \|\|
2113	match(V: Op1, P: PatternMatch::m_AnyZeroFP())) {
2114	// It's tempting to just return Op2 here, but that would give the wrong
2115	// result if Op2 was -0.0.
2116	auto *Zero = ConstantFP::getZero(Ty: II.getType());
2117	auto *FAdd = IC.Builder.CreateFAddFMF(L: Zero, R: Op2, FMFSource: &II);
2118	FAdd->takeName(V: &II);
2119	return IC.replaceInstUsesWith(I&: II, V: FAdd);
2120	}
2121
2122	// If we can prove we don't have one of the special cases then we can use a
2123	// normal fma instead.
2124	if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
2125	II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
2126	M: II.getModule(), id: Intrinsic::fma, OverloadTys: II.getType()));
2127	return &II;
2128	}
2129	break;
2130	}
2131	case Intrinsic::amdgcn_is_shared:
2132	case Intrinsic::amdgcn_is_private: {
2133	Value *Src = II.getArgOperand(i: `0`);
2134	if (isa<PoisonValue>(Val: Src))
2135	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
2136	if (isa<UndefValue>(Val: Src))
2137	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
2138
2139	if (isa<ConstantPointerNull>(Val: II.getArgOperand(i: `0`)))
2140	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getFalse(Ty: II.getType()));
2141	break;
2142	}
2143	case Intrinsic::amdgcn_make_buffer_rsrc: {
2144	Value *Src = II.getArgOperand(i: `0`);
2145	if (isa<PoisonValue>(Val: Src))
2146	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
2147	return std::nullopt;
2148	}
2149	case Intrinsic::amdgcn_raw_buffer_store_format:
2150	case Intrinsic::amdgcn_struct_buffer_store_format:
2151	case Intrinsic::amdgcn_raw_tbuffer_store:
2152	case Intrinsic::amdgcn_struct_tbuffer_store:
2153	case Intrinsic::amdgcn_image_store_1d:
2154	case Intrinsic::amdgcn_image_store_1darray:
2155	case Intrinsic::amdgcn_image_store_2d:
2156	case Intrinsic::amdgcn_image_store_2darray:
2157	case Intrinsic::amdgcn_image_store_2darraymsaa:
2158	case Intrinsic::amdgcn_image_store_2dmsaa:
2159	case Intrinsic::amdgcn_image_store_3d:
2160	case Intrinsic::amdgcn_image_store_cube:
2161	case Intrinsic::amdgcn_image_store_mip_1d:
2162	case Intrinsic::amdgcn_image_store_mip_1darray:
2163	case Intrinsic::amdgcn_image_store_mip_2d:
2164	case Intrinsic::amdgcn_image_store_mip_2darray:
2165	case Intrinsic::amdgcn_image_store_mip_3d:
2166	case Intrinsic::amdgcn_image_store_mip_cube: {
2167	if (!isa<FixedVectorType>(Val: II.getArgOperand(i: `0`)->getType()))
2168	break;
2169
2170	APInt DemandedElts;
2171	if (ST->hasDefaultComponentBroadcast())
2172	DemandedElts = defaultComponentBroadcast(V: II.getArgOperand(i: `0`));
2173	else if (ST->hasDefaultComponentZero())
2174	DemandedElts = trimTrailingZerosInVector(IC, UseV: II.getArgOperand(i: `0`), I: &II);
2175	else
2176	break;
2177
2178	int DMaskIdx = getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID()) ? `1` : -`1`;
2179	if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
2180	IsLoad: false)) {
2181	return IC.eraseInstFromFunction(I&: II);
2182	}
2183
2184	break;
2185	}
2186	case Intrinsic::amdgcn_prng_b32: {
2187	auto *Src = II.getArgOperand(i: `0`);
2188	if (isa<UndefValue>(Val: Src)) {
2189	return IC.replaceInstUsesWith(I&: II, V: Src);
2190	}
2191	return std::nullopt;
2192	}
2193	case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
2194	case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
2195	Value *Src0 = II.getArgOperand(i: `0`);
2196	Value *Src1 = II.getArgOperand(i: `1`);
2197	uint64_t CBSZ = cast<ConstantInt>(Val: II.getArgOperand(i: `3`))->getZExtValue();
2198	uint64_t BLGP = cast<ConstantInt>(Val: II.getArgOperand(i: `4`))->getZExtValue();
2199	auto *Src0Ty = cast<FixedVectorType>(Val: Src0->getType());
2200	auto *Src1Ty = cast<FixedVectorType>(Val: Src1->getType());
2201
2202	auto getFormatNumRegs = [](unsigned FormatVal) {
2203	switch (FormatVal) {
2204	case AMDGPU::MFMAScaleFormats::FP6_E2M3:
2205	case AMDGPU::MFMAScaleFormats::FP6_E3M2:
2206	return `6u`;
2207	case AMDGPU::MFMAScaleFormats::FP4_E2M1:
2208	return `4u`;
2209	case AMDGPU::MFMAScaleFormats::FP8_E4M3:
2210	case AMDGPU::MFMAScaleFormats::FP8_E5M2:
2211	return `8u`;
2212	default:
2213	llvm_unreachable("invalid format value");
2214	}
2215	};
2216
2217	bool MadeChange = false;
2218	unsigned Src0NumElts = getFormatNumRegs (CBSZ);
2219	unsigned Src1NumElts = getFormatNumRegs (BLGP);
2220
2221	// Depending on the used format, fewer registers are required so shrink the
2222	// vector type.
2223	if (Src0Ty->getNumElements() > Src0NumElts) {
2224	Src0 = IC.Builder.CreateExtractVector(
2225	DstType: FixedVectorType::get(ElementType: Src0Ty->getElementType(), NumElts: Src0NumElts), SrcVec: Src0,
2226	Idx: uint64_t(`0`));
2227	MadeChange = true;
2228	}
2229
2230	if (Src1Ty->getNumElements() > Src1NumElts) {
2231	Src1 = IC.Builder.CreateExtractVector(
2232	DstType: FixedVectorType::get(ElementType: Src1Ty->getElementType(), NumElts: Src1NumElts), SrcVec: Src1,
2233	Idx: uint64_t(`0`));
2234	MadeChange = true;
2235	}
2236
2237	if (!MadeChange)
2238	return std::nullopt;
2239
2240	SmallVector<Value *, `10`> Args(II.args());
2241	Args [`0`] = Src0;
2242	Args [`1`] = Src1;
2243
2244	Value *NewII = IC.Builder.CreateIntrinsic(
2245	ID: IID, OverloadTypes: {Src0->getType(), Src1->getType()}, Args, FMFSource: &II);
2246	NewII->takeName(V: &II);
2247	return IC.replaceInstUsesWith(I&: II, V: NewII);
2248	}
2249	case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
2250	case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
2251	case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
2252	Value *Src0 = II.getArgOperand(i: `1`);
2253	Value *Src1 = II.getArgOperand(i: `3`);
2254	unsigned FmtA = cast<ConstantInt>(Val: II.getArgOperand(i: `0`))->getZExtValue();
2255	uint64_t FmtB = cast<ConstantInt>(Val: II.getArgOperand(i: `2`))->getZExtValue();
2256	auto *Src0Ty = cast<FixedVectorType>(Val: Src0->getType());
2257	auto *Src1Ty = cast<FixedVectorType>(Val: Src1->getType());
2258
2259	bool MadeChange = false;
2260	unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt: FmtA);
2261	unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt: FmtB);
2262
2263	// Depending on the used format, fewer registers are required so shrink the
2264	// vector type.
2265	if (Src0Ty->getNumElements() > Src0NumElts) {
2266	Src0 = IC.Builder.CreateExtractVector(
2267	DstType: FixedVectorType::get(ElementType: Src0Ty->getElementType(), NumElts: Src0NumElts), SrcVec: Src0,
2268	Idx: IC.Builder.getInt64(C: `0`));
2269	MadeChange = true;
2270	}
2271
2272	if (Src1Ty->getNumElements() > Src1NumElts) {
2273	Src1 = IC.Builder.CreateExtractVector(
2274	DstType: FixedVectorType::get(ElementType: Src1Ty->getElementType(), NumElts: Src1NumElts), SrcVec: Src1,
2275	Idx: IC.Builder.getInt64(C: `0`));
2276	MadeChange = true;
2277	}
2278
2279	if (!MadeChange)
2280	return std::nullopt;
2281
2282	SmallVector<Value *, `13`> Args(II.args());
2283	Args [`1`] = Src0;
2284	Args [`3`] = Src1;
2285
2286	Value *NewII = IC.Builder.CreateIntrinsic(
2287	ID: IID, OverloadTypes: {II.getArgOperand(i: `5`)->getType(), Src0->getType(), Src1->getType()},
2288	Args, FMFSource: &II);
2289	NewII->takeName(V: &II);
2290	return IC.replaceInstUsesWith(I&: II, V: NewII);
2291	}
2292	}
2293	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
2294	AMDGPU::getImageDimIntrinsicInfo(Intr: II.getIntrinsicID())) {
2295	return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
2296	}
2297	return std::nullopt;
2298	}
2299
2300	/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
2301	///
2302	/// The result of simplifying amdgcn image and buffer store intrinsics is updating
2303	/// definitions of the intrinsics vector argument, not Uses of the result like
2304	/// image and buffer loads.
2305	/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
2306	/// struct returns.
2307	static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
2308	IntrinsicInst &II,
2309	APInt DemandedElts,
2310	int DMaskIdx, bool IsLoad) {
2311
2312	auto *IIVTy = cast<FixedVectorType>(Val: IsLoad ? II.getType()
2313	: II.getOperand(i_nocapture: `0`)->getType());
2314	unsigned VWidth = IIVTy->getNumElements();
2315	if (VWidth == `1`)
2316	return nullptr;
2317	Type *EltTy = IIVTy->getElementType();
2318
2319	IRBuilderBase::InsertPointGuard Guard(IC.Builder);
2320	IC.Builder.SetInsertPoint(&II);
2321
2322	// Assume the arguments are unchanged and later override them, if needed.
2323	SmallVector<Value *, `16`> Args(II.args());
2324
2325	if (DMaskIdx < `0`) {
2326	// Buffer case.
2327
2328	const unsigned ActiveBits = DemandedElts.getActiveBits();
2329	const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
2330
2331	// Start assuming the prefix of elements is demanded, but possibly clear
2332	// some other bits if there are trailing zeros (unused components at front)
2333	// and update offset.
2334	DemandedElts = (`1` << ActiveBits) - `1`;
2335
2336	if (UnusedComponentsAtFront > `0`) {
2337	static const unsigned InvalidOffsetIdx = `0xf`;
2338
2339	unsigned OffsetIdx;
2340	switch (II.getIntrinsicID()) {
2341	case Intrinsic::amdgcn_raw_buffer_load:
2342	case Intrinsic::amdgcn_raw_ptr_buffer_load:
2343	OffsetIdx = `1`;
2344	break;
2345	case Intrinsic::amdgcn_s_buffer_load:
2346	// If resulting type is vec3, there is no point in trimming the
2347	// load with updated offset, as the vec3 would most likely be widened to
2348	// vec4 anyway during lowering.
2349	if (ActiveBits == `4` && UnusedComponentsAtFront == `1`)
2350	OffsetIdx = InvalidOffsetIdx;
2351	else
2352	OffsetIdx = `1`;
2353	break;
2354	case Intrinsic::amdgcn_struct_buffer_load:
2355	case Intrinsic::amdgcn_struct_ptr_buffer_load:
2356	OffsetIdx = `2`;
2357	break;
2358	default:
2359	// TODO: handle tbuffer intrinsics.*
2360	OffsetIdx = InvalidOffsetIdx;
2361	break;
2362	}
2363
2364	if (OffsetIdx != InvalidOffsetIdx) {
2365	// Clear demanded bits and update the offset.
2366	DemandedElts &= ~((`1` << UnusedComponentsAtFront) - `1`);
2367	auto *Offset = Args [OffsetIdx];
2368	unsigned SingleComponentSizeInBits =
2369	IC.getDataLayout().getTypeSizeInBits(Ty: EltTy);
2370	unsigned OffsetAdd =
2371	UnusedComponentsAtFront * SingleComponentSizeInBits / `8`;
2372	auto *OffsetAddVal = ConstantInt::get(Ty: Offset->getType(), V: OffsetAdd);
2373	Args [OffsetIdx] = IC.Builder.CreateAdd(LHS: Offset, RHS: OffsetAddVal);
2374	}
2375	}
2376	} else {
2377	// Image case.
2378
2379	ConstantInt *DMask = cast<ConstantInt>(Val: Args [DMaskIdx]);
2380	unsigned DMaskVal = DMask->getZExtValue() & `0xf`;
2381
2382	// dmask 0 has special semantics, do not simplify.
2383	if (DMaskVal == `0`)
2384	return nullptr;
2385
2386	// Mask off values that are undefined because the dmask doesn't cover them
2387	DemandedElts &= (`1` << llvm::popcount(Value: DMaskVal)) - `1`;
2388
2389	unsigned NewDMaskVal = `0`;
2390	unsigned OrigLdStIdx = `0`;
2391	for (unsigned SrcIdx = `0`; SrcIdx < `4`; ++SrcIdx) {
2392	const unsigned Bit = `1` << SrcIdx;
2393	if (!!(DMaskVal & Bit)) {
2394	if (!!DemandedElts [OrigLdStIdx])
2395	NewDMaskVal \|= Bit;
2396	OrigLdStIdx++;
2397	}
2398	}
2399
2400	if (DMaskVal != NewDMaskVal)
2401	Args [DMaskIdx] = ConstantInt::get(Ty: DMask->getType(), V: NewDMaskVal);
2402	}
2403
2404	unsigned NewNumElts = DemandedElts.popcount();
2405	if (!NewNumElts)
2406	return PoisonValue::get(T: IIVTy);
2407
2408	if (NewNumElts >= VWidth && DemandedElts.isMask()) {
2409	if (DMaskIdx >= `0`)
2410	II.setArgOperand(i: DMaskIdx, v: Args [DMaskIdx]);
2411	return nullptr;
2412	}
2413
2414	// Validate function argument and return types, extracting overloaded types
2415	// along the way.
2416	SmallVector<Type *, `6`> OverloadTys;
2417	if (!Intrinsic::isSignatureValid(F: II.getCalledFunction(), OverloadTys))
2418	return nullptr;
2419
2420	Type *NewTy =
2421	(NewNumElts == `1`) ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: NewNumElts);
2422	OverloadTys [`0`] = NewTy;
2423
2424	if (!IsLoad) {
2425	SmallVector<int, `8`> EltMask;
2426	for (unsigned OrigStoreIdx = `0`; OrigStoreIdx < VWidth; ++OrigStoreIdx)
2427	if (DemandedElts [OrigStoreIdx])
2428	EltMask.push_back(Elt: OrigStoreIdx);
2429
2430	if (NewNumElts == `1`)
2431	Args [`0`] = IC.Builder.CreateExtractElement(Vec: II.getOperand(i_nocapture: `0`), Idx: EltMask [`0`]);
2432	else
2433	Args [`0`] = IC.Builder.CreateShuffleVector(V: II.getOperand(i_nocapture: `0`), Mask: EltMask);
2434	}
2435
2436	CallInst *NewCall = IC.Builder.CreateIntrinsicWithoutFolding(
2437	ID: II.getIntrinsicID(), OverloadTypes: OverloadTys, Args);
2438	NewCall->takeName(V: &II);
2439	NewCall->copyMetadata(SrcInst: II);
2440	AttributeList OldAttrList = II.getAttributes();
2441	NewCall->setAttributes(OldAttrList);
2442
2443	if (IsLoad) {
2444	if (NewNumElts == `1`) {
2445	return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: IIVTy), NewElt: NewCall,
2446	Idx: DemandedElts.countr_zero());
2447	}
2448
2449	SmallVector<int, `8`> EltMask;
2450	unsigned NewLoadIdx = `0`;
2451	for (unsigned OrigLoadIdx = `0`; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2452	if (!!DemandedElts [OrigLoadIdx])
2453	EltMask.push_back(Elt: NewLoadIdx++);
2454	else
2455	EltMask.push_back(Elt: NewNumElts);
2456	}
2457
2458	auto *Shuffle = IC.Builder.CreateShuffleVector(V: NewCall, Mask: EltMask);
2459
2460	return Shuffle;
2461	}
2462
2463	return NewCall;
2464	}
2465
2466	Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
2467	InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2468	APInt &UndefElts) const {
2469	auto *VT = dyn_cast<FixedVectorType>(Val: II.getType());
2470	if (!VT)
2471	return nullptr;
2472
2473	const unsigned FirstElt = DemandedElts.countr_zero();
2474	const unsigned LastElt = DemandedElts.getActiveBits() - `1`;
2475	const unsigned MaskLen = LastElt - FirstElt + `1`;
2476
2477	unsigned OldNumElts = VT->getNumElements();
2478	if (MaskLen == OldNumElts && MaskLen != `1`)
2479	return nullptr;
2480
2481	Type *EltTy = VT->getElementType();
2482	Type *NewVT = MaskLen == `1` ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: MaskLen);
2483
2484	// Theoretically we should support these intrinsics for any legal type. Avoid
2485	// introducing cases that aren't direct register types like v3i16.
2486	if (!isTypeLegal(Ty: NewVT))
2487	return nullptr;
2488
2489	Value *Src = II.getArgOperand(i: `0`);
2490
2491	// Make sure convergence tokens are preserved.
2492	// TODO: CreateIntrinsic should allow directly copying bundles
2493	SmallVector<OperandBundleDef, `2`> OpBundles;
2494	II.getOperandBundlesAsDefs(Defs&: OpBundles);
2495
2496	Module *M = IC.Builder.GetInsertBlock()->getModule();
2497	Function *Remangled =
2498	Intrinsic::getOrInsertDeclaration(M, id: II.getIntrinsicID(), OverloadTys: {NewVT});
2499
2500	if (MaskLen == `1`) {
2501	Value *Extract = IC.Builder.CreateExtractElement(Vec: Src, Idx: FirstElt);
2502
2503	// TODO: Preserve callsite attributes?
2504	CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
2505
2506	return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: II.getType()),
2507	NewElt: NewCall, Idx: FirstElt);
2508	}
2509
2510	SmallVector<int> ExtractMask(MaskLen, -`1`);
2511	for (unsigned I = `0`; I != MaskLen; ++I) {
2512	if (DemandedElts [FirstElt + I])
2513	ExtractMask [I] = FirstElt + I;
2514	}
2515
2516	Value *Extract = IC.Builder.CreateShuffleVector(V: Src, Mask: ExtractMask);
2517
2518	// TODO: Preserve callsite attributes?
2519	CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
2520
2521	SmallVector<int> InsertMask(OldNumElts, -`1`);
2522	for (unsigned I = `0`; I != MaskLen; ++I) {
2523	if (DemandedElts [FirstElt + I])
2524	InsertMask [FirstElt + I] = I;
2525	}
2526
2527	// FIXME: If the call has a convergence bundle, we end up leaving the dead
2528	// call behind.
2529	return IC.Builder.CreateShuffleVector(V: NewCall, Mask: InsertMask);
2530	}
2531
2532	std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
2533	InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2534	APInt &UndefElts2, APInt &UndefElts3,
2535	std::function<void(Instruction , unsigned*, APInt, APInt &)>
2536	SimplifyAndSetOp) const {
2537	switch (II.getIntrinsicID()) {
2538	case Intrinsic::amdgcn_readfirstlane:
2539	SimplifyAndSetOp (&II, `0`, DemandedElts, UndefElts);
2540	return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2541	case Intrinsic::amdgcn_raw_buffer_load:
2542	case Intrinsic::amdgcn_raw_ptr_buffer_load:
2543	case Intrinsic::amdgcn_raw_buffer_load_format:
2544	case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2545	case Intrinsic::amdgcn_raw_tbuffer_load:
2546	case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2547	case Intrinsic::amdgcn_s_buffer_load:
2548	case Intrinsic::amdgcn_struct_buffer_load:
2549	case Intrinsic::amdgcn_struct_ptr_buffer_load:
2550	case Intrinsic::amdgcn_struct_buffer_load_format:
2551	case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2552	case Intrinsic::amdgcn_struct_tbuffer_load:
2553	case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2554	return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2555	default: {
2556	if (getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID())) {
2557	return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx: `0`);
2558	}
2559	break;
2560	}
2561	}
2562	return std::nullopt;
2563	}
2564

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp