AMDGPUInstCombineIntrinsic.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp]

1	//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file
10	// This file implements a TargetTransformInfo analysis pass specific to the
11	// AMDGPU target machine. It uses the target's detailed information to provide
12	// more precise answers to certain TTI queries, while letting the target
13	// independent and default TTI implementations handle the rest.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "AMDGPUInstrInfo.h"
18	#include "AMDGPUTargetTransformInfo.h"
19	#include "GCNSubtarget.h"
20	#include "llvm/ADT/FloatingPointMode.h"
21	#include "llvm/IR/Dominators.h"
22	#include "llvm/IR/IntrinsicsAMDGPU.h"
23	#include "llvm/Transforms/InstCombine/InstCombiner.h"
24	#include <optional>
25
26	using namespace llvm;
27	using namespace llvm::PatternMatch;
28
29	#define DEBUG_TYPE "AMDGPUtti"
30
31	namespace {
32
33	struct AMDGPUImageDMaskIntrinsic {
34	unsigned Intr;
35	};
36
37	#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
38	#include "InstCombineTables.inc"
39
40	} // end anonymous namespace
41
42	// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
43	//
44	// A single NaN input is folded to minnum, so we rely on that folding for
45	// handling NaNs.
46	static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
47	const APFloat &Src2) {
48	APFloat Max3 = maxnum(A: maxnum(A: Src0, B: Src1), B: Src2);
49
50	APFloat::cmpResult Cmp0 = Max3.compare(RHS: Src0);
51	assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
52	if (Cmp0 == APFloat::cmpEqual)
53	return maxnum(A: Src1, B: Src2);
54
55	APFloat::cmpResult Cmp1 = Max3.compare(RHS: Src1);
56	assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
57	if (Cmp1 == APFloat::cmpEqual)
58	return maxnum(A: Src0, B: Src2);
59
60	return maxnum(A: Src0, B: Src1);
61	}
62
63	// Check if a value can be converted to a 16-bit value without losing
64	// precision.
65	// The value is expected to be either a float (IsFloat = true) or an unsigned
66	// integer (IsFloat = false).
67	static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
68	Type *VTy = V.getType();
69	if (VTy->isHalfTy() \|\| VTy->isIntegerTy(Bitwidth: `16`)) {
70	// The value is already 16-bit, so we don't want to convert to 16-bit again!
71	return false;
72	}
73	if (IsFloat) {
74	if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(Val: &V)) {
75	// We need to check that if we cast the index down to a half, we do not
76	// lose precision.
77	APFloat FloatValue(ConstFloat->getValueAPF());
78	bool LosesInfo = true;
79	FloatValue.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero,
80	losesInfo: &LosesInfo);
81	return !LosesInfo;
82	}
83	} else {
84	if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Val: &V)) {
85	// We need to check that if we cast the index down to an i16, we do not
86	// lose precision.
87	APInt IntValue(ConstInt->getValue());
88	return IntValue.getActiveBits() <= `16`;
89	}
90	}
91
92	Value *CastSrc;
93	bool IsExt = IsFloat ? match(V: &V, P: m_FPExt(Op: PatternMatch::m_Value(V&: CastSrc)))
94	: match(V: &V, P: m_ZExt(Op: PatternMatch::m_Value(V&: CastSrc)));
95	if (IsExt) {
96	Type *CastSrcTy = CastSrc->getType();
97	if (CastSrcTy->isHalfTy() \|\| CastSrcTy->isIntegerTy(Bitwidth: `16`))
98	return true;
99	}
100
101	return false;
102	}
103
104	// Convert a value to 16-bit.
105	static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
106	Type *VTy = V.getType();
107	if (isa<FPExtInst, SExtInst, ZExtInst>(Val: &V))
108	return cast<Instruction>(Val: &V)->getOperand(i: `0`);
109	if (VTy->isIntegerTy())
110	return Builder.CreateIntCast(V: &V, DestTy: Type::getInt16Ty(C&: V.getContext()), isSigned: false);
111	if (VTy->isFloatingPointTy())
112	return Builder.CreateFPCast(V: &V, DestTy: Type::getHalfTy(C&: V.getContext()));
113
114	llvm_unreachable("Should never be called!");
115	}
116
117	/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
118	/// modified arguments (based on OldIntr) and replaces InstToReplace with
119	/// this newly created intrinsic call.
120	static std::optional<Instruction *> modifyIntrinsicCall(
121	IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
122	InstCombiner &IC,
123	std::function<void(SmallVectorImpl<Value > &, SmallVectorImpl<Type > &)>
124	Func) {
125	SmallVector<Type *, `4`> ArgTys;
126	if (!Intrinsic::getIntrinsicSignature(F: OldIntr.getCalledFunction(), ArgTys))
127	return std::nullopt;
128
129	SmallVector<Value *, `8`> Args(OldIntr.args());
130
131	// Modify arguments and types
132	Func (Args, ArgTys);
133
134	CallInst *NewCall = IC.Builder.CreateIntrinsic(ID: NewIntr, Types: ArgTys, Args);
135	NewCall->takeName(V: &OldIntr);
136	NewCall->copyMetadata(SrcInst: OldIntr);
137	if (isa<FPMathOperator>(Val: NewCall))
138	NewCall->copyFastMathFlags(I: &OldIntr);
139
140	// Erase and replace uses
141	if (!InstToReplace.getType()->isVoidTy())
142	IC.replaceInstUsesWith(I&: InstToReplace, V: NewCall);
143
144	bool RemoveOldIntr = &OldIntr != &InstToReplace;
145
146	auto *RetValue = IC.eraseInstFromFunction(I&: InstToReplace);
147	if (RemoveOldIntr)
148	IC.eraseInstFromFunction(I&: OldIntr);
149
150	return RetValue;
151	}
152
153	static std::optional<Instruction *>
154	simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
155	const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
156	IntrinsicInst &II, InstCombiner &IC) {
157	// Optimize _L to _LZ when _L is zero
158	if (const auto *LZMappingInfo =
159	AMDGPU::getMIMGLZMappingInfo(L: ImageDimIntr->BaseOpcode)) {
160	if (auto *ConstantLod =
161	dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->LodIndex))) {
162	if (ConstantLod->isZero() \|\| ConstantLod->isNegative()) {
163	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
164	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: LZMappingInfo->LZ,
165	Dim: ImageDimIntr->Dim);
166	return modifyIntrinsicCall(
167	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
168	Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169	});
170	}
171	}
172	}
173
174	// Optimize _mip away, when 'lod' is zero
175	if (const auto *MIPMappingInfo =
176	AMDGPU::getMIMGMIPMappingInfo(MIP: ImageDimIntr->BaseOpcode)) {
177	if (auto *ConstantMip =
178	dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->MipIndex))) {
179	if (ConstantMip->isZero()) {
180	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: MIPMappingInfo->NONMIP,
182	Dim: ImageDimIntr->Dim);
183	return modifyIntrinsicCall(
184	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
185	Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186	});
187	}
188	}
189	}
190
191	// Optimize _bias away when 'bias' is zero
192	if (const auto *BiasMappingInfo =
193	AMDGPU::getMIMGBiasMappingInfo(Bias: ImageDimIntr->BaseOpcode)) {
194	if (auto *ConstantBias =
195	dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->BiasIndex))) {
196	if (ConstantBias->isZero()) {
197	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: BiasMappingInfo->NoBias,
199	Dim: ImageDimIntr->Dim);
200	return modifyIntrinsicCall(
201	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
202	Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203	ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204	});
205	}
206	}
207	}
208
209	// Optimize _offset away when 'offset' is zero
210	if (const auto *OffsetMappingInfo =
211	AMDGPU::getMIMGOffsetMappingInfo(Offset: ImageDimIntr->BaseOpcode)) {
212	if (auto *ConstantOffset =
213	dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->OffsetIndex))) {
214	if (ConstantOffset->isZero()) {
215	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
216	AMDGPU::getImageDimIntrinsicByBaseOpcode(
217	BaseOpcode: OffsetMappingInfo->NoOffset, Dim: ImageDimIntr->Dim);
218	return modifyIntrinsicCall(
219	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
220	Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221	});
222	}
223	}
224	}
225
226	// Try to use D16
227	if (ST->hasD16Images()) {
228
229	const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
230	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode);
231
232	if (BaseOpcode->HasD16) {
233
234	// If the only use of image intrinsic is a fptrunc (with conversion to
235	// half) then both fptrunc and image intrinsic will be replaced with image
236	// intrinsic with D16 flag.
237	if (II.hasOneUse()) {
238	Instruction *User = II.user_back();
239
240	if (User->getOpcode() == Instruction::FPTrunc &&
241	User->getType()->getScalarType()->isHalfTy()) {
242
243	return modifyIntrinsicCall(OldIntr&: II, InstToReplace&: *User, NewIntr: ImageDimIntr->Intr, IC,
244	Func: [&](auto &Args, auto &ArgTys) {
245	// Change return type of image intrinsic.
246	// Set it to return type of fptrunc.
247	ArgTys[`0`] = User->getType();
248	});
249	}
250	}
251
252	// Only perform D16 folding if every user of the image sample is
253	// an ExtractElementInst immediately followed by an FPTrunc to half.
254	SmallVector<std::pair<ExtractElementInst , FPTruncInst >, `4`>
255	ExtractTruncPairs;
256	bool AllHalfExtracts = true;
257
258	for (User *U : II.users()) {
259	auto *Ext = dyn_cast<ExtractElementInst>(Val: U);
260	if (!Ext \|\| !Ext->hasOneUse()) {
261	AllHalfExtracts = false;
262	break;
263	}
264
265	auto Tr = dyn_cast<FPTruncInst>(Val: Ext->user_begin());
266	if (!Tr \|\| !Tr->getType()->isHalfTy()) {
267	AllHalfExtracts = false;
268	break;
269	}
270
271	ExtractTruncPairs.emplace_back(Args&: Ext, Args&: Tr);
272	}
273
274	if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
275	auto *VecTy = cast<VectorType>(Val: II.getType());
276	Type *HalfVecTy =
277	VecTy->getWithNewType(EltTy: Type::getHalfTy(C&: II.getContext()));
278
279	// Obtain the original image sample intrinsic's signature
280	// and replace its return type with the half-vector for D16 folding
281	SmallVector<Type *, `8`> SigTys;
282	Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: SigTys);
283	SigTys [`0`] = HalfVecTy;
284
285	Module *M = II.getModule();
286	Function *HalfDecl =
287	Intrinsic::getOrInsertDeclaration(M, id: ImageDimIntr->Intr, Tys: SigTys);
288
289	II.mutateType(Ty: HalfVecTy);
290	II.setCalledFunction(HalfDecl);
291
292	IRBuilder<> Builder(II.getContext());
293	for (auto &[Ext, Tr] : ExtractTruncPairs) {
294	Value *Idx = Ext->getIndexOperand();
295
296	Builder.SetInsertPoint(Tr);
297
298	Value *HalfExtract = Builder.CreateExtractElement(Vec: &II, Idx);
299	HalfExtract->takeName(V: Tr);
300
301	Tr->replaceAllUsesWith(V: HalfExtract);
302	}
303
304	for (auto &[Ext, Tr] : ExtractTruncPairs) {
305	IC.eraseInstFromFunction(I&: *Tr);
306	IC.eraseInstFromFunction(I&: *Ext);
307	}
308
309	return &II;
310	}
311	}
312	}
313
314	// Try to use A16 or G16
315	if (!ST->hasA16() && !ST->hasG16())
316	return std::nullopt;
317
318	// Address is interpreted as float if the instruction has a sampler or as
319	// unsigned int if there is no sampler.
320	bool HasSampler =
321	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode)->Sampler;
322	bool FloatCoord = false;
323	// true means derivatives can be converted to 16 bit, coordinates not
324	bool OnlyDerivatives = false;
325
326	for (unsigned OperandIndex = ImageDimIntr->GradientStart;
327	OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
328	Value *Coord = II.getOperand(i_nocapture: OperandIndex);
329	// If the values are not derived from 16-bit values, we cannot optimize.
330	if (!canSafelyConvertTo16Bit(V&: *Coord, IsFloat: HasSampler)) {
331	if (OperandIndex < ImageDimIntr->CoordStart \|\|
332	ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
333	return std::nullopt;
334	}
335	// All gradients can be converted, so convert only them
336	OnlyDerivatives = true;
337	break;
338	}
339
340	assert(OperandIndex == ImageDimIntr->GradientStart \|\|
341	FloatCoord == Coord->getType()->isFloatingPointTy());
342	FloatCoord = Coord->getType()->isFloatingPointTy();
343	}
344
345	if (!OnlyDerivatives && !ST->hasA16())
346	OnlyDerivatives = true; // Only supports G16
347
348	// Check if there is a bias parameter and if it can be converted to f16
349	if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != `0`) {
350	Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
351	assert(HasSampler &&
352	"Only image instructions with a sampler can have a bias");
353	if (!canSafelyConvertTo16Bit(V&: *Bias, IsFloat: HasSampler))
354	OnlyDerivatives = true;
355	}
356
357	if (OnlyDerivatives && (!ST->hasG16() \|\| ImageDimIntr->GradientStart ==
358	ImageDimIntr->CoordStart))
359	return std::nullopt;
360
361	Type *CoordType = FloatCoord ? Type::getHalfTy(C&: II.getContext())
362	: Type::getInt16Ty(C&: II.getContext());
363
364	return modifyIntrinsicCall(
365	OldIntr&: II, InstToReplace&: II, NewIntr: II.getIntrinsicID(), IC, Func: [&](auto &Args, auto &ArgTys) {
366	ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
367	if (!OnlyDerivatives) {
368	ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
369
370	// Change the bias type
371	if (ImageDimIntr->NumBiasArgs != `0`)
372	ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(C&: II.getContext());
373	}
374
375	unsigned EndIndex =
376	OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
377	for (unsigned OperandIndex = ImageDimIntr->GradientStart;
378	OperandIndex < EndIndex; OperandIndex++) {
379	Args[OperandIndex] =
380	convertTo16Bit(V&: *II.getOperand(i_nocapture: OperandIndex), Builder&: IC.Builder);
381	}
382
383	// Convert the bias
384	if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != `0`) {
385	Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
386	Args[ImageDimIntr->BiasIndex] = convertTo16Bit(V&: *Bias, Builder&: IC.Builder);
387	}
388	});
389	}
390
391	bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
392	const Value Op0, const* Value *Op1,
393	InstCombiner &IC) const {
394	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
395	// infinity, gives +0.0. If we can prove we don't have one of the special
396	// cases then we can use a normal multiply instead.
397	// TODO: Create and use isKnownFiniteNonZero instead of just matching
398	// constants here.
399	if (match(V: Op0, P: PatternMatch::m_FiniteNonZero()) \|\|
400	match(V: Op1, P: PatternMatch::m_FiniteNonZero())) {
401	// One operand is not zero or infinity or NaN.
402	return true;
403	}
404
405	SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(I: &I);
406	if (isKnownNeverInfOrNaN(V: Op0, SQ) && isKnownNeverInfOrNaN(V: Op1, SQ)) {
407	// Neither operand is infinity or NaN.
408	return true;
409	}
410	return false;
411	}
412
413	/// Match an fpext from half to float, or a constant we can convert.
414	static Value matchFPExtFromF16(Value Arg) {
415	Value Src = nullptr*;
416	ConstantFP CFP = nullptr*;
417	if (match(V: Arg, P: m_OneUse(SubPattern: m_FPExt(Op: m_Value(V&: Src))))) {
418	if (Src->getType()->isHalfTy())
419	return Src;
420	} else if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
421	bool LosesInfo;
422	APFloat Val(CFP->getValueAPF());
423	Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
424	if (!LosesInfo)
425	return ConstantFP::get(Ty: Type::getHalfTy(C&: Arg->getContext()), V: Val);
426	}
427	return nullptr;
428	}
429
430	// Trim all zero components from the end of the vector \p UseV and return
431	// an appropriate bitset with known elements.
432	static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
433	Instruction *I) {
434	auto *VTy = cast<FixedVectorType>(Val: UseV->getType());
435	unsigned VWidth = VTy->getNumElements();
436	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
437
438	for (int i = VWidth - `1`; i > `0`; --i) {
439	auto *Elt = findScalarElement(V: UseV, EltNo: i);
440	if (!Elt)
441	break;
442
443	if (auto *ConstElt = dyn_cast<Constant>(Val: Elt)) {
444	if (!ConstElt->isNullValue() && !isa<UndefValue>(Val: Elt))
445	break;
446	} else {
447	break;
448	}
449
450	DemandedElts.clearBit(BitPosition: i);
451	}
452
453	return DemandedElts;
454	}
455
456	// Trim elements of the end of the vector \p V, if they are
457	// equal to the first element of the vector.
458	static APInt defaultComponentBroadcast(Value *V) {
459	auto *VTy = cast<FixedVectorType>(Val: V->getType());
460	unsigned VWidth = VTy->getNumElements();
461	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
462	Value *FirstComponent = findScalarElement(V, EltNo: `0`);
463
464	SmallVector<int> ShuffleMask;
465	if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: V))
466	SVI->getShuffleMask(Result&: ShuffleMask);
467
468	for (int I = VWidth - `1`; I > `0`; --I) {
469	if (ShuffleMask.empty()) {
470	auto *Elt = findScalarElement(V, EltNo: I);
471	if (!Elt \|\| (Elt != FirstComponent && !isa<UndefValue>(Val: Elt)))
472	break;
473	} else {
474	// Detect identical elements in the shufflevector result, even though
475	// findScalarElement cannot tell us what that element is.
476	if (ShuffleMask [I] != ShuffleMask [`0`] && ShuffleMask [I] != PoisonMaskElem)
477	break;
478	}
479	DemandedElts.clearBit(BitPosition: I);
480	}
481
482	return DemandedElts;
483	}
484
485	static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
486	IntrinsicInst &II,
487	APInt DemandedElts,
488	int DMaskIdx = -`1`,
489	bool IsLoad = true);
490
491	/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
492	static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
493	return (SqrtOp->getType()->isFloatTy() &&
494	(SqrtOp->hasApproxFunc() \|\| SqrtOp->getFPAccuracy() >= `1.0f`)) \|\|
495	SqrtOp->getType()->isHalfTy();
496	}
497
498	/// Return true if we can easily prove that use U is uniform.
499	static bool isTriviallyUniform(const Use &U) {
500	Value *V = U.get();
501	if (isa<Constant>(Val: V))
502	return true;
503	if (const auto *A = dyn_cast<Argument>(Val: V))
504	return AMDGPU::isArgPassedInSGPR(Arg: A);
505	if (const auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
506	if (!AMDGPU::isIntrinsicAlwaysUniform(IntrID: II->getIntrinsicID()))
507	return false;
508	// If II and U are in different blocks then there is a possibility of
509	// temporal divergence.
510	return II->getParent() == cast<Instruction>(Val: U.getUser())->getParent();
511	}
512	return false;
513	}
514
515	/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
516	///
517	/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
518	bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
519	IntrinsicInst &II,
520	unsigned LaneArgIdx) const {
521	unsigned MaskBits = ST->getWavefrontSizeLog2();
522	APInt DemandedMask(`32`, maskTrailingOnes<unsigned>(N: MaskBits));
523
524	KnownBits Known(`32`);
525	if (IC.SimplifyDemandedBits(I: &II, OpNo: LaneArgIdx, DemandedMask, Known))
526	return true;
527
528	if (!Known.isConstant())
529	return false;
530
531	// Out of bounds indexes may appear in wave64 code compiled for wave32.
532	// Unlike the DAG version, SimplifyDemandedBits does not change constants, so
533	// manually fix it up.
534
535	Value *LaneArg = II.getArgOperand(i: LaneArgIdx);
536	Constant *MaskedConst =
537	ConstantInt::get(Ty: LaneArg->getType(), V: Known.getConstant() & DemandedMask);
538	if (MaskedConst != LaneArg) {
539	II.getOperandUse(i: LaneArgIdx).set(MaskedConst);
540	return true;
541	}
542
543	return false;
544	}
545
546	static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
547	Function &NewCallee, ArrayRef<Value *> Ops) {
548	SmallVector<OperandBundleDef, `2`> OpBundles;
549	Old.getOperandBundlesAsDefs(Defs&: OpBundles);
550
551	CallInst *NewCall = B.CreateCall(Callee: &NewCallee, Args: Ops, OpBundles);
552	NewCall->takeName(V: &Old);
553	return NewCall;
554	}
555
556	Instruction *
557	GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
558	IntrinsicInst &II) const {
559	const auto IID = II.getIntrinsicID();
560	assert(IID == Intrinsic::amdgcn_readlane \|\|
561	IID == Intrinsic::amdgcn_readfirstlane \|\|
562	IID == Intrinsic::amdgcn_permlane64);
563
564	Instruction *OpInst = dyn_cast<Instruction>(Val: II.getOperand(i_nocapture: `0`));
565
566	// Only do this if both instructions are in the same block
567	// (so the exec mask won't change) and the readlane is the only user of its
568	// operand.
569	if (!OpInst \|\| !OpInst->hasOneUser() \|\| OpInst->getParent() != II.getParent())
570	return nullptr;
571
572	const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
573
574	// If this is a readlane, check that the second operand is a constant, or is
575	// defined before OpInst so we know it's safe to move this intrinsic higher.
576	Value LaneID = nullptr*;
577	if (IsReadLane) {
578	LaneID = II.getOperand(i_nocapture: `1`);
579
580	// readlane take an extra operand for the lane ID, so we must check if that
581	// LaneID value can be used at the point where we want to move the
582	// intrinsic.
583	if (auto *LaneIDInst = dyn_cast<Instruction>(Val: LaneID)) {
584	if (!IC.getDominatorTree().dominates(Def: LaneIDInst, User: OpInst))
585	return nullptr;
586	}
587	}
588
589	// Hoist the intrinsic (II) through OpInst.
590	//
591	// (II (OpInst x)) -> (OpInst (II x))
592	const auto DoIt = [&](unsigned OpIdx,
593	Function NewIntrinsic) -> Instruction {
594	SmallVector<Value *, `2`> Ops{OpInst->getOperand(i: OpIdx)};
595	if (IsReadLane)
596	Ops.push_back(Elt: LaneID);
597
598	// Rewrite the intrinsic call.
599	CallInst NewII = rewriteCall(B&: IC.Builder, Old&: II, NewCallee&: NewIntrinsic, Ops);
600
601	// Rewrite OpInst so it takes the result of the intrinsic now.
602	Instruction &NewOp = *OpInst->clone();
603	NewOp.setOperand(i: OpIdx, Val: NewII);
604	return &NewOp;
605	};
606
607	// TODO(?): Should we do more with permlane64?
608	if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(Val: OpInst))
609	return nullptr;
610
611	if (isa<UnaryOperator>(Val: OpInst))
612	return DoIt (`0`, II.getCalledFunction());
613
614	if (isa<CastInst>(Val: OpInst)) {
615	Value *Src = OpInst->getOperand(i: `0`);
616	Type *SrcTy = Src->getType();
617	if (!isTypeLegal(Ty: SrcTy))
618	return nullptr;
619
620	Function *Remangled =
621	Intrinsic::getOrInsertDeclaration(M: II.getModule(), id: IID, Tys: {SrcTy});
622	return DoIt (`0`, Remangled);
623	}
624
625	// We can also hoist through binary operators if the other operand is uniform.
626	if (isa<BinaryOperator>(Val: OpInst)) {
627	// FIXME: If we had access to UniformityInfo here we could just check
628	// if the operand is uniform.
629	if (isTriviallyUniform(U: OpInst->getOperandUse(i: `0`)))
630	return DoIt (`1`, II.getCalledFunction());
631	if (isTriviallyUniform(U: OpInst->getOperandUse(i: `1`)))
632	return DoIt (`0`, II.getCalledFunction());
633	}
634
635	return nullptr;
636	}
637
638	std::optional<Instruction *>
639	GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
640	Intrinsic::ID IID = II.getIntrinsicID();
641	switch (IID) {
642	case Intrinsic::amdgcn_rcp: {
643	Value *Src = II.getArgOperand(i: `0`);
644	if (isa<PoisonValue>(Val: Src))
645	return IC.replaceInstUsesWith(I&: II, V: Src);
646
647	// TODO: Move to ConstantFolding/InstSimplify?
648	if (isa<UndefValue>(Val: Src)) {
649	Type *Ty = II.getType();
650	auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
651	return IC.replaceInstUsesWith(I&: II, V: QNaN);
652	}
653
654	if (II.isStrictFP())
655	break;
656
657	if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
658	const APFloat &ArgVal = C->getValueAPF();
659	APFloat Val(ArgVal.getSemantics(), `1`);
660	Val.divide(RHS: ArgVal, RM: APFloat::rmNearestTiesToEven);
661
662	// This is more precise than the instruction may give.
663	//
664	// TODO: The instruction always flushes denormal results (except for f16),
665	// should this also?
666	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Val));
667	}
668
669	FastMathFlags FMF = cast<FPMathOperator>(Val&: II).getFastMathFlags();
670	if (!FMF.allowContract())
671	break;
672	auto *SrcCI = dyn_cast<IntrinsicInst>(Val: Src);
673	if (!SrcCI)
674	break;
675
676	auto IID = SrcCI->getIntrinsicID();
677	// llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
678	//
679	// llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
680	// relaxed.
681	if (IID == Intrinsic::amdgcn_sqrt \|\| IID == Intrinsic::sqrt) {
682	const FPMathOperator *SqrtOp = cast<FPMathOperator>(Val: SrcCI);
683	FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
684	if (!InnerFMF.allowContract() \|\| !SrcCI->hasOneUse())
685	break;
686
687	if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
688	break;
689
690	Function *NewDecl = Intrinsic::getOrInsertDeclaration(
691	M: SrcCI->getModule(), id: Intrinsic::amdgcn_rsq, Tys: {SrcCI->getType()});
692
693	InnerFMF \|= FMF;
694	II.setFastMathFlags(InnerFMF);
695
696	II.setCalledFunction(NewDecl);
697	return IC.replaceOperand(I&: II, OpNum: `0`, V: SrcCI->getArgOperand(i: `0`));
698	}
699
700	break;
701	}
702	case Intrinsic::amdgcn_sqrt:
703	case Intrinsic::amdgcn_rsq: {
704	Value *Src = II.getArgOperand(i: `0`);
705	if (isa<PoisonValue>(Val: Src))
706	return IC.replaceInstUsesWith(I&: II, V: Src);
707
708	// TODO: Move to ConstantFolding/InstSimplify?
709	if (isa<UndefValue>(Val: Src)) {
710	Type *Ty = II.getType();
711	auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
712	return IC.replaceInstUsesWith(I&: II, V: QNaN);
713	}
714
715	// f16 amdgcn.sqrt is identical to regular sqrt.
716	if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
717	Function *NewDecl = Intrinsic::getOrInsertDeclaration(
718	M: II.getModule(), id: Intrinsic::sqrt, Tys: {II.getType()});
719	II.setCalledFunction(NewDecl);
720	return &II;
721	}
722
723	break;
724	}
725	case Intrinsic::amdgcn_log:
726	case Intrinsic::amdgcn_exp2: {
727	const bool IsLog = IID == Intrinsic::amdgcn_log;
728	const bool IsExp = IID == Intrinsic::amdgcn_exp2;
729	Value *Src = II.getArgOperand(i: `0`);
730	Type *Ty = II.getType();
731
732	if (isa<PoisonValue>(Val: Src))
733	return IC.replaceInstUsesWith(I&: II, V: Src);
734
735	if (IC.getSimplifyQuery().isUndefValue(V: Src))
736	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
737
738	if (ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
739	if (C->isInfinity()) {
740	// exp2(+inf) -> +inf
741	// log2(+inf) -> +inf
742	if (!C->isNegative())
743	return IC.replaceInstUsesWith(I&: II, V: C);
744
745	// exp2(-inf) -> 0
746	if (IsExp && C->isNegative())
747	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty));
748	}
749
750	if (II.isStrictFP())
751	break;
752
753	if (C->isNaN()) {
754	Constant *Quieted = ConstantFP::get(Ty, V: C->getValue().makeQuiet());
755	return IC.replaceInstUsesWith(I&: II, V: Quieted);
756	}
757
758	// f32 instruction doesn't handle denormals, f16 does.
759	if (C->isZero() \|\| (C->getValue().isDenormal() && Ty->isFloatTy())) {
760	Constant FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, Negative: true*)
761	: ConstantFP::get(Ty, V: `1.0`);
762	return IC.replaceInstUsesWith(I&: II, V: FoldedValue);
763	}
764
765	if (IsLog && C->isNegative())
766	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
767
768	// TODO: Full constant folding matching hardware behavior.
769	}
770
771	break;
772	}
773	case Intrinsic::amdgcn_frexp_mant:
774	case Intrinsic::amdgcn_frexp_exp: {
775	Value *Src = II.getArgOperand(i: `0`);
776	if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
777	int Exp;
778	APFloat Significand =
779	frexp(X: C->getValueAPF(), Exp, RM: APFloat::rmNearestTiesToEven);
780
781	if (IID == Intrinsic::amdgcn_frexp_mant) {
782	return IC.replaceInstUsesWith(
783	I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Significand));
784	}
785
786	// Match instruction special case behavior.
787	if (Exp == APFloat::IEK_NaN \|\| Exp == APFloat::IEK_Inf)
788	Exp = `0`;
789
790	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: Exp));
791	}
792
793	if (isa<PoisonValue>(Val: Src))
794	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
795
796	if (isa<UndefValue>(Val: Src)) {
797	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
798	}
799
800	break;
801	}
802	case Intrinsic::amdgcn_class: {
803	Value *Src0 = II.getArgOperand(i: `0`);
804	Value *Src1 = II.getArgOperand(i: `1`);
805	const ConstantInt *CMask = dyn_cast<ConstantInt>(Val: Src1);
806	if (CMask) {
807	II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
808	M: II.getModule(), id: Intrinsic::is_fpclass, Tys: Src0->getType()));
809
810	// Clamp any excess bits, as they're illegal for the generic intrinsic.
811	II.setArgOperand(i: `1`, v: ConstantInt::get(Ty: Src1->getType(),
812	V: CMask->getZExtValue() & fcAllFlags));
813	return &II;
814	}
815
816	// Propagate poison.
817	if (isa<PoisonValue>(Val: Src0) \|\| isa<PoisonValue>(Val: Src1))
818	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
819
820	// llvm.amdgcn.class(_, undef) -> false
821	if (IC.getSimplifyQuery().isUndefValue(V: Src1))
822	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: false));
823
824	// llvm.amdgcn.class(undef, mask) -> mask != 0
825	if (IC.getSimplifyQuery().isUndefValue(V: Src0)) {
826	Value *CmpMask = IC.Builder.CreateICmpNE(
827	LHS: Src1, RHS: ConstantInt::getNullValue(Ty: Src1->getType()));
828	return IC.replaceInstUsesWith(I&: II, V: CmpMask);
829	}
830	break;
831	}
832	case Intrinsic::amdgcn_cvt_pkrtz: {
833	auto foldFPTruncToF16RTZ = [](Value Arg) -> Value {
834	Type *HalfTy = Type::getHalfTy(C&: Arg->getContext());
835
836	if (isa<PoisonValue>(Val: Arg))
837	return PoisonValue::get(T: HalfTy);
838	if (isa<UndefValue>(Val: Arg))
839	return UndefValue::get(T: HalfTy);
840
841	ConstantFP CFP = nullptr*;
842	if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
843	bool LosesInfo;
844	APFloat Val(CFP->getValueAPF());
845	Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
846	return ConstantFP::get(Ty: HalfTy, V: Val);
847	}
848
849	Value Src = nullptr*;
850	if (match(V: Arg, P: m_FPExt(Op: m_Value(V&: Src)))) {
851	if (Src->getType()->isHalfTy())
852	return Src;
853	}
854
855	return nullptr;
856	};
857
858	if (Value *Src0 = foldFPTruncToF16RTZ (II.getArgOperand(i: `0`))) {
859	if (Value *Src1 = foldFPTruncToF16RTZ (II.getArgOperand(i: `1`))) {
860	Value *V = PoisonValue::get(T: II.getType());
861	V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src0, Idx: (uint64_t)`0`);
862	V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src1, Idx: (uint64_t)`1`);
863	return IC.replaceInstUsesWith(I&: II, V);
864	}
865	}
866
867	break;
868	}
869	case Intrinsic::amdgcn_cvt_pknorm_i16:
870	case Intrinsic::amdgcn_cvt_pknorm_u16:
871	case Intrinsic::amdgcn_cvt_pk_i16:
872	case Intrinsic::amdgcn_cvt_pk_u16: {
873	Value *Src0 = II.getArgOperand(i: `0`);
874	Value *Src1 = II.getArgOperand(i: `1`);
875
876	// TODO: Replace call with scalar operation if only one element is poison.
877	if (isa<PoisonValue>(Val: Src0) && isa<PoisonValue>(Val: Src1))
878	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
879
880	if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
881	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
882	}
883
884	break;
885	}
886	case Intrinsic::amdgcn_cvt_off_f32_i4: {
887	Value* Arg = II.getArgOperand(i: `0`);
888	Type *Ty = II.getType();
889
890	if (isa<PoisonValue>(Val: Arg))
891	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: Ty));
892
893	if(IC.getSimplifyQuery().isUndefValue(V: Arg))
894	return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty));
895
896	ConstantInt *CArg = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `0`));
897	if (!CArg)
898	break;
899
900	// Tabulated 0.0625 (sext (CArg & 0xf)).*
901	constexpr size_t ResValsSize = `16`;
902	static constexpr float ResVals[ResValsSize] = {
903	`0.0`, `0.0625`, `0.125`, `0.1875`, `0.25`, `0.3125`, `0.375`, `0.4375`,
904	-`0.5`, -`0.4375`, -`0.375`, -`0.3125`, -`0.25`, -`0.1875`, -`0.125`, -`0.0625`};
905	Constant *Res =
906	ConstantFP::get(Ty, V: ResVals[CArg->getZExtValue() & (ResValsSize - `1`)]);
907	return IC.replaceInstUsesWith(I&: II, V: Res);
908	}
909	case Intrinsic::amdgcn_ubfe:
910	case Intrinsic::amdgcn_sbfe: {
911	// Decompose simple cases into standard shifts.
912	Value *Src = II.getArgOperand(i: `0`);
913	if (isa<UndefValue>(Val: Src)) {
914	return IC.replaceInstUsesWith(I&: II, V: Src);
915	}
916
917	unsigned Width;
918	Type *Ty = II.getType();
919	unsigned IntSize = Ty->getIntegerBitWidth();
920
921	ConstantInt *CWidth = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `2`));
922	if (CWidth) {
923	Width = CWidth->getZExtValue();
924	if ((Width & (IntSize - `1`)) == `0`) {
925	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getNullValue(Ty));
926	}
927
928	// Hardware ignores high bits, so remove those.
929	if (Width >= IntSize) {
930	return IC.replaceOperand(
931	I&: II, OpNum: `2`, V: ConstantInt::get(Ty: CWidth->getType(), V: Width & (IntSize - `1`)));
932	}
933	}
934
935	unsigned Offset;
936	ConstantInt *COffset = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
937	if (COffset) {
938	Offset = COffset->getZExtValue();
939	if (Offset >= IntSize) {
940	return IC.replaceOperand(
941	I&: II, OpNum: `1`,
942	V: ConstantInt::get(Ty: COffset->getType(), V: Offset & (IntSize - `1`)));
943	}
944	}
945
946	bool Signed = IID == Intrinsic::amdgcn_sbfe;
947
948	if (!CWidth \|\| !COffset)
949	break;
950
951	// The case of Width == 0 is handled above, which makes this transformation
952	// safe. If Width == 0, then the ashr and lshr instructions become poison
953	// value since the shift amount would be equal to the bit size.
954	assert(Width != `0`);
955
956	// TODO: This allows folding to undef when the hardware has specific
957	// behavior?
958	if (Offset + Width < IntSize) {
959	Value *Shl = IC.Builder.CreateShl(LHS: Src, RHS: IntSize - Offset - Width);
960	Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Shl, RHS: IntSize - Width)
961	: IC.Builder.CreateLShr(LHS: Shl, RHS: IntSize - Width);
962	RightShift->takeName(V: &II);
963	return IC.replaceInstUsesWith(I&: II, V: RightShift);
964	}
965
966	Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Src, RHS: Offset)
967	: IC.Builder.CreateLShr(LHS: Src, RHS: Offset);
968
969	RightShift->takeName(V: &II);
970	return IC.replaceInstUsesWith(I&: II, V: RightShift);
971	}
972	case Intrinsic::amdgcn_exp:
973	case Intrinsic::amdgcn_exp_row:
974	case Intrinsic::amdgcn_exp_compr: {
975	ConstantInt *En = cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
976	unsigned EnBits = En->getZExtValue();
977	if (EnBits == `0xf`)
978	break; // All inputs enabled.
979
980	bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
981	bool Changed = false;
982	for (int I = `0`; I < (IsCompr ? `2` : `4`); ++I) {
983	if ((!IsCompr && (EnBits & (`1` << I)) == `0`) \|\|
984	(IsCompr && ((EnBits & (`0x3` << (`2` * I))) == `0`))) {
985	Value *Src = II.getArgOperand(i: I + `2`);
986	if (!isa<PoisonValue>(Val: Src)) {
987	IC.replaceOperand(I&: II, OpNum: I + `2`, V: PoisonValue::get(T: Src->getType()));
988	Changed = true;
989	}
990	}
991	}
992
993	if (Changed) {
994	return &II;
995	}
996
997	break;
998	}
999	case Intrinsic::amdgcn_fmed3: {
1000	Value *Src0 = II.getArgOperand(i: `0`);
1001	Value *Src1 = II.getArgOperand(i: `1`);
1002	Value *Src2 = II.getArgOperand(i: `2`);
1003
1004	for (Value *Src : {Src0, Src1, Src2}) {
1005	if (isa<PoisonValue>(Val: Src))
1006	return IC.replaceInstUsesWith(I&: II, V: Src);
1007	}
1008
1009	if (II.isStrictFP())
1010	break;
1011
1012	// med3 with a nan input acts like
1013	// v_min_f32(v_min_f32(s0, s1), s2)
1014	//
1015	// Signalingness is ignored with ieee=0, so we fold to
1016	// minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1017	// with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1018	// returned signaling nan will not be quieted.
1019
1020	// ieee=1
1021	// s0 snan: s2
1022	// s1 snan: s2
1023	// s2 snan: qnan
1024
1025	// s0 qnan: min(s1, s2)
1026	// s1 qnan: min(s0, s2)
1027	// s2 qnan: min(s0, s1)
1028
1029	// ieee=0
1030	// s0 _nan: min(s1, s2)
1031	// s1 _nan: min(s0, s2)
1032	// s2 _nan: min(s0, s1)
1033
1034	// med3 behavior with infinity
1035	// s0 +inf: max(s1, s2)
1036	// s1 +inf: max(s0, s2)
1037	// s2 +inf: max(s0, s1)
1038	// s0 -inf: min(s1, s2)
1039	// s1 -inf: min(s0, s2)
1040	// s2 -inf: min(s0, s1)
1041
1042	// Checking for NaN before canonicalization provides better fidelity when
1043	// mapping other operations onto fmed3 since the order of operands is
1044	// unchanged.
1045	Value V = nullptr*;
1046	const APFloat ConstSrc0 = nullptr*;
1047	const APFloat ConstSrc1 = nullptr*;
1048	const APFloat ConstSrc2 = nullptr*;
1049
1050	if ((match(V: Src0, P: m_APFloat(Res&: ConstSrc0)) &&
1051	(ConstSrc0->isNaN() \|\| ConstSrc0->isInfinity())) \|\|
1052	isa<UndefValue>(Val: Src0)) {
1053	const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1054	switch (fpenvIEEEMode(I: II)) {
1055	case KnownIEEEMode::On:
1056	// TODO: If Src2 is snan, does it need quieting?
1057	if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1058	return IC.replaceInstUsesWith(I&: II, V: Src2);
1059
1060	V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src1, RHS: Src2)
1061	: IC.Builder.CreateMinNum(LHS: Src1, RHS: Src2);
1062	break;
1063	case KnownIEEEMode::Off:
1064	V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src1, RHS: Src2)
1065	: IC.Builder.CreateMinimumNum(LHS: Src1, RHS: Src2);
1066	break;
1067	case KnownIEEEMode::Unknown:
1068	break;
1069	}
1070	} else if ((match(V: Src1, P: m_APFloat(Res&: ConstSrc1)) &&
1071	(ConstSrc1->isNaN() \|\| ConstSrc1->isInfinity())) \|\|
1072	isa<UndefValue>(Val: Src1)) {
1073	const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1074	switch (fpenvIEEEMode(I: II)) {
1075	case KnownIEEEMode::On:
1076	// TODO: If Src2 is snan, does it need quieting?
1077	if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1078	return IC.replaceInstUsesWith(I&: II, V: Src2);
1079
1080	V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src2)
1081	: IC.Builder.CreateMinNum(LHS: Src0, RHS: Src2);
1082	break;
1083	case KnownIEEEMode::Off:
1084	V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src2)
1085	: IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src2);
1086	break;
1087	case KnownIEEEMode::Unknown:
1088	break;
1089	}
1090	} else if ((match(V: Src2, P: m_APFloat(Res&: ConstSrc2)) &&
1091	(ConstSrc2->isNaN() \|\| ConstSrc2->isInfinity())) \|\|
1092	isa<UndefValue>(Val: Src2)) {
1093	switch (fpenvIEEEMode(I: II)) {
1094	case KnownIEEEMode::On:
1095	if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1096	auto *Quieted = ConstantFP::get(Ty: II.getType(), V: ConstSrc2->makeQuiet());
1097	return IC.replaceInstUsesWith(I&: II, V: Quieted);
1098	}
1099
1100	V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1101	? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src1)
1102	: IC.Builder.CreateMinNum(LHS: Src0, RHS: Src1);
1103	break;
1104	case KnownIEEEMode::Off:
1105	V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1106	? IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src1)
1107	: IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src1);
1108	break;
1109	case KnownIEEEMode::Unknown:
1110	break;
1111	}
1112	}
1113
1114	if (V) {
1115	if (auto *CI = dyn_cast<CallInst>(Val: V)) {
1116	CI->copyFastMathFlags(I: &II);
1117	CI->takeName(V: &II);
1118	}
1119	return IC.replaceInstUsesWith(I&: II, V);
1120	}
1121
1122	bool Swap = false;
1123	// Canonicalize constants to RHS operands.
1124	//
1125	// fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1126	if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1127	std::swap(a&: Src0, b&: Src1);
1128	Swap = true;
1129	}
1130
1131	if (isa<Constant>(Val: Src1) && !isa<Constant>(Val: Src2)) {
1132	std::swap(a&: Src1, b&: Src2);
1133	Swap = true;
1134	}
1135
1136	if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1137	std::swap(a&: Src0, b&: Src1);
1138	Swap = true;
1139	}
1140
1141	if (Swap) {
1142	II.setArgOperand(i: `0`, v: Src0);
1143	II.setArgOperand(i: `1`, v: Src1);
1144	II.setArgOperand(i: `2`, v: Src2);
1145	return &II;
1146	}
1147
1148	if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
1149	if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
1150	if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Val: Src2)) {
1151	APFloat Result = fmed3AMDGCN(Src0: C0->getValueAPF(), Src1: C1->getValueAPF(),
1152	Src2: C2->getValueAPF());
1153	return IC.replaceInstUsesWith(I&: II,
1154	V: ConstantFP::get(Ty: II.getType(), V: Result));
1155	}
1156	}
1157	}
1158
1159	if (!ST->hasMed3_16())
1160	break;
1161
1162	// Repeat floating-point width reduction done for minnum/maxnum.
1163	// fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1164	if (Value *X = matchFPExtFromF16(Arg: Src0)) {
1165	if (Value *Y = matchFPExtFromF16(Arg: Src1)) {
1166	if (Value *Z = matchFPExtFromF16(Arg: Src2)) {
1167	Value *NewCall = IC.Builder.CreateIntrinsic(
1168	ID: IID, Types: {X->getType()}, Args: {X, Y, Z}, FMFSource: &II, Name: II.getName());
1169	return new FPExtInst (NewCall, II.getType());
1170	}
1171	}
1172	}
1173
1174	break;
1175	}
1176	case Intrinsic::amdgcn_icmp:
1177	case Intrinsic::amdgcn_fcmp: {
1178	const ConstantInt *CC = cast<ConstantInt>(Val: II.getArgOperand(i: `2`));
1179	// Guard against invalid arguments.
1180	int64_t CCVal = CC->getZExtValue();
1181	bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1182	if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE \|\|
1183	CCVal > CmpInst::LAST_ICMP_PREDICATE)) \|\|
1184	(!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE \|\|
1185	CCVal > CmpInst::LAST_FCMP_PREDICATE)))
1186	break;
1187
1188	Value *Src0 = II.getArgOperand(i: `0`);
1189	Value *Src1 = II.getArgOperand(i: `1`);
1190
1191	if (auto *CSrc0 = dyn_cast<Constant>(Val: Src0)) {
1192	if (auto *CSrc1 = dyn_cast<Constant>(Val: Src1)) {
1193	Constant *CCmp = ConstantFoldCompareInstOperands(
1194	Predicate: (ICmpInst::Predicate)CCVal, LHS: CSrc0, RHS: CSrc1, DL);
1195	if (CCmp && CCmp->isNullValue()) {
1196	return IC.replaceInstUsesWith(
1197	I&: II, V: IC.Builder.CreateSExt(V: CCmp, DestTy: II.getType()));
1198	}
1199
1200	// The result of V_ICMP/V_FCMP assembly instructions (which this
1201	// intrinsic exposes) is one bit per thread, masked with the EXEC
1202	// register (which contains the bitmask of live threads). So a
1203	// comparison that always returns true is the same as a read of the
1204	// EXEC register.
1205	Metadata *MDArgs[] = {MDString::get(Context&: II.getContext(), Str: "exec")};
1206	MDNode *MD = MDNode::get(Context&: II.getContext(), MDs: MDArgs);
1207	Value *Args[] = {MetadataAsValue::get(Context&: II.getContext(), MD)};
1208	CallInst *NewCall = IC.Builder.CreateIntrinsic(ID: Intrinsic::read_register,
1209	Types: II.getType(), Args);
1210	NewCall->addFnAttr(Kind: Attribute::Convergent);
1211	NewCall->takeName(V: &II);
1212	return IC.replaceInstUsesWith(I&: II, V: NewCall);
1213	}
1214
1215	// Canonicalize constants to RHS.
1216	CmpInst::Predicate SwapPred =
1217	CmpInst::getSwappedPredicate(pred: static_cast<CmpInst::Predicate>(CCVal));
1218	II.setArgOperand(i: `0`, v: Src1);
1219	II.setArgOperand(i: `1`, v: Src0);
1220	II.setArgOperand(
1221	i: `2`, v: ConstantInt::get(Ty: CC->getType(), V: static_cast<int>(SwapPred)));
1222	return &II;
1223	}
1224
1225	if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1226	break;
1227
1228	// Canonicalize compare eq with true value to compare != 0
1229	// llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1230	// -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1231	// llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1232	// -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1233	Value *ExtSrc;
1234	if (CCVal == CmpInst::ICMP_EQ &&
1235	((match(V: Src1, P: PatternMatch::m_One()) &&
1236	match(V: Src0, P: m_ZExt(Op: PatternMatch::m_Value(V&: ExtSrc)))) \|\|
1237	(match(V: Src1, P: PatternMatch::m_AllOnes()) &&
1238	match(V: Src0, P: m_SExt(Op: PatternMatch::m_Value(V&: ExtSrc))))) &&
1239	ExtSrc->getType()->isIntegerTy(Bitwidth: `1`)) {
1240	IC.replaceOperand(I&: II, OpNum: `1`, V: ConstantInt::getNullValue(Ty: Src1->getType()));
1241	IC.replaceOperand(I&: II, OpNum: `2`,
1242	V: ConstantInt::get(Ty: CC->getType(), V: CmpInst::ICMP_NE));
1243	return &II;
1244	}
1245
1246	CmpPredicate SrcPred;
1247	Value *SrcLHS;
1248	Value *SrcRHS;
1249
1250	// Fold compare eq/ne with 0 from a compare result as the predicate to the
1251	// intrinsic. The typical use is a wave vote function in the library, which
1252	// will be fed from a user code condition compared with 0. Fold in the
1253	// redundant compare.
1254
1255	// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1256	// -> llvm.amdgcn.[if]cmp(a, b, pred)
1257	//
1258	// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1259	// -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1260	if (match(V: Src1, P: PatternMatch::m_Zero()) &&
1261	match(V: Src0, P: PatternMatch::m_ZExtOrSExt(
1262	Op: m_Cmp(Pred&: SrcPred, L: PatternMatch::m_Value(V&: SrcLHS),
1263	R: PatternMatch::m_Value(V&: SrcRHS))))) {
1264	if (CCVal == CmpInst::ICMP_EQ)
1265	SrcPred = CmpInst::getInversePredicate(pred: SrcPred);
1266
1267	Intrinsic::ID NewIID = CmpInst::isFPPredicate(P: SrcPred)
1268	? Intrinsic::amdgcn_fcmp
1269	: Intrinsic::amdgcn_icmp;
1270
1271	Type *Ty = SrcLHS->getType();
1272	if (auto *CmpType = dyn_cast<IntegerType>(Val: Ty)) {
1273	// Promote to next legal integer type.
1274	unsigned Width = CmpType->getBitWidth();
1275	unsigned NewWidth = Width;
1276
1277	// Don't do anything for i1 comparisons.
1278	if (Width == `1`)
1279	break;
1280
1281	if (Width <= `16`)
1282	NewWidth = `16`;
1283	else if (Width <= `32`)
1284	NewWidth = `32`;
1285	else if (Width <= `64`)
1286	NewWidth = `64`;
1287	else
1288	break; // Can't handle this.
1289
1290	if (Width != NewWidth) {
1291	IntegerType *CmpTy = IC.Builder.getIntNTy(N: NewWidth);
1292	if (CmpInst::isSigned(predicate: SrcPred)) {
1293	SrcLHS = IC.Builder.CreateSExt(V: SrcLHS, DestTy: CmpTy);
1294	SrcRHS = IC.Builder.CreateSExt(V: SrcRHS, DestTy: CmpTy);
1295	} else {
1296	SrcLHS = IC.Builder.CreateZExt(V: SrcLHS, DestTy: CmpTy);
1297	SrcRHS = IC.Builder.CreateZExt(V: SrcRHS, DestTy: CmpTy);
1298	}
1299	}
1300	} else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1301	break;
1302
1303	Value *Args[] = {SrcLHS, SrcRHS,
1304	ConstantInt::get(Ty: CC->getType(), V: SrcPred)};
1305	CallInst *NewCall = IC.Builder.CreateIntrinsic(
1306	ID: NewIID, Types: {II.getType(), SrcLHS->getType()}, Args);
1307	NewCall->takeName(V: &II);
1308	return IC.replaceInstUsesWith(I&: II, V: NewCall);
1309	}
1310
1311	break;
1312	}
1313	case Intrinsic::amdgcn_mbcnt_hi: {
1314	// exec_hi is all 0, so this is just a copy.
1315	if (ST->isWave32())
1316	return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: `1`));
1317	break;
1318	}
1319	case Intrinsic::amdgcn_ballot: {
1320	Value *Arg = II.getArgOperand(i: `0`);
1321	if (isa<PoisonValue>(Val: Arg))
1322	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1323
1324	if (auto *Src = dyn_cast<ConstantInt>(Val: Arg)) {
1325	if (Src->isZero()) {
1326	// amdgcn.ballot(i1 0) is zero.
1327	return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1328	}
1329	}
1330	if (ST->isWave32() && II.getType()->getIntegerBitWidth() == `64`) {
1331	// %b64 = call i64 ballot.i64(...)
1332	// =>
1333	// %b32 = call i32 ballot.i32(...)
1334	// %b64 = zext i32 %b32 to i64
1335	Value *Call = IC.Builder.CreateZExt(
1336	V: IC.Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_ballot,
1337	Types: {IC.Builder.getInt32Ty()},
1338	Args: {II.getArgOperand(i: `0`)}),
1339	DestTy: II.getType());
1340	Call->takeName(V: &II);
1341	return IC.replaceInstUsesWith(I&: II, V: Call);
1342	}
1343	break;
1344	}
1345	case Intrinsic::amdgcn_wavefrontsize: {
1346	if (ST->isWaveSizeKnown())
1347	return IC.replaceInstUsesWith(
1348	I&: II, V: ConstantInt::get(Ty: II.getType(), V: ST->getWavefrontSize()));
1349	break;
1350	}
1351	case Intrinsic::amdgcn_wqm_vote: {
1352	// wqm_vote is identity when the argument is constant.
1353	if (!isa<Constant>(Val: II.getArgOperand(i: `0`)))
1354	break;
1355
1356	return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: `0`));
1357	}
1358	case Intrinsic::amdgcn_kill: {
1359	const ConstantInt *C = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `0`));
1360	if (!C \|\| !C->getZExtValue())
1361	break;
1362
1363	// amdgcn.kill(i1 1) is a no-op
1364	return IC.eraseInstFromFunction(I&: II);
1365	}
1366	case Intrinsic::amdgcn_update_dpp: {
1367	Value *Old = II.getArgOperand(i: `0`);
1368
1369	auto *BC = cast<ConstantInt>(Val: II.getArgOperand(i: `5`));
1370	auto *RM = cast<ConstantInt>(Val: II.getArgOperand(i: `3`));
1371	auto *BM = cast<ConstantInt>(Val: II.getArgOperand(i: `4`));
1372	if (BC->isZeroValue() \|\| RM->getZExtValue() != `0xF` \|\|
1373	BM->getZExtValue() != `0xF` \|\| isa<PoisonValue>(Val: Old))
1374	break;
1375
1376	// If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1377	return IC.replaceOperand(I&: II, OpNum: `0`, V: PoisonValue::get(T: Old->getType()));
1378	}
1379	case Intrinsic::amdgcn_permlane16:
1380	case Intrinsic::amdgcn_permlane16_var:
1381	case Intrinsic::amdgcn_permlanex16:
1382	case Intrinsic::amdgcn_permlanex16_var: {
1383	// Discard vdst_in if it's not going to be read.
1384	Value *VDstIn = II.getArgOperand(i: `0`);
1385	if (isa<PoisonValue>(Val: VDstIn))
1386	break;
1387
1388	// FetchInvalid operand idx.
1389	unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 \|\|
1390	IID == Intrinsic::amdgcn_permlanex16)
1391	? `4` / for permlane16 and permlanex16 /
1392	: `3`; / for permlane16_var and permlanex16_var /
1393
1394	// BoundCtrl operand idx.
1395	// For permlane16 and permlanex16 it should be 5
1396	// For Permlane16_var and permlanex16_var it should be 4
1397	unsigned int BcIdx = FiIdx + `1`;
1398
1399	ConstantInt *FetchInvalid = cast<ConstantInt>(Val: II.getArgOperand(i: FiIdx));
1400	ConstantInt *BoundCtrl = cast<ConstantInt>(Val: II.getArgOperand(i: BcIdx));
1401	if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1402	break;
1403
1404	return IC.replaceOperand(I&: II, OpNum: `0`, V: PoisonValue::get(T: VDstIn->getType()));
1405	}
1406	case Intrinsic::amdgcn_permlane64:
1407	case Intrinsic::amdgcn_readfirstlane:
1408	case Intrinsic::amdgcn_readlane:
1409	case Intrinsic::amdgcn_ds_bpermute: {
1410	// If the data argument is uniform these intrinsics return it unchanged.
1411	unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? `1` : `0`;
1412	const Use &Src = II.getArgOperandUse(i: SrcIdx);
1413	if (isTriviallyUniform(U: Src))
1414	return IC.replaceInstUsesWith(I&: II, V: Src.get());
1415
1416	if (IID == Intrinsic::amdgcn_readlane &&
1417	simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: `1`))
1418	return &II;
1419
1420	// If the lane argument of bpermute is uniform, change it to readlane. This
1421	// generates better code and can enable further optimizations because
1422	// readlane is AlwaysUniform.
1423	if (IID == Intrinsic::amdgcn_ds_bpermute) {
1424	const Use &Lane = II.getArgOperandUse(i: `0`);
1425	if (isTriviallyUniform(U: Lane)) {
1426	Value *NewLane = IC.Builder.CreateLShr(LHS: Lane, RHS: `2`);
1427	Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1428	M: II.getModule(), id: Intrinsic::amdgcn_readlane, Tys: II.getType());
1429	II.setCalledFunction(NewDecl);
1430	II.setOperand(i_nocapture: `0`, Val_nocapture: Src);
1431	II.setOperand(i_nocapture: `1`, Val_nocapture: NewLane);
1432	return &II;
1433	}
1434	}
1435
1436	if (IID != Intrinsic::amdgcn_ds_bpermute) {
1437	if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
1438	return Res;
1439	}
1440
1441	return std::nullopt;
1442	}
1443	case Intrinsic::amdgcn_writelane: {
1444	// TODO: Fold bitcast like readlane.
1445	if (simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: `1`))
1446	return &II;
1447	return std::nullopt;
1448	}
1449	case Intrinsic::amdgcn_trig_preop: {
1450	// The intrinsic is declared with name mangling, but currently the
1451	// instruction only exists for f64
1452	if (!II.getType()->isDoubleTy())
1453	break;
1454
1455	Value *Src = II.getArgOperand(i: `0`);
1456	Value *Segment = II.getArgOperand(i: `1`);
1457	if (isa<PoisonValue>(Val: Src) \|\| isa<PoisonValue>(Val: Segment))
1458	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1459
1460	if (isa<UndefValue>(Val: Src)) {
1461	auto *QNaN = ConstantFP::get(
1462	Ty: II.getType(), V: APFloat::getQNaN(Sem: II.getType()->getFltSemantics()));
1463	return IC.replaceInstUsesWith(I&: II, V: QNaN);
1464	}
1465
1466	const ConstantFP *Csrc = dyn_cast<ConstantFP>(Val: Src);
1467	if (!Csrc)
1468	break;
1469
1470	if (II.isStrictFP())
1471	break;
1472
1473	const APFloat &Fsrc = Csrc->getValueAPF();
1474	if (Fsrc.isNaN()) {
1475	auto *Quieted = ConstantFP::get(Ty: II.getType(), V: Fsrc.makeQuiet());
1476	return IC.replaceInstUsesWith(I&: II, V: Quieted);
1477	}
1478
1479	const ConstantInt *Cseg = dyn_cast<ConstantInt>(Val: Segment);
1480	if (!Cseg)
1481	break;
1482
1483	unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> `52`) & `0x7ff`;
1484	unsigned SegmentVal = Cseg->getValue().trunc(width: `5`).getZExtValue();
1485	unsigned Shift = SegmentVal * `53`;
1486	if (Exponent > `1077`)
1487	Shift += Exponent - `1077`;
1488
1489	// 2.0/PI table.
1490	static const uint32_t TwoByPi[] = {
1491	`0xa2f9836e`, `0x4e441529`, `0xfc2757d1`, `0xf534ddc0`, `0xdb629599`, `0x3c439041`,
1492	`0xfe5163ab`, `0xdebbc561`, `0xb7246e3a`, `0x424dd2e0`, `0x06492eea`, `0x09d1921c`,
1493	`0xfe1deb1c`, `0xb129a73e`, `0xe88235f5`, `0x2ebb4484`, `0xe99c7026`, `0xb45f7e41`,
1494	`0x3991d639`, `0x835339f4`, `0x9c845f8b`, `0xbdf9283b`, `0x1ff897ff`, `0xde05980f`,
1495	`0xef2f118b`, `0x5a0a6d1f`, `0x6d367ecf`, `0x27cb09b7`, `0x4f463f66`, `0x9e5fea2d`,
1496	`0x7527bac7`, `0xebe5f17b`, `0x3d0739f7`, `0x8a5292ea`, `0x6bfb5fb1`, `0x1f8d5d08`,
1497	`0x56033046`};
1498
1499	// Return 0 for outbound segment (hardware behavior).
1500	unsigned Idx = Shift >> `5`;
1501	if (Idx + `2` >= std::size(TwoByPi)) {
1502	APFloat Zero = APFloat::getZero(Sem: II.getType()->getFltSemantics());
1503	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: II.getType(), V: Zero));
1504	}
1505
1506	unsigned BShift = Shift & `0x1f`;
1507	uint64_t Thi = Make_64(High: TwoByPi[Idx], Low: TwoByPi[Idx + `1`]);
1508	uint64_t Tlo = Make_64(High: TwoByPi[Idx + `2`], Low: `0`);
1509	if (BShift)
1510	Thi = (Thi << BShift) \| (Tlo >> (`64` - BShift));
1511	Thi = Thi >> `11`;
1512	APFloat Result = APFloat ((double)Thi);
1513
1514	int Scale = -`53` - Shift;
1515	if (Exponent >= `1968`)
1516	Scale += `128`;
1517
1518	Result = scalbn(X: Result, Exp: Scale, RM: RoundingMode::NearestTiesToEven);
1519	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: Src->getType(), V: Result));
1520	}
1521	case Intrinsic::amdgcn_fmul_legacy: {
1522	Value *Op0 = II.getArgOperand(i: `0`);
1523	Value *Op1 = II.getArgOperand(i: `1`);
1524
1525	for (Value *Src : {Op0, Op1}) {
1526	if (isa<PoisonValue>(Val: Src))
1527	return IC.replaceInstUsesWith(I&: II, V: Src);
1528	}
1529
1530	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1531	// infinity, gives +0.0.
1532	// TODO: Move to InstSimplify?
1533	if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) \|\|
1534	match(V: Op1, P: PatternMatch::m_AnyZeroFP()))
1535	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1536
1537	// If we can prove we don't have one of the special cases then we can use a
1538	// normal fmul instruction instead.
1539	if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1540	auto *FMul = IC.Builder.CreateFMulFMF(L: Op0, R: Op1, FMFSource: &II);
1541	FMul->takeName(V: &II);
1542	return IC.replaceInstUsesWith(I&: II, V: FMul);
1543	}
1544	break;
1545	}
1546	case Intrinsic::amdgcn_fma_legacy: {
1547	Value *Op0 = II.getArgOperand(i: `0`);
1548	Value *Op1 = II.getArgOperand(i: `1`);
1549	Value *Op2 = II.getArgOperand(i: `2`);
1550
1551	for (Value *Src : {Op0, Op1, Op2}) {
1552	if (isa<PoisonValue>(Val: Src))
1553	return IC.replaceInstUsesWith(I&: II, V: Src);
1554	}
1555
1556	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1557	// infinity, gives +0.0.
1558	// TODO: Move to InstSimplify?
1559	if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) \|\|
1560	match(V: Op1, P: PatternMatch::m_AnyZeroFP())) {
1561	// It's tempting to just return Op2 here, but that would give the wrong
1562	// result if Op2 was -0.0.
1563	auto *Zero = ConstantFP::getZero(Ty: II.getType());
1564	auto *FAdd = IC.Builder.CreateFAddFMF(L: Zero, R: Op2, FMFSource: &II);
1565	FAdd->takeName(V: &II);
1566	return IC.replaceInstUsesWith(I&: II, V: FAdd);
1567	}
1568
1569	// If we can prove we don't have one of the special cases then we can use a
1570	// normal fma instead.
1571	if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1572	II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1573	M: II.getModule(), id: Intrinsic::fma, Tys: II.getType()));
1574	return &II;
1575	}
1576	break;
1577	}
1578	case Intrinsic::amdgcn_is_shared:
1579	case Intrinsic::amdgcn_is_private: {
1580	Value *Src = II.getArgOperand(i: `0`);
1581	if (isa<PoisonValue>(Val: Src))
1582	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1583	if (isa<UndefValue>(Val: Src))
1584	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1585
1586	if (isa<ConstantPointerNull>(Val: II.getArgOperand(i: `0`)))
1587	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getFalse(Ty: II.getType()));
1588	break;
1589	}
1590	case Intrinsic::amdgcn_make_buffer_rsrc: {
1591	Value *Src = II.getArgOperand(i: `0`);
1592	if (isa<PoisonValue>(Val: Src))
1593	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1594	return std::nullopt;
1595	}
1596	case Intrinsic::amdgcn_raw_buffer_store_format:
1597	case Intrinsic::amdgcn_struct_buffer_store_format:
1598	case Intrinsic::amdgcn_raw_tbuffer_store:
1599	case Intrinsic::amdgcn_struct_tbuffer_store:
1600	case Intrinsic::amdgcn_image_store_1d:
1601	case Intrinsic::amdgcn_image_store_1darray:
1602	case Intrinsic::amdgcn_image_store_2d:
1603	case Intrinsic::amdgcn_image_store_2darray:
1604	case Intrinsic::amdgcn_image_store_2darraymsaa:
1605	case Intrinsic::amdgcn_image_store_2dmsaa:
1606	case Intrinsic::amdgcn_image_store_3d:
1607	case Intrinsic::amdgcn_image_store_cube:
1608	case Intrinsic::amdgcn_image_store_mip_1d:
1609	case Intrinsic::amdgcn_image_store_mip_1darray:
1610	case Intrinsic::amdgcn_image_store_mip_2d:
1611	case Intrinsic::amdgcn_image_store_mip_2darray:
1612	case Intrinsic::amdgcn_image_store_mip_3d:
1613	case Intrinsic::amdgcn_image_store_mip_cube: {
1614	if (!isa<FixedVectorType>(Val: II.getArgOperand(i: `0`)->getType()))
1615	break;
1616
1617	APInt DemandedElts;
1618	if (ST->hasDefaultComponentBroadcast())
1619	DemandedElts = defaultComponentBroadcast(V: II.getArgOperand(i: `0`));
1620	else if (ST->hasDefaultComponentZero())
1621	DemandedElts = trimTrailingZerosInVector(IC, UseV: II.getArgOperand(i: `0`), I: &II);
1622	else
1623	break;
1624
1625	int DMaskIdx = getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID()) ? `1` : -`1`;
1626	if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1627	IsLoad: false)) {
1628	return IC.eraseInstFromFunction(I&: II);
1629	}
1630
1631	break;
1632	}
1633	case Intrinsic::amdgcn_prng_b32: {
1634	auto *Src = II.getArgOperand(i: `0`);
1635	if (isa<UndefValue>(Val: Src)) {
1636	return IC.replaceInstUsesWith(I&: II, V: Src);
1637	}
1638	return std::nullopt;
1639	}
1640	case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1641	case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1642	Value *Src0 = II.getArgOperand(i: `0`);
1643	Value *Src1 = II.getArgOperand(i: `1`);
1644	uint64_t CBSZ = cast<ConstantInt>(Val: II.getArgOperand(i: `3`))->getZExtValue();
1645	uint64_t BLGP = cast<ConstantInt>(Val: II.getArgOperand(i: `4`))->getZExtValue();
1646	auto *Src0Ty = cast<FixedVectorType>(Val: Src0->getType());
1647	auto *Src1Ty = cast<FixedVectorType>(Val: Src1->getType());
1648
1649	auto getFormatNumRegs = [](unsigned FormatVal) {
1650	switch (FormatVal) {
1651	case AMDGPU::MFMAScaleFormats::FP6_E2M3:
1652	case AMDGPU::MFMAScaleFormats::FP6_E3M2:
1653	return `6u`;
1654	case AMDGPU::MFMAScaleFormats::FP4_E2M1:
1655	return `4u`;
1656	case AMDGPU::MFMAScaleFormats::FP8_E4M3:
1657	case AMDGPU::MFMAScaleFormats::FP8_E5M2:
1658	return `8u`;
1659	default:
1660	llvm_unreachable("invalid format value");
1661	}
1662	};
1663
1664	bool MadeChange = false;
1665	unsigned Src0NumElts = getFormatNumRegs (CBSZ);
1666	unsigned Src1NumElts = getFormatNumRegs (BLGP);
1667
1668	// Depending on the used format, fewer registers are required so shrink the
1669	// vector type.
1670	if (Src0Ty->getNumElements() > Src0NumElts) {
1671	Src0 = IC.Builder.CreateExtractVector(
1672	DstType: FixedVectorType::get(ElementType: Src0Ty->getElementType(), NumElts: Src0NumElts), SrcVec: Src0,
1673	Idx: uint64_t(`0`));
1674	MadeChange = true;
1675	}
1676
1677	if (Src1Ty->getNumElements() > Src1NumElts) {
1678	Src1 = IC.Builder.CreateExtractVector(
1679	DstType: FixedVectorType::get(ElementType: Src1Ty->getElementType(), NumElts: Src1NumElts), SrcVec: Src1,
1680	Idx: uint64_t(`0`));
1681	MadeChange = true;
1682	}
1683
1684	if (!MadeChange)
1685	return std::nullopt;
1686
1687	SmallVector<Value *, `10`> Args(II.args());
1688	Args [`0`] = Src0;
1689	Args [`1`] = Src1;
1690
1691	CallInst *NewII = IC.Builder.CreateIntrinsic(
1692	ID: IID, Types: {Src0->getType(), Src1->getType()}, Args, FMFSource: &II);
1693	NewII->takeName(V: &II);
1694	return IC.replaceInstUsesWith(I&: II, V: NewII);
1695	}
1696	}
1697	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1698	AMDGPU::getImageDimIntrinsicInfo(Intr: II.getIntrinsicID())) {
1699	return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1700	}
1701	return std::nullopt;
1702	}
1703
1704	/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1705	///
1706	/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1707	/// definitions of the intrinsics vector argument, not Uses of the result like
1708	/// image and buffer loads.
1709	/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1710	/// struct returns.
1711	static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1712	IntrinsicInst &II,
1713	APInt DemandedElts,
1714	int DMaskIdx, bool IsLoad) {
1715
1716	auto *IIVTy = cast<FixedVectorType>(Val: IsLoad ? II.getType()
1717	: II.getOperand(i_nocapture: `0`)->getType());
1718	unsigned VWidth = IIVTy->getNumElements();
1719	if (VWidth == `1`)
1720	return nullptr;
1721	Type *EltTy = IIVTy->getElementType();
1722
1723	IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1724	IC.Builder.SetInsertPoint(&II);
1725
1726	// Assume the arguments are unchanged and later override them, if needed.
1727	SmallVector<Value *, `16`> Args(II.args());
1728
1729	if (DMaskIdx < `0`) {
1730	// Buffer case.
1731
1732	const unsigned ActiveBits = DemandedElts.getActiveBits();
1733	const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1734
1735	// Start assuming the prefix of elements is demanded, but possibly clear
1736	// some other bits if there are trailing zeros (unused components at front)
1737	// and update offset.
1738	DemandedElts = (`1` << ActiveBits) - `1`;
1739
1740	if (UnusedComponentsAtFront > `0`) {
1741	static const unsigned InvalidOffsetIdx = `0xf`;
1742
1743	unsigned OffsetIdx;
1744	switch (II.getIntrinsicID()) {
1745	case Intrinsic::amdgcn_raw_buffer_load:
1746	case Intrinsic::amdgcn_raw_ptr_buffer_load:
1747	OffsetIdx = `1`;
1748	break;
1749	case Intrinsic::amdgcn_s_buffer_load:
1750	// If resulting type is vec3, there is no point in trimming the
1751	// load with updated offset, as the vec3 would most likely be widened to
1752	// vec4 anyway during lowering.
1753	if (ActiveBits == `4` && UnusedComponentsAtFront == `1`)
1754	OffsetIdx = InvalidOffsetIdx;
1755	else
1756	OffsetIdx = `1`;
1757	break;
1758	case Intrinsic::amdgcn_struct_buffer_load:
1759	case Intrinsic::amdgcn_struct_ptr_buffer_load:
1760	OffsetIdx = `2`;
1761	break;
1762	default:
1763	// TODO: handle tbuffer intrinsics.*
1764	OffsetIdx = InvalidOffsetIdx;
1765	break;
1766	}
1767
1768	if (OffsetIdx != InvalidOffsetIdx) {
1769	// Clear demanded bits and update the offset.
1770	DemandedElts &= ~((`1` << UnusedComponentsAtFront) - `1`);
1771	auto *Offset = Args [OffsetIdx];
1772	unsigned SingleComponentSizeInBits =
1773	IC.getDataLayout().getTypeSizeInBits(Ty: EltTy);
1774	unsigned OffsetAdd =
1775	UnusedComponentsAtFront * SingleComponentSizeInBits / `8`;
1776	auto *OffsetAddVal = ConstantInt::get(Ty: Offset->getType(), V: OffsetAdd);
1777	Args [OffsetIdx] = IC.Builder.CreateAdd(LHS: Offset, RHS: OffsetAddVal);
1778	}
1779	}
1780	} else {
1781	// Image case.
1782
1783	ConstantInt *DMask = cast<ConstantInt>(Val: Args [DMaskIdx]);
1784	unsigned DMaskVal = DMask->getZExtValue() & `0xf`;
1785
1786	// dmask 0 has special semantics, do not simplify.
1787	if (DMaskVal == `0`)
1788	return nullptr;
1789
1790	// Mask off values that are undefined because the dmask doesn't cover them
1791	DemandedElts &= (`1` << llvm::popcount(Value: DMaskVal)) - `1`;
1792
1793	unsigned NewDMaskVal = `0`;
1794	unsigned OrigLdStIdx = `0`;
1795	for (unsigned SrcIdx = `0`; SrcIdx < `4`; ++SrcIdx) {
1796	const unsigned Bit = `1` << SrcIdx;
1797	if (!!(DMaskVal & Bit)) {
1798	if (!!DemandedElts [OrigLdStIdx])
1799	NewDMaskVal \|= Bit;
1800	OrigLdStIdx++;
1801	}
1802	}
1803
1804	if (DMaskVal != NewDMaskVal)
1805	Args [DMaskIdx] = ConstantInt::get(Ty: DMask->getType(), V: NewDMaskVal);
1806	}
1807
1808	unsigned NewNumElts = DemandedElts.popcount();
1809	if (!NewNumElts)
1810	return PoisonValue::get(T: IIVTy);
1811
1812	if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1813	if (DMaskIdx >= `0`)
1814	II.setArgOperand(i: DMaskIdx, v: Args [DMaskIdx]);
1815	return nullptr;
1816	}
1817
1818	// Validate function argument and return types, extracting overloaded types
1819	// along the way.
1820	SmallVector<Type *, `6`> OverloadTys;
1821	if (!Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: OverloadTys))
1822	return nullptr;
1823
1824	Type *NewTy =
1825	(NewNumElts == `1`) ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: NewNumElts);
1826	OverloadTys [`0`] = NewTy;
1827
1828	if (!IsLoad) {
1829	SmallVector<int, `8`> EltMask;
1830	for (unsigned OrigStoreIdx = `0`; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1831	if (DemandedElts [OrigStoreIdx])
1832	EltMask.push_back(Elt: OrigStoreIdx);
1833
1834	if (NewNumElts == `1`)
1835	Args [`0`] = IC.Builder.CreateExtractElement(Vec: II.getOperand(i_nocapture: `0`), Idx: EltMask [`0`]);
1836	else
1837	Args [`0`] = IC.Builder.CreateShuffleVector(V: II.getOperand(i_nocapture: `0`), Mask: EltMask);
1838	}
1839
1840	CallInst *NewCall =
1841	IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: OverloadTys, Args);
1842	NewCall->takeName(V: &II);
1843	NewCall->copyMetadata(SrcInst: II);
1844
1845	if (IsLoad) {
1846	if (NewNumElts == `1`) {
1847	return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: IIVTy), NewElt: NewCall,
1848	Idx: DemandedElts.countr_zero());
1849	}
1850
1851	SmallVector<int, `8`> EltMask;
1852	unsigned NewLoadIdx = `0`;
1853	for (unsigned OrigLoadIdx = `0`; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1854	if (!!DemandedElts [OrigLoadIdx])
1855	EltMask.push_back(Elt: NewLoadIdx++);
1856	else
1857	EltMask.push_back(Elt: NewNumElts);
1858	}
1859
1860	auto *Shuffle = IC.Builder.CreateShuffleVector(V: NewCall, Mask: EltMask);
1861
1862	return Shuffle;
1863	}
1864
1865	return NewCall;
1866	}
1867
1868	Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
1869	InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
1870	APInt &UndefElts) const {
1871	auto *VT = dyn_cast<FixedVectorType>(Val: II.getType());
1872	if (!VT)
1873	return nullptr;
1874
1875	const unsigned FirstElt = DemandedElts.countr_zero();
1876	const unsigned LastElt = DemandedElts.getActiveBits() - `1`;
1877	const unsigned MaskLen = LastElt - FirstElt + `1`;
1878
1879	unsigned OldNumElts = VT->getNumElements();
1880	if (MaskLen == OldNumElts && MaskLen != `1`)
1881	return nullptr;
1882
1883	Type *EltTy = VT->getElementType();
1884	Type *NewVT = MaskLen == `1` ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: MaskLen);
1885
1886	// Theoretically we should support these intrinsics for any legal type. Avoid
1887	// introducing cases that aren't direct register types like v3i16.
1888	if (!isTypeLegal(Ty: NewVT))
1889	return nullptr;
1890
1891	Value *Src = II.getArgOperand(i: `0`);
1892
1893	// Make sure convergence tokens are preserved.
1894	// TODO: CreateIntrinsic should allow directly copying bundles
1895	SmallVector<OperandBundleDef, `2`> OpBundles;
1896	II.getOperandBundlesAsDefs(Defs&: OpBundles);
1897
1898	Module *M = IC.Builder.GetInsertBlock()->getModule();
1899	Function *Remangled =
1900	Intrinsic::getOrInsertDeclaration(M, id: II.getIntrinsicID(), Tys: {NewVT});
1901
1902	if (MaskLen == `1`) {
1903	Value *Extract = IC.Builder.CreateExtractElement(Vec: Src, Idx: FirstElt);
1904
1905	// TODO: Preserve callsite attributes?
1906	CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
1907
1908	return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: II.getType()),
1909	NewElt: NewCall, Idx: FirstElt);
1910	}
1911
1912	SmallVector<int> ExtractMask(MaskLen, -`1`);
1913	for (unsigned I = `0`; I != MaskLen; ++I) {
1914	if (DemandedElts [FirstElt + I])
1915	ExtractMask [I] = FirstElt + I;
1916	}
1917
1918	Value *Extract = IC.Builder.CreateShuffleVector(V: Src, Mask: ExtractMask);
1919
1920	// TODO: Preserve callsite attributes?
1921	CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
1922
1923	SmallVector<int> InsertMask(OldNumElts, -`1`);
1924	for (unsigned I = `0`; I != MaskLen; ++I) {
1925	if (DemandedElts [FirstElt + I])
1926	InsertMask [FirstElt + I] = I;
1927	}
1928
1929	// FIXME: If the call has a convergence bundle, we end up leaving the dead
1930	// call behind.
1931	return IC.Builder.CreateShuffleVector(V: NewCall, Mask: InsertMask);
1932	}
1933
1934	std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1935	InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1936	APInt &UndefElts2, APInt &UndefElts3,
1937	std::function<void(Instruction , unsigned*, APInt, APInt &)>
1938	SimplifyAndSetOp) const {
1939	switch (II.getIntrinsicID()) {
1940	case Intrinsic::amdgcn_readfirstlane:
1941	SimplifyAndSetOp (&II, `0`, DemandedElts, UndefElts);
1942	return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
1943	case Intrinsic::amdgcn_raw_buffer_load:
1944	case Intrinsic::amdgcn_raw_ptr_buffer_load:
1945	case Intrinsic::amdgcn_raw_buffer_load_format:
1946	case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1947	case Intrinsic::amdgcn_raw_tbuffer_load:
1948	case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1949	case Intrinsic::amdgcn_s_buffer_load:
1950	case Intrinsic::amdgcn_struct_buffer_load:
1951	case Intrinsic::amdgcn_struct_ptr_buffer_load:
1952	case Intrinsic::amdgcn_struct_buffer_load_format:
1953	case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1954	case Intrinsic::amdgcn_struct_tbuffer_load:
1955	case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1956	return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1957	default: {
1958	if (getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID())) {
1959	return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx: `0`);
1960	}
1961	break;
1962	}
1963	}
1964	return std::nullopt;
1965	}
1966

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp