AMDGPUInstCombineIntrinsic.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp]

1	//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file
10	// This file implements a TargetTransformInfo analysis pass specific to the
11	// AMDGPU target machine. It uses the target's detailed information to provide
12	// more precise answers to certain TTI queries, while letting the target
13	// independent and default TTI implementations handle the rest.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "AMDGPUInstrInfo.h"
18	#include "AMDGPUTargetTransformInfo.h"
19	#include "GCNSubtarget.h"
20	#include "llvm/ADT/FloatingPointMode.h"
21	#include "llvm/IR/IntrinsicsAMDGPU.h"
22	#include "llvm/Transforms/InstCombine/InstCombiner.h"
23	#include <optional>
24
25	using namespace llvm;
26	using namespace llvm::PatternMatch;
27
28	#define DEBUG_TYPE "AMDGPUtti"
29
30	namespace {
31
32	struct AMDGPUImageDMaskIntrinsic {
33	unsigned Intr;
34	};
35
36	#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37	#include "InstCombineTables.inc"
38
39	} // end anonymous namespace
40
41	// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
42	//
43	// A single NaN input is folded to minnum, so we rely on that folding for
44	// handling NaNs.
45	static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
46	const APFloat &Src2) {
47	APFloat Max3 = maxnum(A: maxnum(A: Src0, B: Src1), B: Src2);
48
49	APFloat::cmpResult Cmp0 = Max3.compare(RHS: Src0);
50	assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
51	if (Cmp0 == APFloat::cmpEqual)
52	return maxnum(A: Src1, B: Src2);
53
54	APFloat::cmpResult Cmp1 = Max3.compare(RHS: Src1);
55	assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
56	if (Cmp1 == APFloat::cmpEqual)
57	return maxnum(A: Src0, B: Src2);
58
59	return maxnum(A: Src0, B: Src1);
60	}
61
62	// Check if a value can be converted to a 16-bit value without losing
63	// precision.
64	// The value is expected to be either a float (IsFloat = true) or an unsigned
65	// integer (IsFloat = false).
66	static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
67	Type *VTy = V.getType();
68	if (VTy->isHalfTy() \|\| VTy->isIntegerTy(Bitwidth: `16`)) {
69	// The value is already 16-bit, so we don't want to convert to 16-bit again!
70	return false;
71	}
72	if (IsFloat) {
73	if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(Val: &V)) {
74	// We need to check that if we cast the index down to a half, we do not
75	// lose precision.
76	APFloat FloatValue(ConstFloat->getValueAPF());
77	bool LosesInfo = true;
78	FloatValue.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero,
79	losesInfo: &LosesInfo);
80	return !LosesInfo;
81	}
82	} else {
83	if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Val: &V)) {
84	// We need to check that if we cast the index down to an i16, we do not
85	// lose precision.
86	APInt IntValue(ConstInt->getValue());
87	return IntValue.getActiveBits() <= `16`;
88	}
89	}
90
91	Value *CastSrc;
92	bool IsExt = IsFloat ? match(V: &V, P: m_FPExt(Op: PatternMatch::m_Value(V&: CastSrc)))
93	: match(V: &V, P: m_ZExt(Op: PatternMatch::m_Value(V&: CastSrc)));
94	if (IsExt) {
95	Type *CastSrcTy = CastSrc->getType();
96	if (CastSrcTy->isHalfTy() \|\| CastSrcTy->isIntegerTy(Bitwidth: `16`))
97	return true;
98	}
99
100	return false;
101	}
102
103	// Convert a value to 16-bit.
104	static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
105	Type *VTy = V.getType();
106	if (isa<FPExtInst>(Val: &V) \|\| isa<SExtInst>(Val: &V) \|\| isa<ZExtInst>(Val: &V))
107	return cast<Instruction>(Val: &V)->getOperand(i: `0`);
108	if (VTy->isIntegerTy())
109	return Builder.CreateIntCast(V: &V, DestTy: Type::getInt16Ty(C&: V.getContext()), isSigned: false);
110	if (VTy->isFloatingPointTy())
111	return Builder.CreateFPCast(V: &V, DestTy: Type::getHalfTy(C&: V.getContext()));
112
113	llvm_unreachable("Should never be called!");
114	}
115
116	/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117	/// modified arguments (based on OldIntr) and replaces InstToReplace with
118	/// this newly created intrinsic call.
119	static std::optional<Instruction *> modifyIntrinsicCall(
120	IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
121	InstCombiner &IC,
122	std::function<void(SmallVectorImpl<Value > &, SmallVectorImpl<Type > &)>
123	Func) {
124	SmallVector<Type *, `4`> ArgTys;
125	if (!Intrinsic::getIntrinsicSignature(F: OldIntr.getCalledFunction(), ArgTys))
126	return std::nullopt;
127
128	SmallVector<Value *, `8`> Args(OldIntr.args());
129
130	// Modify arguments and types
131	Func (Args, ArgTys);
132
133	Function *I = Intrinsic::getDeclaration(M: OldIntr.getModule(), id: NewIntr, Tys: ArgTys);
134
135	CallInst *NewCall = IC.Builder.CreateCall(Callee: I, Args);
136	NewCall->takeName(V: &OldIntr);
137	NewCall->copyMetadata(SrcInst: OldIntr);
138	if (isa<FPMathOperator>(Val: NewCall))
139	NewCall->copyFastMathFlags(I: &OldIntr);
140
141	// Erase and replace uses
142	if (!InstToReplace.getType()->isVoidTy())
143	IC.replaceInstUsesWith(I&: InstToReplace, V: NewCall);
144
145	bool RemoveOldIntr = &OldIntr != &InstToReplace;
146
147	auto RetValue = IC.eraseInstFromFunction(I&: InstToReplace);
148	if (RemoveOldIntr)
149	IC.eraseInstFromFunction(I&: OldIntr);
150
151	return RetValue;
152	}
153
154	static std::optional<Instruction *>
155	simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
156	const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157	IntrinsicInst &II, InstCombiner &IC) {
158	// Optimize _L to _LZ when _L is zero
159	if (const auto *LZMappingInfo =
160	AMDGPU::getMIMGLZMappingInfo(L: ImageDimIntr->BaseOpcode)) {
161	if (auto *ConstantLod =
162	dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->LodIndex))) {
163	if (ConstantLod->isZero() \|\| ConstantLod->isNegative()) {
164	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: LZMappingInfo->LZ,
166	Dim: ImageDimIntr->Dim);
167	return modifyIntrinsicCall(
168	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
169	Args.erase(Args.begin() + ImageDimIntr->LodIndex);
170	});
171	}
172	}
173	}
174
175	// Optimize _mip away, when 'lod' is zero
176	if (const auto *MIPMappingInfo =
177	AMDGPU::getMIMGMIPMappingInfo(MIP: ImageDimIntr->BaseOpcode)) {
178	if (auto *ConstantMip =
179	dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->MipIndex))) {
180	if (ConstantMip->isZero()) {
181	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
182	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: MIPMappingInfo->NONMIP,
183	Dim: ImageDimIntr->Dim);
184	return modifyIntrinsicCall(
185	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
186	Args.erase(Args.begin() + ImageDimIntr->MipIndex);
187	});
188	}
189	}
190	}
191
192	// Optimize _bias away when 'bias' is zero
193	if (const auto *BiasMappingInfo =
194	AMDGPU::getMIMGBiasMappingInfo(Bias: ImageDimIntr->BaseOpcode)) {
195	if (auto *ConstantBias =
196	dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->BiasIndex))) {
197	if (ConstantBias->isZero()) {
198	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
199	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: BiasMappingInfo->NoBias,
200	Dim: ImageDimIntr->Dim);
201	return modifyIntrinsicCall(
202	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
203	Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
204	ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
205	});
206	}
207	}
208	}
209
210	// Optimize _offset away when 'offset' is zero
211	if (const auto *OffsetMappingInfo =
212	AMDGPU::getMIMGOffsetMappingInfo(Offset: ImageDimIntr->BaseOpcode)) {
213	if (auto *ConstantOffset =
214	dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->OffsetIndex))) {
215	if (ConstantOffset->isZero()) {
216	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217	AMDGPU::getImageDimIntrinsicByBaseOpcode(
218	BaseOpcode: OffsetMappingInfo->NoOffset, Dim: ImageDimIntr->Dim);
219	return modifyIntrinsicCall(
220	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
221	Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
222	});
223	}
224	}
225	}
226
227	// Try to use D16
228	if (ST->hasD16Images()) {
229
230	const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode);
232
233	if (BaseOpcode->HasD16) {
234
235	// If the only use of image intrinsic is a fptrunc (with conversion to
236	// half) then both fptrunc and image intrinsic will be replaced with image
237	// intrinsic with D16 flag.
238	if (II.hasOneUse()) {
239	Instruction *User = II.user_back();
240
241	if (User->getOpcode() == Instruction::FPTrunc &&
242	User->getType()->getScalarType()->isHalfTy()) {
243
244	return modifyIntrinsicCall(OldIntr&: II, InstToReplace&: *User, NewIntr: ImageDimIntr->Intr, IC,
245	Func: [&](auto &Args, auto &ArgTys) {
246	// Change return type of image intrinsic.
247	// Set it to return type of fptrunc.
248	ArgTys[`0`] = User->getType();
249	});
250	}
251	}
252	}
253	}
254
255	// Try to use A16 or G16
256	if (!ST->hasA16() && !ST->hasG16())
257	return std::nullopt;
258
259	// Address is interpreted as float if the instruction has a sampler or as
260	// unsigned int if there is no sampler.
261	bool HasSampler =
262	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode)->Sampler;
263	bool FloatCoord = false;
264	// true means derivatives can be converted to 16 bit, coordinates not
265	bool OnlyDerivatives = false;
266
267	for (unsigned OperandIndex = ImageDimIntr->GradientStart;
268	OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
269	Value *Coord = II.getOperand(i_nocapture: OperandIndex);
270	// If the values are not derived from 16-bit values, we cannot optimize.
271	if (!canSafelyConvertTo16Bit(V&: *Coord, IsFloat: HasSampler)) {
272	if (OperandIndex < ImageDimIntr->CoordStart \|\|
273	ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
274	return std::nullopt;
275	}
276	// All gradients can be converted, so convert only them
277	OnlyDerivatives = true;
278	break;
279	}
280
281	assert(OperandIndex == ImageDimIntr->GradientStart \|\|
282	FloatCoord == Coord->getType()->isFloatingPointTy());
283	FloatCoord = Coord->getType()->isFloatingPointTy();
284	}
285
286	if (!OnlyDerivatives && !ST->hasA16())
287	OnlyDerivatives = true; // Only supports G16
288
289	// Check if there is a bias parameter and if it can be converted to f16
290	if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != `0`) {
291	Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
292	assert(HasSampler &&
293	"Only image instructions with a sampler can have a bias");
294	if (!canSafelyConvertTo16Bit(V&: *Bias, IsFloat: HasSampler))
295	OnlyDerivatives = true;
296	}
297
298	if (OnlyDerivatives && (!ST->hasG16() \|\| ImageDimIntr->GradientStart ==
299	ImageDimIntr->CoordStart))
300	return std::nullopt;
301
302	Type *CoordType = FloatCoord ? Type::getHalfTy(C&: II.getContext())
303	: Type::getInt16Ty(C&: II.getContext());
304
305	return modifyIntrinsicCall(
306	OldIntr&: II, InstToReplace&: II, NewIntr: II.getIntrinsicID(), IC, Func: [&](auto &Args, auto &ArgTys) {
307	ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
308	if (!OnlyDerivatives) {
309	ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
310
311	// Change the bias type
312	if (ImageDimIntr->NumBiasArgs != `0`)
313	ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(C&: II.getContext());
314	}
315
316	unsigned EndIndex =
317	OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
318	for (unsigned OperandIndex = ImageDimIntr->GradientStart;
319	OperandIndex < EndIndex; OperandIndex++) {
320	Args[OperandIndex] =
321	convertTo16Bit(V&: *II.getOperand(i_nocapture: OperandIndex), Builder&: IC.Builder);
322	}
323
324	// Convert the bias
325	if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != `0`) {
326	Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
327	Args[ImageDimIntr->BiasIndex] = convertTo16Bit(V&: *Bias, Builder&: IC.Builder);
328	}
329	});
330	}
331
332	bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
333	const Value Op0, const* Value *Op1,
334	InstCombiner &IC) const {
335	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
336	// infinity, gives +0.0. If we can prove we don't have one of the special
337	// cases then we can use a normal multiply instead.
338	// TODO: Create and use isKnownFiniteNonZero instead of just matching
339	// constants here.
340	if (match(V: Op0, P: PatternMatch::m_FiniteNonZero()) \|\|
341	match(V: Op1, P: PatternMatch::m_FiniteNonZero())) {
342	// One operand is not zero or infinity or NaN.
343	return true;
344	}
345
346	SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(I: &I);
347	if (isKnownNeverInfOrNaN(V: Op0, /Depth=/`0`, SQ) &&
348	isKnownNeverInfOrNaN(V: Op1, /Depth=/`0`, SQ)) {
349	// Neither operand is infinity or NaN.
350	return true;
351	}
352	return false;
353	}
354
355	/// Match an fpext from half to float, or a constant we can convert.
356	static bool matchFPExtFromF16(Value Arg, Value &FPExtSrc) {
357	if (match(V: Arg, P: m_OneUse(SubPattern: m_FPExt(Op: m_Value(V&: FPExtSrc)))))
358	return FPExtSrc->getType()->isHalfTy();
359
360	ConstantFP *CFP;
361	if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
362	bool LosesInfo;
363	APFloat Val(CFP->getValueAPF());
364	Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
365	if (LosesInfo)
366	return false;
367
368	FPExtSrc = ConstantFP::get(Ty: Type::getHalfTy(C&: Arg->getContext()), V: Val);
369	return true;
370	}
371
372	return false;
373	}
374
375	// Trim all zero components from the end of the vector \p UseV and return
376	// an appropriate bitset with known elements.
377	static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
378	Instruction *I) {
379	auto *VTy = cast<FixedVectorType>(Val: UseV->getType());
380	unsigned VWidth = VTy->getNumElements();
381	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
382
383	for (int i = VWidth - `1`; i > `0`; --i) {
384	auto *Elt = findScalarElement(V: UseV, EltNo: i);
385	if (!Elt)
386	break;
387
388	if (auto *ConstElt = dyn_cast<Constant>(Val: Elt)) {
389	if (!ConstElt->isNullValue() && !isa<UndefValue>(Val: Elt))
390	break;
391	} else {
392	break;
393	}
394
395	DemandedElts.clearBit(BitPosition: i);
396	}
397
398	return DemandedElts;
399	}
400
401	// Trim elements of the end of the vector \p V, if they are
402	// equal to the first element of the vector.
403	static APInt defaultComponentBroadcast(Value *V) {
404	auto *VTy = cast<FixedVectorType>(Val: V->getType());
405	unsigned VWidth = VTy->getNumElements();
406	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
407	Value *FirstComponent = findScalarElement(V, EltNo: `0`);
408
409	SmallVector<int> ShuffleMask;
410	if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: V))
411	SVI->getShuffleMask(Result&: ShuffleMask);
412
413	for (int I = VWidth - `1`; I > `0`; --I) {
414	if (ShuffleMask.empty()) {
415	auto *Elt = findScalarElement(V, EltNo: I);
416	if (!Elt \|\| (Elt != FirstComponent && !isa<UndefValue>(Val: Elt)))
417	break;
418	} else {
419	// Detect identical elements in the shufflevector result, even though
420	// findScalarElement cannot tell us what that element is.
421	if (ShuffleMask [I] != ShuffleMask [`0`] && ShuffleMask [I] != PoisonMaskElem)
422	break;
423	}
424	DemandedElts.clearBit(BitPosition: I);
425	}
426
427	return DemandedElts;
428	}
429
430	static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
431	IntrinsicInst &II,
432	APInt DemandedElts,
433	int DMaskIdx = -`1`,
434	bool IsLoad = true);
435
436	/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
437	static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
438	return (SqrtOp->getType()->isFloatTy() &&
439	(SqrtOp->hasApproxFunc() \|\| SqrtOp->getFPAccuracy() >= `1.0f`)) \|\|
440	SqrtOp->getType()->isHalfTy();
441	}
442
443	std::optional<Instruction *>
444	GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
445	Intrinsic::ID IID = II.getIntrinsicID();
446	switch (IID) {
447	case Intrinsic::amdgcn_rcp: {
448	Value *Src = II.getArgOperand(i: `0`);
449
450	// TODO: Move to ConstantFolding/InstSimplify?
451	if (isa<UndefValue>(Val: Src)) {
452	Type *Ty = II.getType();
453	auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
454	return IC.replaceInstUsesWith(I&: II, V: QNaN);
455	}
456
457	if (II.isStrictFP())
458	break;
459
460	if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
461	const APFloat &ArgVal = C->getValueAPF();
462	APFloat Val(ArgVal.getSemantics(), `1`);
463	Val.divide(RHS: ArgVal, RM: APFloat::rmNearestTiesToEven);
464
465	// This is more precise than the instruction may give.
466	//
467	// TODO: The instruction always flushes denormal results (except for f16),
468	// should this also?
469	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Val));
470	}
471
472	FastMathFlags FMF = cast<FPMathOperator>(Val&: II).getFastMathFlags();
473	if (!FMF.allowContract())
474	break;
475	auto *SrcCI = dyn_cast<IntrinsicInst>(Val: Src);
476	if (!SrcCI)
477	break;
478
479	auto IID = SrcCI->getIntrinsicID();
480	// llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
481	//
482	// llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
483	// relaxed.
484	if (IID == Intrinsic::amdgcn_sqrt \|\| IID == Intrinsic::sqrt) {
485	const FPMathOperator *SqrtOp = cast<FPMathOperator>(Val: SrcCI);
486	FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
487	if (!InnerFMF.allowContract() \|\| !SrcCI->hasOneUse())
488	break;
489
490	if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
491	break;
492
493	Function *NewDecl = Intrinsic::getDeclaration(
494	M: SrcCI->getModule(), id: Intrinsic::amdgcn_rsq, Tys: {SrcCI->getType()});
495
496	InnerFMF \|= FMF;
497	II.setFastMathFlags(InnerFMF);
498
499	II.setCalledFunction(NewDecl);
500	return IC.replaceOperand(I&: II, OpNum: `0`, V: SrcCI->getArgOperand(i: `0`));
501	}
502
503	break;
504	}
505	case Intrinsic::amdgcn_sqrt:
506	case Intrinsic::amdgcn_rsq: {
507	Value *Src = II.getArgOperand(i: `0`);
508
509	// TODO: Move to ConstantFolding/InstSimplify?
510	if (isa<UndefValue>(Val: Src)) {
511	Type *Ty = II.getType();
512	auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
513	return IC.replaceInstUsesWith(I&: II, V: QNaN);
514	}
515
516	// f16 amdgcn.sqrt is identical to regular sqrt.
517	if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
518	Function *NewDecl = Intrinsic::getDeclaration(
519	M: II.getModule(), id: Intrinsic::sqrt, Tys: {II.getType()});
520	II.setCalledFunction(NewDecl);
521	return &II;
522	}
523
524	break;
525	}
526	case Intrinsic::amdgcn_log:
527	case Intrinsic::amdgcn_exp2: {
528	const bool IsLog = IID == Intrinsic::amdgcn_log;
529	const bool IsExp = IID == Intrinsic::amdgcn_exp2;
530	Value *Src = II.getArgOperand(i: `0`);
531	Type *Ty = II.getType();
532
533	if (isa<PoisonValue>(Val: Src))
534	return IC.replaceInstUsesWith(I&: II, V: Src);
535
536	if (IC.getSimplifyQuery().isUndefValue(V: Src))
537	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
538
539	if (ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
540	if (C->isInfinity()) {
541	// exp2(+inf) -> +inf
542	// log2(+inf) -> +inf
543	if (!C->isNegative())
544	return IC.replaceInstUsesWith(I&: II, V: C);
545
546	// exp2(-inf) -> 0
547	if (IsExp && C->isNegative())
548	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty));
549	}
550
551	if (II.isStrictFP())
552	break;
553
554	if (C->isNaN()) {
555	Constant *Quieted = ConstantFP::get(Ty, V: C->getValue().makeQuiet());
556	return IC.replaceInstUsesWith(I&: II, V: Quieted);
557	}
558
559	// f32 instruction doesn't handle denormals, f16 does.
560	if (C->isZero() \|\| (C->getValue().isDenormal() && Ty->isFloatTy())) {
561	Constant FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, Negative: true*)
562	: ConstantFP::get(Ty, V: `1.0`);
563	return IC.replaceInstUsesWith(I&: II, V: FoldedValue);
564	}
565
566	if (IsLog && C->isNegative())
567	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
568
569	// TODO: Full constant folding matching hardware behavior.
570	}
571
572	break;
573	}
574	case Intrinsic::amdgcn_frexp_mant:
575	case Intrinsic::amdgcn_frexp_exp: {
576	Value *Src = II.getArgOperand(i: `0`);
577	if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
578	int Exp;
579	APFloat Significand =
580	frexp(X: C->getValueAPF(), Exp, RM: APFloat::rmNearestTiesToEven);
581
582	if (IID == Intrinsic::amdgcn_frexp_mant) {
583	return IC.replaceInstUsesWith(
584	I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Significand));
585	}
586
587	// Match instruction special case behavior.
588	if (Exp == APFloat::IEK_NaN \|\| Exp == APFloat::IEK_Inf)
589	Exp = `0`;
590
591	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: Exp));
592	}
593
594	if (isa<UndefValue>(Val: Src)) {
595	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
596	}
597
598	break;
599	}
600	case Intrinsic::amdgcn_class: {
601	Value *Src0 = II.getArgOperand(i: `0`);
602	Value *Src1 = II.getArgOperand(i: `1`);
603	const ConstantInt *CMask = dyn_cast<ConstantInt>(Val: Src1);
604	if (CMask) {
605	II.setCalledOperand(Intrinsic::getDeclaration(
606	M: II.getModule(), id: Intrinsic::is_fpclass, Tys: Src0->getType()));
607
608	// Clamp any excess bits, as they're illegal for the generic intrinsic.
609	II.setArgOperand(i: `1`, v: ConstantInt::get(Ty: Src1->getType(),
610	V: CMask->getZExtValue() & fcAllFlags));
611	return &II;
612	}
613
614	// Propagate poison.
615	if (isa<PoisonValue>(Val: Src0) \|\| isa<PoisonValue>(Val: Src1))
616	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
617
618	// llvm.amdgcn.class(_, undef) -> false
619	if (IC.getSimplifyQuery().isUndefValue(V: Src1))
620	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: false));
621
622	// llvm.amdgcn.class(undef, mask) -> mask != 0
623	if (IC.getSimplifyQuery().isUndefValue(V: Src0)) {
624	Value *CmpMask = IC.Builder.CreateICmpNE(
625	LHS: Src1, RHS: ConstantInt::getNullValue(Ty: Src1->getType()));
626	return IC.replaceInstUsesWith(I&: II, V: CmpMask);
627	}
628	break;
629	}
630	case Intrinsic::amdgcn_cvt_pkrtz: {
631	Value *Src0 = II.getArgOperand(i: `0`);
632	Value *Src1 = II.getArgOperand(i: `1`);
633	if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
634	if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
635	const fltSemantics &HalfSem =
636	II.getType()->getScalarType()->getFltSemantics();
637	bool LosesInfo;
638	APFloat Val0 = C0->getValueAPF();
639	APFloat Val1 = C1->getValueAPF();
640	Val0.convert(ToSemantics: HalfSem, RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
641	Val1.convert(ToSemantics: HalfSem, RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
642
643	Constant *Folded =
644	ConstantVector::get(V: {ConstantFP::get(Context&: II.getContext(), V: Val0),
645	ConstantFP::get(Context&: II.getContext(), V: Val1)});
646	return IC.replaceInstUsesWith(I&: II, V: Folded);
647	}
648	}
649
650	if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
651	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
652	}
653
654	break;
655	}
656	case Intrinsic::amdgcn_cvt_pknorm_i16:
657	case Intrinsic::amdgcn_cvt_pknorm_u16:
658	case Intrinsic::amdgcn_cvt_pk_i16:
659	case Intrinsic::amdgcn_cvt_pk_u16: {
660	Value *Src0 = II.getArgOperand(i: `0`);
661	Value *Src1 = II.getArgOperand(i: `1`);
662
663	if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
664	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
665	}
666
667	break;
668	}
669	case Intrinsic::amdgcn_ubfe:
670	case Intrinsic::amdgcn_sbfe: {
671	// Decompose simple cases into standard shifts.
672	Value *Src = II.getArgOperand(i: `0`);
673	if (isa<UndefValue>(Val: Src)) {
674	return IC.replaceInstUsesWith(I&: II, V: Src);
675	}
676
677	unsigned Width;
678	Type *Ty = II.getType();
679	unsigned IntSize = Ty->getIntegerBitWidth();
680
681	ConstantInt *CWidth = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `2`));
682	if (CWidth) {
683	Width = CWidth->getZExtValue();
684	if ((Width & (IntSize - `1`)) == `0`) {
685	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getNullValue(Ty));
686	}
687
688	// Hardware ignores high bits, so remove those.
689	if (Width >= IntSize) {
690	return IC.replaceOperand(
691	I&: II, OpNum: `2`, V: ConstantInt::get(Ty: CWidth->getType(), V: Width & (IntSize - `1`)));
692	}
693	}
694
695	unsigned Offset;
696	ConstantInt *COffset = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
697	if (COffset) {
698	Offset = COffset->getZExtValue();
699	if (Offset >= IntSize) {
700	return IC.replaceOperand(
701	I&: II, OpNum: `1`,
702	V: ConstantInt::get(Ty: COffset->getType(), V: Offset & (IntSize - `1`)));
703	}
704	}
705
706	bool Signed = IID == Intrinsic::amdgcn_sbfe;
707
708	if (!CWidth \|\| !COffset)
709	break;
710
711	// The case of Width == 0 is handled above, which makes this transformation
712	// safe. If Width == 0, then the ashr and lshr instructions become poison
713	// value since the shift amount would be equal to the bit size.
714	assert(Width != `0`);
715
716	// TODO: This allows folding to undef when the hardware has specific
717	// behavior?
718	if (Offset + Width < IntSize) {
719	Value *Shl = IC.Builder.CreateShl(LHS: Src, RHS: IntSize - Offset - Width);
720	Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Shl, RHS: IntSize - Width)
721	: IC.Builder.CreateLShr(LHS: Shl, RHS: IntSize - Width);
722	RightShift->takeName(V: &II);
723	return IC.replaceInstUsesWith(I&: II, V: RightShift);
724	}
725
726	Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Src, RHS: Offset)
727	: IC.Builder.CreateLShr(LHS: Src, RHS: Offset);
728
729	RightShift->takeName(V: &II);
730	return IC.replaceInstUsesWith(I&: II, V: RightShift);
731	}
732	case Intrinsic::amdgcn_exp:
733	case Intrinsic::amdgcn_exp_row:
734	case Intrinsic::amdgcn_exp_compr: {
735	ConstantInt *En = cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
736	unsigned EnBits = En->getZExtValue();
737	if (EnBits == `0xf`)
738	break; // All inputs enabled.
739
740	bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
741	bool Changed = false;
742	for (int I = `0`; I < (IsCompr ? `2` : `4`); ++I) {
743	if ((!IsCompr && (EnBits & (`1` << I)) == `0`) \|\|
744	(IsCompr && ((EnBits & (`0x3` << (`2` * I))) == `0`))) {
745	Value *Src = II.getArgOperand(i: I + `2`);
746	if (!isa<UndefValue>(Val: Src)) {
747	IC.replaceOperand(I&: II, OpNum: I + `2`, V: UndefValue::get(T: Src->getType()));
748	Changed = true;
749	}
750	}
751	}
752
753	if (Changed) {
754	return &II;
755	}
756
757	break;
758	}
759	case Intrinsic::amdgcn_fmed3: {
760	// Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
761	// for the shader.
762
763	Value *Src0 = II.getArgOperand(i: `0`);
764	Value *Src1 = II.getArgOperand(i: `1`);
765	Value *Src2 = II.getArgOperand(i: `2`);
766
767	// Checking for NaN before canonicalization provides better fidelity when
768	// mapping other operations onto fmed3 since the order of operands is
769	// unchanged.
770	Value V = nullptr*;
771	if (match(V: Src0, P: PatternMatch::m_NaN()) \|\| isa<UndefValue>(Val: Src0)) {
772	V = IC.Builder.CreateMinNum(LHS: Src1, RHS: Src2);
773	} else if (match(V: Src1, P: PatternMatch::m_NaN()) \|\| isa<UndefValue>(Val: Src1)) {
774	V = IC.Builder.CreateMinNum(LHS: Src0, RHS: Src2);
775	} else if (match(V: Src2, P: PatternMatch::m_NaN()) \|\| isa<UndefValue>(Val: Src2)) {
776	V = IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src1);
777	}
778
779	if (V) {
780	if (auto *CI = dyn_cast<CallInst>(Val: V)) {
781	CI->copyFastMathFlags(I: &II);
782	CI->takeName(V: &II);
783	}
784	return IC.replaceInstUsesWith(I&: II, V);
785	}
786
787	bool Swap = false;
788	// Canonicalize constants to RHS operands.
789	//
790	// fmed3(c0, x, c1) -> fmed3(x, c0, c1)
791	if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
792	std::swap(a&: Src0, b&: Src1);
793	Swap = true;
794	}
795
796	if (isa<Constant>(Val: Src1) && !isa<Constant>(Val: Src2)) {
797	std::swap(a&: Src1, b&: Src2);
798	Swap = true;
799	}
800
801	if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
802	std::swap(a&: Src0, b&: Src1);
803	Swap = true;
804	}
805
806	if (Swap) {
807	II.setArgOperand(i: `0`, v: Src0);
808	II.setArgOperand(i: `1`, v: Src1);
809	II.setArgOperand(i: `2`, v: Src2);
810	return &II;
811	}
812
813	if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
814	if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
815	if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Val: Src2)) {
816	APFloat Result = fmed3AMDGCN(Src0: C0->getValueAPF(), Src1: C1->getValueAPF(),
817	Src2: C2->getValueAPF());
818	return IC.replaceInstUsesWith(
819	I&: II, V: ConstantFP::get(Context&: IC.Builder.getContext(), V: Result));
820	}
821	}
822	}
823
824	if (!ST->hasMed3_16())
825	break;
826
827	Value X, Y, *Z;
828
829	// Repeat floating-point width reduction done for minnum/maxnum.
830	// fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
831	if (matchFPExtFromF16(Arg: Src0, FPExtSrc&: X) && matchFPExtFromF16(Arg: Src1, FPExtSrc&: Y) &&
832	matchFPExtFromF16(Arg: Src2, FPExtSrc&: Z)) {
833	Value *NewCall = IC.Builder.CreateIntrinsic(ID: IID, Types: {X->getType()},
834	Args: {X, Y, Z}, FMFSource: &II, Name: II.getName());
835	return new FPExtInst (NewCall, II.getType());
836	}
837
838	break;
839	}
840	case Intrinsic::amdgcn_icmp:
841	case Intrinsic::amdgcn_fcmp: {
842	const ConstantInt *CC = cast<ConstantInt>(Val: II.getArgOperand(i: `2`));
843	// Guard against invalid arguments.
844	int64_t CCVal = CC->getZExtValue();
845	bool IsInteger = IID == Intrinsic::amdgcn_icmp;
846	if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE \|\|
847	CCVal > CmpInst::LAST_ICMP_PREDICATE)) \|\|
848	(!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE \|\|
849	CCVal > CmpInst::LAST_FCMP_PREDICATE)))
850	break;
851
852	Value *Src0 = II.getArgOperand(i: `0`);
853	Value *Src1 = II.getArgOperand(i: `1`);
854
855	if (auto *CSrc0 = dyn_cast<Constant>(Val: Src0)) {
856	if (auto *CSrc1 = dyn_cast<Constant>(Val: Src1)) {
857	Constant *CCmp = ConstantFoldCompareInstOperands(
858	Predicate: (ICmpInst::Predicate)CCVal, LHS: CSrc0, RHS: CSrc1, DL);
859	if (CCmp && CCmp->isNullValue()) {
860	return IC.replaceInstUsesWith(
861	I&: II, V: IC.Builder.CreateSExt(V: CCmp, DestTy: II.getType()));
862	}
863
864	// The result of V_ICMP/V_FCMP assembly instructions (which this
865	// intrinsic exposes) is one bit per thread, masked with the EXEC
866	// register (which contains the bitmask of live threads). So a
867	// comparison that always returns true is the same as a read of the
868	// EXEC register.
869	Function *NewF = Intrinsic::getDeclaration(
870	M: II.getModule(), id: Intrinsic::read_register, Tys: II.getType());
871	Metadata *MDArgs[] = {MDString::get(Context&: II.getContext(), Str: "exec")};
872	MDNode *MD = MDNode::get(Context&: II.getContext(), MDs: MDArgs);
873	Value *Args[] = {MetadataAsValue::get(Context&: II.getContext(), MD)};
874	CallInst *NewCall = IC.Builder.CreateCall(Callee: NewF, Args);
875	NewCall->addFnAttr(Kind: Attribute::Convergent);
876	NewCall->takeName(V: &II);
877	return IC.replaceInstUsesWith(I&: II, V: NewCall);
878	}
879
880	// Canonicalize constants to RHS.
881	CmpInst::Predicate SwapPred =
882	CmpInst::getSwappedPredicate(pred: static_cast<CmpInst::Predicate>(CCVal));
883	II.setArgOperand(i: `0`, v: Src1);
884	II.setArgOperand(i: `1`, v: Src0);
885	II.setArgOperand(
886	i: `2`, v: ConstantInt::get(Ty: CC->getType(), V: static_cast<int>(SwapPred)));
887	return &II;
888	}
889
890	if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
891	break;
892
893	// Canonicalize compare eq with true value to compare != 0
894	// llvm.amdgcn.icmp(zext (i1 x), 1, eq)
895	// -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
896	// llvm.amdgcn.icmp(sext (i1 x), -1, eq)
897	// -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
898	Value *ExtSrc;
899	if (CCVal == CmpInst::ICMP_EQ &&
900	((match(V: Src1, P: PatternMatch::m_One()) &&
901	match(V: Src0, P: m_ZExt(Op: PatternMatch::m_Value(V&: ExtSrc)))) \|\|
902	(match(V: Src1, P: PatternMatch::m_AllOnes()) &&
903	match(V: Src0, P: m_SExt(Op: PatternMatch::m_Value(V&: ExtSrc))))) &&
904	ExtSrc->getType()->isIntegerTy(Bitwidth: `1`)) {
905	IC.replaceOperand(I&: II, OpNum: `1`, V: ConstantInt::getNullValue(Ty: Src1->getType()));
906	IC.replaceOperand(I&: II, OpNum: `2`,
907	V: ConstantInt::get(Ty: CC->getType(), V: CmpInst::ICMP_NE));
908	return &II;
909	}
910
911	CmpInst::Predicate SrcPred;
912	Value *SrcLHS;
913	Value *SrcRHS;
914
915	// Fold compare eq/ne with 0 from a compare result as the predicate to the
916	// intrinsic. The typical use is a wave vote function in the library, which
917	// will be fed from a user code condition compared with 0. Fold in the
918	// redundant compare.
919
920	// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
921	// -> llvm.amdgcn.[if]cmp(a, b, pred)
922	//
923	// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
924	// -> llvm.amdgcn.[if]cmp(a, b, inv pred)
925	if (match(V: Src1, P: PatternMatch::m_Zero()) &&
926	match(V: Src0, P: PatternMatch::m_ZExtOrSExt(
927	Op: m_Cmp(Pred&: SrcPred, L: PatternMatch::m_Value(V&: SrcLHS),
928	R: PatternMatch::m_Value(V&: SrcRHS))))) {
929	if (CCVal == CmpInst::ICMP_EQ)
930	SrcPred = CmpInst::getInversePredicate(pred: SrcPred);
931
932	Intrinsic::ID NewIID = CmpInst::isFPPredicate(P: SrcPred)
933	? Intrinsic::amdgcn_fcmp
934	: Intrinsic::amdgcn_icmp;
935
936	Type *Ty = SrcLHS->getType();
937	if (auto *CmpType = dyn_cast<IntegerType>(Val: Ty)) {
938	// Promote to next legal integer type.
939	unsigned Width = CmpType->getBitWidth();
940	unsigned NewWidth = Width;
941
942	// Don't do anything for i1 comparisons.
943	if (Width == `1`)
944	break;
945
946	if (Width <= `16`)
947	NewWidth = `16`;
948	else if (Width <= `32`)
949	NewWidth = `32`;
950	else if (Width <= `64`)
951	NewWidth = `64`;
952	else
953	break; // Can't handle this.
954
955	if (Width != NewWidth) {
956	IntegerType *CmpTy = IC.Builder.getIntNTy(N: NewWidth);
957	if (CmpInst::isSigned(predicate: SrcPred)) {
958	SrcLHS = IC.Builder.CreateSExt(V: SrcLHS, DestTy: CmpTy);
959	SrcRHS = IC.Builder.CreateSExt(V: SrcRHS, DestTy: CmpTy);
960	} else {
961	SrcLHS = IC.Builder.CreateZExt(V: SrcLHS, DestTy: CmpTy);
962	SrcRHS = IC.Builder.CreateZExt(V: SrcRHS, DestTy: CmpTy);
963	}
964	}
965	} else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
966	break;
967
968	Function *NewF = Intrinsic::getDeclaration(
969	M: II.getModule(), id: NewIID, Tys: {II.getType(), SrcLHS->getType()});
970	Value *Args[] = {SrcLHS, SrcRHS,
971	ConstantInt::get(Ty: CC->getType(), V: SrcPred)};
972	CallInst *NewCall = IC.Builder.CreateCall(Callee: NewF, Args);
973	NewCall->takeName(V: &II);
974	return IC.replaceInstUsesWith(I&: II, V: NewCall);
975	}
976
977	break;
978	}
979	case Intrinsic::amdgcn_mbcnt_hi: {
980	// exec_hi is all 0, so this is just a copy.
981	if (ST->isWave32())
982	return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: `1`));
983	break;
984	}
985	case Intrinsic::amdgcn_ballot: {
986	if (auto *Src = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `0`))) {
987	if (Src->isZero()) {
988	// amdgcn.ballot(i1 0) is zero.
989	return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
990	}
991	}
992	if (ST->isWave32() && II.getType()->getIntegerBitWidth() == `64`) {
993	// %b64 = call i64 ballot.i64(...)
994	// =>
995	// %b32 = call i32 ballot.i32(...)
996	// %b64 = zext i32 %b32 to i64
997	Value *Call = IC.Builder.CreateZExt(
998	V: IC.Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_ballot,
999	Types: {IC.Builder.getInt32Ty()},
1000	Args: {II.getArgOperand(i: `0`)}),
1001	DestTy: II.getType());
1002	Call->takeName(V: &II);
1003	return IC.replaceInstUsesWith(I&: II, V: Call);
1004	}
1005	break;
1006	}
1007	case Intrinsic::amdgcn_wqm_vote: {
1008	// wqm_vote is identity when the argument is constant.
1009	if (!isa<Constant>(Val: II.getArgOperand(i: `0`)))
1010	break;
1011
1012	return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: `0`));
1013	}
1014	case Intrinsic::amdgcn_kill: {
1015	const ConstantInt *C = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `0`));
1016	if (!C \|\| !C->getZExtValue())
1017	break;
1018
1019	// amdgcn.kill(i1 1) is a no-op
1020	return IC.eraseInstFromFunction(I&: II);
1021	}
1022	case Intrinsic::amdgcn_update_dpp: {
1023	Value *Old = II.getArgOperand(i: `0`);
1024
1025	auto *BC = cast<ConstantInt>(Val: II.getArgOperand(i: `5`));
1026	auto *RM = cast<ConstantInt>(Val: II.getArgOperand(i: `3`));
1027	auto *BM = cast<ConstantInt>(Val: II.getArgOperand(i: `4`));
1028	if (BC->isZeroValue() \|\| RM->getZExtValue() != `0xF` \|\|
1029	BM->getZExtValue() != `0xF` \|\| isa<UndefValue>(Val: Old))
1030	break;
1031
1032	// If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1033	return IC.replaceOperand(I&: II, OpNum: `0`, V: UndefValue::get(T: Old->getType()));
1034	}
1035	case Intrinsic::amdgcn_permlane16:
1036	case Intrinsic::amdgcn_permlane16_var:
1037	case Intrinsic::amdgcn_permlanex16:
1038	case Intrinsic::amdgcn_permlanex16_var: {
1039	// Discard vdst_in if it's not going to be read.
1040	Value *VDstIn = II.getArgOperand(i: `0`);
1041	if (isa<UndefValue>(Val: VDstIn))
1042	break;
1043
1044	// FetchInvalid operand idx.
1045	unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 \|\|
1046	IID == Intrinsic::amdgcn_permlanex16)
1047	? `4` / for permlane16 and permlanex16 /
1048	: `3`; / for permlane16_var and permlanex16_var /
1049
1050	// BoundCtrl operand idx.
1051	// For permlane16 and permlanex16 it should be 5
1052	// For Permlane16_var and permlanex16_var it should be 4
1053	unsigned int BcIdx = FiIdx + `1`;
1054
1055	ConstantInt *FetchInvalid = cast<ConstantInt>(Val: II.getArgOperand(i: FiIdx));
1056	ConstantInt *BoundCtrl = cast<ConstantInt>(Val: II.getArgOperand(i: BcIdx));
1057	if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1058	break;
1059
1060	return IC.replaceOperand(I&: II, OpNum: `0`, V: UndefValue::get(T: VDstIn->getType()));
1061	}
1062	case Intrinsic::amdgcn_permlane64:
1063	// A constant value is trivially uniform.
1064	if (Constant *C = dyn_cast<Constant>(Val: II.getArgOperand(i: `0`))) {
1065	return IC.replaceInstUsesWith(I&: II, V: C);
1066	}
1067	break;
1068	case Intrinsic::amdgcn_readfirstlane:
1069	case Intrinsic::amdgcn_readlane: {
1070	// A constant value is trivially uniform.
1071	if (Constant *C = dyn_cast<Constant>(Val: II.getArgOperand(i: `0`))) {
1072	return IC.replaceInstUsesWith(I&: II, V: C);
1073	}
1074
1075	// The rest of these may not be safe if the exec may not be the same between
1076	// the def and use.
1077	Value *Src = II.getArgOperand(i: `0`);
1078	Instruction *SrcInst = dyn_cast<Instruction>(Val: Src);
1079	if (SrcInst && SrcInst->getParent() != II.getParent())
1080	break;
1081
1082	// readfirstlane (readfirstlane x) -> readfirstlane x
1083	// readlane (readfirstlane x), y -> readfirstlane x
1084	if (match(V: Src,
1085	P: PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
1086	return IC.replaceInstUsesWith(I&: II, V: Src);
1087	}
1088
1089	if (IID == Intrinsic::amdgcn_readfirstlane) {
1090	// readfirstlane (readlane x, y) -> readlane x, y
1091	if (match(V: Src, P: PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
1092	return IC.replaceInstUsesWith(I&: II, V: Src);
1093	}
1094	} else {
1095	// readlane (readlane x, y), y -> readlane x, y
1096	if (match(V: Src, P: PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
1097	Op0: PatternMatch::m_Value(),
1098	Op1: PatternMatch::m_Specific(V: II.getArgOperand(i: `1`))))) {
1099	return IC.replaceInstUsesWith(I&: II, V: Src);
1100	}
1101	}
1102
1103	break;
1104	}
1105	case Intrinsic::amdgcn_trig_preop: {
1106	// The intrinsic is declared with name mangling, but currently the
1107	// instruction only exists for f64
1108	if (!II.getType()->isDoubleTy())
1109	break;
1110
1111	Value *Src = II.getArgOperand(i: `0`);
1112	Value *Segment = II.getArgOperand(i: `1`);
1113	if (isa<PoisonValue>(Val: Src) \|\| isa<PoisonValue>(Val: Segment))
1114	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1115
1116	if (isa<UndefValue>(Val: Src)) {
1117	auto *QNaN = ConstantFP::get(
1118	Ty: II.getType(), V: APFloat::getQNaN(Sem: II.getType()->getFltSemantics()));
1119	return IC.replaceInstUsesWith(I&: II, V: QNaN);
1120	}
1121
1122	const ConstantFP *Csrc = dyn_cast<ConstantFP>(Val: Src);
1123	if (!Csrc)
1124	break;
1125
1126	if (II.isStrictFP())
1127	break;
1128
1129	const APFloat &Fsrc = Csrc->getValueAPF();
1130	if (Fsrc.isNaN()) {
1131	auto *Quieted = ConstantFP::get(Ty: II.getType(), V: Fsrc.makeQuiet());
1132	return IC.replaceInstUsesWith(I&: II, V: Quieted);
1133	}
1134
1135	const ConstantInt *Cseg = dyn_cast<ConstantInt>(Val: Segment);
1136	if (!Cseg)
1137	break;
1138
1139	unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> `52`) & `0x7ff`;
1140	unsigned SegmentVal = Cseg->getValue().trunc(width: `5`).getZExtValue();
1141	unsigned Shift = SegmentVal * `53`;
1142	if (Exponent > `1077`)
1143	Shift += Exponent - `1077`;
1144
1145	// 2.0/PI table.
1146	static const uint32_t TwoByPi[] = {
1147	`0xa2f9836e`, `0x4e441529`, `0xfc2757d1`, `0xf534ddc0`, `0xdb629599`, `0x3c439041`,
1148	`0xfe5163ab`, `0xdebbc561`, `0xb7246e3a`, `0x424dd2e0`, `0x06492eea`, `0x09d1921c`,
1149	`0xfe1deb1c`, `0xb129a73e`, `0xe88235f5`, `0x2ebb4484`, `0xe99c7026`, `0xb45f7e41`,
1150	`0x3991d639`, `0x835339f4`, `0x9c845f8b`, `0xbdf9283b`, `0x1ff897ff`, `0xde05980f`,
1151	`0xef2f118b`, `0x5a0a6d1f`, `0x6d367ecf`, `0x27cb09b7`, `0x4f463f66`, `0x9e5fea2d`,
1152	`0x7527bac7`, `0xebe5f17b`, `0x3d0739f7`, `0x8a5292ea`, `0x6bfb5fb1`, `0x1f8d5d08`,
1153	`0x56033046`};
1154
1155	// Return 0 for outbound segment (hardware behavior).
1156	unsigned Idx = Shift >> `5`;
1157	if (Idx + `2` >= std::size(TwoByPi)) {
1158	APFloat Zero = APFloat::getZero(Sem: II.getType()->getFltSemantics());
1159	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: II.getType(), V: Zero));
1160	}
1161
1162	unsigned BShift = Shift & `0x1f`;
1163	uint64_t Thi = Make_64(High: TwoByPi[Idx], Low: TwoByPi[Idx + `1`]);
1164	uint64_t Tlo = Make_64(High: TwoByPi[Idx + `2`], Low: `0`);
1165	if (BShift)
1166	Thi = (Thi << BShift) \| (Tlo >> (`64` - BShift));
1167	Thi = Thi >> `11`;
1168	APFloat Result = APFloat ((double)Thi);
1169
1170	int Scale = -`53` - Shift;
1171	if (Exponent >= `1968`)
1172	Scale += `128`;
1173
1174	Result = scalbn(X: Result, Exp: Scale, RM: RoundingMode::NearestTiesToEven);
1175	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: Src->getType(), V: Result));
1176	}
1177	case Intrinsic::amdgcn_fmul_legacy: {
1178	Value *Op0 = II.getArgOperand(i: `0`);
1179	Value *Op1 = II.getArgOperand(i: `1`);
1180
1181	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1182	// infinity, gives +0.0.
1183	// TODO: Move to InstSimplify?
1184	if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) \|\|
1185	match(V: Op1, P: PatternMatch::m_AnyZeroFP()))
1186	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1187
1188	// If we can prove we don't have one of the special cases then we can use a
1189	// normal fmul instruction instead.
1190	if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1191	auto *FMul = IC.Builder.CreateFMulFMF(L: Op0, R: Op1, FMFSource: &II);
1192	FMul->takeName(V: &II);
1193	return IC.replaceInstUsesWith(I&: II, V: FMul);
1194	}
1195	break;
1196	}
1197	case Intrinsic::amdgcn_fma_legacy: {
1198	Value *Op0 = II.getArgOperand(i: `0`);
1199	Value *Op1 = II.getArgOperand(i: `1`);
1200	Value *Op2 = II.getArgOperand(i: `2`);
1201
1202	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1203	// infinity, gives +0.0.
1204	// TODO: Move to InstSimplify?
1205	if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) \|\|
1206	match(V: Op1, P: PatternMatch::m_AnyZeroFP())) {
1207	// It's tempting to just return Op2 here, but that would give the wrong
1208	// result if Op2 was -0.0.
1209	auto *Zero = ConstantFP::getZero(Ty: II.getType());
1210	auto *FAdd = IC.Builder.CreateFAddFMF(L: Zero, R: Op2, FMFSource: &II);
1211	FAdd->takeName(V: &II);
1212	return IC.replaceInstUsesWith(I&: II, V: FAdd);
1213	}
1214
1215	// If we can prove we don't have one of the special cases then we can use a
1216	// normal fma instead.
1217	if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1218	II.setCalledOperand(Intrinsic::getDeclaration(
1219	M: II.getModule(), id: Intrinsic::fma, Tys: II.getType()));
1220	return &II;
1221	}
1222	break;
1223	}
1224	case Intrinsic::amdgcn_is_shared:
1225	case Intrinsic::amdgcn_is_private: {
1226	if (isa<UndefValue>(Val: II.getArgOperand(i: `0`)))
1227	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1228
1229	if (isa<ConstantPointerNull>(Val: II.getArgOperand(i: `0`)))
1230	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getFalse(Ty: II.getType()));
1231	break;
1232	}
1233	case Intrinsic::amdgcn_raw_buffer_store_format:
1234	case Intrinsic::amdgcn_struct_buffer_store_format:
1235	case Intrinsic::amdgcn_raw_tbuffer_store:
1236	case Intrinsic::amdgcn_struct_tbuffer_store:
1237	case Intrinsic::amdgcn_image_store_1d:
1238	case Intrinsic::amdgcn_image_store_1darray:
1239	case Intrinsic::amdgcn_image_store_2d:
1240	case Intrinsic::amdgcn_image_store_2darray:
1241	case Intrinsic::amdgcn_image_store_2darraymsaa:
1242	case Intrinsic::amdgcn_image_store_2dmsaa:
1243	case Intrinsic::amdgcn_image_store_3d:
1244	case Intrinsic::amdgcn_image_store_cube:
1245	case Intrinsic::amdgcn_image_store_mip_1d:
1246	case Intrinsic::amdgcn_image_store_mip_1darray:
1247	case Intrinsic::amdgcn_image_store_mip_2d:
1248	case Intrinsic::amdgcn_image_store_mip_2darray:
1249	case Intrinsic::amdgcn_image_store_mip_3d:
1250	case Intrinsic::amdgcn_image_store_mip_cube: {
1251	if (!isa<FixedVectorType>(Val: II.getArgOperand(i: `0`)->getType()))
1252	break;
1253
1254	APInt DemandedElts;
1255	if (ST->hasDefaultComponentBroadcast())
1256	DemandedElts = defaultComponentBroadcast(V: II.getArgOperand(i: `0`));
1257	else if (ST->hasDefaultComponentZero())
1258	DemandedElts = trimTrailingZerosInVector(IC, UseV: II.getArgOperand(i: `0`), I: &II);
1259	else
1260	break;
1261
1262	int DMaskIdx = getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID()) ? `1` : -`1`;
1263	if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1264	IsLoad: false)) {
1265	return IC.eraseInstFromFunction(I&: II);
1266	}
1267
1268	break;
1269	}
1270	}
1271	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1272	AMDGPU::getImageDimIntrinsicInfo(Intr: II.getIntrinsicID())) {
1273	return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1274	}
1275	return std::nullopt;
1276	}
1277
1278	/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1279	///
1280	/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1281	/// definitions of the intrinsics vector argument, not Uses of the result like
1282	/// image and buffer loads.
1283	/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1284	/// struct returns.
1285	static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1286	IntrinsicInst &II,
1287	APInt DemandedElts,
1288	int DMaskIdx, bool IsLoad) {
1289
1290	auto *IIVTy = cast<FixedVectorType>(Val: IsLoad ? II.getType()
1291	: II.getOperand(i_nocapture: `0`)->getType());
1292	unsigned VWidth = IIVTy->getNumElements();
1293	if (VWidth == `1`)
1294	return nullptr;
1295	Type *EltTy = IIVTy->getElementType();
1296
1297	IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1298	IC.Builder.SetInsertPoint(&II);
1299
1300	// Assume the arguments are unchanged and later override them, if needed.
1301	SmallVector<Value *, `16`> Args(II.args());
1302
1303	if (DMaskIdx < `0`) {
1304	// Buffer case.
1305
1306	const unsigned ActiveBits = DemandedElts.getActiveBits();
1307	const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1308
1309	// Start assuming the prefix of elements is demanded, but possibly clear
1310	// some other bits if there are trailing zeros (unused components at front)
1311	// and update offset.
1312	DemandedElts = (`1` << ActiveBits) - `1`;
1313
1314	if (UnusedComponentsAtFront > `0`) {
1315	static const unsigned InvalidOffsetIdx = `0xf`;
1316
1317	unsigned OffsetIdx;
1318	switch (II.getIntrinsicID()) {
1319	case Intrinsic::amdgcn_raw_buffer_load:
1320	case Intrinsic::amdgcn_raw_ptr_buffer_load:
1321	OffsetIdx = `1`;
1322	break;
1323	case Intrinsic::amdgcn_s_buffer_load:
1324	// If resulting type is vec3, there is no point in trimming the
1325	// load with updated offset, as the vec3 would most likely be widened to
1326	// vec4 anyway during lowering.
1327	if (ActiveBits == `4` && UnusedComponentsAtFront == `1`)
1328	OffsetIdx = InvalidOffsetIdx;
1329	else
1330	OffsetIdx = `1`;
1331	break;
1332	case Intrinsic::amdgcn_struct_buffer_load:
1333	case Intrinsic::amdgcn_struct_ptr_buffer_load:
1334	OffsetIdx = `2`;
1335	break;
1336	default:
1337	// TODO: handle tbuffer intrinsics.*
1338	OffsetIdx = InvalidOffsetIdx;
1339	break;
1340	}
1341
1342	if (OffsetIdx != InvalidOffsetIdx) {
1343	// Clear demanded bits and update the offset.
1344	DemandedElts &= ~((`1` << UnusedComponentsAtFront) - `1`);
1345	auto *Offset = Args [OffsetIdx];
1346	unsigned SingleComponentSizeInBits =
1347	IC.getDataLayout().getTypeSizeInBits(Ty: EltTy);
1348	unsigned OffsetAdd =
1349	UnusedComponentsAtFront * SingleComponentSizeInBits / `8`;
1350	auto *OffsetAddVal = ConstantInt::get(Ty: Offset->getType(), V: OffsetAdd);
1351	Args [OffsetIdx] = IC.Builder.CreateAdd(LHS: Offset, RHS: OffsetAddVal);
1352	}
1353	}
1354	} else {
1355	// Image case.
1356
1357	ConstantInt *DMask = cast<ConstantInt>(Val: Args [DMaskIdx]);
1358	unsigned DMaskVal = DMask->getZExtValue() & `0xf`;
1359
1360	// dmask 0 has special semantics, do not simplify.
1361	if (DMaskVal == `0`)
1362	return nullptr;
1363
1364	// Mask off values that are undefined because the dmask doesn't cover them
1365	DemandedElts &= (`1` << llvm::popcount(Value: DMaskVal)) - `1`;
1366
1367	unsigned NewDMaskVal = `0`;
1368	unsigned OrigLdStIdx = `0`;
1369	for (unsigned SrcIdx = `0`; SrcIdx < `4`; ++SrcIdx) {
1370	const unsigned Bit = `1` << SrcIdx;
1371	if (!!(DMaskVal & Bit)) {
1372	if (!!DemandedElts [OrigLdStIdx])
1373	NewDMaskVal \|= Bit;
1374	OrigLdStIdx++;
1375	}
1376	}
1377
1378	if (DMaskVal != NewDMaskVal)
1379	Args [DMaskIdx] = ConstantInt::get(Ty: DMask->getType(), V: NewDMaskVal);
1380	}
1381
1382	unsigned NewNumElts = DemandedElts.popcount();
1383	if (!NewNumElts)
1384	return PoisonValue::get(T: IIVTy);
1385
1386	if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1387	if (DMaskIdx >= `0`)
1388	II.setArgOperand(i: DMaskIdx, v: Args [DMaskIdx]);
1389	return nullptr;
1390	}
1391
1392	// Validate function argument and return types, extracting overloaded types
1393	// along the way.
1394	SmallVector<Type *, `6`> OverloadTys;
1395	if (!Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: OverloadTys))
1396	return nullptr;
1397
1398	Type *NewTy =
1399	(NewNumElts == `1`) ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: NewNumElts);
1400	OverloadTys [`0`] = NewTy;
1401
1402	if (!IsLoad) {
1403	SmallVector<int, `8`> EltMask;
1404	for (unsigned OrigStoreIdx = `0`; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1405	if (DemandedElts [OrigStoreIdx])
1406	EltMask.push_back(Elt: OrigStoreIdx);
1407
1408	if (NewNumElts == `1`)
1409	Args [`0`] = IC.Builder.CreateExtractElement(Vec: II.getOperand(i_nocapture: `0`), Idx: EltMask [`0`]);
1410	else
1411	Args [`0`] = IC.Builder.CreateShuffleVector(V: II.getOperand(i_nocapture: `0`), Mask: EltMask);
1412	}
1413
1414	Function *NewIntrin = Intrinsic::getDeclaration(
1415	M: II.getModule(), id: II.getIntrinsicID(), Tys: OverloadTys);
1416	CallInst *NewCall = IC.Builder.CreateCall(Callee: NewIntrin, Args);
1417	NewCall->takeName(V: &II);
1418	NewCall->copyMetadata(SrcInst: II);
1419
1420	if (IsLoad) {
1421	if (NewNumElts == `1`) {
1422	return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: IIVTy), NewElt: NewCall,
1423	Idx: DemandedElts.countr_zero());
1424	}
1425
1426	SmallVector<int, `8`> EltMask;
1427	unsigned NewLoadIdx = `0`;
1428	for (unsigned OrigLoadIdx = `0`; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1429	if (!!DemandedElts [OrigLoadIdx])
1430	EltMask.push_back(Elt: NewLoadIdx++);
1431	else
1432	EltMask.push_back(Elt: NewNumElts);
1433	}
1434
1435	auto *Shuffle = IC.Builder.CreateShuffleVector(V: NewCall, Mask: EltMask);
1436
1437	return Shuffle;
1438	}
1439
1440	return NewCall;
1441	}
1442
1443	std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1444	InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1445	APInt &UndefElts2, APInt &UndefElts3,
1446	std::function<void(Instruction , unsigned*, APInt, APInt &)>
1447	SimplifyAndSetOp) const {
1448	switch (II.getIntrinsicID()) {
1449	case Intrinsic::amdgcn_raw_buffer_load:
1450	case Intrinsic::amdgcn_raw_ptr_buffer_load:
1451	case Intrinsic::amdgcn_raw_buffer_load_format:
1452	case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1453	case Intrinsic::amdgcn_raw_tbuffer_load:
1454	case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1455	case Intrinsic::amdgcn_s_buffer_load:
1456	case Intrinsic::amdgcn_struct_buffer_load:
1457	case Intrinsic::amdgcn_struct_ptr_buffer_load:
1458	case Intrinsic::amdgcn_struct_buffer_load_format:
1459	case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1460	case Intrinsic::amdgcn_struct_tbuffer_load:
1461	case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1462	return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1463	default: {
1464	if (getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID())) {
1465	return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx: `0`);
1466	}
1467	break;
1468	}
1469	}
1470	return std::nullopt;
1471	}
1472

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp