1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUTargetTransformInfo.h"
19#include "GCNSubtarget.h"
20#include "llvm/ADT/FloatingPointMode.h"
21#include "llvm/IR/Dominators.h"
22#include "llvm/IR/IntrinsicsAMDGPU.h"
23#include "llvm/Transforms/InstCombine/InstCombiner.h"
24#include <optional>
25
26using namespace llvm;
27using namespace llvm::PatternMatch;
28
29#define DEBUG_TYPE "AMDGPUtti"
30
31namespace {
32
33struct AMDGPUImageDMaskIntrinsic {
34 unsigned Intr;
35};
36
37#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
38#include "InstCombineTables.inc"
39
40} // end anonymous namespace
41
42// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
43//
44// A single NaN input is folded to minnum, so we rely on that folding for
45// handling NaNs.
46static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
47 const APFloat &Src2) {
48 APFloat Max3 = maxnum(A: maxnum(A: Src0, B: Src1), B: Src2);
49
50 APFloat::cmpResult Cmp0 = Max3.compare(RHS: Src0);
51 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
52 if (Cmp0 == APFloat::cmpEqual)
53 return maxnum(A: Src1, B: Src2);
54
55 APFloat::cmpResult Cmp1 = Max3.compare(RHS: Src1);
56 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
57 if (Cmp1 == APFloat::cmpEqual)
58 return maxnum(A: Src0, B: Src2);
59
60 return maxnum(A: Src0, B: Src1);
61}
62
63// Check if a value can be converted to a 16-bit value without losing
64// precision.
65// The value is expected to be either a float (IsFloat = true) or an unsigned
66// integer (IsFloat = false).
67static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
68 Type *VTy = V.getType();
69 if (VTy->isHalfTy() || VTy->isIntegerTy(Bitwidth: 16)) {
70 // The value is already 16-bit, so we don't want to convert to 16-bit again!
71 return false;
72 }
73 if (IsFloat) {
74 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(Val: &V)) {
75 // We need to check that if we cast the index down to a half, we do not
76 // lose precision.
77 APFloat FloatValue(ConstFloat->getValueAPF());
78 bool LosesInfo = true;
79 FloatValue.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero,
80 losesInfo: &LosesInfo);
81 return !LosesInfo;
82 }
83 } else {
84 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Val: &V)) {
85 // We need to check that if we cast the index down to an i16, we do not
86 // lose precision.
87 APInt IntValue(ConstInt->getValue());
88 return IntValue.getActiveBits() <= 16;
89 }
90 }
91
92 Value *CastSrc;
93 bool IsExt = IsFloat ? match(V: &V, P: m_FPExt(Op: PatternMatch::m_Value(V&: CastSrc)))
94 : match(V: &V, P: m_ZExt(Op: PatternMatch::m_Value(V&: CastSrc)));
95 if (IsExt) {
96 Type *CastSrcTy = CastSrc->getType();
97 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(Bitwidth: 16))
98 return true;
99 }
100
101 return false;
102}
103
104// Convert a value to 16-bit.
105static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
106 Type *VTy = V.getType();
107 if (isa<FPExtInst, SExtInst, ZExtInst>(Val: &V))
108 return cast<Instruction>(Val: &V)->getOperand(i: 0);
109 if (VTy->isIntegerTy())
110 return Builder.CreateIntCast(V: &V, DestTy: Type::getInt16Ty(C&: V.getContext()), isSigned: false);
111 if (VTy->isFloatingPointTy())
112 return Builder.CreateFPCast(V: &V, DestTy: Type::getHalfTy(C&: V.getContext()));
113
114 llvm_unreachable("Should never be called!");
115}
116
117/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
118/// modified arguments (based on OldIntr) and replaces InstToReplace with
119/// this newly created intrinsic call.
120static std::optional<Instruction *> modifyIntrinsicCall(
121 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
122 InstCombiner &IC,
123 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
124 Func) {
125 SmallVector<Type *, 4> ArgTys;
126 if (!Intrinsic::getIntrinsicSignature(F: OldIntr.getCalledFunction(), ArgTys))
127 return std::nullopt;
128
129 SmallVector<Value *, 8> Args(OldIntr.args());
130
131 // Modify arguments and types
132 Func(Args, ArgTys);
133
134 CallInst *NewCall = IC.Builder.CreateIntrinsic(ID: NewIntr, Types: ArgTys, Args);
135 NewCall->takeName(V: &OldIntr);
136 NewCall->copyMetadata(SrcInst: OldIntr);
137 if (isa<FPMathOperator>(Val: NewCall))
138 NewCall->copyFastMathFlags(I: &OldIntr);
139
140 // Erase and replace uses
141 if (!InstToReplace.getType()->isVoidTy())
142 IC.replaceInstUsesWith(I&: InstToReplace, V: NewCall);
143
144 bool RemoveOldIntr = &OldIntr != &InstToReplace;
145
146 auto *RetValue = IC.eraseInstFromFunction(I&: InstToReplace);
147 if (RemoveOldIntr)
148 IC.eraseInstFromFunction(I&: OldIntr);
149
150 return RetValue;
151}
152
153static std::optional<Instruction *>
154simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
155 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
156 IntrinsicInst &II, InstCombiner &IC) {
157 // Optimize _L to _LZ when _L is zero
158 if (const auto *LZMappingInfo =
159 AMDGPU::getMIMGLZMappingInfo(L: ImageDimIntr->BaseOpcode)) {
160 if (auto *ConstantLod =
161 dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->LodIndex))) {
162 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
163 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
164 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: LZMappingInfo->LZ,
165 Dim: ImageDimIntr->Dim);
166 return modifyIntrinsicCall(
167 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
168 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169 });
170 }
171 }
172 }
173
174 // Optimize _mip away, when 'lod' is zero
175 if (const auto *MIPMappingInfo =
176 AMDGPU::getMIMGMIPMappingInfo(MIP: ImageDimIntr->BaseOpcode)) {
177 if (auto *ConstantMip =
178 dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->MipIndex))) {
179 if (ConstantMip->isZero()) {
180 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: MIPMappingInfo->NONMIP,
182 Dim: ImageDimIntr->Dim);
183 return modifyIntrinsicCall(
184 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
185 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186 });
187 }
188 }
189 }
190
191 // Optimize _bias away when 'bias' is zero
192 if (const auto *BiasMappingInfo =
193 AMDGPU::getMIMGBiasMappingInfo(Bias: ImageDimIntr->BaseOpcode)) {
194 if (auto *ConstantBias =
195 dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->BiasIndex))) {
196 if (ConstantBias->isZero()) {
197 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: BiasMappingInfo->NoBias,
199 Dim: ImageDimIntr->Dim);
200 return modifyIntrinsicCall(
201 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
202 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204 });
205 }
206 }
207 }
208
209 // Optimize _offset away when 'offset' is zero
210 if (const auto *OffsetMappingInfo =
211 AMDGPU::getMIMGOffsetMappingInfo(Offset: ImageDimIntr->BaseOpcode)) {
212 if (auto *ConstantOffset =
213 dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->OffsetIndex))) {
214 if (ConstantOffset->isZero()) {
215 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
216 AMDGPU::getImageDimIntrinsicByBaseOpcode(
217 BaseOpcode: OffsetMappingInfo->NoOffset, Dim: ImageDimIntr->Dim);
218 return modifyIntrinsicCall(
219 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
220 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221 });
222 }
223 }
224 }
225
226 // Try to use D16
227 if (ST->hasD16Images()) {
228
229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
230 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode);
231
232 if (BaseOpcode->HasD16) {
233
234 // If the only use of image intrinsic is a fptrunc (with conversion to
235 // half) then both fptrunc and image intrinsic will be replaced with image
236 // intrinsic with D16 flag.
237 if (II.hasOneUse()) {
238 Instruction *User = II.user_back();
239
240 if (User->getOpcode() == Instruction::FPTrunc &&
241 User->getType()->getScalarType()->isHalfTy()) {
242
243 return modifyIntrinsicCall(OldIntr&: II, InstToReplace&: *User, NewIntr: ImageDimIntr->Intr, IC,
244 Func: [&](auto &Args, auto &ArgTys) {
245 // Change return type of image intrinsic.
246 // Set it to return type of fptrunc.
247 ArgTys[0] = User->getType();
248 });
249 }
250 }
251
252 // Only perform D16 folding if every user of the image sample is
253 // an ExtractElementInst immediately followed by an FPTrunc to half.
254 SmallVector<std::pair<ExtractElementInst *, FPTruncInst *>, 4>
255 ExtractTruncPairs;
256 bool AllHalfExtracts = true;
257
258 for (User *U : II.users()) {
259 auto *Ext = dyn_cast<ExtractElementInst>(Val: U);
260 if (!Ext || !Ext->hasOneUse()) {
261 AllHalfExtracts = false;
262 break;
263 }
264
265 auto *Tr = dyn_cast<FPTruncInst>(Val: *Ext->user_begin());
266 if (!Tr || !Tr->getType()->isHalfTy()) {
267 AllHalfExtracts = false;
268 break;
269 }
270
271 ExtractTruncPairs.emplace_back(Args&: Ext, Args&: Tr);
272 }
273
274 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
275 auto *VecTy = cast<VectorType>(Val: II.getType());
276 Type *HalfVecTy =
277 VecTy->getWithNewType(EltTy: Type::getHalfTy(C&: II.getContext()));
278
279 // Obtain the original image sample intrinsic's signature
280 // and replace its return type with the half-vector for D16 folding
281 SmallVector<Type *, 8> SigTys;
282 Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: SigTys);
283 SigTys[0] = HalfVecTy;
284
285 Module *M = II.getModule();
286 Function *HalfDecl =
287 Intrinsic::getOrInsertDeclaration(M, id: ImageDimIntr->Intr, Tys: SigTys);
288
289 II.mutateType(Ty: HalfVecTy);
290 II.setCalledFunction(HalfDecl);
291
292 IRBuilder<> Builder(II.getContext());
293 for (auto &[Ext, Tr] : ExtractTruncPairs) {
294 Value *Idx = Ext->getIndexOperand();
295
296 Builder.SetInsertPoint(Tr);
297
298 Value *HalfExtract = Builder.CreateExtractElement(Vec: &II, Idx);
299 HalfExtract->takeName(V: Tr);
300
301 Tr->replaceAllUsesWith(V: HalfExtract);
302 }
303
304 for (auto &[Ext, Tr] : ExtractTruncPairs) {
305 IC.eraseInstFromFunction(I&: *Tr);
306 IC.eraseInstFromFunction(I&: *Ext);
307 }
308
309 return &II;
310 }
311 }
312 }
313
314 // Try to use A16 or G16
315 if (!ST->hasA16() && !ST->hasG16())
316 return std::nullopt;
317
318 // Address is interpreted as float if the instruction has a sampler or as
319 // unsigned int if there is no sampler.
320 bool HasSampler =
321 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode)->Sampler;
322 bool FloatCoord = false;
323 // true means derivatives can be converted to 16 bit, coordinates not
324 bool OnlyDerivatives = false;
325
326 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
327 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
328 Value *Coord = II.getOperand(i_nocapture: OperandIndex);
329 // If the values are not derived from 16-bit values, we cannot optimize.
330 if (!canSafelyConvertTo16Bit(V&: *Coord, IsFloat: HasSampler)) {
331 if (OperandIndex < ImageDimIntr->CoordStart ||
332 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
333 return std::nullopt;
334 }
335 // All gradients can be converted, so convert only them
336 OnlyDerivatives = true;
337 break;
338 }
339
340 assert(OperandIndex == ImageDimIntr->GradientStart ||
341 FloatCoord == Coord->getType()->isFloatingPointTy());
342 FloatCoord = Coord->getType()->isFloatingPointTy();
343 }
344
345 if (!OnlyDerivatives && !ST->hasA16())
346 OnlyDerivatives = true; // Only supports G16
347
348 // Check if there is a bias parameter and if it can be converted to f16
349 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
350 Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
351 assert(HasSampler &&
352 "Only image instructions with a sampler can have a bias");
353 if (!canSafelyConvertTo16Bit(V&: *Bias, IsFloat: HasSampler))
354 OnlyDerivatives = true;
355 }
356
357 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
358 ImageDimIntr->CoordStart))
359 return std::nullopt;
360
361 Type *CoordType = FloatCoord ? Type::getHalfTy(C&: II.getContext())
362 : Type::getInt16Ty(C&: II.getContext());
363
364 return modifyIntrinsicCall(
365 OldIntr&: II, InstToReplace&: II, NewIntr: II.getIntrinsicID(), IC, Func: [&](auto &Args, auto &ArgTys) {
366 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
367 if (!OnlyDerivatives) {
368 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
369
370 // Change the bias type
371 if (ImageDimIntr->NumBiasArgs != 0)
372 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(C&: II.getContext());
373 }
374
375 unsigned EndIndex =
376 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
377 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
378 OperandIndex < EndIndex; OperandIndex++) {
379 Args[OperandIndex] =
380 convertTo16Bit(V&: *II.getOperand(i_nocapture: OperandIndex), Builder&: IC.Builder);
381 }
382
383 // Convert the bias
384 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
385 Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
386 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(V&: *Bias, Builder&: IC.Builder);
387 }
388 });
389}
390
391bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
392 const Value *Op0, const Value *Op1,
393 InstCombiner &IC) const {
394 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
395 // infinity, gives +0.0. If we can prove we don't have one of the special
396 // cases then we can use a normal multiply instead.
397 // TODO: Create and use isKnownFiniteNonZero instead of just matching
398 // constants here.
399 if (match(V: Op0, P: PatternMatch::m_FiniteNonZero()) ||
400 match(V: Op1, P: PatternMatch::m_FiniteNonZero())) {
401 // One operand is not zero or infinity or NaN.
402 return true;
403 }
404
405 SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(I: &I);
406 if (isKnownNeverInfOrNaN(V: Op0, SQ) && isKnownNeverInfOrNaN(V: Op1, SQ)) {
407 // Neither operand is infinity or NaN.
408 return true;
409 }
410 return false;
411}
412
413/// Match an fpext from half to float, or a constant we can convert.
414static Value *matchFPExtFromF16(Value *Arg) {
415 Value *Src = nullptr;
416 ConstantFP *CFP = nullptr;
417 if (match(V: Arg, P: m_OneUse(SubPattern: m_FPExt(Op: m_Value(V&: Src))))) {
418 if (Src->getType()->isHalfTy())
419 return Src;
420 } else if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
421 bool LosesInfo;
422 APFloat Val(CFP->getValueAPF());
423 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
424 if (!LosesInfo)
425 return ConstantFP::get(Ty: Type::getHalfTy(C&: Arg->getContext()), V: Val);
426 }
427 return nullptr;
428}
429
430// Trim all zero components from the end of the vector \p UseV and return
431// an appropriate bitset with known elements.
432static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
433 Instruction *I) {
434 auto *VTy = cast<FixedVectorType>(Val: UseV->getType());
435 unsigned VWidth = VTy->getNumElements();
436 APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
437
438 for (int i = VWidth - 1; i > 0; --i) {
439 auto *Elt = findScalarElement(V: UseV, EltNo: i);
440 if (!Elt)
441 break;
442
443 if (auto *ConstElt = dyn_cast<Constant>(Val: Elt)) {
444 if (!ConstElt->isNullValue() && !isa<UndefValue>(Val: Elt))
445 break;
446 } else {
447 break;
448 }
449
450 DemandedElts.clearBit(BitPosition: i);
451 }
452
453 return DemandedElts;
454}
455
456// Trim elements of the end of the vector \p V, if they are
457// equal to the first element of the vector.
458static APInt defaultComponentBroadcast(Value *V) {
459 auto *VTy = cast<FixedVectorType>(Val: V->getType());
460 unsigned VWidth = VTy->getNumElements();
461 APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
462 Value *FirstComponent = findScalarElement(V, EltNo: 0);
463
464 SmallVector<int> ShuffleMask;
465 if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: V))
466 SVI->getShuffleMask(Result&: ShuffleMask);
467
468 for (int I = VWidth - 1; I > 0; --I) {
469 if (ShuffleMask.empty()) {
470 auto *Elt = findScalarElement(V, EltNo: I);
471 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Val: Elt)))
472 break;
473 } else {
474 // Detect identical elements in the shufflevector result, even though
475 // findScalarElement cannot tell us what that element is.
476 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
477 break;
478 }
479 DemandedElts.clearBit(BitPosition: I);
480 }
481
482 return DemandedElts;
483}
484
485static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
486 IntrinsicInst &II,
487 APInt DemandedElts,
488 int DMaskIdx = -1,
489 bool IsLoad = true);
490
491/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
492static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
493 return (SqrtOp->getType()->isFloatTy() &&
494 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
495 SqrtOp->getType()->isHalfTy();
496}
497
498/// Return true if we can easily prove that use U is uniform.
499static bool isTriviallyUniform(const Use &U) {
500 Value *V = U.get();
501 if (isa<Constant>(Val: V))
502 return true;
503 if (const auto *A = dyn_cast<Argument>(Val: V))
504 return AMDGPU::isArgPassedInSGPR(Arg: A);
505 if (const auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
506 if (!AMDGPU::isIntrinsicAlwaysUniform(IntrID: II->getIntrinsicID()))
507 return false;
508 // If II and U are in different blocks then there is a possibility of
509 // temporal divergence.
510 return II->getParent() == cast<Instruction>(Val: U.getUser())->getParent();
511 }
512 return false;
513}
514
515/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
516///
517/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
518bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
519 IntrinsicInst &II,
520 unsigned LaneArgIdx) const {
521 unsigned MaskBits = ST->getWavefrontSizeLog2();
522 APInt DemandedMask(32, maskTrailingOnes<unsigned>(N: MaskBits));
523
524 KnownBits Known(32);
525 if (IC.SimplifyDemandedBits(I: &II, OpNo: LaneArgIdx, DemandedMask, Known))
526 return true;
527
528 if (!Known.isConstant())
529 return false;
530
531 // Out of bounds indexes may appear in wave64 code compiled for wave32.
532 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
533 // manually fix it up.
534
535 Value *LaneArg = II.getArgOperand(i: LaneArgIdx);
536 Constant *MaskedConst =
537 ConstantInt::get(Ty: LaneArg->getType(), V: Known.getConstant() & DemandedMask);
538 if (MaskedConst != LaneArg) {
539 II.getOperandUse(i: LaneArgIdx).set(MaskedConst);
540 return true;
541 }
542
543 return false;
544}
545
546static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
547 Function &NewCallee, ArrayRef<Value *> Ops) {
548 SmallVector<OperandBundleDef, 2> OpBundles;
549 Old.getOperandBundlesAsDefs(Defs&: OpBundles);
550
551 CallInst *NewCall = B.CreateCall(Callee: &NewCallee, Args: Ops, OpBundles);
552 NewCall->takeName(V: &Old);
553 return NewCall;
554}
555
556Instruction *
557GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
558 IntrinsicInst &II) const {
559 const auto IID = II.getIntrinsicID();
560 assert(IID == Intrinsic::amdgcn_readlane ||
561 IID == Intrinsic::amdgcn_readfirstlane ||
562 IID == Intrinsic::amdgcn_permlane64);
563
564 Instruction *OpInst = dyn_cast<Instruction>(Val: II.getOperand(i_nocapture: 0));
565
566 // Only do this if both instructions are in the same block
567 // (so the exec mask won't change) and the readlane is the only user of its
568 // operand.
569 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
570 return nullptr;
571
572 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
573
574 // If this is a readlane, check that the second operand is a constant, or is
575 // defined before OpInst so we know it's safe to move this intrinsic higher.
576 Value *LaneID = nullptr;
577 if (IsReadLane) {
578 LaneID = II.getOperand(i_nocapture: 1);
579
580 // readlane take an extra operand for the lane ID, so we must check if that
581 // LaneID value can be used at the point where we want to move the
582 // intrinsic.
583 if (auto *LaneIDInst = dyn_cast<Instruction>(Val: LaneID)) {
584 if (!IC.getDominatorTree().dominates(Def: LaneIDInst, User: OpInst))
585 return nullptr;
586 }
587 }
588
589 // Hoist the intrinsic (II) through OpInst.
590 //
591 // (II (OpInst x)) -> (OpInst (II x))
592 const auto DoIt = [&](unsigned OpIdx,
593 Function *NewIntrinsic) -> Instruction * {
594 SmallVector<Value *, 2> Ops{OpInst->getOperand(i: OpIdx)};
595 if (IsReadLane)
596 Ops.push_back(Elt: LaneID);
597
598 // Rewrite the intrinsic call.
599 CallInst *NewII = rewriteCall(B&: IC.Builder, Old&: II, NewCallee&: *NewIntrinsic, Ops);
600
601 // Rewrite OpInst so it takes the result of the intrinsic now.
602 Instruction &NewOp = *OpInst->clone();
603 NewOp.setOperand(i: OpIdx, Val: NewII);
604 return &NewOp;
605 };
606
607 // TODO(?): Should we do more with permlane64?
608 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(Val: OpInst))
609 return nullptr;
610
611 if (isa<UnaryOperator>(Val: OpInst))
612 return DoIt(0, II.getCalledFunction());
613
614 if (isa<CastInst>(Val: OpInst)) {
615 Value *Src = OpInst->getOperand(i: 0);
616 Type *SrcTy = Src->getType();
617 if (!isTypeLegal(Ty: SrcTy))
618 return nullptr;
619
620 Function *Remangled =
621 Intrinsic::getOrInsertDeclaration(M: II.getModule(), id: IID, Tys: {SrcTy});
622 return DoIt(0, Remangled);
623 }
624
625 // We can also hoist through binary operators if the other operand is uniform.
626 if (isa<BinaryOperator>(Val: OpInst)) {
627 // FIXME: If we had access to UniformityInfo here we could just check
628 // if the operand is uniform.
629 if (isTriviallyUniform(U: OpInst->getOperandUse(i: 0)))
630 return DoIt(1, II.getCalledFunction());
631 if (isTriviallyUniform(U: OpInst->getOperandUse(i: 1)))
632 return DoIt(0, II.getCalledFunction());
633 }
634
635 return nullptr;
636}
637
638std::optional<Instruction *>
639GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
640 Intrinsic::ID IID = II.getIntrinsicID();
641 switch (IID) {
642 case Intrinsic::amdgcn_rcp: {
643 Value *Src = II.getArgOperand(i: 0);
644 if (isa<PoisonValue>(Val: Src))
645 return IC.replaceInstUsesWith(I&: II, V: Src);
646
647 // TODO: Move to ConstantFolding/InstSimplify?
648 if (isa<UndefValue>(Val: Src)) {
649 Type *Ty = II.getType();
650 auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
651 return IC.replaceInstUsesWith(I&: II, V: QNaN);
652 }
653
654 if (II.isStrictFP())
655 break;
656
657 if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
658 const APFloat &ArgVal = C->getValueAPF();
659 APFloat Val(ArgVal.getSemantics(), 1);
660 Val.divide(RHS: ArgVal, RM: APFloat::rmNearestTiesToEven);
661
662 // This is more precise than the instruction may give.
663 //
664 // TODO: The instruction always flushes denormal results (except for f16),
665 // should this also?
666 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Val));
667 }
668
669 FastMathFlags FMF = cast<FPMathOperator>(Val&: II).getFastMathFlags();
670 if (!FMF.allowContract())
671 break;
672 auto *SrcCI = dyn_cast<IntrinsicInst>(Val: Src);
673 if (!SrcCI)
674 break;
675
676 auto IID = SrcCI->getIntrinsicID();
677 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
678 //
679 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
680 // relaxed.
681 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
682 const FPMathOperator *SqrtOp = cast<FPMathOperator>(Val: SrcCI);
683 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
684 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
685 break;
686
687 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
688 break;
689
690 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
691 M: SrcCI->getModule(), id: Intrinsic::amdgcn_rsq, Tys: {SrcCI->getType()});
692
693 InnerFMF |= FMF;
694 II.setFastMathFlags(InnerFMF);
695
696 II.setCalledFunction(NewDecl);
697 return IC.replaceOperand(I&: II, OpNum: 0, V: SrcCI->getArgOperand(i: 0));
698 }
699
700 break;
701 }
702 case Intrinsic::amdgcn_sqrt:
703 case Intrinsic::amdgcn_rsq: {
704 Value *Src = II.getArgOperand(i: 0);
705 if (isa<PoisonValue>(Val: Src))
706 return IC.replaceInstUsesWith(I&: II, V: Src);
707
708 // TODO: Move to ConstantFolding/InstSimplify?
709 if (isa<UndefValue>(Val: Src)) {
710 Type *Ty = II.getType();
711 auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
712 return IC.replaceInstUsesWith(I&: II, V: QNaN);
713 }
714
715 // f16 amdgcn.sqrt is identical to regular sqrt.
716 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
717 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
718 M: II.getModule(), id: Intrinsic::sqrt, Tys: {II.getType()});
719 II.setCalledFunction(NewDecl);
720 return &II;
721 }
722
723 break;
724 }
725 case Intrinsic::amdgcn_log:
726 case Intrinsic::amdgcn_exp2: {
727 const bool IsLog = IID == Intrinsic::amdgcn_log;
728 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
729 Value *Src = II.getArgOperand(i: 0);
730 Type *Ty = II.getType();
731
732 if (isa<PoisonValue>(Val: Src))
733 return IC.replaceInstUsesWith(I&: II, V: Src);
734
735 if (IC.getSimplifyQuery().isUndefValue(V: Src))
736 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
737
738 if (ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
739 if (C->isInfinity()) {
740 // exp2(+inf) -> +inf
741 // log2(+inf) -> +inf
742 if (!C->isNegative())
743 return IC.replaceInstUsesWith(I&: II, V: C);
744
745 // exp2(-inf) -> 0
746 if (IsExp && C->isNegative())
747 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty));
748 }
749
750 if (II.isStrictFP())
751 break;
752
753 if (C->isNaN()) {
754 Constant *Quieted = ConstantFP::get(Ty, V: C->getValue().makeQuiet());
755 return IC.replaceInstUsesWith(I&: II, V: Quieted);
756 }
757
758 // f32 instruction doesn't handle denormals, f16 does.
759 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
760 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, Negative: true)
761 : ConstantFP::get(Ty, V: 1.0);
762 return IC.replaceInstUsesWith(I&: II, V: FoldedValue);
763 }
764
765 if (IsLog && C->isNegative())
766 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
767
768 // TODO: Full constant folding matching hardware behavior.
769 }
770
771 break;
772 }
773 case Intrinsic::amdgcn_frexp_mant:
774 case Intrinsic::amdgcn_frexp_exp: {
775 Value *Src = II.getArgOperand(i: 0);
776 if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
777 int Exp;
778 APFloat Significand =
779 frexp(X: C->getValueAPF(), Exp, RM: APFloat::rmNearestTiesToEven);
780
781 if (IID == Intrinsic::amdgcn_frexp_mant) {
782 return IC.replaceInstUsesWith(
783 I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Significand));
784 }
785
786 // Match instruction special case behavior.
787 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
788 Exp = 0;
789
790 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: Exp));
791 }
792
793 if (isa<PoisonValue>(Val: Src))
794 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
795
796 if (isa<UndefValue>(Val: Src)) {
797 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
798 }
799
800 break;
801 }
802 case Intrinsic::amdgcn_class: {
803 Value *Src0 = II.getArgOperand(i: 0);
804 Value *Src1 = II.getArgOperand(i: 1);
805 const ConstantInt *CMask = dyn_cast<ConstantInt>(Val: Src1);
806 if (CMask) {
807 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
808 M: II.getModule(), id: Intrinsic::is_fpclass, Tys: Src0->getType()));
809
810 // Clamp any excess bits, as they're illegal for the generic intrinsic.
811 II.setArgOperand(i: 1, v: ConstantInt::get(Ty: Src1->getType(),
812 V: CMask->getZExtValue() & fcAllFlags));
813 return &II;
814 }
815
816 // Propagate poison.
817 if (isa<PoisonValue>(Val: Src0) || isa<PoisonValue>(Val: Src1))
818 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
819
820 // llvm.amdgcn.class(_, undef) -> false
821 if (IC.getSimplifyQuery().isUndefValue(V: Src1))
822 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: false));
823
824 // llvm.amdgcn.class(undef, mask) -> mask != 0
825 if (IC.getSimplifyQuery().isUndefValue(V: Src0)) {
826 Value *CmpMask = IC.Builder.CreateICmpNE(
827 LHS: Src1, RHS: ConstantInt::getNullValue(Ty: Src1->getType()));
828 return IC.replaceInstUsesWith(I&: II, V: CmpMask);
829 }
830 break;
831 }
832 case Intrinsic::amdgcn_cvt_pkrtz: {
833 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
834 Type *HalfTy = Type::getHalfTy(C&: Arg->getContext());
835
836 if (isa<PoisonValue>(Val: Arg))
837 return PoisonValue::get(T: HalfTy);
838 if (isa<UndefValue>(Val: Arg))
839 return UndefValue::get(T: HalfTy);
840
841 ConstantFP *CFP = nullptr;
842 if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
843 bool LosesInfo;
844 APFloat Val(CFP->getValueAPF());
845 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
846 return ConstantFP::get(Ty: HalfTy, V: Val);
847 }
848
849 Value *Src = nullptr;
850 if (match(V: Arg, P: m_FPExt(Op: m_Value(V&: Src)))) {
851 if (Src->getType()->isHalfTy())
852 return Src;
853 }
854
855 return nullptr;
856 };
857
858 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(i: 0))) {
859 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(i: 1))) {
860 Value *V = PoisonValue::get(T: II.getType());
861 V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src0, Idx: (uint64_t)0);
862 V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src1, Idx: (uint64_t)1);
863 return IC.replaceInstUsesWith(I&: II, V);
864 }
865 }
866
867 break;
868 }
869 case Intrinsic::amdgcn_cvt_pknorm_i16:
870 case Intrinsic::amdgcn_cvt_pknorm_u16:
871 case Intrinsic::amdgcn_cvt_pk_i16:
872 case Intrinsic::amdgcn_cvt_pk_u16: {
873 Value *Src0 = II.getArgOperand(i: 0);
874 Value *Src1 = II.getArgOperand(i: 1);
875
876 // TODO: Replace call with scalar operation if only one element is poison.
877 if (isa<PoisonValue>(Val: Src0) && isa<PoisonValue>(Val: Src1))
878 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
879
880 if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
881 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
882 }
883
884 break;
885 }
886 case Intrinsic::amdgcn_cvt_off_f32_i4: {
887 Value* Arg = II.getArgOperand(i: 0);
888 Type *Ty = II.getType();
889
890 if (isa<PoisonValue>(Val: Arg))
891 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: Ty));
892
893 if(IC.getSimplifyQuery().isUndefValue(V: Arg))
894 return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty));
895
896 ConstantInt *CArg = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 0));
897 if (!CArg)
898 break;
899
900 // Tabulated 0.0625 * (sext (CArg & 0xf)).
901 constexpr size_t ResValsSize = 16;
902 static constexpr float ResVals[ResValsSize] = {
903 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
904 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
905 Constant *Res =
906 ConstantFP::get(Ty, V: ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
907 return IC.replaceInstUsesWith(I&: II, V: Res);
908 }
909 case Intrinsic::amdgcn_ubfe:
910 case Intrinsic::amdgcn_sbfe: {
911 // Decompose simple cases into standard shifts.
912 Value *Src = II.getArgOperand(i: 0);
913 if (isa<UndefValue>(Val: Src)) {
914 return IC.replaceInstUsesWith(I&: II, V: Src);
915 }
916
917 unsigned Width;
918 Type *Ty = II.getType();
919 unsigned IntSize = Ty->getIntegerBitWidth();
920
921 ConstantInt *CWidth = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 2));
922 if (CWidth) {
923 Width = CWidth->getZExtValue();
924 if ((Width & (IntSize - 1)) == 0) {
925 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getNullValue(Ty));
926 }
927
928 // Hardware ignores high bits, so remove those.
929 if (Width >= IntSize) {
930 return IC.replaceOperand(
931 I&: II, OpNum: 2, V: ConstantInt::get(Ty: CWidth->getType(), V: Width & (IntSize - 1)));
932 }
933 }
934
935 unsigned Offset;
936 ConstantInt *COffset = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 1));
937 if (COffset) {
938 Offset = COffset->getZExtValue();
939 if (Offset >= IntSize) {
940 return IC.replaceOperand(
941 I&: II, OpNum: 1,
942 V: ConstantInt::get(Ty: COffset->getType(), V: Offset & (IntSize - 1)));
943 }
944 }
945
946 bool Signed = IID == Intrinsic::amdgcn_sbfe;
947
948 if (!CWidth || !COffset)
949 break;
950
951 // The case of Width == 0 is handled above, which makes this transformation
952 // safe. If Width == 0, then the ashr and lshr instructions become poison
953 // value since the shift amount would be equal to the bit size.
954 assert(Width != 0);
955
956 // TODO: This allows folding to undef when the hardware has specific
957 // behavior?
958 if (Offset + Width < IntSize) {
959 Value *Shl = IC.Builder.CreateShl(LHS: Src, RHS: IntSize - Offset - Width);
960 Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Shl, RHS: IntSize - Width)
961 : IC.Builder.CreateLShr(LHS: Shl, RHS: IntSize - Width);
962 RightShift->takeName(V: &II);
963 return IC.replaceInstUsesWith(I&: II, V: RightShift);
964 }
965
966 Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Src, RHS: Offset)
967 : IC.Builder.CreateLShr(LHS: Src, RHS: Offset);
968
969 RightShift->takeName(V: &II);
970 return IC.replaceInstUsesWith(I&: II, V: RightShift);
971 }
972 case Intrinsic::amdgcn_exp:
973 case Intrinsic::amdgcn_exp_row:
974 case Intrinsic::amdgcn_exp_compr: {
975 ConstantInt *En = cast<ConstantInt>(Val: II.getArgOperand(i: 1));
976 unsigned EnBits = En->getZExtValue();
977 if (EnBits == 0xf)
978 break; // All inputs enabled.
979
980 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
981 bool Changed = false;
982 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
983 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
984 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
985 Value *Src = II.getArgOperand(i: I + 2);
986 if (!isa<PoisonValue>(Val: Src)) {
987 IC.replaceOperand(I&: II, OpNum: I + 2, V: PoisonValue::get(T: Src->getType()));
988 Changed = true;
989 }
990 }
991 }
992
993 if (Changed) {
994 return &II;
995 }
996
997 break;
998 }
999 case Intrinsic::amdgcn_fmed3: {
1000 Value *Src0 = II.getArgOperand(i: 0);
1001 Value *Src1 = II.getArgOperand(i: 1);
1002 Value *Src2 = II.getArgOperand(i: 2);
1003
1004 for (Value *Src : {Src0, Src1, Src2}) {
1005 if (isa<PoisonValue>(Val: Src))
1006 return IC.replaceInstUsesWith(I&: II, V: Src);
1007 }
1008
1009 if (II.isStrictFP())
1010 break;
1011
1012 // med3 with a nan input acts like
1013 // v_min_f32(v_min_f32(s0, s1), s2)
1014 //
1015 // Signalingness is ignored with ieee=0, so we fold to
1016 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1017 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1018 // returned signaling nan will not be quieted.
1019
1020 // ieee=1
1021 // s0 snan: s2
1022 // s1 snan: s2
1023 // s2 snan: qnan
1024
1025 // s0 qnan: min(s1, s2)
1026 // s1 qnan: min(s0, s2)
1027 // s2 qnan: min(s0, s1)
1028
1029 // ieee=0
1030 // s0 _nan: min(s1, s2)
1031 // s1 _nan: min(s0, s2)
1032 // s2 _nan: min(s0, s1)
1033
1034 // med3 behavior with infinity
1035 // s0 +inf: max(s1, s2)
1036 // s1 +inf: max(s0, s2)
1037 // s2 +inf: max(s0, s1)
1038 // s0 -inf: min(s1, s2)
1039 // s1 -inf: min(s0, s2)
1040 // s2 -inf: min(s0, s1)
1041
1042 // Checking for NaN before canonicalization provides better fidelity when
1043 // mapping other operations onto fmed3 since the order of operands is
1044 // unchanged.
1045 Value *V = nullptr;
1046 const APFloat *ConstSrc0 = nullptr;
1047 const APFloat *ConstSrc1 = nullptr;
1048 const APFloat *ConstSrc2 = nullptr;
1049
1050 if ((match(V: Src0, P: m_APFloat(Res&: ConstSrc0)) &&
1051 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1052 isa<UndefValue>(Val: Src0)) {
1053 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1054 switch (fpenvIEEEMode(I: II)) {
1055 case KnownIEEEMode::On:
1056 // TODO: If Src2 is snan, does it need quieting?
1057 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1058 return IC.replaceInstUsesWith(I&: II, V: Src2);
1059
1060 V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src1, RHS: Src2)
1061 : IC.Builder.CreateMinNum(LHS: Src1, RHS: Src2);
1062 break;
1063 case KnownIEEEMode::Off:
1064 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src1, RHS: Src2)
1065 : IC.Builder.CreateMinimumNum(LHS: Src1, RHS: Src2);
1066 break;
1067 case KnownIEEEMode::Unknown:
1068 break;
1069 }
1070 } else if ((match(V: Src1, P: m_APFloat(Res&: ConstSrc1)) &&
1071 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1072 isa<UndefValue>(Val: Src1)) {
1073 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1074 switch (fpenvIEEEMode(I: II)) {
1075 case KnownIEEEMode::On:
1076 // TODO: If Src2 is snan, does it need quieting?
1077 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1078 return IC.replaceInstUsesWith(I&: II, V: Src2);
1079
1080 V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src2)
1081 : IC.Builder.CreateMinNum(LHS: Src0, RHS: Src2);
1082 break;
1083 case KnownIEEEMode::Off:
1084 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src2)
1085 : IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src2);
1086 break;
1087 case KnownIEEEMode::Unknown:
1088 break;
1089 }
1090 } else if ((match(V: Src2, P: m_APFloat(Res&: ConstSrc2)) &&
1091 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1092 isa<UndefValue>(Val: Src2)) {
1093 switch (fpenvIEEEMode(I: II)) {
1094 case KnownIEEEMode::On:
1095 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1096 auto *Quieted = ConstantFP::get(Ty: II.getType(), V: ConstSrc2->makeQuiet());
1097 return IC.replaceInstUsesWith(I&: II, V: Quieted);
1098 }
1099
1100 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1101 ? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src1)
1102 : IC.Builder.CreateMinNum(LHS: Src0, RHS: Src1);
1103 break;
1104 case KnownIEEEMode::Off:
1105 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1106 ? IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src1)
1107 : IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src1);
1108 break;
1109 case KnownIEEEMode::Unknown:
1110 break;
1111 }
1112 }
1113
1114 if (V) {
1115 if (auto *CI = dyn_cast<CallInst>(Val: V)) {
1116 CI->copyFastMathFlags(I: &II);
1117 CI->takeName(V: &II);
1118 }
1119 return IC.replaceInstUsesWith(I&: II, V);
1120 }
1121
1122 bool Swap = false;
1123 // Canonicalize constants to RHS operands.
1124 //
1125 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1126 if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1127 std::swap(a&: Src0, b&: Src1);
1128 Swap = true;
1129 }
1130
1131 if (isa<Constant>(Val: Src1) && !isa<Constant>(Val: Src2)) {
1132 std::swap(a&: Src1, b&: Src2);
1133 Swap = true;
1134 }
1135
1136 if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1137 std::swap(a&: Src0, b&: Src1);
1138 Swap = true;
1139 }
1140
1141 if (Swap) {
1142 II.setArgOperand(i: 0, v: Src0);
1143 II.setArgOperand(i: 1, v: Src1);
1144 II.setArgOperand(i: 2, v: Src2);
1145 return &II;
1146 }
1147
1148 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
1149 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
1150 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Val: Src2)) {
1151 APFloat Result = fmed3AMDGCN(Src0: C0->getValueAPF(), Src1: C1->getValueAPF(),
1152 Src2: C2->getValueAPF());
1153 return IC.replaceInstUsesWith(I&: II,
1154 V: ConstantFP::get(Ty: II.getType(), V: Result));
1155 }
1156 }
1157 }
1158
1159 if (!ST->hasMed3_16())
1160 break;
1161
1162 // Repeat floating-point width reduction done for minnum/maxnum.
1163 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1164 if (Value *X = matchFPExtFromF16(Arg: Src0)) {
1165 if (Value *Y = matchFPExtFromF16(Arg: Src1)) {
1166 if (Value *Z = matchFPExtFromF16(Arg: Src2)) {
1167 Value *NewCall = IC.Builder.CreateIntrinsic(
1168 ID: IID, Types: {X->getType()}, Args: {X, Y, Z}, FMFSource: &II, Name: II.getName());
1169 return new FPExtInst(NewCall, II.getType());
1170 }
1171 }
1172 }
1173
1174 break;
1175 }
1176 case Intrinsic::amdgcn_icmp:
1177 case Intrinsic::amdgcn_fcmp: {
1178 const ConstantInt *CC = cast<ConstantInt>(Val: II.getArgOperand(i: 2));
1179 // Guard against invalid arguments.
1180 int64_t CCVal = CC->getZExtValue();
1181 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1182 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1183 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1184 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1185 CCVal > CmpInst::LAST_FCMP_PREDICATE)))
1186 break;
1187
1188 Value *Src0 = II.getArgOperand(i: 0);
1189 Value *Src1 = II.getArgOperand(i: 1);
1190
1191 if (auto *CSrc0 = dyn_cast<Constant>(Val: Src0)) {
1192 if (auto *CSrc1 = dyn_cast<Constant>(Val: Src1)) {
1193 Constant *CCmp = ConstantFoldCompareInstOperands(
1194 Predicate: (ICmpInst::Predicate)CCVal, LHS: CSrc0, RHS: CSrc1, DL);
1195 if (CCmp && CCmp->isNullValue()) {
1196 return IC.replaceInstUsesWith(
1197 I&: II, V: IC.Builder.CreateSExt(V: CCmp, DestTy: II.getType()));
1198 }
1199
1200 // The result of V_ICMP/V_FCMP assembly instructions (which this
1201 // intrinsic exposes) is one bit per thread, masked with the EXEC
1202 // register (which contains the bitmask of live threads). So a
1203 // comparison that always returns true is the same as a read of the
1204 // EXEC register.
1205 Metadata *MDArgs[] = {MDString::get(Context&: II.getContext(), Str: "exec")};
1206 MDNode *MD = MDNode::get(Context&: II.getContext(), MDs: MDArgs);
1207 Value *Args[] = {MetadataAsValue::get(Context&: II.getContext(), MD)};
1208 CallInst *NewCall = IC.Builder.CreateIntrinsic(ID: Intrinsic::read_register,
1209 Types: II.getType(), Args);
1210 NewCall->addFnAttr(Kind: Attribute::Convergent);
1211 NewCall->takeName(V: &II);
1212 return IC.replaceInstUsesWith(I&: II, V: NewCall);
1213 }
1214
1215 // Canonicalize constants to RHS.
1216 CmpInst::Predicate SwapPred =
1217 CmpInst::getSwappedPredicate(pred: static_cast<CmpInst::Predicate>(CCVal));
1218 II.setArgOperand(i: 0, v: Src1);
1219 II.setArgOperand(i: 1, v: Src0);
1220 II.setArgOperand(
1221 i: 2, v: ConstantInt::get(Ty: CC->getType(), V: static_cast<int>(SwapPred)));
1222 return &II;
1223 }
1224
1225 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1226 break;
1227
1228 // Canonicalize compare eq with true value to compare != 0
1229 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1230 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1231 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1232 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1233 Value *ExtSrc;
1234 if (CCVal == CmpInst::ICMP_EQ &&
1235 ((match(V: Src1, P: PatternMatch::m_One()) &&
1236 match(V: Src0, P: m_ZExt(Op: PatternMatch::m_Value(V&: ExtSrc)))) ||
1237 (match(V: Src1, P: PatternMatch::m_AllOnes()) &&
1238 match(V: Src0, P: m_SExt(Op: PatternMatch::m_Value(V&: ExtSrc))))) &&
1239 ExtSrc->getType()->isIntegerTy(Bitwidth: 1)) {
1240 IC.replaceOperand(I&: II, OpNum: 1, V: ConstantInt::getNullValue(Ty: Src1->getType()));
1241 IC.replaceOperand(I&: II, OpNum: 2,
1242 V: ConstantInt::get(Ty: CC->getType(), V: CmpInst::ICMP_NE));
1243 return &II;
1244 }
1245
1246 CmpPredicate SrcPred;
1247 Value *SrcLHS;
1248 Value *SrcRHS;
1249
1250 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1251 // intrinsic. The typical use is a wave vote function in the library, which
1252 // will be fed from a user code condition compared with 0. Fold in the
1253 // redundant compare.
1254
1255 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1256 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1257 //
1258 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1259 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1260 if (match(V: Src1, P: PatternMatch::m_Zero()) &&
1261 match(V: Src0, P: PatternMatch::m_ZExtOrSExt(
1262 Op: m_Cmp(Pred&: SrcPred, L: PatternMatch::m_Value(V&: SrcLHS),
1263 R: PatternMatch::m_Value(V&: SrcRHS))))) {
1264 if (CCVal == CmpInst::ICMP_EQ)
1265 SrcPred = CmpInst::getInversePredicate(pred: SrcPred);
1266
1267 Intrinsic::ID NewIID = CmpInst::isFPPredicate(P: SrcPred)
1268 ? Intrinsic::amdgcn_fcmp
1269 : Intrinsic::amdgcn_icmp;
1270
1271 Type *Ty = SrcLHS->getType();
1272 if (auto *CmpType = dyn_cast<IntegerType>(Val: Ty)) {
1273 // Promote to next legal integer type.
1274 unsigned Width = CmpType->getBitWidth();
1275 unsigned NewWidth = Width;
1276
1277 // Don't do anything for i1 comparisons.
1278 if (Width == 1)
1279 break;
1280
1281 if (Width <= 16)
1282 NewWidth = 16;
1283 else if (Width <= 32)
1284 NewWidth = 32;
1285 else if (Width <= 64)
1286 NewWidth = 64;
1287 else
1288 break; // Can't handle this.
1289
1290 if (Width != NewWidth) {
1291 IntegerType *CmpTy = IC.Builder.getIntNTy(N: NewWidth);
1292 if (CmpInst::isSigned(predicate: SrcPred)) {
1293 SrcLHS = IC.Builder.CreateSExt(V: SrcLHS, DestTy: CmpTy);
1294 SrcRHS = IC.Builder.CreateSExt(V: SrcRHS, DestTy: CmpTy);
1295 } else {
1296 SrcLHS = IC.Builder.CreateZExt(V: SrcLHS, DestTy: CmpTy);
1297 SrcRHS = IC.Builder.CreateZExt(V: SrcRHS, DestTy: CmpTy);
1298 }
1299 }
1300 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1301 break;
1302
1303 Value *Args[] = {SrcLHS, SrcRHS,
1304 ConstantInt::get(Ty: CC->getType(), V: SrcPred)};
1305 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1306 ID: NewIID, Types: {II.getType(), SrcLHS->getType()}, Args);
1307 NewCall->takeName(V: &II);
1308 return IC.replaceInstUsesWith(I&: II, V: NewCall);
1309 }
1310
1311 break;
1312 }
1313 case Intrinsic::amdgcn_mbcnt_hi: {
1314 // exec_hi is all 0, so this is just a copy.
1315 if (ST->isWave32())
1316 return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: 1));
1317 break;
1318 }
1319 case Intrinsic::amdgcn_ballot: {
1320 Value *Arg = II.getArgOperand(i: 0);
1321 if (isa<PoisonValue>(Val: Arg))
1322 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1323
1324 if (auto *Src = dyn_cast<ConstantInt>(Val: Arg)) {
1325 if (Src->isZero()) {
1326 // amdgcn.ballot(i1 0) is zero.
1327 return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1328 }
1329 }
1330 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1331 // %b64 = call i64 ballot.i64(...)
1332 // =>
1333 // %b32 = call i32 ballot.i32(...)
1334 // %b64 = zext i32 %b32 to i64
1335 Value *Call = IC.Builder.CreateZExt(
1336 V: IC.Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_ballot,
1337 Types: {IC.Builder.getInt32Ty()},
1338 Args: {II.getArgOperand(i: 0)}),
1339 DestTy: II.getType());
1340 Call->takeName(V: &II);
1341 return IC.replaceInstUsesWith(I&: II, V: Call);
1342 }
1343 break;
1344 }
1345 case Intrinsic::amdgcn_wavefrontsize: {
1346 if (ST->isWaveSizeKnown())
1347 return IC.replaceInstUsesWith(
1348 I&: II, V: ConstantInt::get(Ty: II.getType(), V: ST->getWavefrontSize()));
1349 break;
1350 }
1351 case Intrinsic::amdgcn_wqm_vote: {
1352 // wqm_vote is identity when the argument is constant.
1353 if (!isa<Constant>(Val: II.getArgOperand(i: 0)))
1354 break;
1355
1356 return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: 0));
1357 }
1358 case Intrinsic::amdgcn_kill: {
1359 const ConstantInt *C = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 0));
1360 if (!C || !C->getZExtValue())
1361 break;
1362
1363 // amdgcn.kill(i1 1) is a no-op
1364 return IC.eraseInstFromFunction(I&: II);
1365 }
1366 case Intrinsic::amdgcn_update_dpp: {
1367 Value *Old = II.getArgOperand(i: 0);
1368
1369 auto *BC = cast<ConstantInt>(Val: II.getArgOperand(i: 5));
1370 auto *RM = cast<ConstantInt>(Val: II.getArgOperand(i: 3));
1371 auto *BM = cast<ConstantInt>(Val: II.getArgOperand(i: 4));
1372 if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1373 BM->getZExtValue() != 0xF || isa<PoisonValue>(Val: Old))
1374 break;
1375
1376 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1377 return IC.replaceOperand(I&: II, OpNum: 0, V: PoisonValue::get(T: Old->getType()));
1378 }
1379 case Intrinsic::amdgcn_permlane16:
1380 case Intrinsic::amdgcn_permlane16_var:
1381 case Intrinsic::amdgcn_permlanex16:
1382 case Intrinsic::amdgcn_permlanex16_var: {
1383 // Discard vdst_in if it's not going to be read.
1384 Value *VDstIn = II.getArgOperand(i: 0);
1385 if (isa<PoisonValue>(Val: VDstIn))
1386 break;
1387
1388 // FetchInvalid operand idx.
1389 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1390 IID == Intrinsic::amdgcn_permlanex16)
1391 ? 4 /* for permlane16 and permlanex16 */
1392 : 3; /* for permlane16_var and permlanex16_var */
1393
1394 // BoundCtrl operand idx.
1395 // For permlane16 and permlanex16 it should be 5
1396 // For Permlane16_var and permlanex16_var it should be 4
1397 unsigned int BcIdx = FiIdx + 1;
1398
1399 ConstantInt *FetchInvalid = cast<ConstantInt>(Val: II.getArgOperand(i: FiIdx));
1400 ConstantInt *BoundCtrl = cast<ConstantInt>(Val: II.getArgOperand(i: BcIdx));
1401 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1402 break;
1403
1404 return IC.replaceOperand(I&: II, OpNum: 0, V: PoisonValue::get(T: VDstIn->getType()));
1405 }
1406 case Intrinsic::amdgcn_permlane64:
1407 case Intrinsic::amdgcn_readfirstlane:
1408 case Intrinsic::amdgcn_readlane:
1409 case Intrinsic::amdgcn_ds_bpermute: {
1410 // If the data argument is uniform these intrinsics return it unchanged.
1411 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1412 const Use &Src = II.getArgOperandUse(i: SrcIdx);
1413 if (isTriviallyUniform(U: Src))
1414 return IC.replaceInstUsesWith(I&: II, V: Src.get());
1415
1416 if (IID == Intrinsic::amdgcn_readlane &&
1417 simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: 1))
1418 return &II;
1419
1420 // If the lane argument of bpermute is uniform, change it to readlane. This
1421 // generates better code and can enable further optimizations because
1422 // readlane is AlwaysUniform.
1423 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1424 const Use &Lane = II.getArgOperandUse(i: 0);
1425 if (isTriviallyUniform(U: Lane)) {
1426 Value *NewLane = IC.Builder.CreateLShr(LHS: Lane, RHS: 2);
1427 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1428 M: II.getModule(), id: Intrinsic::amdgcn_readlane, Tys: II.getType());
1429 II.setCalledFunction(NewDecl);
1430 II.setOperand(i_nocapture: 0, Val_nocapture: Src);
1431 II.setOperand(i_nocapture: 1, Val_nocapture: NewLane);
1432 return &II;
1433 }
1434 }
1435
1436 if (IID != Intrinsic::amdgcn_ds_bpermute) {
1437 if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
1438 return Res;
1439 }
1440
1441 return std::nullopt;
1442 }
1443 case Intrinsic::amdgcn_writelane: {
1444 // TODO: Fold bitcast like readlane.
1445 if (simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: 1))
1446 return &II;
1447 return std::nullopt;
1448 }
1449 case Intrinsic::amdgcn_trig_preop: {
1450 // The intrinsic is declared with name mangling, but currently the
1451 // instruction only exists for f64
1452 if (!II.getType()->isDoubleTy())
1453 break;
1454
1455 Value *Src = II.getArgOperand(i: 0);
1456 Value *Segment = II.getArgOperand(i: 1);
1457 if (isa<PoisonValue>(Val: Src) || isa<PoisonValue>(Val: Segment))
1458 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1459
1460 if (isa<UndefValue>(Val: Src)) {
1461 auto *QNaN = ConstantFP::get(
1462 Ty: II.getType(), V: APFloat::getQNaN(Sem: II.getType()->getFltSemantics()));
1463 return IC.replaceInstUsesWith(I&: II, V: QNaN);
1464 }
1465
1466 const ConstantFP *Csrc = dyn_cast<ConstantFP>(Val: Src);
1467 if (!Csrc)
1468 break;
1469
1470 if (II.isStrictFP())
1471 break;
1472
1473 const APFloat &Fsrc = Csrc->getValueAPF();
1474 if (Fsrc.isNaN()) {
1475 auto *Quieted = ConstantFP::get(Ty: II.getType(), V: Fsrc.makeQuiet());
1476 return IC.replaceInstUsesWith(I&: II, V: Quieted);
1477 }
1478
1479 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Val: Segment);
1480 if (!Cseg)
1481 break;
1482
1483 unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1484 unsigned SegmentVal = Cseg->getValue().trunc(width: 5).getZExtValue();
1485 unsigned Shift = SegmentVal * 53;
1486 if (Exponent > 1077)
1487 Shift += Exponent - 1077;
1488
1489 // 2.0/PI table.
1490 static const uint32_t TwoByPi[] = {
1491 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1492 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1493 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1494 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1495 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1496 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1497 0x56033046};
1498
1499 // Return 0 for outbound segment (hardware behavior).
1500 unsigned Idx = Shift >> 5;
1501 if (Idx + 2 >= std::size(TwoByPi)) {
1502 APFloat Zero = APFloat::getZero(Sem: II.getType()->getFltSemantics());
1503 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: II.getType(), V: Zero));
1504 }
1505
1506 unsigned BShift = Shift & 0x1f;
1507 uint64_t Thi = Make_64(High: TwoByPi[Idx], Low: TwoByPi[Idx + 1]);
1508 uint64_t Tlo = Make_64(High: TwoByPi[Idx + 2], Low: 0);
1509 if (BShift)
1510 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1511 Thi = Thi >> 11;
1512 APFloat Result = APFloat((double)Thi);
1513
1514 int Scale = -53 - Shift;
1515 if (Exponent >= 1968)
1516 Scale += 128;
1517
1518 Result = scalbn(X: Result, Exp: Scale, RM: RoundingMode::NearestTiesToEven);
1519 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: Src->getType(), V: Result));
1520 }
1521 case Intrinsic::amdgcn_fmul_legacy: {
1522 Value *Op0 = II.getArgOperand(i: 0);
1523 Value *Op1 = II.getArgOperand(i: 1);
1524
1525 for (Value *Src : {Op0, Op1}) {
1526 if (isa<PoisonValue>(Val: Src))
1527 return IC.replaceInstUsesWith(I&: II, V: Src);
1528 }
1529
1530 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1531 // infinity, gives +0.0.
1532 // TODO: Move to InstSimplify?
1533 if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) ||
1534 match(V: Op1, P: PatternMatch::m_AnyZeroFP()))
1535 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1536
1537 // If we can prove we don't have one of the special cases then we can use a
1538 // normal fmul instruction instead.
1539 if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1540 auto *FMul = IC.Builder.CreateFMulFMF(L: Op0, R: Op1, FMFSource: &II);
1541 FMul->takeName(V: &II);
1542 return IC.replaceInstUsesWith(I&: II, V: FMul);
1543 }
1544 break;
1545 }
1546 case Intrinsic::amdgcn_fma_legacy: {
1547 Value *Op0 = II.getArgOperand(i: 0);
1548 Value *Op1 = II.getArgOperand(i: 1);
1549 Value *Op2 = II.getArgOperand(i: 2);
1550
1551 for (Value *Src : {Op0, Op1, Op2}) {
1552 if (isa<PoisonValue>(Val: Src))
1553 return IC.replaceInstUsesWith(I&: II, V: Src);
1554 }
1555
1556 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1557 // infinity, gives +0.0.
1558 // TODO: Move to InstSimplify?
1559 if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) ||
1560 match(V: Op1, P: PatternMatch::m_AnyZeroFP())) {
1561 // It's tempting to just return Op2 here, but that would give the wrong
1562 // result if Op2 was -0.0.
1563 auto *Zero = ConstantFP::getZero(Ty: II.getType());
1564 auto *FAdd = IC.Builder.CreateFAddFMF(L: Zero, R: Op2, FMFSource: &II);
1565 FAdd->takeName(V: &II);
1566 return IC.replaceInstUsesWith(I&: II, V: FAdd);
1567 }
1568
1569 // If we can prove we don't have one of the special cases then we can use a
1570 // normal fma instead.
1571 if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1572 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1573 M: II.getModule(), id: Intrinsic::fma, Tys: II.getType()));
1574 return &II;
1575 }
1576 break;
1577 }
1578 case Intrinsic::amdgcn_is_shared:
1579 case Intrinsic::amdgcn_is_private: {
1580 Value *Src = II.getArgOperand(i: 0);
1581 if (isa<PoisonValue>(Val: Src))
1582 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1583 if (isa<UndefValue>(Val: Src))
1584 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1585
1586 if (isa<ConstantPointerNull>(Val: II.getArgOperand(i: 0)))
1587 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getFalse(Ty: II.getType()));
1588 break;
1589 }
1590 case Intrinsic::amdgcn_make_buffer_rsrc: {
1591 Value *Src = II.getArgOperand(i: 0);
1592 if (isa<PoisonValue>(Val: Src))
1593 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1594 return std::nullopt;
1595 }
1596 case Intrinsic::amdgcn_raw_buffer_store_format:
1597 case Intrinsic::amdgcn_struct_buffer_store_format:
1598 case Intrinsic::amdgcn_raw_tbuffer_store:
1599 case Intrinsic::amdgcn_struct_tbuffer_store:
1600 case Intrinsic::amdgcn_image_store_1d:
1601 case Intrinsic::amdgcn_image_store_1darray:
1602 case Intrinsic::amdgcn_image_store_2d:
1603 case Intrinsic::amdgcn_image_store_2darray:
1604 case Intrinsic::amdgcn_image_store_2darraymsaa:
1605 case Intrinsic::amdgcn_image_store_2dmsaa:
1606 case Intrinsic::amdgcn_image_store_3d:
1607 case Intrinsic::amdgcn_image_store_cube:
1608 case Intrinsic::amdgcn_image_store_mip_1d:
1609 case Intrinsic::amdgcn_image_store_mip_1darray:
1610 case Intrinsic::amdgcn_image_store_mip_2d:
1611 case Intrinsic::amdgcn_image_store_mip_2darray:
1612 case Intrinsic::amdgcn_image_store_mip_3d:
1613 case Intrinsic::amdgcn_image_store_mip_cube: {
1614 if (!isa<FixedVectorType>(Val: II.getArgOperand(i: 0)->getType()))
1615 break;
1616
1617 APInt DemandedElts;
1618 if (ST->hasDefaultComponentBroadcast())
1619 DemandedElts = defaultComponentBroadcast(V: II.getArgOperand(i: 0));
1620 else if (ST->hasDefaultComponentZero())
1621 DemandedElts = trimTrailingZerosInVector(IC, UseV: II.getArgOperand(i: 0), I: &II);
1622 else
1623 break;
1624
1625 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID()) ? 1 : -1;
1626 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1627 IsLoad: false)) {
1628 return IC.eraseInstFromFunction(I&: II);
1629 }
1630
1631 break;
1632 }
1633 case Intrinsic::amdgcn_prng_b32: {
1634 auto *Src = II.getArgOperand(i: 0);
1635 if (isa<UndefValue>(Val: Src)) {
1636 return IC.replaceInstUsesWith(I&: II, V: Src);
1637 }
1638 return std::nullopt;
1639 }
1640 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1641 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1642 Value *Src0 = II.getArgOperand(i: 0);
1643 Value *Src1 = II.getArgOperand(i: 1);
1644 uint64_t CBSZ = cast<ConstantInt>(Val: II.getArgOperand(i: 3))->getZExtValue();
1645 uint64_t BLGP = cast<ConstantInt>(Val: II.getArgOperand(i: 4))->getZExtValue();
1646 auto *Src0Ty = cast<FixedVectorType>(Val: Src0->getType());
1647 auto *Src1Ty = cast<FixedVectorType>(Val: Src1->getType());
1648
1649 auto getFormatNumRegs = [](unsigned FormatVal) {
1650 switch (FormatVal) {
1651 case AMDGPU::MFMAScaleFormats::FP6_E2M3:
1652 case AMDGPU::MFMAScaleFormats::FP6_E3M2:
1653 return 6u;
1654 case AMDGPU::MFMAScaleFormats::FP4_E2M1:
1655 return 4u;
1656 case AMDGPU::MFMAScaleFormats::FP8_E4M3:
1657 case AMDGPU::MFMAScaleFormats::FP8_E5M2:
1658 return 8u;
1659 default:
1660 llvm_unreachable("invalid format value");
1661 }
1662 };
1663
1664 bool MadeChange = false;
1665 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
1666 unsigned Src1NumElts = getFormatNumRegs(BLGP);
1667
1668 // Depending on the used format, fewer registers are required so shrink the
1669 // vector type.
1670 if (Src0Ty->getNumElements() > Src0NumElts) {
1671 Src0 = IC.Builder.CreateExtractVector(
1672 DstType: FixedVectorType::get(ElementType: Src0Ty->getElementType(), NumElts: Src0NumElts), SrcVec: Src0,
1673 Idx: uint64_t(0));
1674 MadeChange = true;
1675 }
1676
1677 if (Src1Ty->getNumElements() > Src1NumElts) {
1678 Src1 = IC.Builder.CreateExtractVector(
1679 DstType: FixedVectorType::get(ElementType: Src1Ty->getElementType(), NumElts: Src1NumElts), SrcVec: Src1,
1680 Idx: uint64_t(0));
1681 MadeChange = true;
1682 }
1683
1684 if (!MadeChange)
1685 return std::nullopt;
1686
1687 SmallVector<Value *, 10> Args(II.args());
1688 Args[0] = Src0;
1689 Args[1] = Src1;
1690
1691 CallInst *NewII = IC.Builder.CreateIntrinsic(
1692 ID: IID, Types: {Src0->getType(), Src1->getType()}, Args, FMFSource: &II);
1693 NewII->takeName(V: &II);
1694 return IC.replaceInstUsesWith(I&: II, V: NewII);
1695 }
1696 }
1697 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1698 AMDGPU::getImageDimIntrinsicInfo(Intr: II.getIntrinsicID())) {
1699 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1700 }
1701 return std::nullopt;
1702}
1703
1704/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1705///
1706/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1707/// definitions of the intrinsics vector argument, not Uses of the result like
1708/// image and buffer loads.
1709/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1710/// struct returns.
1711static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1712 IntrinsicInst &II,
1713 APInt DemandedElts,
1714 int DMaskIdx, bool IsLoad) {
1715
1716 auto *IIVTy = cast<FixedVectorType>(Val: IsLoad ? II.getType()
1717 : II.getOperand(i_nocapture: 0)->getType());
1718 unsigned VWidth = IIVTy->getNumElements();
1719 if (VWidth == 1)
1720 return nullptr;
1721 Type *EltTy = IIVTy->getElementType();
1722
1723 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1724 IC.Builder.SetInsertPoint(&II);
1725
1726 // Assume the arguments are unchanged and later override them, if needed.
1727 SmallVector<Value *, 16> Args(II.args());
1728
1729 if (DMaskIdx < 0) {
1730 // Buffer case.
1731
1732 const unsigned ActiveBits = DemandedElts.getActiveBits();
1733 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1734
1735 // Start assuming the prefix of elements is demanded, but possibly clear
1736 // some other bits if there are trailing zeros (unused components at front)
1737 // and update offset.
1738 DemandedElts = (1 << ActiveBits) - 1;
1739
1740 if (UnusedComponentsAtFront > 0) {
1741 static const unsigned InvalidOffsetIdx = 0xf;
1742
1743 unsigned OffsetIdx;
1744 switch (II.getIntrinsicID()) {
1745 case Intrinsic::amdgcn_raw_buffer_load:
1746 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1747 OffsetIdx = 1;
1748 break;
1749 case Intrinsic::amdgcn_s_buffer_load:
1750 // If resulting type is vec3, there is no point in trimming the
1751 // load with updated offset, as the vec3 would most likely be widened to
1752 // vec4 anyway during lowering.
1753 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1754 OffsetIdx = InvalidOffsetIdx;
1755 else
1756 OffsetIdx = 1;
1757 break;
1758 case Intrinsic::amdgcn_struct_buffer_load:
1759 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1760 OffsetIdx = 2;
1761 break;
1762 default:
1763 // TODO: handle tbuffer* intrinsics.
1764 OffsetIdx = InvalidOffsetIdx;
1765 break;
1766 }
1767
1768 if (OffsetIdx != InvalidOffsetIdx) {
1769 // Clear demanded bits and update the offset.
1770 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1771 auto *Offset = Args[OffsetIdx];
1772 unsigned SingleComponentSizeInBits =
1773 IC.getDataLayout().getTypeSizeInBits(Ty: EltTy);
1774 unsigned OffsetAdd =
1775 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1776 auto *OffsetAddVal = ConstantInt::get(Ty: Offset->getType(), V: OffsetAdd);
1777 Args[OffsetIdx] = IC.Builder.CreateAdd(LHS: Offset, RHS: OffsetAddVal);
1778 }
1779 }
1780 } else {
1781 // Image case.
1782
1783 ConstantInt *DMask = cast<ConstantInt>(Val: Args[DMaskIdx]);
1784 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1785
1786 // dmask 0 has special semantics, do not simplify.
1787 if (DMaskVal == 0)
1788 return nullptr;
1789
1790 // Mask off values that are undefined because the dmask doesn't cover them
1791 DemandedElts &= (1 << llvm::popcount(Value: DMaskVal)) - 1;
1792
1793 unsigned NewDMaskVal = 0;
1794 unsigned OrigLdStIdx = 0;
1795 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1796 const unsigned Bit = 1 << SrcIdx;
1797 if (!!(DMaskVal & Bit)) {
1798 if (!!DemandedElts[OrigLdStIdx])
1799 NewDMaskVal |= Bit;
1800 OrigLdStIdx++;
1801 }
1802 }
1803
1804 if (DMaskVal != NewDMaskVal)
1805 Args[DMaskIdx] = ConstantInt::get(Ty: DMask->getType(), V: NewDMaskVal);
1806 }
1807
1808 unsigned NewNumElts = DemandedElts.popcount();
1809 if (!NewNumElts)
1810 return PoisonValue::get(T: IIVTy);
1811
1812 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1813 if (DMaskIdx >= 0)
1814 II.setArgOperand(i: DMaskIdx, v: Args[DMaskIdx]);
1815 return nullptr;
1816 }
1817
1818 // Validate function argument and return types, extracting overloaded types
1819 // along the way.
1820 SmallVector<Type *, 6> OverloadTys;
1821 if (!Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: OverloadTys))
1822 return nullptr;
1823
1824 Type *NewTy =
1825 (NewNumElts == 1) ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: NewNumElts);
1826 OverloadTys[0] = NewTy;
1827
1828 if (!IsLoad) {
1829 SmallVector<int, 8> EltMask;
1830 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1831 if (DemandedElts[OrigStoreIdx])
1832 EltMask.push_back(Elt: OrigStoreIdx);
1833
1834 if (NewNumElts == 1)
1835 Args[0] = IC.Builder.CreateExtractElement(Vec: II.getOperand(i_nocapture: 0), Idx: EltMask[0]);
1836 else
1837 Args[0] = IC.Builder.CreateShuffleVector(V: II.getOperand(i_nocapture: 0), Mask: EltMask);
1838 }
1839
1840 CallInst *NewCall =
1841 IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: OverloadTys, Args);
1842 NewCall->takeName(V: &II);
1843 NewCall->copyMetadata(SrcInst: II);
1844
1845 if (IsLoad) {
1846 if (NewNumElts == 1) {
1847 return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: IIVTy), NewElt: NewCall,
1848 Idx: DemandedElts.countr_zero());
1849 }
1850
1851 SmallVector<int, 8> EltMask;
1852 unsigned NewLoadIdx = 0;
1853 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1854 if (!!DemandedElts[OrigLoadIdx])
1855 EltMask.push_back(Elt: NewLoadIdx++);
1856 else
1857 EltMask.push_back(Elt: NewNumElts);
1858 }
1859
1860 auto *Shuffle = IC.Builder.CreateShuffleVector(V: NewCall, Mask: EltMask);
1861
1862 return Shuffle;
1863 }
1864
1865 return NewCall;
1866}
1867
1868Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
1869 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
1870 APInt &UndefElts) const {
1871 auto *VT = dyn_cast<FixedVectorType>(Val: II.getType());
1872 if (!VT)
1873 return nullptr;
1874
1875 const unsigned FirstElt = DemandedElts.countr_zero();
1876 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
1877 const unsigned MaskLen = LastElt - FirstElt + 1;
1878
1879 unsigned OldNumElts = VT->getNumElements();
1880 if (MaskLen == OldNumElts && MaskLen != 1)
1881 return nullptr;
1882
1883 Type *EltTy = VT->getElementType();
1884 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: MaskLen);
1885
1886 // Theoretically we should support these intrinsics for any legal type. Avoid
1887 // introducing cases that aren't direct register types like v3i16.
1888 if (!isTypeLegal(Ty: NewVT))
1889 return nullptr;
1890
1891 Value *Src = II.getArgOperand(i: 0);
1892
1893 // Make sure convergence tokens are preserved.
1894 // TODO: CreateIntrinsic should allow directly copying bundles
1895 SmallVector<OperandBundleDef, 2> OpBundles;
1896 II.getOperandBundlesAsDefs(Defs&: OpBundles);
1897
1898 Module *M = IC.Builder.GetInsertBlock()->getModule();
1899 Function *Remangled =
1900 Intrinsic::getOrInsertDeclaration(M, id: II.getIntrinsicID(), Tys: {NewVT});
1901
1902 if (MaskLen == 1) {
1903 Value *Extract = IC.Builder.CreateExtractElement(Vec: Src, Idx: FirstElt);
1904
1905 // TODO: Preserve callsite attributes?
1906 CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
1907
1908 return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: II.getType()),
1909 NewElt: NewCall, Idx: FirstElt);
1910 }
1911
1912 SmallVector<int> ExtractMask(MaskLen, -1);
1913 for (unsigned I = 0; I != MaskLen; ++I) {
1914 if (DemandedElts[FirstElt + I])
1915 ExtractMask[I] = FirstElt + I;
1916 }
1917
1918 Value *Extract = IC.Builder.CreateShuffleVector(V: Src, Mask: ExtractMask);
1919
1920 // TODO: Preserve callsite attributes?
1921 CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
1922
1923 SmallVector<int> InsertMask(OldNumElts, -1);
1924 for (unsigned I = 0; I != MaskLen; ++I) {
1925 if (DemandedElts[FirstElt + I])
1926 InsertMask[FirstElt + I] = I;
1927 }
1928
1929 // FIXME: If the call has a convergence bundle, we end up leaving the dead
1930 // call behind.
1931 return IC.Builder.CreateShuffleVector(V: NewCall, Mask: InsertMask);
1932}
1933
1934std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1935 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1936 APInt &UndefElts2, APInt &UndefElts3,
1937 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1938 SimplifyAndSetOp) const {
1939 switch (II.getIntrinsicID()) {
1940 case Intrinsic::amdgcn_readfirstlane:
1941 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1942 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
1943 case Intrinsic::amdgcn_raw_buffer_load:
1944 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1945 case Intrinsic::amdgcn_raw_buffer_load_format:
1946 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1947 case Intrinsic::amdgcn_raw_tbuffer_load:
1948 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1949 case Intrinsic::amdgcn_s_buffer_load:
1950 case Intrinsic::amdgcn_struct_buffer_load:
1951 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1952 case Intrinsic::amdgcn_struct_buffer_load_format:
1953 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1954 case Intrinsic::amdgcn_struct_tbuffer_load:
1955 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1956 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1957 default: {
1958 if (getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID())) {
1959 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx: 0);
1960 }
1961 break;
1962 }
1963 }
1964 return std::nullopt;
1965}
1966