1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUTargetTransformInfo.h"
19#include "GCNSubtarget.h"
20#include "llvm/ADT/FloatingPointMode.h"
21#include "llvm/IR/Dominators.h"
22#include "llvm/IR/IntrinsicsAMDGPU.h"
23#include "llvm/Transforms/InstCombine/InstCombiner.h"
24#include <optional>
25
26using namespace llvm;
27using namespace llvm::PatternMatch;
28
29#define DEBUG_TYPE "AMDGPUtti"
30
31namespace {
32
33struct AMDGPUImageDMaskIntrinsic {
34 unsigned Intr;
35};
36
37#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
38#include "AMDGPUGenSearchableTables.inc"
39
40} // end anonymous namespace
41
42// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
43//
44// A single NaN input is folded to minnum, so we rely on that folding for
45// handling NaNs.
46static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
47 const APFloat &Src2) {
48 APFloat Max3 = maxnum(A: maxnum(A: Src0, B: Src1), B: Src2);
49
50 APFloat::cmpResult Cmp0 = Max3.compare(RHS: Src0);
51 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
52 if (Cmp0 == APFloat::cmpEqual)
53 return maxnum(A: Src1, B: Src2);
54
55 APFloat::cmpResult Cmp1 = Max3.compare(RHS: Src1);
56 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
57 if (Cmp1 == APFloat::cmpEqual)
58 return maxnum(A: Src0, B: Src2);
59
60 return maxnum(A: Src0, B: Src1);
61}
62
63// Check if a value can be converted to a 16-bit value without losing
64// precision.
65// The value is expected to be either a float (IsFloat = true) or an unsigned
66// integer (IsFloat = false).
67static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
68 Type *VTy = V.getType();
69 if (VTy->isHalfTy() || VTy->isIntegerTy(Bitwidth: 16)) {
70 // The value is already 16-bit, so we don't want to convert to 16-bit again!
71 return false;
72 }
73 if (IsFloat) {
74 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(Val: &V)) {
75 // We need to check that if we cast the index down to a half, we do not
76 // lose precision.
77 APFloat FloatValue(ConstFloat->getValueAPF());
78 bool LosesInfo = true;
79 FloatValue.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero,
80 losesInfo: &LosesInfo);
81 return !LosesInfo;
82 }
83 } else {
84 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Val: &V)) {
85 // We need to check that if we cast the index down to an i16, we do not
86 // lose precision.
87 APInt IntValue(ConstInt->getValue());
88 return IntValue.getActiveBits() <= 16;
89 }
90 }
91
92 Value *CastSrc;
93 bool IsExt = IsFloat ? match(V: &V, P: m_FPExt(Op: PatternMatch::m_Value(V&: CastSrc)))
94 : match(V: &V, P: m_ZExt(Op: PatternMatch::m_Value(V&: CastSrc)));
95 if (IsExt) {
96 Type *CastSrcTy = CastSrc->getType();
97 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(Bitwidth: 16))
98 return true;
99 }
100
101 return false;
102}
103
104// Convert a value to 16-bit.
105static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
106 Type *VTy = V.getType();
107 if (isa<FPExtInst, SExtInst, ZExtInst>(Val: &V))
108 return cast<Instruction>(Val: &V)->getOperand(i: 0);
109 if (VTy->isIntegerTy())
110 return Builder.CreateIntCast(V: &V, DestTy: Type::getInt16Ty(C&: V.getContext()), isSigned: false);
111 if (VTy->isFloatingPointTy())
112 return Builder.CreateFPCast(V: &V, DestTy: Type::getHalfTy(C&: V.getContext()));
113
114 llvm_unreachable("Should never be called!");
115}
116
117/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
118/// modified arguments (based on OldIntr) and replaces InstToReplace with
119/// this newly created intrinsic call.
120static std::optional<Instruction *> modifyIntrinsicCall(
121 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
122 InstCombiner &IC,
123 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
124 Func) {
125 SmallVector<Type *, 4> ArgTys;
126 if (!Intrinsic::getIntrinsicSignature(F: OldIntr.getCalledFunction(), ArgTys))
127 return std::nullopt;
128
129 SmallVector<Value *, 8> Args(OldIntr.args());
130
131 // Modify arguments and types
132 Func(Args, ArgTys);
133
134 CallInst *NewCall = IC.Builder.CreateIntrinsic(ID: NewIntr, Types: ArgTys, Args);
135 NewCall->takeName(V: &OldIntr);
136 NewCall->copyMetadata(SrcInst: OldIntr);
137 if (isa<FPMathOperator>(Val: NewCall))
138 NewCall->copyFastMathFlags(I: &OldIntr);
139
140 // Erase and replace uses
141 if (!InstToReplace.getType()->isVoidTy())
142 IC.replaceInstUsesWith(I&: InstToReplace, V: NewCall);
143
144 bool RemoveOldIntr = &OldIntr != &InstToReplace;
145
146 auto *RetValue = IC.eraseInstFromFunction(I&: InstToReplace);
147 if (RemoveOldIntr)
148 IC.eraseInstFromFunction(I&: OldIntr);
149
150 return RetValue;
151}
152
153static std::optional<Instruction *>
154simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
155 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
156 IntrinsicInst &II, InstCombiner &IC) {
157 // Optimize _L to _LZ when _L is zero
158 if (const auto *LZMappingInfo =
159 AMDGPU::getMIMGLZMappingInfo(L: ImageDimIntr->BaseOpcode)) {
160 if (auto *ConstantLod =
161 dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->LodIndex))) {
162 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
163 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
164 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: LZMappingInfo->LZ,
165 Dim: ImageDimIntr->Dim);
166 return modifyIntrinsicCall(
167 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
168 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169 });
170 }
171 }
172 }
173
174 // Optimize _mip away, when 'lod' is zero
175 if (const auto *MIPMappingInfo =
176 AMDGPU::getMIMGMIPMappingInfo(MIP: ImageDimIntr->BaseOpcode)) {
177 if (auto *ConstantMip =
178 dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->MipIndex))) {
179 if (ConstantMip->isZero()) {
180 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: MIPMappingInfo->NONMIP,
182 Dim: ImageDimIntr->Dim);
183 return modifyIntrinsicCall(
184 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
185 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186 });
187 }
188 }
189 }
190
191 // Optimize _bias away when 'bias' is zero
192 if (const auto *BiasMappingInfo =
193 AMDGPU::getMIMGBiasMappingInfo(Bias: ImageDimIntr->BaseOpcode)) {
194 if (auto *ConstantBias =
195 dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->BiasIndex))) {
196 if (ConstantBias->isZero()) {
197 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: BiasMappingInfo->NoBias,
199 Dim: ImageDimIntr->Dim);
200 return modifyIntrinsicCall(
201 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
202 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204 });
205 }
206 }
207 }
208
209 // Optimize _offset away when 'offset' is zero
210 if (const auto *OffsetMappingInfo =
211 AMDGPU::getMIMGOffsetMappingInfo(Offset: ImageDimIntr->BaseOpcode)) {
212 if (auto *ConstantOffset =
213 dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->OffsetIndex))) {
214 if (ConstantOffset->isZero()) {
215 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
216 AMDGPU::getImageDimIntrinsicByBaseOpcode(
217 BaseOpcode: OffsetMappingInfo->NoOffset, Dim: ImageDimIntr->Dim);
218 return modifyIntrinsicCall(
219 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
220 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221 });
222 }
223 }
224 }
225
226 // Try to use D16
227 if (ST->hasD16Images()) {
228
229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
230 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode);
231
232 if (BaseOpcode->HasD16) {
233
234 // If the only use of image intrinsic is a fptrunc (with conversion to
235 // half) then both fptrunc and image intrinsic will be replaced with image
236 // intrinsic with D16 flag.
237 if (II.hasOneUse()) {
238 Instruction *User = II.user_back();
239
240 if (User->getOpcode() == Instruction::FPTrunc &&
241 User->getType()->getScalarType()->isHalfTy()) {
242
243 return modifyIntrinsicCall(OldIntr&: II, InstToReplace&: *User, NewIntr: ImageDimIntr->Intr, IC,
244 Func: [&](auto &Args, auto &ArgTys) {
245 // Change return type of image intrinsic.
246 // Set it to return type of fptrunc.
247 ArgTys[0] = User->getType();
248 });
249 }
250 }
251
252 // Only perform D16 folding if every user of the image sample is
253 // an ExtractElementInst immediately followed by an FPTrunc to half.
254 SmallVector<std::pair<ExtractElementInst *, FPTruncInst *>, 4>
255 ExtractTruncPairs;
256 bool AllHalfExtracts = true;
257
258 for (User *U : II.users()) {
259 auto *Ext = dyn_cast<ExtractElementInst>(Val: U);
260 if (!Ext || !Ext->hasOneUse()) {
261 AllHalfExtracts = false;
262 break;
263 }
264
265 auto *Tr = dyn_cast<FPTruncInst>(Val: *Ext->user_begin());
266 if (!Tr || !Tr->getType()->isHalfTy()) {
267 AllHalfExtracts = false;
268 break;
269 }
270
271 ExtractTruncPairs.emplace_back(Args&: Ext, Args&: Tr);
272 }
273
274 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
275 auto *VecTy = cast<VectorType>(Val: II.getType());
276 Type *HalfVecTy =
277 VecTy->getWithNewType(EltTy: Type::getHalfTy(C&: II.getContext()));
278
279 // Obtain the original image sample intrinsic's signature
280 // and replace its return type with the half-vector for D16 folding
281 SmallVector<Type *, 8> SigTys;
282 Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: SigTys);
283 SigTys[0] = HalfVecTy;
284
285 Module *M = II.getModule();
286 Function *HalfDecl =
287 Intrinsic::getOrInsertDeclaration(M, id: ImageDimIntr->Intr, Tys: SigTys);
288
289 II.mutateType(Ty: HalfVecTy);
290 II.setCalledFunction(HalfDecl);
291
292 IRBuilder<> Builder(II.getContext());
293 for (auto &[Ext, Tr] : ExtractTruncPairs) {
294 Value *Idx = Ext->getIndexOperand();
295
296 Builder.SetInsertPoint(Tr);
297
298 Value *HalfExtract = Builder.CreateExtractElement(Vec: &II, Idx);
299 HalfExtract->takeName(V: Tr);
300
301 Tr->replaceAllUsesWith(V: HalfExtract);
302 }
303
304 for (auto &[Ext, Tr] : ExtractTruncPairs) {
305 IC.eraseInstFromFunction(I&: *Tr);
306 IC.eraseInstFromFunction(I&: *Ext);
307 }
308
309 return &II;
310 }
311 }
312 }
313
314 // Try to use A16 or G16
315 if (!ST->hasA16() && !ST->hasG16())
316 return std::nullopt;
317
318 // Address is interpreted as float if the instruction has a sampler or as
319 // unsigned int if there is no sampler.
320 bool HasSampler =
321 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode)->Sampler;
322 bool FloatCoord = false;
323 // true means derivatives can be converted to 16 bit, coordinates not
324 bool OnlyDerivatives = false;
325
326 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
327 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
328 Value *Coord = II.getOperand(i_nocapture: OperandIndex);
329 // If the values are not derived from 16-bit values, we cannot optimize.
330 if (!canSafelyConvertTo16Bit(V&: *Coord, IsFloat: HasSampler)) {
331 if (OperandIndex < ImageDimIntr->CoordStart ||
332 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
333 return std::nullopt;
334 }
335 // All gradients can be converted, so convert only them
336 OnlyDerivatives = true;
337 break;
338 }
339
340 assert(OperandIndex == ImageDimIntr->GradientStart ||
341 FloatCoord == Coord->getType()->isFloatingPointTy());
342 FloatCoord = Coord->getType()->isFloatingPointTy();
343 }
344
345 if (!OnlyDerivatives && !ST->hasA16())
346 OnlyDerivatives = true; // Only supports G16
347
348 // Check if there is a bias parameter and if it can be converted to f16
349 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
350 Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
351 assert(HasSampler &&
352 "Only image instructions with a sampler can have a bias");
353 if (!canSafelyConvertTo16Bit(V&: *Bias, IsFloat: HasSampler))
354 OnlyDerivatives = true;
355 }
356
357 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
358 ImageDimIntr->CoordStart))
359 return std::nullopt;
360
361 Type *CoordType = FloatCoord ? Type::getHalfTy(C&: II.getContext())
362 : Type::getInt16Ty(C&: II.getContext());
363
364 return modifyIntrinsicCall(
365 OldIntr&: II, InstToReplace&: II, NewIntr: II.getIntrinsicID(), IC, Func: [&](auto &Args, auto &ArgTys) {
366 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
367 if (!OnlyDerivatives) {
368 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
369
370 // Change the bias type
371 if (ImageDimIntr->NumBiasArgs != 0)
372 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(C&: II.getContext());
373 }
374
375 unsigned EndIndex =
376 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
377 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
378 OperandIndex < EndIndex; OperandIndex++) {
379 Args[OperandIndex] =
380 convertTo16Bit(V&: *II.getOperand(i_nocapture: OperandIndex), Builder&: IC.Builder);
381 }
382
383 // Convert the bias
384 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
385 Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
386 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(V&: *Bias, Builder&: IC.Builder);
387 }
388 });
389}
390
391bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
392 const Value *Op0, const Value *Op1,
393 InstCombiner &IC) const {
394 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
395 // infinity, gives +0.0. If we can prove we don't have one of the special
396 // cases then we can use a normal multiply instead.
397 // TODO: Create and use isKnownFiniteNonZero instead of just matching
398 // constants here.
399 if (match(V: Op0, P: PatternMatch::m_FiniteNonZero()) ||
400 match(V: Op1, P: PatternMatch::m_FiniteNonZero())) {
401 // One operand is not zero or infinity or NaN.
402 return true;
403 }
404
405 SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(I: &I);
406 if (isKnownNeverInfOrNaN(V: Op0, SQ) && isKnownNeverInfOrNaN(V: Op1, SQ)) {
407 // Neither operand is infinity or NaN.
408 return true;
409 }
410 return false;
411}
412
413/// Match an fpext from half to float, or a constant we can convert.
414static Value *matchFPExtFromF16(Value *Arg) {
415 Value *Src = nullptr;
416 ConstantFP *CFP = nullptr;
417 if (match(V: Arg, P: m_OneUse(SubPattern: m_FPExt(Op: m_Value(V&: Src))))) {
418 if (Src->getType()->isHalfTy())
419 return Src;
420 } else if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
421 bool LosesInfo;
422 APFloat Val(CFP->getValueAPF());
423 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
424 if (!LosesInfo)
425 return ConstantFP::get(Ty: Type::getHalfTy(C&: Arg->getContext()), V: Val);
426 }
427 return nullptr;
428}
429
430// Trim all zero components from the end of the vector \p UseV and return
431// an appropriate bitset with known elements.
432static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
433 Instruction *I) {
434 auto *VTy = cast<FixedVectorType>(Val: UseV->getType());
435 unsigned VWidth = VTy->getNumElements();
436 APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
437
438 for (int i = VWidth - 1; i > 0; --i) {
439 auto *Elt = findScalarElement(V: UseV, EltNo: i);
440 if (!Elt)
441 break;
442
443 if (auto *ConstElt = dyn_cast<Constant>(Val: Elt)) {
444 if (!ConstElt->isNullValue() && !isa<UndefValue>(Val: Elt))
445 break;
446 } else {
447 break;
448 }
449
450 DemandedElts.clearBit(BitPosition: i);
451 }
452
453 return DemandedElts;
454}
455
456// Trim elements of the end of the vector \p V, if they are
457// equal to the first element of the vector.
458static APInt defaultComponentBroadcast(Value *V) {
459 auto *VTy = cast<FixedVectorType>(Val: V->getType());
460 unsigned VWidth = VTy->getNumElements();
461 APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
462 Value *FirstComponent = findScalarElement(V, EltNo: 0);
463
464 SmallVector<int> ShuffleMask;
465 if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: V))
466 SVI->getShuffleMask(Result&: ShuffleMask);
467
468 for (int I = VWidth - 1; I > 0; --I) {
469 if (ShuffleMask.empty()) {
470 auto *Elt = findScalarElement(V, EltNo: I);
471 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Val: Elt)))
472 break;
473 } else {
474 // Detect identical elements in the shufflevector result, even though
475 // findScalarElement cannot tell us what that element is.
476 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
477 break;
478 }
479 DemandedElts.clearBit(BitPosition: I);
480 }
481
482 return DemandedElts;
483}
484
485static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
486 IntrinsicInst &II,
487 APInt DemandedElts,
488 int DMaskIdx = -1,
489 bool IsLoad = true);
490
491/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
492static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
493 return (SqrtOp->getType()->isFloatTy() &&
494 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
495 SqrtOp->getType()->isHalfTy();
496}
497
498/// Return true if we can easily prove that use U is uniform.
499static bool isTriviallyUniform(const Use &U) {
500 Value *V = U.get();
501 if (isa<Constant>(Val: V))
502 return true;
503 if (const auto *A = dyn_cast<Argument>(Val: V))
504 return AMDGPU::isArgPassedInSGPR(Arg: A);
505 if (const auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
506 if (!AMDGPU::isIntrinsicAlwaysUniform(IntrID: II->getIntrinsicID()))
507 return false;
508 // If II and U are in different blocks then there is a possibility of
509 // temporal divergence.
510 return II->getParent() == cast<Instruction>(Val: U.getUser())->getParent();
511 }
512 return false;
513}
514
515/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
516///
517/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
518bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
519 IntrinsicInst &II,
520 unsigned LaneArgIdx) const {
521 unsigned MaskBits = ST->getWavefrontSizeLog2();
522 APInt DemandedMask(32, maskTrailingOnes<unsigned>(N: MaskBits));
523
524 KnownBits Known(32);
525 if (IC.SimplifyDemandedBits(I: &II, OpNo: LaneArgIdx, DemandedMask, Known))
526 return true;
527
528 if (!Known.isConstant())
529 return false;
530
531 // Out of bounds indexes may appear in wave64 code compiled for wave32.
532 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
533 // manually fix it up.
534
535 Value *LaneArg = II.getArgOperand(i: LaneArgIdx);
536 Constant *MaskedConst =
537 ConstantInt::get(Ty: LaneArg->getType(), V: Known.getConstant() & DemandedMask);
538 if (MaskedConst != LaneArg) {
539 II.getOperandUse(i: LaneArgIdx).set(MaskedConst);
540 return true;
541 }
542
543 return false;
544}
545
546static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
547 Function &NewCallee, ArrayRef<Value *> Ops) {
548 SmallVector<OperandBundleDef, 2> OpBundles;
549 Old.getOperandBundlesAsDefs(Defs&: OpBundles);
550
551 CallInst *NewCall = B.CreateCall(Callee: &NewCallee, Args: Ops, OpBundles);
552 NewCall->takeName(V: &Old);
553 return NewCall;
554}
555
556// Return true for sequences of instructions that effectively assign
557// each lane to its thread ID
558static bool isThreadID(const GCNSubtarget &ST, Value *V) {
559 // Case 1:
560 // wave32: mbcnt_lo(-1, 0)
561 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
562 auto W32Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(Op0: m_ConstantInt<-1>(),
563 Op1: m_ConstantInt<0>());
564 auto W64Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
565 Op0: m_ConstantInt<-1>(), Op1: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
566 Op0: m_ConstantInt<-1>(), Op1: m_ConstantInt<0>()));
567 if (ST.isWave32() && match(V, P: W32Pred))
568 return true;
569 if (ST.isWave64() && match(V, P: W64Pred))
570 return true;
571
572 return false;
573}
574
575// Attempt to capture situations where the index argument matches
576// a DPP pattern, and convert to a DPP-based mov
577static std::optional<Instruction *>
578tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
579 Value *Val = II.getArgOperand(i: 0);
580 Value *Idx = II.getArgOperand(i: 1);
581 auto &B = IC.Builder;
582
583 // DPP16 Row Share requires known wave size, architecture support
584 if (!ST.isWaveSizeKnown() || !ST.hasDPPRowShare())
585 return std::nullopt;
586
587 Value *Tid;
588 uint64_t Mask;
589 uint64_t RowIdx;
590 bool CanDPP16RowShare = false;
591
592 // wave32 requires Mask & 0x1F == 0x10
593 // wave64 requires Mask & 0x3F == 0x30
594 uint64_t MaskCheck = (1UL << ST.getWavefrontSizeLog2()) - 1;
595 uint64_t MaskTarget = MaskCheck & 0xF0;
596
597 // DPP16 Row Share 0: Idx = Tid & Mask
598 auto RowShare0Pred = m_And(L: m_Value(V&: Tid), R: m_ConstantInt(V&: Mask));
599
600 // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
601 auto RowSharePred =
602 m_Or(L: m_And(L: m_Value(V&: Tid), R: m_ConstantInt(V&: Mask)), R: m_ConstantInt(V&: RowIdx));
603
604 // DPP16 Row Share 15: Idx = Tid | 0xF
605 auto RowShare15Pred = m_Or(L: m_Value(V&: Tid), R: m_ConstantInt<0xF>());
606
607 if (match(V: Idx, P: RowShare0Pred) && isThreadID(ST, V: Tid)) {
608 if ((Mask & MaskCheck) != MaskTarget)
609 return std::nullopt;
610
611 RowIdx = 0;
612 CanDPP16RowShare = true;
613 } else if (match(V: Idx, P: RowSharePred) && isThreadID(ST, V: Tid) && RowIdx < 15 &&
614 RowIdx > 0) {
615 if ((Mask & MaskCheck) != MaskTarget)
616 return std::nullopt;
617
618 CanDPP16RowShare = true;
619 } else if (match(V: Idx, P: RowShare15Pred) && isThreadID(ST, V: Tid)) {
620 RowIdx = 15;
621 CanDPP16RowShare = true;
622 }
623
624 if (CanDPP16RowShare) {
625 CallInst *UpdateDPP =
626 B.CreateIntrinsic(ID: Intrinsic::amdgcn_update_dpp, Types: Val->getType(),
627 Args: {PoisonValue::get(T: Val->getType()), Val,
628 B.getInt32(C: AMDGPU::DPP::ROW_SHARE0 | RowIdx),
629 B.getInt32(C: 0xF), B.getInt32(C: 0xF), B.getFalse()});
630 UpdateDPP->takeName(V: &II);
631 UpdateDPP->copyMetadata(SrcInst: II);
632 return IC.replaceInstUsesWith(I&: II, V: UpdateDPP);
633 }
634
635 // No valid DPP detected
636 return std::nullopt;
637}
638
639Instruction *
640GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
641 IntrinsicInst &II) const {
642 const auto IID = II.getIntrinsicID();
643 assert(IID == Intrinsic::amdgcn_readlane ||
644 IID == Intrinsic::amdgcn_readfirstlane ||
645 IID == Intrinsic::amdgcn_permlane64);
646
647 Instruction *OpInst = dyn_cast<Instruction>(Val: II.getOperand(i_nocapture: 0));
648
649 // Only do this if both instructions are in the same block
650 // (so the exec mask won't change) and the readlane is the only user of its
651 // operand.
652 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
653 return nullptr;
654
655 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
656
657 // If this is a readlane, check that the second operand is a constant, or is
658 // defined before OpInst so we know it's safe to move this intrinsic higher.
659 Value *LaneID = nullptr;
660 if (IsReadLane) {
661 LaneID = II.getOperand(i_nocapture: 1);
662
663 // readlane take an extra operand for the lane ID, so we must check if that
664 // LaneID value can be used at the point where we want to move the
665 // intrinsic.
666 if (auto *LaneIDInst = dyn_cast<Instruction>(Val: LaneID)) {
667 if (!IC.getDominatorTree().dominates(Def: LaneIDInst, User: OpInst))
668 return nullptr;
669 }
670 }
671
672 // Hoist the intrinsic (II) through OpInst.
673 //
674 // (II (OpInst x)) -> (OpInst (II x))
675 const auto DoIt = [&](unsigned OpIdx,
676 Function *NewIntrinsic) -> Instruction * {
677 SmallVector<Value *, 2> Ops{OpInst->getOperand(i: OpIdx)};
678 if (IsReadLane)
679 Ops.push_back(Elt: LaneID);
680
681 // Rewrite the intrinsic call.
682 CallInst *NewII = rewriteCall(B&: IC.Builder, Old&: II, NewCallee&: *NewIntrinsic, Ops);
683
684 // Rewrite OpInst so it takes the result of the intrinsic now.
685 Instruction &NewOp = *OpInst->clone();
686 NewOp.setOperand(i: OpIdx, Val: NewII);
687 return &NewOp;
688 };
689
690 // TODO(?): Should we do more with permlane64?
691 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(Val: OpInst))
692 return nullptr;
693
694 if (isa<UnaryOperator>(Val: OpInst))
695 return DoIt(0, II.getCalledFunction());
696
697 if (isa<CastInst>(Val: OpInst)) {
698 Value *Src = OpInst->getOperand(i: 0);
699 Type *SrcTy = Src->getType();
700 if (!isTypeLegal(Ty: SrcTy))
701 return nullptr;
702
703 Function *Remangled =
704 Intrinsic::getOrInsertDeclaration(M: II.getModule(), id: IID, Tys: {SrcTy});
705 return DoIt(0, Remangled);
706 }
707
708 // We can also hoist through binary operators if the other operand is uniform.
709 if (isa<BinaryOperator>(Val: OpInst)) {
710 // FIXME: If we had access to UniformityInfo here we could just check
711 // if the operand is uniform.
712 if (isTriviallyUniform(U: OpInst->getOperandUse(i: 0)))
713 return DoIt(1, II.getCalledFunction());
714 if (isTriviallyUniform(U: OpInst->getOperandUse(i: 1)))
715 return DoIt(0, II.getCalledFunction());
716 }
717
718 return nullptr;
719}
720
721std::optional<Instruction *>
722GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
723 Intrinsic::ID IID = II.getIntrinsicID();
724 switch (IID) {
725 case Intrinsic::amdgcn_implicitarg_ptr: {
726 uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(F: *II.getFunction());
727
728 uint64_t CurrentOrNullBytes =
729 II.getAttributes().getRetDereferenceableOrNullBytes();
730 if (CurrentOrNullBytes != 0) {
731 // Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
732 // into dereferenceable(max(A, B))
733 uint64_t NewBytes = std::max(a: CurrentOrNullBytes, b: ImplicitArgBytes);
734 II.addRetAttr(
735 Attr: Attribute::getWithDereferenceableBytes(Context&: II.getContext(), Bytes: NewBytes));
736 II.removeRetAttr(Kind: Attribute::DereferenceableOrNull);
737 return &II;
738 }
739
740 uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
741 uint64_t NewBytes = std::max(a: CurrentBytes, b: ImplicitArgBytes);
742 if (NewBytes != CurrentBytes) {
743 II.addRetAttr(
744 Attr: Attribute::getWithDereferenceableBytes(Context&: II.getContext(), Bytes: NewBytes));
745 return &II;
746 }
747
748 return std::nullopt;
749 }
750 case Intrinsic::amdgcn_rcp: {
751 Value *Src = II.getArgOperand(i: 0);
752 if (isa<PoisonValue>(Val: Src))
753 return IC.replaceInstUsesWith(I&: II, V: Src);
754
755 // TODO: Move to ConstantFolding/InstSimplify?
756 if (isa<UndefValue>(Val: Src)) {
757 Type *Ty = II.getType();
758 auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
759 return IC.replaceInstUsesWith(I&: II, V: QNaN);
760 }
761
762 if (II.isStrictFP())
763 break;
764
765 if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
766 const APFloat &ArgVal = C->getValueAPF();
767 APFloat Val(ArgVal.getSemantics(), 1);
768 Val.divide(RHS: ArgVal, RM: APFloat::rmNearestTiesToEven);
769
770 // This is more precise than the instruction may give.
771 //
772 // TODO: The instruction always flushes denormal results (except for f16),
773 // should this also?
774 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Val));
775 }
776
777 FastMathFlags FMF = cast<FPMathOperator>(Val&: II).getFastMathFlags();
778 if (!FMF.allowContract())
779 break;
780 auto *SrcCI = dyn_cast<IntrinsicInst>(Val: Src);
781 if (!SrcCI)
782 break;
783
784 auto IID = SrcCI->getIntrinsicID();
785 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
786 //
787 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
788 // relaxed.
789 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
790 const FPMathOperator *SqrtOp = cast<FPMathOperator>(Val: SrcCI);
791 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
792 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
793 break;
794
795 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
796 break;
797
798 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
799 M: SrcCI->getModule(), id: Intrinsic::amdgcn_rsq, Tys: {SrcCI->getType()});
800
801 InnerFMF |= FMF;
802 II.setFastMathFlags(InnerFMF);
803
804 II.setCalledFunction(NewDecl);
805 return IC.replaceOperand(I&: II, OpNum: 0, V: SrcCI->getArgOperand(i: 0));
806 }
807
808 break;
809 }
810 case Intrinsic::amdgcn_sqrt:
811 case Intrinsic::amdgcn_rsq:
812 case Intrinsic::amdgcn_tanh: {
813 Value *Src = II.getArgOperand(i: 0);
814 if (isa<PoisonValue>(Val: Src))
815 return IC.replaceInstUsesWith(I&: II, V: Src);
816
817 // TODO: Move to ConstantFolding/InstSimplify?
818 if (isa<UndefValue>(Val: Src)) {
819 Type *Ty = II.getType();
820 auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
821 return IC.replaceInstUsesWith(I&: II, V: QNaN);
822 }
823
824 // f16 amdgcn.sqrt is identical to regular sqrt.
825 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
826 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
827 M: II.getModule(), id: Intrinsic::sqrt, Tys: {II.getType()});
828 II.setCalledFunction(NewDecl);
829 return &II;
830 }
831
832 break;
833 }
834 case Intrinsic::amdgcn_log:
835 case Intrinsic::amdgcn_exp2: {
836 const bool IsLog = IID == Intrinsic::amdgcn_log;
837 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
838 Value *Src = II.getArgOperand(i: 0);
839 Type *Ty = II.getType();
840
841 if (isa<PoisonValue>(Val: Src))
842 return IC.replaceInstUsesWith(I&: II, V: Src);
843
844 if (IC.getSimplifyQuery().isUndefValue(V: Src))
845 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
846
847 if (ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
848 if (C->isInfinity()) {
849 // exp2(+inf) -> +inf
850 // log2(+inf) -> +inf
851 if (!C->isNegative())
852 return IC.replaceInstUsesWith(I&: II, V: C);
853
854 // exp2(-inf) -> 0
855 if (IsExp && C->isNegative())
856 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty));
857 }
858
859 if (II.isStrictFP())
860 break;
861
862 if (C->isNaN()) {
863 Constant *Quieted = ConstantFP::get(Ty, V: C->getValue().makeQuiet());
864 return IC.replaceInstUsesWith(I&: II, V: Quieted);
865 }
866
867 // f32 instruction doesn't handle denormals, f16 does.
868 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
869 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, Negative: true)
870 : ConstantFP::get(Ty, V: 1.0);
871 return IC.replaceInstUsesWith(I&: II, V: FoldedValue);
872 }
873
874 if (IsLog && C->isNegative())
875 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
876
877 // TODO: Full constant folding matching hardware behavior.
878 }
879
880 break;
881 }
882 case Intrinsic::amdgcn_frexp_mant:
883 case Intrinsic::amdgcn_frexp_exp: {
884 Value *Src = II.getArgOperand(i: 0);
885 if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
886 int Exp;
887 APFloat Significand =
888 frexp(X: C->getValueAPF(), Exp, RM: APFloat::rmNearestTiesToEven);
889
890 if (IID == Intrinsic::amdgcn_frexp_mant) {
891 return IC.replaceInstUsesWith(
892 I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Significand));
893 }
894
895 // Match instruction special case behavior.
896 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
897 Exp = 0;
898
899 return IC.replaceInstUsesWith(I&: II,
900 V: ConstantInt::getSigned(Ty: II.getType(), V: Exp));
901 }
902
903 if (isa<PoisonValue>(Val: Src))
904 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
905
906 if (isa<UndefValue>(Val: Src)) {
907 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
908 }
909
910 break;
911 }
912 case Intrinsic::amdgcn_class: {
913 Value *Src0 = II.getArgOperand(i: 0);
914 Value *Src1 = II.getArgOperand(i: 1);
915 const ConstantInt *CMask = dyn_cast<ConstantInt>(Val: Src1);
916 if (CMask) {
917 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
918 M: II.getModule(), id: Intrinsic::is_fpclass, Tys: Src0->getType()));
919
920 // Clamp any excess bits, as they're illegal for the generic intrinsic.
921 II.setArgOperand(i: 1, v: ConstantInt::get(Ty: Src1->getType(),
922 V: CMask->getZExtValue() & fcAllFlags));
923 return &II;
924 }
925
926 // Propagate poison.
927 if (isa<PoisonValue>(Val: Src0) || isa<PoisonValue>(Val: Src1))
928 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
929
930 // llvm.amdgcn.class(_, undef) -> false
931 if (IC.getSimplifyQuery().isUndefValue(V: Src1))
932 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: false));
933
934 // llvm.amdgcn.class(undef, mask) -> mask != 0
935 if (IC.getSimplifyQuery().isUndefValue(V: Src0)) {
936 Value *CmpMask = IC.Builder.CreateICmpNE(
937 LHS: Src1, RHS: ConstantInt::getNullValue(Ty: Src1->getType()));
938 return IC.replaceInstUsesWith(I&: II, V: CmpMask);
939 }
940 break;
941 }
942 case Intrinsic::amdgcn_cvt_pkrtz: {
943 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
944 Type *HalfTy = Type::getHalfTy(C&: Arg->getContext());
945
946 if (isa<PoisonValue>(Val: Arg))
947 return PoisonValue::get(T: HalfTy);
948 if (isa<UndefValue>(Val: Arg))
949 return UndefValue::get(T: HalfTy);
950
951 ConstantFP *CFP = nullptr;
952 if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
953 bool LosesInfo;
954 APFloat Val(CFP->getValueAPF());
955 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
956 return ConstantFP::get(Ty: HalfTy, V: Val);
957 }
958
959 Value *Src = nullptr;
960 if (match(V: Arg, P: m_FPExt(Op: m_Value(V&: Src)))) {
961 if (Src->getType()->isHalfTy())
962 return Src;
963 }
964
965 return nullptr;
966 };
967
968 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(i: 0))) {
969 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(i: 1))) {
970 Value *V = PoisonValue::get(T: II.getType());
971 V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src0, Idx: (uint64_t)0);
972 V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src1, Idx: (uint64_t)1);
973 return IC.replaceInstUsesWith(I&: II, V);
974 }
975 }
976
977 break;
978 }
979 case Intrinsic::amdgcn_cvt_pknorm_i16:
980 case Intrinsic::amdgcn_cvt_pknorm_u16:
981 case Intrinsic::amdgcn_cvt_pk_i16:
982 case Intrinsic::amdgcn_cvt_pk_u16: {
983 Value *Src0 = II.getArgOperand(i: 0);
984 Value *Src1 = II.getArgOperand(i: 1);
985
986 // TODO: Replace call with scalar operation if only one element is poison.
987 if (isa<PoisonValue>(Val: Src0) && isa<PoisonValue>(Val: Src1))
988 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
989
990 if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
991 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
992 }
993
994 break;
995 }
996 case Intrinsic::amdgcn_cvt_off_f32_i4: {
997 Value* Arg = II.getArgOperand(i: 0);
998 Type *Ty = II.getType();
999
1000 if (isa<PoisonValue>(Val: Arg))
1001 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: Ty));
1002
1003 if(IC.getSimplifyQuery().isUndefValue(V: Arg))
1004 return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty));
1005
1006 ConstantInt *CArg = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 0));
1007 if (!CArg)
1008 break;
1009
1010 // Tabulated 0.0625 * (sext (CArg & 0xf)).
1011 constexpr size_t ResValsSize = 16;
1012 static constexpr float ResVals[ResValsSize] = {
1013 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
1014 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
1015 Constant *Res =
1016 ConstantFP::get(Ty, V: ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
1017 return IC.replaceInstUsesWith(I&: II, V: Res);
1018 }
1019 case Intrinsic::amdgcn_ubfe:
1020 case Intrinsic::amdgcn_sbfe: {
1021 // Decompose simple cases into standard shifts.
1022 Value *Src = II.getArgOperand(i: 0);
1023 if (isa<UndefValue>(Val: Src)) {
1024 return IC.replaceInstUsesWith(I&: II, V: Src);
1025 }
1026
1027 unsigned Width;
1028 Type *Ty = II.getType();
1029 unsigned IntSize = Ty->getIntegerBitWidth();
1030
1031 ConstantInt *CWidth = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 2));
1032 if (CWidth) {
1033 Width = CWidth->getZExtValue();
1034 if ((Width & (IntSize - 1)) == 0) {
1035 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getNullValue(Ty));
1036 }
1037
1038 // Hardware ignores high bits, so remove those.
1039 if (Width >= IntSize) {
1040 return IC.replaceOperand(
1041 I&: II, OpNum: 2, V: ConstantInt::get(Ty: CWidth->getType(), V: Width & (IntSize - 1)));
1042 }
1043 }
1044
1045 unsigned Offset;
1046 ConstantInt *COffset = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 1));
1047 if (COffset) {
1048 Offset = COffset->getZExtValue();
1049 if (Offset >= IntSize) {
1050 return IC.replaceOperand(
1051 I&: II, OpNum: 1,
1052 V: ConstantInt::get(Ty: COffset->getType(), V: Offset & (IntSize - 1)));
1053 }
1054 }
1055
1056 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1057
1058 if (!CWidth || !COffset)
1059 break;
1060
1061 // The case of Width == 0 is handled above, which makes this transformation
1062 // safe. If Width == 0, then the ashr and lshr instructions become poison
1063 // value since the shift amount would be equal to the bit size.
1064 assert(Width != 0);
1065
1066 // TODO: This allows folding to undef when the hardware has specific
1067 // behavior?
1068 if (Offset + Width < IntSize) {
1069 Value *Shl = IC.Builder.CreateShl(LHS: Src, RHS: IntSize - Offset - Width);
1070 Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Shl, RHS: IntSize - Width)
1071 : IC.Builder.CreateLShr(LHS: Shl, RHS: IntSize - Width);
1072 RightShift->takeName(V: &II);
1073 return IC.replaceInstUsesWith(I&: II, V: RightShift);
1074 }
1075
1076 Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Src, RHS: Offset)
1077 : IC.Builder.CreateLShr(LHS: Src, RHS: Offset);
1078
1079 RightShift->takeName(V: &II);
1080 return IC.replaceInstUsesWith(I&: II, V: RightShift);
1081 }
1082 case Intrinsic::amdgcn_exp:
1083 case Intrinsic::amdgcn_exp_row:
1084 case Intrinsic::amdgcn_exp_compr: {
1085 ConstantInt *En = cast<ConstantInt>(Val: II.getArgOperand(i: 1));
1086 unsigned EnBits = En->getZExtValue();
1087 if (EnBits == 0xf)
1088 break; // All inputs enabled.
1089
1090 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1091 bool Changed = false;
1092 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1093 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1094 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1095 Value *Src = II.getArgOperand(i: I + 2);
1096 if (!isa<PoisonValue>(Val: Src)) {
1097 IC.replaceOperand(I&: II, OpNum: I + 2, V: PoisonValue::get(T: Src->getType()));
1098 Changed = true;
1099 }
1100 }
1101 }
1102
1103 if (Changed) {
1104 return &II;
1105 }
1106
1107 break;
1108 }
1109 case Intrinsic::amdgcn_fmed3: {
1110 Value *Src0 = II.getArgOperand(i: 0);
1111 Value *Src1 = II.getArgOperand(i: 1);
1112 Value *Src2 = II.getArgOperand(i: 2);
1113
1114 for (Value *Src : {Src0, Src1, Src2}) {
1115 if (isa<PoisonValue>(Val: Src))
1116 return IC.replaceInstUsesWith(I&: II, V: Src);
1117 }
1118
1119 if (II.isStrictFP())
1120 break;
1121
1122 // med3 with a nan input acts like
1123 // v_min_f32(v_min_f32(s0, s1), s2)
1124 //
1125 // Signalingness is ignored with ieee=0, so we fold to
1126 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1127 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1128 // returned signaling nan will not be quieted.
1129
1130 // ieee=1
1131 // s0 snan: s2
1132 // s1 snan: s2
1133 // s2 snan: qnan
1134
1135 // s0 qnan: min(s1, s2)
1136 // s1 qnan: min(s0, s2)
1137 // s2 qnan: min(s0, s1)
1138
1139 // ieee=0
1140 // s0 _nan: min(s1, s2)
1141 // s1 _nan: min(s0, s2)
1142 // s2 _nan: min(s0, s1)
1143
1144 // med3 behavior with infinity
1145 // s0 +inf: max(s1, s2)
1146 // s1 +inf: max(s0, s2)
1147 // s2 +inf: max(s0, s1)
1148 // s0 -inf: min(s1, s2)
1149 // s1 -inf: min(s0, s2)
1150 // s2 -inf: min(s0, s1)
1151
1152 // Checking for NaN before canonicalization provides better fidelity when
1153 // mapping other operations onto fmed3 since the order of operands is
1154 // unchanged.
1155 Value *V = nullptr;
1156 const APFloat *ConstSrc0 = nullptr;
1157 const APFloat *ConstSrc1 = nullptr;
1158 const APFloat *ConstSrc2 = nullptr;
1159
1160 if ((match(V: Src0, P: m_APFloat(Res&: ConstSrc0)) &&
1161 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1162 isa<UndefValue>(Val: Src0)) {
1163 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1164 switch (fpenvIEEEMode(I: II)) {
1165 case KnownIEEEMode::On:
1166 // TODO: If Src2 is snan, does it need quieting?
1167 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1168 return IC.replaceInstUsesWith(I&: II, V: Src2);
1169
1170 V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src1, RHS: Src2)
1171 : IC.Builder.CreateMinNum(LHS: Src1, RHS: Src2);
1172 break;
1173 case KnownIEEEMode::Off:
1174 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src1, RHS: Src2)
1175 : IC.Builder.CreateMinimumNum(LHS: Src1, RHS: Src2);
1176 break;
1177 case KnownIEEEMode::Unknown:
1178 break;
1179 }
1180 } else if ((match(V: Src1, P: m_APFloat(Res&: ConstSrc1)) &&
1181 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1182 isa<UndefValue>(Val: Src1)) {
1183 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1184 switch (fpenvIEEEMode(I: II)) {
1185 case KnownIEEEMode::On:
1186 // TODO: If Src2 is snan, does it need quieting?
1187 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1188 return IC.replaceInstUsesWith(I&: II, V: Src2);
1189
1190 V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src2)
1191 : IC.Builder.CreateMinNum(LHS: Src0, RHS: Src2);
1192 break;
1193 case KnownIEEEMode::Off:
1194 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src2)
1195 : IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src2);
1196 break;
1197 case KnownIEEEMode::Unknown:
1198 break;
1199 }
1200 } else if ((match(V: Src2, P: m_APFloat(Res&: ConstSrc2)) &&
1201 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1202 isa<UndefValue>(Val: Src2)) {
1203 switch (fpenvIEEEMode(I: II)) {
1204 case KnownIEEEMode::On:
1205 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1206 auto *Quieted = ConstantFP::get(Ty: II.getType(), V: ConstSrc2->makeQuiet());
1207 return IC.replaceInstUsesWith(I&: II, V: Quieted);
1208 }
1209
1210 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1211 ? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src1)
1212 : IC.Builder.CreateMinNum(LHS: Src0, RHS: Src1);
1213 break;
1214 case KnownIEEEMode::Off:
1215 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1216 ? IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src1)
1217 : IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src1);
1218 break;
1219 case KnownIEEEMode::Unknown:
1220 break;
1221 }
1222 }
1223
1224 if (V) {
1225 if (auto *CI = dyn_cast<CallInst>(Val: V)) {
1226 CI->copyFastMathFlags(I: &II);
1227 CI->takeName(V: &II);
1228 }
1229 return IC.replaceInstUsesWith(I&: II, V);
1230 }
1231
1232 bool Swap = false;
1233 // Canonicalize constants to RHS operands.
1234 //
1235 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1236 if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1237 std::swap(a&: Src0, b&: Src1);
1238 Swap = true;
1239 }
1240
1241 if (isa<Constant>(Val: Src1) && !isa<Constant>(Val: Src2)) {
1242 std::swap(a&: Src1, b&: Src2);
1243 Swap = true;
1244 }
1245
1246 if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1247 std::swap(a&: Src0, b&: Src1);
1248 Swap = true;
1249 }
1250
1251 if (Swap) {
1252 II.setArgOperand(i: 0, v: Src0);
1253 II.setArgOperand(i: 1, v: Src1);
1254 II.setArgOperand(i: 2, v: Src2);
1255 return &II;
1256 }
1257
1258 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
1259 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
1260 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Val: Src2)) {
1261 APFloat Result = fmed3AMDGCN(Src0: C0->getValueAPF(), Src1: C1->getValueAPF(),
1262 Src2: C2->getValueAPF());
1263 return IC.replaceInstUsesWith(I&: II,
1264 V: ConstantFP::get(Ty: II.getType(), V: Result));
1265 }
1266 }
1267 }
1268
1269 if (!ST->hasMed3_16())
1270 break;
1271
1272 // Repeat floating-point width reduction done for minnum/maxnum.
1273 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1274 if (Value *X = matchFPExtFromF16(Arg: Src0)) {
1275 if (Value *Y = matchFPExtFromF16(Arg: Src1)) {
1276 if (Value *Z = matchFPExtFromF16(Arg: Src2)) {
1277 Value *NewCall = IC.Builder.CreateIntrinsic(
1278 ID: IID, Types: {X->getType()}, Args: {X, Y, Z}, FMFSource: &II, Name: II.getName());
1279 return new FPExtInst(NewCall, II.getType());
1280 }
1281 }
1282 }
1283
1284 break;
1285 }
1286 case Intrinsic::amdgcn_icmp:
1287 case Intrinsic::amdgcn_fcmp: {
1288 const ConstantInt *CC = cast<ConstantInt>(Val: II.getArgOperand(i: 2));
1289 // Guard against invalid arguments.
1290 int64_t CCVal = CC->getZExtValue();
1291 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1292 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1293 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1294 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1295 CCVal > CmpInst::LAST_FCMP_PREDICATE)))
1296 break;
1297
1298 Value *Src0 = II.getArgOperand(i: 0);
1299 Value *Src1 = II.getArgOperand(i: 1);
1300
1301 if (auto *CSrc0 = dyn_cast<Constant>(Val: Src0)) {
1302 if (auto *CSrc1 = dyn_cast<Constant>(Val: Src1)) {
1303 Constant *CCmp = ConstantFoldCompareInstOperands(
1304 Predicate: (ICmpInst::Predicate)CCVal, LHS: CSrc0, RHS: CSrc1, DL);
1305 if (CCmp && CCmp->isNullValue()) {
1306 return IC.replaceInstUsesWith(
1307 I&: II, V: IC.Builder.CreateSExt(V: CCmp, DestTy: II.getType()));
1308 }
1309
1310 // The result of V_ICMP/V_FCMP assembly instructions (which this
1311 // intrinsic exposes) is one bit per thread, masked with the EXEC
1312 // register (which contains the bitmask of live threads). So a
1313 // comparison that always returns true is the same as a read of the
1314 // EXEC register.
1315 Metadata *MDArgs[] = {MDString::get(Context&: II.getContext(), Str: "exec")};
1316 MDNode *MD = MDNode::get(Context&: II.getContext(), MDs: MDArgs);
1317 Value *Args[] = {MetadataAsValue::get(Context&: II.getContext(), MD)};
1318 CallInst *NewCall = IC.Builder.CreateIntrinsic(ID: Intrinsic::read_register,
1319 Types: II.getType(), Args);
1320 NewCall->addFnAttr(Kind: Attribute::Convergent);
1321 NewCall->takeName(V: &II);
1322 return IC.replaceInstUsesWith(I&: II, V: NewCall);
1323 }
1324
1325 // Canonicalize constants to RHS.
1326 CmpInst::Predicate SwapPred =
1327 CmpInst::getSwappedPredicate(pred: static_cast<CmpInst::Predicate>(CCVal));
1328 II.setArgOperand(i: 0, v: Src1);
1329 II.setArgOperand(i: 1, v: Src0);
1330 II.setArgOperand(
1331 i: 2, v: ConstantInt::get(Ty: CC->getType(), V: static_cast<int>(SwapPred)));
1332 return &II;
1333 }
1334
1335 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1336 break;
1337
1338 // Canonicalize compare eq with true value to compare != 0
1339 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1340 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1341 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1342 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1343 Value *ExtSrc;
1344 if (CCVal == CmpInst::ICMP_EQ &&
1345 ((match(V: Src1, P: PatternMatch::m_One()) &&
1346 match(V: Src0, P: m_ZExt(Op: PatternMatch::m_Value(V&: ExtSrc)))) ||
1347 (match(V: Src1, P: PatternMatch::m_AllOnes()) &&
1348 match(V: Src0, P: m_SExt(Op: PatternMatch::m_Value(V&: ExtSrc))))) &&
1349 ExtSrc->getType()->isIntegerTy(Bitwidth: 1)) {
1350 IC.replaceOperand(I&: II, OpNum: 1, V: ConstantInt::getNullValue(Ty: Src1->getType()));
1351 IC.replaceOperand(I&: II, OpNum: 2,
1352 V: ConstantInt::get(Ty: CC->getType(), V: CmpInst::ICMP_NE));
1353 return &II;
1354 }
1355
1356 CmpPredicate SrcPred;
1357 Value *SrcLHS;
1358 Value *SrcRHS;
1359
1360 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1361 // intrinsic. The typical use is a wave vote function in the library, which
1362 // will be fed from a user code condition compared with 0. Fold in the
1363 // redundant compare.
1364
1365 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1366 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1367 //
1368 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1369 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1370 if (match(V: Src1, P: PatternMatch::m_Zero()) &&
1371 match(V: Src0, P: PatternMatch::m_ZExtOrSExt(
1372 Op: m_Cmp(Pred&: SrcPred, L: PatternMatch::m_Value(V&: SrcLHS),
1373 R: PatternMatch::m_Value(V&: SrcRHS))))) {
1374 if (CCVal == CmpInst::ICMP_EQ)
1375 SrcPred = CmpInst::getInversePredicate(pred: SrcPred);
1376
1377 Intrinsic::ID NewIID = CmpInst::isFPPredicate(P: SrcPred)
1378 ? Intrinsic::amdgcn_fcmp
1379 : Intrinsic::amdgcn_icmp;
1380
1381 Type *Ty = SrcLHS->getType();
1382 if (auto *CmpType = dyn_cast<IntegerType>(Val: Ty)) {
1383 // Promote to next legal integer type.
1384 unsigned Width = CmpType->getBitWidth();
1385 unsigned NewWidth = Width;
1386
1387 // Don't do anything for i1 comparisons.
1388 if (Width == 1)
1389 break;
1390
1391 if (Width <= 16)
1392 NewWidth = 16;
1393 else if (Width <= 32)
1394 NewWidth = 32;
1395 else if (Width <= 64)
1396 NewWidth = 64;
1397 else
1398 break; // Can't handle this.
1399
1400 if (Width != NewWidth) {
1401 IntegerType *CmpTy = IC.Builder.getIntNTy(N: NewWidth);
1402 if (CmpInst::isSigned(Pred: SrcPred)) {
1403 SrcLHS = IC.Builder.CreateSExt(V: SrcLHS, DestTy: CmpTy);
1404 SrcRHS = IC.Builder.CreateSExt(V: SrcRHS, DestTy: CmpTy);
1405 } else {
1406 SrcLHS = IC.Builder.CreateZExt(V: SrcLHS, DestTy: CmpTy);
1407 SrcRHS = IC.Builder.CreateZExt(V: SrcRHS, DestTy: CmpTy);
1408 }
1409 }
1410 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1411 break;
1412
1413 Value *Args[] = {SrcLHS, SrcRHS,
1414 ConstantInt::get(Ty: CC->getType(), V: SrcPred)};
1415 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1416 ID: NewIID, Types: {II.getType(), SrcLHS->getType()}, Args);
1417 NewCall->takeName(V: &II);
1418 return IC.replaceInstUsesWith(I&: II, V: NewCall);
1419 }
1420
1421 break;
1422 }
1423 case Intrinsic::amdgcn_mbcnt_hi: {
1424 // exec_hi is all 0, so this is just a copy.
1425 if (ST->isWave32())
1426 return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: 1));
1427 break;
1428 }
1429 case Intrinsic::amdgcn_ballot: {
1430 Value *Arg = II.getArgOperand(i: 0);
1431 if (isa<PoisonValue>(Val: Arg))
1432 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1433
1434 if (auto *Src = dyn_cast<ConstantInt>(Val: Arg)) {
1435 if (Src->isZero()) {
1436 // amdgcn.ballot(i1 0) is zero.
1437 return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1438 }
1439 }
1440 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1441 // %b64 = call i64 ballot.i64(...)
1442 // =>
1443 // %b32 = call i32 ballot.i32(...)
1444 // %b64 = zext i32 %b32 to i64
1445 Value *Call = IC.Builder.CreateZExt(
1446 V: IC.Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_ballot,
1447 Types: {IC.Builder.getInt32Ty()},
1448 Args: {II.getArgOperand(i: 0)}),
1449 DestTy: II.getType());
1450 Call->takeName(V: &II);
1451 return IC.replaceInstUsesWith(I&: II, V: Call);
1452 }
1453 break;
1454 }
1455 case Intrinsic::amdgcn_wavefrontsize: {
1456 if (ST->isWaveSizeKnown())
1457 return IC.replaceInstUsesWith(
1458 I&: II, V: ConstantInt::get(Ty: II.getType(), V: ST->getWavefrontSize()));
1459 break;
1460 }
1461 case Intrinsic::amdgcn_wqm_vote: {
1462 // wqm_vote is identity when the argument is constant.
1463 if (!isa<Constant>(Val: II.getArgOperand(i: 0)))
1464 break;
1465
1466 return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: 0));
1467 }
1468 case Intrinsic::amdgcn_kill: {
1469 const ConstantInt *C = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 0));
1470 if (!C || !C->getZExtValue())
1471 break;
1472
1473 // amdgcn.kill(i1 1) is a no-op
1474 return IC.eraseInstFromFunction(I&: II);
1475 }
1476 case Intrinsic::amdgcn_s_sendmsg:
1477 case Intrinsic::amdgcn_s_sendmsghalt: {
1478 // The second operand is copied to m0, but is only actually used for
1479 // certain message types. For message types that are known to not use m0,
1480 // fold it to poison.
1481 using namespace AMDGPU::SendMsg;
1482
1483 Value *M0Val = II.getArgOperand(i: 1);
1484 if (isa<PoisonValue>(Val: M0Val))
1485 break;
1486
1487 auto *MsgImm = cast<ConstantInt>(Val: II.getArgOperand(i: 0));
1488 uint16_t MsgId, OpId, StreamId;
1489 decodeMsg(Val: MsgImm->getZExtValue(), MsgId, OpId, StreamId, STI: *ST);
1490
1491 if (!msgDoesNotUseM0(MsgId, STI: *ST))
1492 break;
1493
1494 // Drop UB-implying attributes since we're replacing with poison.
1495 II.dropUBImplyingAttrsAndMetadata();
1496 IC.replaceOperand(I&: II, OpNum: 1, V: PoisonValue::get(T: M0Val->getType()));
1497 return nullptr;
1498 }
1499 case Intrinsic::amdgcn_update_dpp: {
1500 Value *Old = II.getArgOperand(i: 0);
1501
1502 auto *BC = cast<ConstantInt>(Val: II.getArgOperand(i: 5));
1503 auto *RM = cast<ConstantInt>(Val: II.getArgOperand(i: 3));
1504 auto *BM = cast<ConstantInt>(Val: II.getArgOperand(i: 4));
1505 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1506 BM->getZExtValue() != 0xF || isa<PoisonValue>(Val: Old))
1507 break;
1508
1509 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1510 return IC.replaceOperand(I&: II, OpNum: 0, V: PoisonValue::get(T: Old->getType()));
1511 }
1512 case Intrinsic::amdgcn_permlane16:
1513 case Intrinsic::amdgcn_permlane16_var:
1514 case Intrinsic::amdgcn_permlanex16:
1515 case Intrinsic::amdgcn_permlanex16_var: {
1516 // Discard vdst_in if it's not going to be read.
1517 Value *VDstIn = II.getArgOperand(i: 0);
1518 if (isa<PoisonValue>(Val: VDstIn))
1519 break;
1520
1521 // FetchInvalid operand idx.
1522 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1523 IID == Intrinsic::amdgcn_permlanex16)
1524 ? 4 /* for permlane16 and permlanex16 */
1525 : 3; /* for permlane16_var and permlanex16_var */
1526
1527 // BoundCtrl operand idx.
1528 // For permlane16 and permlanex16 it should be 5
1529 // For Permlane16_var and permlanex16_var it should be 4
1530 unsigned int BcIdx = FiIdx + 1;
1531
1532 ConstantInt *FetchInvalid = cast<ConstantInt>(Val: II.getArgOperand(i: FiIdx));
1533 ConstantInt *BoundCtrl = cast<ConstantInt>(Val: II.getArgOperand(i: BcIdx));
1534 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1535 break;
1536
1537 return IC.replaceOperand(I&: II, OpNum: 0, V: PoisonValue::get(T: VDstIn->getType()));
1538 }
1539 case Intrinsic::amdgcn_permlane64:
1540 case Intrinsic::amdgcn_readfirstlane:
1541 case Intrinsic::amdgcn_readlane:
1542 case Intrinsic::amdgcn_ds_bpermute: {
1543 // If the data argument is uniform these intrinsics return it unchanged.
1544 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1545 const Use &Src = II.getArgOperandUse(i: SrcIdx);
1546 if (isTriviallyUniform(U: Src))
1547 return IC.replaceInstUsesWith(I&: II, V: Src.get());
1548
1549 if (IID == Intrinsic::amdgcn_readlane &&
1550 simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: 1))
1551 return &II;
1552
1553 // If the lane argument of bpermute is uniform, change it to readlane. This
1554 // generates better code and can enable further optimizations because
1555 // readlane is AlwaysUniform.
1556 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1557 const Use &Lane = II.getArgOperandUse(i: 0);
1558 if (isTriviallyUniform(U: Lane)) {
1559 Value *NewLane = IC.Builder.CreateLShr(LHS: Lane, RHS: 2);
1560 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1561 M: II.getModule(), id: Intrinsic::amdgcn_readlane, Tys: II.getType());
1562 II.setCalledFunction(NewDecl);
1563 II.setOperand(i_nocapture: 0, Val_nocapture: Src);
1564 II.setOperand(i_nocapture: 1, Val_nocapture: NewLane);
1565 return &II;
1566 }
1567 }
1568
1569 if (IID != Intrinsic::amdgcn_ds_bpermute) {
1570 if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
1571 return Res;
1572 }
1573
1574 return std::nullopt;
1575 }
1576 case Intrinsic::amdgcn_writelane: {
1577 // TODO: Fold bitcast like readlane.
1578 if (simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: 1))
1579 return &II;
1580 return std::nullopt;
1581 }
1582 case Intrinsic::amdgcn_trig_preop: {
1583 // The intrinsic is declared with name mangling, but currently the
1584 // instruction only exists for f64
1585 if (!II.getType()->isDoubleTy())
1586 break;
1587
1588 Value *Src = II.getArgOperand(i: 0);
1589 Value *Segment = II.getArgOperand(i: 1);
1590 if (isa<PoisonValue>(Val: Src) || isa<PoisonValue>(Val: Segment))
1591 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1592
1593 if (isa<UndefValue>(Val: Segment))
1594 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1595
1596 // Sign bit is not used.
1597 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Val: Src);
1598 if (StrippedSign != Src)
1599 return IC.replaceOperand(I&: II, OpNum: 0, V: StrippedSign);
1600
1601 if (II.isStrictFP())
1602 break;
1603
1604 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Val: Src);
1605 if (!CSrc && !isa<UndefValue>(Val: Src))
1606 break;
1607
1608 // The instruction ignores special cases, and literally just extracts the
1609 // exponents. Fold undef to nan, and index the table as normal.
1610 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
1611 : APFloat::getQNaN(Sem: II.getType()->getFltSemantics())
1612 .bitcastToAPInt();
1613
1614 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Val: Segment);
1615 if (!Cseg) {
1616 if (isa<UndefValue>(Val: Src))
1617 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1618 break;
1619 }
1620
1621 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(numBits: 11, bitPosition: 52);
1622 unsigned SegmentVal = Cseg->getValue().trunc(width: 5).getZExtValue();
1623 unsigned Shift = SegmentVal * 53;
1624 if (Exponent > 1077)
1625 Shift += Exponent - 1077;
1626
1627 // 2.0/PI table.
1628 static const uint32_t TwoByPi[] = {
1629 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1630 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1631 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1632 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1633 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1634 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1635 0x56033046};
1636
1637 // Return 0 for outbound segment (hardware behavior).
1638 unsigned Idx = Shift >> 5;
1639 if (Idx + 2 >= std::size(TwoByPi)) {
1640 APFloat Zero = APFloat::getZero(Sem: II.getType()->getFltSemantics());
1641 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: II.getType(), V: Zero));
1642 }
1643
1644 unsigned BShift = Shift & 0x1f;
1645 uint64_t Thi = Make_64(High: TwoByPi[Idx], Low: TwoByPi[Idx + 1]);
1646 uint64_t Tlo = Make_64(High: TwoByPi[Idx + 2], Low: 0);
1647 if (BShift)
1648 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1649 Thi = Thi >> 11;
1650 APFloat Result = APFloat((double)Thi);
1651
1652 int Scale = -53 - Shift;
1653 if (Exponent >= 1968)
1654 Scale += 128;
1655
1656 Result = scalbn(X: Result, Exp: Scale, RM: RoundingMode::NearestTiesToEven);
1657 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: Src->getType(), V: Result));
1658 }
1659 case Intrinsic::amdgcn_fmul_legacy: {
1660 Value *Op0 = II.getArgOperand(i: 0);
1661 Value *Op1 = II.getArgOperand(i: 1);
1662
1663 for (Value *Src : {Op0, Op1}) {
1664 if (isa<PoisonValue>(Val: Src))
1665 return IC.replaceInstUsesWith(I&: II, V: Src);
1666 }
1667
1668 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1669 // infinity, gives +0.0.
1670 // TODO: Move to InstSimplify?
1671 if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) ||
1672 match(V: Op1, P: PatternMatch::m_AnyZeroFP()))
1673 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1674
1675 // If we can prove we don't have one of the special cases then we can use a
1676 // normal fmul instruction instead.
1677 if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1678 auto *FMul = IC.Builder.CreateFMulFMF(L: Op0, R: Op1, FMFSource: &II);
1679 FMul->takeName(V: &II);
1680 return IC.replaceInstUsesWith(I&: II, V: FMul);
1681 }
1682 break;
1683 }
1684 case Intrinsic::amdgcn_fma_legacy: {
1685 Value *Op0 = II.getArgOperand(i: 0);
1686 Value *Op1 = II.getArgOperand(i: 1);
1687 Value *Op2 = II.getArgOperand(i: 2);
1688
1689 for (Value *Src : {Op0, Op1, Op2}) {
1690 if (isa<PoisonValue>(Val: Src))
1691 return IC.replaceInstUsesWith(I&: II, V: Src);
1692 }
1693
1694 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1695 // infinity, gives +0.0.
1696 // TODO: Move to InstSimplify?
1697 if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) ||
1698 match(V: Op1, P: PatternMatch::m_AnyZeroFP())) {
1699 // It's tempting to just return Op2 here, but that would give the wrong
1700 // result if Op2 was -0.0.
1701 auto *Zero = ConstantFP::getZero(Ty: II.getType());
1702 auto *FAdd = IC.Builder.CreateFAddFMF(L: Zero, R: Op2, FMFSource: &II);
1703 FAdd->takeName(V: &II);
1704 return IC.replaceInstUsesWith(I&: II, V: FAdd);
1705 }
1706
1707 // If we can prove we don't have one of the special cases then we can use a
1708 // normal fma instead.
1709 if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1710 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1711 M: II.getModule(), id: Intrinsic::fma, Tys: II.getType()));
1712 return &II;
1713 }
1714 break;
1715 }
1716 case Intrinsic::amdgcn_is_shared:
1717 case Intrinsic::amdgcn_is_private: {
1718 Value *Src = II.getArgOperand(i: 0);
1719 if (isa<PoisonValue>(Val: Src))
1720 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1721 if (isa<UndefValue>(Val: Src))
1722 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1723
1724 if (isa<ConstantPointerNull>(Val: II.getArgOperand(i: 0)))
1725 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getFalse(Ty: II.getType()));
1726 break;
1727 }
1728 case Intrinsic::amdgcn_make_buffer_rsrc: {
1729 Value *Src = II.getArgOperand(i: 0);
1730 if (isa<PoisonValue>(Val: Src))
1731 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1732 return std::nullopt;
1733 }
1734 case Intrinsic::amdgcn_raw_buffer_store_format:
1735 case Intrinsic::amdgcn_struct_buffer_store_format:
1736 case Intrinsic::amdgcn_raw_tbuffer_store:
1737 case Intrinsic::amdgcn_struct_tbuffer_store:
1738 case Intrinsic::amdgcn_image_store_1d:
1739 case Intrinsic::amdgcn_image_store_1darray:
1740 case Intrinsic::amdgcn_image_store_2d:
1741 case Intrinsic::amdgcn_image_store_2darray:
1742 case Intrinsic::amdgcn_image_store_2darraymsaa:
1743 case Intrinsic::amdgcn_image_store_2dmsaa:
1744 case Intrinsic::amdgcn_image_store_3d:
1745 case Intrinsic::amdgcn_image_store_cube:
1746 case Intrinsic::amdgcn_image_store_mip_1d:
1747 case Intrinsic::amdgcn_image_store_mip_1darray:
1748 case Intrinsic::amdgcn_image_store_mip_2d:
1749 case Intrinsic::amdgcn_image_store_mip_2darray:
1750 case Intrinsic::amdgcn_image_store_mip_3d:
1751 case Intrinsic::amdgcn_image_store_mip_cube: {
1752 if (!isa<FixedVectorType>(Val: II.getArgOperand(i: 0)->getType()))
1753 break;
1754
1755 APInt DemandedElts;
1756 if (ST->hasDefaultComponentBroadcast())
1757 DemandedElts = defaultComponentBroadcast(V: II.getArgOperand(i: 0));
1758 else if (ST->hasDefaultComponentZero())
1759 DemandedElts = trimTrailingZerosInVector(IC, UseV: II.getArgOperand(i: 0), I: &II);
1760 else
1761 break;
1762
1763 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID()) ? 1 : -1;
1764 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1765 IsLoad: false)) {
1766 return IC.eraseInstFromFunction(I&: II);
1767 }
1768
1769 break;
1770 }
1771 case Intrinsic::amdgcn_prng_b32: {
1772 auto *Src = II.getArgOperand(i: 0);
1773 if (isa<UndefValue>(Val: Src)) {
1774 return IC.replaceInstUsesWith(I&: II, V: Src);
1775 }
1776 return std::nullopt;
1777 }
1778 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1779 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1780 Value *Src0 = II.getArgOperand(i: 0);
1781 Value *Src1 = II.getArgOperand(i: 1);
1782 uint64_t CBSZ = cast<ConstantInt>(Val: II.getArgOperand(i: 3))->getZExtValue();
1783 uint64_t BLGP = cast<ConstantInt>(Val: II.getArgOperand(i: 4))->getZExtValue();
1784 auto *Src0Ty = cast<FixedVectorType>(Val: Src0->getType());
1785 auto *Src1Ty = cast<FixedVectorType>(Val: Src1->getType());
1786
1787 auto getFormatNumRegs = [](unsigned FormatVal) {
1788 switch (FormatVal) {
1789 case AMDGPU::MFMAScaleFormats::FP6_E2M3:
1790 case AMDGPU::MFMAScaleFormats::FP6_E3M2:
1791 return 6u;
1792 case AMDGPU::MFMAScaleFormats::FP4_E2M1:
1793 return 4u;
1794 case AMDGPU::MFMAScaleFormats::FP8_E4M3:
1795 case AMDGPU::MFMAScaleFormats::FP8_E5M2:
1796 return 8u;
1797 default:
1798 llvm_unreachable("invalid format value");
1799 }
1800 };
1801
1802 bool MadeChange = false;
1803 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
1804 unsigned Src1NumElts = getFormatNumRegs(BLGP);
1805
1806 // Depending on the used format, fewer registers are required so shrink the
1807 // vector type.
1808 if (Src0Ty->getNumElements() > Src0NumElts) {
1809 Src0 = IC.Builder.CreateExtractVector(
1810 DstType: FixedVectorType::get(ElementType: Src0Ty->getElementType(), NumElts: Src0NumElts), SrcVec: Src0,
1811 Idx: uint64_t(0));
1812 MadeChange = true;
1813 }
1814
1815 if (Src1Ty->getNumElements() > Src1NumElts) {
1816 Src1 = IC.Builder.CreateExtractVector(
1817 DstType: FixedVectorType::get(ElementType: Src1Ty->getElementType(), NumElts: Src1NumElts), SrcVec: Src1,
1818 Idx: uint64_t(0));
1819 MadeChange = true;
1820 }
1821
1822 if (!MadeChange)
1823 return std::nullopt;
1824
1825 SmallVector<Value *, 10> Args(II.args());
1826 Args[0] = Src0;
1827 Args[1] = Src1;
1828
1829 CallInst *NewII = IC.Builder.CreateIntrinsic(
1830 ID: IID, Types: {Src0->getType(), Src1->getType()}, Args, FMFSource: &II);
1831 NewII->takeName(V: &II);
1832 return IC.replaceInstUsesWith(I&: II, V: NewII);
1833 }
1834 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
1835 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1836 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
1837 Value *Src0 = II.getArgOperand(i: 1);
1838 Value *Src1 = II.getArgOperand(i: 3);
1839 unsigned FmtA = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue();
1840 uint64_t FmtB = cast<ConstantInt>(Val: II.getArgOperand(i: 2))->getZExtValue();
1841 auto *Src0Ty = cast<FixedVectorType>(Val: Src0->getType());
1842 auto *Src1Ty = cast<FixedVectorType>(Val: Src1->getType());
1843
1844 bool MadeChange = false;
1845 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt: FmtA);
1846 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt: FmtB);
1847
1848 // Depending on the used format, fewer registers are required so shrink the
1849 // vector type.
1850 if (Src0Ty->getNumElements() > Src0NumElts) {
1851 Src0 = IC.Builder.CreateExtractVector(
1852 DstType: FixedVectorType::get(ElementType: Src0Ty->getElementType(), NumElts: Src0NumElts), SrcVec: Src0,
1853 Idx: IC.Builder.getInt64(C: 0));
1854 MadeChange = true;
1855 }
1856
1857 if (Src1Ty->getNumElements() > Src1NumElts) {
1858 Src1 = IC.Builder.CreateExtractVector(
1859 DstType: FixedVectorType::get(ElementType: Src1Ty->getElementType(), NumElts: Src1NumElts), SrcVec: Src1,
1860 Idx: IC.Builder.getInt64(C: 0));
1861 MadeChange = true;
1862 }
1863
1864 if (!MadeChange)
1865 return std::nullopt;
1866
1867 SmallVector<Value *, 13> Args(II.args());
1868 Args[1] = Src0;
1869 Args[3] = Src1;
1870
1871 CallInst *NewII = IC.Builder.CreateIntrinsic(
1872 ID: IID, Types: {II.getArgOperand(i: 5)->getType(), Src0->getType(), Src1->getType()},
1873 Args, FMFSource: &II);
1874 NewII->takeName(V: &II);
1875 return IC.replaceInstUsesWith(I&: II, V: NewII);
1876 }
1877 case Intrinsic::amdgcn_wave_shuffle: {
1878 if (!ST->hasDPP())
1879 return std::nullopt;
1880
1881 return tryWaveShuffleDPP(ST: *ST, IC, II);
1882 }
1883 }
1884 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1885 AMDGPU::getImageDimIntrinsicInfo(Intr: II.getIntrinsicID())) {
1886 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1887 }
1888 return std::nullopt;
1889}
1890
1891/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1892///
1893/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1894/// definitions of the intrinsics vector argument, not Uses of the result like
1895/// image and buffer loads.
1896/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1897/// struct returns.
1898static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1899 IntrinsicInst &II,
1900 APInt DemandedElts,
1901 int DMaskIdx, bool IsLoad) {
1902
1903 auto *IIVTy = cast<FixedVectorType>(Val: IsLoad ? II.getType()
1904 : II.getOperand(i_nocapture: 0)->getType());
1905 unsigned VWidth = IIVTy->getNumElements();
1906 if (VWidth == 1)
1907 return nullptr;
1908 Type *EltTy = IIVTy->getElementType();
1909
1910 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1911 IC.Builder.SetInsertPoint(&II);
1912
1913 // Assume the arguments are unchanged and later override them, if needed.
1914 SmallVector<Value *, 16> Args(II.args());
1915
1916 if (DMaskIdx < 0) {
1917 // Buffer case.
1918
1919 const unsigned ActiveBits = DemandedElts.getActiveBits();
1920 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1921
1922 // Start assuming the prefix of elements is demanded, but possibly clear
1923 // some other bits if there are trailing zeros (unused components at front)
1924 // and update offset.
1925 DemandedElts = (1 << ActiveBits) - 1;
1926
1927 if (UnusedComponentsAtFront > 0) {
1928 static const unsigned InvalidOffsetIdx = 0xf;
1929
1930 unsigned OffsetIdx;
1931 switch (II.getIntrinsicID()) {
1932 case Intrinsic::amdgcn_raw_buffer_load:
1933 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1934 OffsetIdx = 1;
1935 break;
1936 case Intrinsic::amdgcn_s_buffer_load:
1937 // If resulting type is vec3, there is no point in trimming the
1938 // load with updated offset, as the vec3 would most likely be widened to
1939 // vec4 anyway during lowering.
1940 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1941 OffsetIdx = InvalidOffsetIdx;
1942 else
1943 OffsetIdx = 1;
1944 break;
1945 case Intrinsic::amdgcn_struct_buffer_load:
1946 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1947 OffsetIdx = 2;
1948 break;
1949 default:
1950 // TODO: handle tbuffer* intrinsics.
1951 OffsetIdx = InvalidOffsetIdx;
1952 break;
1953 }
1954
1955 if (OffsetIdx != InvalidOffsetIdx) {
1956 // Clear demanded bits and update the offset.
1957 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1958 auto *Offset = Args[OffsetIdx];
1959 unsigned SingleComponentSizeInBits =
1960 IC.getDataLayout().getTypeSizeInBits(Ty: EltTy);
1961 unsigned OffsetAdd =
1962 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1963 auto *OffsetAddVal = ConstantInt::get(Ty: Offset->getType(), V: OffsetAdd);
1964 Args[OffsetIdx] = IC.Builder.CreateAdd(LHS: Offset, RHS: OffsetAddVal);
1965 }
1966 }
1967 } else {
1968 // Image case.
1969
1970 ConstantInt *DMask = cast<ConstantInt>(Val: Args[DMaskIdx]);
1971 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1972
1973 // dmask 0 has special semantics, do not simplify.
1974 if (DMaskVal == 0)
1975 return nullptr;
1976
1977 // Mask off values that are undefined because the dmask doesn't cover them
1978 DemandedElts &= (1 << llvm::popcount(Value: DMaskVal)) - 1;
1979
1980 unsigned NewDMaskVal = 0;
1981 unsigned OrigLdStIdx = 0;
1982 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1983 const unsigned Bit = 1 << SrcIdx;
1984 if (!!(DMaskVal & Bit)) {
1985 if (!!DemandedElts[OrigLdStIdx])
1986 NewDMaskVal |= Bit;
1987 OrigLdStIdx++;
1988 }
1989 }
1990
1991 if (DMaskVal != NewDMaskVal)
1992 Args[DMaskIdx] = ConstantInt::get(Ty: DMask->getType(), V: NewDMaskVal);
1993 }
1994
1995 unsigned NewNumElts = DemandedElts.popcount();
1996 if (!NewNumElts)
1997 return PoisonValue::get(T: IIVTy);
1998
1999 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
2000 if (DMaskIdx >= 0)
2001 II.setArgOperand(i: DMaskIdx, v: Args[DMaskIdx]);
2002 return nullptr;
2003 }
2004
2005 // Validate function argument and return types, extracting overloaded types
2006 // along the way.
2007 SmallVector<Type *, 6> OverloadTys;
2008 if (!Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: OverloadTys))
2009 return nullptr;
2010
2011 Type *NewTy =
2012 (NewNumElts == 1) ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: NewNumElts);
2013 OverloadTys[0] = NewTy;
2014
2015 if (!IsLoad) {
2016 SmallVector<int, 8> EltMask;
2017 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
2018 if (DemandedElts[OrigStoreIdx])
2019 EltMask.push_back(Elt: OrigStoreIdx);
2020
2021 if (NewNumElts == 1)
2022 Args[0] = IC.Builder.CreateExtractElement(Vec: II.getOperand(i_nocapture: 0), Idx: EltMask[0]);
2023 else
2024 Args[0] = IC.Builder.CreateShuffleVector(V: II.getOperand(i_nocapture: 0), Mask: EltMask);
2025 }
2026
2027 CallInst *NewCall =
2028 IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: OverloadTys, Args);
2029 NewCall->takeName(V: &II);
2030 NewCall->copyMetadata(SrcInst: II);
2031
2032 if (IsLoad) {
2033 if (NewNumElts == 1) {
2034 return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: IIVTy), NewElt: NewCall,
2035 Idx: DemandedElts.countr_zero());
2036 }
2037
2038 SmallVector<int, 8> EltMask;
2039 unsigned NewLoadIdx = 0;
2040 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2041 if (!!DemandedElts[OrigLoadIdx])
2042 EltMask.push_back(Elt: NewLoadIdx++);
2043 else
2044 EltMask.push_back(Elt: NewNumElts);
2045 }
2046
2047 auto *Shuffle = IC.Builder.CreateShuffleVector(V: NewCall, Mask: EltMask);
2048
2049 return Shuffle;
2050 }
2051
2052 return NewCall;
2053}
2054
2055Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
2056 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2057 APInt &UndefElts) const {
2058 auto *VT = dyn_cast<FixedVectorType>(Val: II.getType());
2059 if (!VT)
2060 return nullptr;
2061
2062 const unsigned FirstElt = DemandedElts.countr_zero();
2063 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2064 const unsigned MaskLen = LastElt - FirstElt + 1;
2065
2066 unsigned OldNumElts = VT->getNumElements();
2067 if (MaskLen == OldNumElts && MaskLen != 1)
2068 return nullptr;
2069
2070 Type *EltTy = VT->getElementType();
2071 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: MaskLen);
2072
2073 // Theoretically we should support these intrinsics for any legal type. Avoid
2074 // introducing cases that aren't direct register types like v3i16.
2075 if (!isTypeLegal(Ty: NewVT))
2076 return nullptr;
2077
2078 Value *Src = II.getArgOperand(i: 0);
2079
2080 // Make sure convergence tokens are preserved.
2081 // TODO: CreateIntrinsic should allow directly copying bundles
2082 SmallVector<OperandBundleDef, 2> OpBundles;
2083 II.getOperandBundlesAsDefs(Defs&: OpBundles);
2084
2085 Module *M = IC.Builder.GetInsertBlock()->getModule();
2086 Function *Remangled =
2087 Intrinsic::getOrInsertDeclaration(M, id: II.getIntrinsicID(), Tys: {NewVT});
2088
2089 if (MaskLen == 1) {
2090 Value *Extract = IC.Builder.CreateExtractElement(Vec: Src, Idx: FirstElt);
2091
2092 // TODO: Preserve callsite attributes?
2093 CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
2094
2095 return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: II.getType()),
2096 NewElt: NewCall, Idx: FirstElt);
2097 }
2098
2099 SmallVector<int> ExtractMask(MaskLen, -1);
2100 for (unsigned I = 0; I != MaskLen; ++I) {
2101 if (DemandedElts[FirstElt + I])
2102 ExtractMask[I] = FirstElt + I;
2103 }
2104
2105 Value *Extract = IC.Builder.CreateShuffleVector(V: Src, Mask: ExtractMask);
2106
2107 // TODO: Preserve callsite attributes?
2108 CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
2109
2110 SmallVector<int> InsertMask(OldNumElts, -1);
2111 for (unsigned I = 0; I != MaskLen; ++I) {
2112 if (DemandedElts[FirstElt + I])
2113 InsertMask[FirstElt + I] = I;
2114 }
2115
2116 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2117 // call behind.
2118 return IC.Builder.CreateShuffleVector(V: NewCall, Mask: InsertMask);
2119}
2120
2121std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
2122 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2123 APInt &UndefElts2, APInt &UndefElts3,
2124 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2125 SimplifyAndSetOp) const {
2126 switch (II.getIntrinsicID()) {
2127 case Intrinsic::amdgcn_readfirstlane:
2128 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2129 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2130 case Intrinsic::amdgcn_raw_buffer_load:
2131 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2132 case Intrinsic::amdgcn_raw_buffer_load_format:
2133 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2134 case Intrinsic::amdgcn_raw_tbuffer_load:
2135 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2136 case Intrinsic::amdgcn_s_buffer_load:
2137 case Intrinsic::amdgcn_struct_buffer_load:
2138 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2139 case Intrinsic::amdgcn_struct_buffer_load_format:
2140 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2141 case Intrinsic::amdgcn_struct_tbuffer_load:
2142 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2143 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2144 default: {
2145 if (getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID())) {
2146 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx: 0);
2147 }
2148 break;
2149 }
2150 }
2151 return std::nullopt;
2152}
2153