1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUTargetTransformInfo.h"
19#include "GCNSubtarget.h"
20#include "llvm/ADT/FloatingPointMode.h"
21#include "llvm/IR/Dominators.h"
22#include "llvm/IR/IntrinsicsAMDGPU.h"
23#include "llvm/Transforms/InstCombine/InstCombiner.h"
24#include <optional>
25
26using namespace llvm;
27using namespace llvm::PatternMatch;
28
29#define DEBUG_TYPE "AMDGPUtti"
30
31namespace {
32
33struct AMDGPUImageDMaskIntrinsic {
34 unsigned Intr;
35};
36
37#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
38#include "AMDGPUGenSearchableTables.inc"
39
40} // end anonymous namespace
41
42// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
43//
44// A single NaN input is folded to minnum, so we rely on that folding for
45// handling NaNs.
46static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
47 const APFloat &Src2) {
48 APFloat Max3 = maxnum(A: maxnum(A: Src0, B: Src1), B: Src2);
49
50 APFloat::cmpResult Cmp0 = Max3.compare(RHS: Src0);
51 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
52 if (Cmp0 == APFloat::cmpEqual)
53 return maxnum(A: Src1, B: Src2);
54
55 APFloat::cmpResult Cmp1 = Max3.compare(RHS: Src1);
56 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
57 if (Cmp1 == APFloat::cmpEqual)
58 return maxnum(A: Src0, B: Src2);
59
60 return maxnum(A: Src0, B: Src1);
61}
62
63// Check if a value can be converted to a 16-bit value without losing
64// precision.
65// The value is expected to be either a float (IsFloat = true) or an unsigned
66// integer (IsFloat = false).
67static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
68 Type *VTy = V.getType();
69 if (VTy->isHalfTy() || VTy->isIntegerTy(Bitwidth: 16)) {
70 // The value is already 16-bit, so we don't want to convert to 16-bit again!
71 return false;
72 }
73 if (IsFloat) {
74 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(Val: &V)) {
75 // We need to check that if we cast the index down to a half, we do not
76 // lose precision.
77 APFloat FloatValue(ConstFloat->getValueAPF());
78 bool LosesInfo = true;
79 FloatValue.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero,
80 losesInfo: &LosesInfo);
81 return !LosesInfo;
82 }
83 } else {
84 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Val: &V)) {
85 // We need to check that if we cast the index down to an i16, we do not
86 // lose precision.
87 APInt IntValue(ConstInt->getValue());
88 return IntValue.getActiveBits() <= 16;
89 }
90 }
91
92 Value *CastSrc;
93 bool IsExt = IsFloat ? match(V: &V, P: m_FPExt(Op: PatternMatch::m_Value(V&: CastSrc)))
94 : match(V: &V, P: m_ZExt(Op: PatternMatch::m_Value(V&: CastSrc)));
95 if (IsExt) {
96 Type *CastSrcTy = CastSrc->getType();
97 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(Bitwidth: 16))
98 return true;
99 }
100
101 return false;
102}
103
104// Convert a value to 16-bit.
105static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
106 Type *VTy = V.getType();
107 if (isa<FPExtInst, SExtInst, ZExtInst>(Val: &V))
108 return cast<Instruction>(Val: &V)->getOperand(i: 0);
109 if (VTy->isIntegerTy())
110 return Builder.CreateIntCast(V: &V, DestTy: Type::getInt16Ty(C&: V.getContext()), isSigned: false);
111 if (VTy->isFloatingPointTy())
112 return Builder.CreateFPCast(V: &V, DestTy: Type::getHalfTy(C&: V.getContext()));
113
114 llvm_unreachable("Should never be called!");
115}
116
117/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
118/// modified arguments (based on OldIntr) and replaces InstToReplace with
119/// this newly created intrinsic call.
120static std::optional<Instruction *> modifyIntrinsicCall(
121 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
122 InstCombiner &IC,
123 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
124 Func) {
125 SmallVector<Type *, 4> ArgTys;
126 if (!Intrinsic::getIntrinsicSignature(F: OldIntr.getCalledFunction(), ArgTys))
127 return std::nullopt;
128
129 SmallVector<Value *, 8> Args(OldIntr.args());
130
131 // Modify arguments and types
132 Func(Args, ArgTys);
133
134 CallInst *NewCall = IC.Builder.CreateIntrinsic(ID: NewIntr, Types: ArgTys, Args);
135 NewCall->takeName(V: &OldIntr);
136 NewCall->copyMetadata(SrcInst: OldIntr);
137 if (isa<FPMathOperator>(Val: NewCall))
138 NewCall->copyFastMathFlags(I: &OldIntr);
139
140 // Erase and replace uses
141 if (!InstToReplace.getType()->isVoidTy())
142 IC.replaceInstUsesWith(I&: InstToReplace, V: NewCall);
143
144 bool RemoveOldIntr = &OldIntr != &InstToReplace;
145
146 auto *RetValue = IC.eraseInstFromFunction(I&: InstToReplace);
147 if (RemoveOldIntr)
148 IC.eraseInstFromFunction(I&: OldIntr);
149
150 return RetValue;
151}
152
153static std::optional<Instruction *>
154simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
155 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
156 IntrinsicInst &II, InstCombiner &IC) {
157 // Optimize _L to _LZ when _L is zero
158 if (const auto *LZMappingInfo =
159 AMDGPU::getMIMGLZMappingInfo(L: ImageDimIntr->BaseOpcode)) {
160 if (auto *ConstantLod =
161 dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->LodIndex))) {
162 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
163 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
164 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: LZMappingInfo->LZ,
165 Dim: ImageDimIntr->Dim);
166 return modifyIntrinsicCall(
167 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
168 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169 });
170 }
171 }
172 }
173
174 // Optimize _mip away, when 'lod' is zero
175 if (const auto *MIPMappingInfo =
176 AMDGPU::getMIMGMIPMappingInfo(MIP: ImageDimIntr->BaseOpcode)) {
177 if (auto *ConstantMip =
178 dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->MipIndex))) {
179 if (ConstantMip->isZero()) {
180 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: MIPMappingInfo->NONMIP,
182 Dim: ImageDimIntr->Dim);
183 return modifyIntrinsicCall(
184 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
185 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186 });
187 }
188 }
189 }
190
191 // Optimize _bias away when 'bias' is zero
192 if (const auto *BiasMappingInfo =
193 AMDGPU::getMIMGBiasMappingInfo(Bias: ImageDimIntr->BaseOpcode)) {
194 if (auto *ConstantBias =
195 dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->BiasIndex))) {
196 if (ConstantBias->isZero()) {
197 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: BiasMappingInfo->NoBias,
199 Dim: ImageDimIntr->Dim);
200 return modifyIntrinsicCall(
201 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
202 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204 });
205 }
206 }
207 }
208
209 // Optimize _offset away when 'offset' is zero
210 if (const auto *OffsetMappingInfo =
211 AMDGPU::getMIMGOffsetMappingInfo(Offset: ImageDimIntr->BaseOpcode)) {
212 if (auto *ConstantOffset =
213 dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->OffsetIndex))) {
214 if (ConstantOffset->isZero()) {
215 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
216 AMDGPU::getImageDimIntrinsicByBaseOpcode(
217 BaseOpcode: OffsetMappingInfo->NoOffset, Dim: ImageDimIntr->Dim);
218 return modifyIntrinsicCall(
219 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
220 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221 });
222 }
223 }
224 }
225
226 // Try to use D16
227 if (ST->hasD16Images()) {
228
229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
230 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode);
231
232 if (BaseOpcode->HasD16) {
233
234 // If the only use of image intrinsic is a fptrunc (with conversion to
235 // half) then both fptrunc and image intrinsic will be replaced with image
236 // intrinsic with D16 flag.
237 if (II.hasOneUse()) {
238 Instruction *User = II.user_back();
239
240 if (User->getOpcode() == Instruction::FPTrunc &&
241 User->getType()->getScalarType()->isHalfTy()) {
242
243 return modifyIntrinsicCall(OldIntr&: II, InstToReplace&: *User, NewIntr: ImageDimIntr->Intr, IC,
244 Func: [&](auto &Args, auto &ArgTys) {
245 // Change return type of image intrinsic.
246 // Set it to return type of fptrunc.
247 ArgTys[0] = User->getType();
248 });
249 }
250 }
251
252 // Only perform D16 folding if every user of the image sample is
253 // an ExtractElementInst immediately followed by an FPTrunc to half.
254 SmallVector<std::pair<ExtractElementInst *, FPTruncInst *>, 4>
255 ExtractTruncPairs;
256 bool AllHalfExtracts = true;
257
258 for (User *U : II.users()) {
259 auto *Ext = dyn_cast<ExtractElementInst>(Val: U);
260 if (!Ext || !Ext->hasOneUse()) {
261 AllHalfExtracts = false;
262 break;
263 }
264
265 auto *Tr = dyn_cast<FPTruncInst>(Val: *Ext->user_begin());
266 if (!Tr || !Tr->getType()->isHalfTy()) {
267 AllHalfExtracts = false;
268 break;
269 }
270
271 ExtractTruncPairs.emplace_back(Args&: Ext, Args&: Tr);
272 }
273
274 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
275 auto *VecTy = cast<VectorType>(Val: II.getType());
276 Type *HalfVecTy =
277 VecTy->getWithNewType(EltTy: Type::getHalfTy(C&: II.getContext()));
278
279 // Obtain the original image sample intrinsic's signature
280 // and replace its return type with the half-vector for D16 folding
281 SmallVector<Type *, 8> SigTys;
282 Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: SigTys);
283 SigTys[0] = HalfVecTy;
284
285 Module *M = II.getModule();
286 Function *HalfDecl =
287 Intrinsic::getOrInsertDeclaration(M, id: ImageDimIntr->Intr, Tys: SigTys);
288
289 II.mutateType(Ty: HalfVecTy);
290 II.setCalledFunction(HalfDecl);
291
292 IRBuilder<> Builder(II.getContext());
293 for (auto &[Ext, Tr] : ExtractTruncPairs) {
294 Value *Idx = Ext->getIndexOperand();
295
296 Builder.SetInsertPoint(Tr);
297
298 Value *HalfExtract = Builder.CreateExtractElement(Vec: &II, Idx);
299 HalfExtract->takeName(V: Tr);
300
301 Tr->replaceAllUsesWith(V: HalfExtract);
302 }
303
304 for (auto &[Ext, Tr] : ExtractTruncPairs) {
305 IC.eraseInstFromFunction(I&: *Tr);
306 IC.eraseInstFromFunction(I&: *Ext);
307 }
308
309 return &II;
310 }
311 }
312 }
313
314 // Try to use A16 or G16
315 if (!ST->hasA16() && !ST->hasG16())
316 return std::nullopt;
317
318 // Address is interpreted as float if the instruction has a sampler or as
319 // unsigned int if there is no sampler.
320 bool HasSampler =
321 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode)->Sampler;
322 bool FloatCoord = false;
323 // true means derivatives can be converted to 16 bit, coordinates not
324 bool OnlyDerivatives = false;
325
326 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
327 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
328 Value *Coord = II.getOperand(i_nocapture: OperandIndex);
329 // If the values are not derived from 16-bit values, we cannot optimize.
330 if (!canSafelyConvertTo16Bit(V&: *Coord, IsFloat: HasSampler)) {
331 if (OperandIndex < ImageDimIntr->CoordStart ||
332 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
333 return std::nullopt;
334 }
335 // All gradients can be converted, so convert only them
336 OnlyDerivatives = true;
337 break;
338 }
339
340 assert(OperandIndex == ImageDimIntr->GradientStart ||
341 FloatCoord == Coord->getType()->isFloatingPointTy());
342 FloatCoord = Coord->getType()->isFloatingPointTy();
343 }
344
345 if (!OnlyDerivatives && !ST->hasA16())
346 OnlyDerivatives = true; // Only supports G16
347
348 // Check if there is a bias parameter and if it can be converted to f16
349 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
350 Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
351 assert(HasSampler &&
352 "Only image instructions with a sampler can have a bias");
353 if (!canSafelyConvertTo16Bit(V&: *Bias, IsFloat: HasSampler))
354 OnlyDerivatives = true;
355 }
356
357 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
358 ImageDimIntr->CoordStart))
359 return std::nullopt;
360
361 Type *CoordType = FloatCoord ? Type::getHalfTy(C&: II.getContext())
362 : Type::getInt16Ty(C&: II.getContext());
363
364 return modifyIntrinsicCall(
365 OldIntr&: II, InstToReplace&: II, NewIntr: II.getIntrinsicID(), IC, Func: [&](auto &Args, auto &ArgTys) {
366 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
367 if (!OnlyDerivatives) {
368 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
369
370 // Change the bias type
371 if (ImageDimIntr->NumBiasArgs != 0)
372 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(C&: II.getContext());
373 }
374
375 unsigned EndIndex =
376 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
377 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
378 OperandIndex < EndIndex; OperandIndex++) {
379 Args[OperandIndex] =
380 convertTo16Bit(V&: *II.getOperand(i_nocapture: OperandIndex), Builder&: IC.Builder);
381 }
382
383 // Convert the bias
384 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
385 Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
386 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(V&: *Bias, Builder&: IC.Builder);
387 }
388 });
389}
390
391bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
392 const Value *Op0, const Value *Op1,
393 InstCombiner &IC) const {
394 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
395 // infinity, gives +0.0. If we can prove we don't have one of the special
396 // cases then we can use a normal multiply instead.
397 // TODO: Create and use isKnownFiniteNonZero instead of just matching
398 // constants here.
399 if (match(V: Op0, P: PatternMatch::m_FiniteNonZero()) ||
400 match(V: Op1, P: PatternMatch::m_FiniteNonZero())) {
401 // One operand is not zero or infinity or NaN.
402 return true;
403 }
404
405 SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(I: &I);
406 if (isKnownNeverInfOrNaN(V: Op0, SQ) && isKnownNeverInfOrNaN(V: Op1, SQ)) {
407 // Neither operand is infinity or NaN.
408 return true;
409 }
410 return false;
411}
412
413/// Match an fpext from half to float, or a constant we can convert.
414static Value *matchFPExtFromF16(Value *Arg) {
415 Value *Src = nullptr;
416 ConstantFP *CFP = nullptr;
417 if (match(V: Arg, P: m_OneUse(SubPattern: m_FPExt(Op: m_Value(V&: Src))))) {
418 if (Src->getType()->isHalfTy())
419 return Src;
420 } else if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
421 bool LosesInfo;
422 APFloat Val(CFP->getValueAPF());
423 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
424 if (!LosesInfo)
425 return ConstantFP::get(Ty: Type::getHalfTy(C&: Arg->getContext()), V: Val);
426 }
427 return nullptr;
428}
429
430// Trim all zero components from the end of the vector \p UseV and return
431// an appropriate bitset with known elements.
432static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
433 Instruction *I) {
434 auto *VTy = cast<FixedVectorType>(Val: UseV->getType());
435 unsigned VWidth = VTy->getNumElements();
436 APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
437
438 for (int i = VWidth - 1; i > 0; --i) {
439 auto *Elt = findScalarElement(V: UseV, EltNo: i);
440 if (!Elt)
441 break;
442
443 if (auto *ConstElt = dyn_cast<Constant>(Val: Elt)) {
444 if (!ConstElt->isNullValue() && !isa<UndefValue>(Val: Elt))
445 break;
446 } else {
447 break;
448 }
449
450 DemandedElts.clearBit(BitPosition: i);
451 }
452
453 return DemandedElts;
454}
455
456// Trim elements of the end of the vector \p V, if they are
457// equal to the first element of the vector.
458static APInt defaultComponentBroadcast(Value *V) {
459 auto *VTy = cast<FixedVectorType>(Val: V->getType());
460 unsigned VWidth = VTy->getNumElements();
461 APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
462 Value *FirstComponent = findScalarElement(V, EltNo: 0);
463
464 SmallVector<int> ShuffleMask;
465 if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: V))
466 SVI->getShuffleMask(Result&: ShuffleMask);
467
468 for (int I = VWidth - 1; I > 0; --I) {
469 if (ShuffleMask.empty()) {
470 auto *Elt = findScalarElement(V, EltNo: I);
471 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Val: Elt)))
472 break;
473 } else {
474 // Detect identical elements in the shufflevector result, even though
475 // findScalarElement cannot tell us what that element is.
476 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
477 break;
478 }
479 DemandedElts.clearBit(BitPosition: I);
480 }
481
482 return DemandedElts;
483}
484
485static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
486 IntrinsicInst &II,
487 APInt DemandedElts,
488 int DMaskIdx = -1,
489 bool IsLoad = true);
490
491/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
492static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
493 return (SqrtOp->getType()->isFloatTy() &&
494 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
495 SqrtOp->getType()->isHalfTy();
496}
497
498/// Return true if we can easily prove that use U is uniform.
499static bool isTriviallyUniform(const Use &U) {
500 Value *V = U.get();
501 if (isa<Constant>(Val: V))
502 return true;
503 if (const auto *A = dyn_cast<Argument>(Val: V))
504 return AMDGPU::isArgPassedInSGPR(Arg: A);
505 if (const auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
506 if (!AMDGPU::isIntrinsicAlwaysUniform(IntrID: II->getIntrinsicID()))
507 return false;
508 // If II and U are in different blocks then there is a possibility of
509 // temporal divergence.
510 return II->getParent() == cast<Instruction>(Val: U.getUser())->getParent();
511 }
512 return false;
513}
514
515/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
516///
517/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
518bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
519 IntrinsicInst &II,
520 unsigned LaneArgIdx) const {
521 unsigned MaskBits = ST->getWavefrontSizeLog2();
522 APInt DemandedMask(32, maskTrailingOnes<unsigned>(N: MaskBits));
523
524 KnownBits Known(32);
525 if (IC.SimplifyDemandedBits(I: &II, OpNo: LaneArgIdx, DemandedMask, Known))
526 return true;
527
528 if (!Known.isConstant())
529 return false;
530
531 // Out of bounds indexes may appear in wave64 code compiled for wave32.
532 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
533 // manually fix it up.
534
535 Value *LaneArg = II.getArgOperand(i: LaneArgIdx);
536 Constant *MaskedConst =
537 ConstantInt::get(Ty: LaneArg->getType(), V: Known.getConstant() & DemandedMask);
538 if (MaskedConst != LaneArg) {
539 II.getOperandUse(i: LaneArgIdx).set(MaskedConst);
540 return true;
541 }
542
543 return false;
544}
545
546static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
547 Function &NewCallee, ArrayRef<Value *> Ops) {
548 SmallVector<OperandBundleDef, 2> OpBundles;
549 Old.getOperandBundlesAsDefs(Defs&: OpBundles);
550
551 CallInst *NewCall = B.CreateCall(Callee: &NewCallee, Args: Ops, OpBundles);
552 NewCall->takeName(V: &Old);
553 return NewCall;
554}
555
556// Return true for sequences of instructions that effectively assign
557// each lane to its thread ID
558static bool isThreadID(const GCNSubtarget &ST, Value *V) {
559 // Case 1:
560 // wave32: mbcnt_lo(-1, 0)
561 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
562 auto W32Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(Op0: m_ConstantInt<-1>(),
563 Op1: m_ConstantInt<0>());
564 auto W64Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
565 Op0: m_ConstantInt<-1>(), Op1: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
566 Op0: m_ConstantInt<-1>(), Op1: m_ConstantInt<0>()));
567 if (ST.isWave32() && match(V, P: W32Pred))
568 return true;
569 if (ST.isWave64() && match(V, P: W64Pred))
570 return true;
571
572 return false;
573}
574
575// Attempt to capture situations where the index argument matches
576// a DPP pattern, and convert to a DPP-based mov
577static std::optional<Instruction *>
578tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
579 Value *Val = II.getArgOperand(i: 0);
580 Value *Idx = II.getArgOperand(i: 1);
581 auto &B = IC.Builder;
582
583 // DPP16 Row Share requires known wave size, architecture support
584 if (!ST.isWaveSizeKnown() || !ST.hasDPPRowShare())
585 return std::nullopt;
586
587 Value *Tid;
588 uint64_t Mask;
589 uint64_t RowIdx;
590 bool CanDPP16RowShare = false;
591
592 // wave32 requires Mask & 0x1F == 0x10
593 // wave64 requires Mask & 0x3F == 0x30
594 uint64_t MaskCheck = (1UL << ST.getWavefrontSizeLog2()) - 1;
595 uint64_t MaskTarget = MaskCheck & 0xF0;
596
597 // DPP16 Row Share 0: Idx = Tid & Mask
598 auto RowShare0Pred = m_And(L: m_Value(V&: Tid), R: m_ConstantInt(V&: Mask));
599
600 // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
601 auto RowSharePred =
602 m_Or(L: m_And(L: m_Value(V&: Tid), R: m_ConstantInt(V&: Mask)), R: m_ConstantInt(V&: RowIdx));
603
604 // DPP16 Row Share 15: Idx = Tid | 0xF
605 auto RowShare15Pred = m_Or(L: m_Value(V&: Tid), R: m_ConstantInt<0xF>());
606
607 if (match(V: Idx, P: RowShare0Pred) && isThreadID(ST, V: Tid)) {
608 if ((Mask & MaskCheck) != MaskTarget)
609 return std::nullopt;
610
611 RowIdx = 0;
612 CanDPP16RowShare = true;
613 } else if (match(V: Idx, P: RowSharePred) && isThreadID(ST, V: Tid) && RowIdx < 15 &&
614 RowIdx > 0) {
615 if ((Mask & MaskCheck) != MaskTarget)
616 return std::nullopt;
617
618 CanDPP16RowShare = true;
619 } else if (match(V: Idx, P: RowShare15Pred) && isThreadID(ST, V: Tid)) {
620 RowIdx = 15;
621 CanDPP16RowShare = true;
622 }
623
624 if (CanDPP16RowShare) {
625 CallInst *UpdateDPP =
626 B.CreateIntrinsic(ID: Intrinsic::amdgcn_update_dpp, Types: Val->getType(),
627 Args: {PoisonValue::get(T: Val->getType()), Val,
628 B.getInt32(C: AMDGPU::DPP::ROW_SHARE0 | RowIdx),
629 B.getInt32(C: 0xF), B.getInt32(C: 0xF), B.getFalse()});
630 UpdateDPP->takeName(V: &II);
631 UpdateDPP->copyMetadata(SrcInst: II);
632 return IC.replaceInstUsesWith(I&: II, V: UpdateDPP);
633 }
634
635 // No valid DPP detected
636 return std::nullopt;
637}
638
639Instruction *
640GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
641 IntrinsicInst &II) const {
642 const auto IID = II.getIntrinsicID();
643 assert(IID == Intrinsic::amdgcn_readlane ||
644 IID == Intrinsic::amdgcn_readfirstlane ||
645 IID == Intrinsic::amdgcn_permlane64);
646
647 Instruction *OpInst = dyn_cast<Instruction>(Val: II.getOperand(i_nocapture: 0));
648
649 // Only do this if both instructions are in the same block
650 // (so the exec mask won't change) and the readlane is the only user of its
651 // operand.
652 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
653 return nullptr;
654
655 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
656
657 // If this is a readlane, check that the second operand is a constant, or is
658 // defined before OpInst so we know it's safe to move this intrinsic higher.
659 Value *LaneID = nullptr;
660 if (IsReadLane) {
661 LaneID = II.getOperand(i_nocapture: 1);
662
663 // readlane take an extra operand for the lane ID, so we must check if that
664 // LaneID value can be used at the point where we want to move the
665 // intrinsic.
666 if (auto *LaneIDInst = dyn_cast<Instruction>(Val: LaneID)) {
667 if (!IC.getDominatorTree().dominates(Def: LaneIDInst, User: OpInst))
668 return nullptr;
669 }
670 }
671
672 // Hoist the intrinsic (II) through OpInst.
673 //
674 // (II (OpInst x)) -> (OpInst (II x))
675 const auto DoIt = [&](unsigned OpIdx,
676 Function *NewIntrinsic) -> Instruction * {
677 SmallVector<Value *, 2> Ops{OpInst->getOperand(i: OpIdx)};
678 if (IsReadLane)
679 Ops.push_back(Elt: LaneID);
680
681 // Rewrite the intrinsic call.
682 CallInst *NewII = rewriteCall(B&: IC.Builder, Old&: II, NewCallee&: *NewIntrinsic, Ops);
683
684 // Rewrite OpInst so it takes the result of the intrinsic now.
685 Instruction &NewOp = *OpInst->clone();
686 NewOp.setOperand(i: OpIdx, Val: NewII);
687 return &NewOp;
688 };
689
690 // TODO(?): Should we do more with permlane64?
691 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(Val: OpInst))
692 return nullptr;
693
694 if (isa<UnaryOperator>(Val: OpInst))
695 return DoIt(0, II.getCalledFunction());
696
697 if (isa<CastInst>(Val: OpInst)) {
698 Value *Src = OpInst->getOperand(i: 0);
699 Type *SrcTy = Src->getType();
700 if (!isTypeLegal(Ty: SrcTy))
701 return nullptr;
702
703 Function *Remangled =
704 Intrinsic::getOrInsertDeclaration(M: II.getModule(), id: IID, Tys: {SrcTy});
705 return DoIt(0, Remangled);
706 }
707
708 // We can also hoist through binary operators if the other operand is uniform.
709 if (isa<BinaryOperator>(Val: OpInst)) {
710 // FIXME: If we had access to UniformityInfo here we could just check
711 // if the operand is uniform.
712 if (isTriviallyUniform(U: OpInst->getOperandUse(i: 0)))
713 return DoIt(1, II.getCalledFunction());
714 if (isTriviallyUniform(U: OpInst->getOperandUse(i: 1)))
715 return DoIt(0, II.getCalledFunction());
716 }
717
718 return nullptr;
719}
720
721std::optional<Instruction *>
722GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
723 Intrinsic::ID IID = II.getIntrinsicID();
724 switch (IID) {
725 case Intrinsic::amdgcn_rcp: {
726 Value *Src = II.getArgOperand(i: 0);
727 if (isa<PoisonValue>(Val: Src))
728 return IC.replaceInstUsesWith(I&: II, V: Src);
729
730 // TODO: Move to ConstantFolding/InstSimplify?
731 if (isa<UndefValue>(Val: Src)) {
732 Type *Ty = II.getType();
733 auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
734 return IC.replaceInstUsesWith(I&: II, V: QNaN);
735 }
736
737 if (II.isStrictFP())
738 break;
739
740 if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
741 const APFloat &ArgVal = C->getValueAPF();
742 APFloat Val(ArgVal.getSemantics(), 1);
743 Val.divide(RHS: ArgVal, RM: APFloat::rmNearestTiesToEven);
744
745 // This is more precise than the instruction may give.
746 //
747 // TODO: The instruction always flushes denormal results (except for f16),
748 // should this also?
749 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Val));
750 }
751
752 FastMathFlags FMF = cast<FPMathOperator>(Val&: II).getFastMathFlags();
753 if (!FMF.allowContract())
754 break;
755 auto *SrcCI = dyn_cast<IntrinsicInst>(Val: Src);
756 if (!SrcCI)
757 break;
758
759 auto IID = SrcCI->getIntrinsicID();
760 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
761 //
762 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
763 // relaxed.
764 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
765 const FPMathOperator *SqrtOp = cast<FPMathOperator>(Val: SrcCI);
766 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
767 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
768 break;
769
770 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
771 break;
772
773 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
774 M: SrcCI->getModule(), id: Intrinsic::amdgcn_rsq, Tys: {SrcCI->getType()});
775
776 InnerFMF |= FMF;
777 II.setFastMathFlags(InnerFMF);
778
779 II.setCalledFunction(NewDecl);
780 return IC.replaceOperand(I&: II, OpNum: 0, V: SrcCI->getArgOperand(i: 0));
781 }
782
783 break;
784 }
785 case Intrinsic::amdgcn_sqrt:
786 case Intrinsic::amdgcn_rsq:
787 case Intrinsic::amdgcn_tanh: {
788 Value *Src = II.getArgOperand(i: 0);
789 if (isa<PoisonValue>(Val: Src))
790 return IC.replaceInstUsesWith(I&: II, V: Src);
791
792 // TODO: Move to ConstantFolding/InstSimplify?
793 if (isa<UndefValue>(Val: Src)) {
794 Type *Ty = II.getType();
795 auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
796 return IC.replaceInstUsesWith(I&: II, V: QNaN);
797 }
798
799 // f16 amdgcn.sqrt is identical to regular sqrt.
800 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
801 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
802 M: II.getModule(), id: Intrinsic::sqrt, Tys: {II.getType()});
803 II.setCalledFunction(NewDecl);
804 return &II;
805 }
806
807 break;
808 }
809 case Intrinsic::amdgcn_log:
810 case Intrinsic::amdgcn_exp2: {
811 const bool IsLog = IID == Intrinsic::amdgcn_log;
812 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
813 Value *Src = II.getArgOperand(i: 0);
814 Type *Ty = II.getType();
815
816 if (isa<PoisonValue>(Val: Src))
817 return IC.replaceInstUsesWith(I&: II, V: Src);
818
819 if (IC.getSimplifyQuery().isUndefValue(V: Src))
820 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
821
822 if (ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
823 if (C->isInfinity()) {
824 // exp2(+inf) -> +inf
825 // log2(+inf) -> +inf
826 if (!C->isNegative())
827 return IC.replaceInstUsesWith(I&: II, V: C);
828
829 // exp2(-inf) -> 0
830 if (IsExp && C->isNegative())
831 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty));
832 }
833
834 if (II.isStrictFP())
835 break;
836
837 if (C->isNaN()) {
838 Constant *Quieted = ConstantFP::get(Ty, V: C->getValue().makeQuiet());
839 return IC.replaceInstUsesWith(I&: II, V: Quieted);
840 }
841
842 // f32 instruction doesn't handle denormals, f16 does.
843 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
844 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, Negative: true)
845 : ConstantFP::get(Ty, V: 1.0);
846 return IC.replaceInstUsesWith(I&: II, V: FoldedValue);
847 }
848
849 if (IsLog && C->isNegative())
850 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
851
852 // TODO: Full constant folding matching hardware behavior.
853 }
854
855 break;
856 }
857 case Intrinsic::amdgcn_frexp_mant:
858 case Intrinsic::amdgcn_frexp_exp: {
859 Value *Src = II.getArgOperand(i: 0);
860 if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
861 int Exp;
862 APFloat Significand =
863 frexp(X: C->getValueAPF(), Exp, RM: APFloat::rmNearestTiesToEven);
864
865 if (IID == Intrinsic::amdgcn_frexp_mant) {
866 return IC.replaceInstUsesWith(
867 I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Significand));
868 }
869
870 // Match instruction special case behavior.
871 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
872 Exp = 0;
873
874 return IC.replaceInstUsesWith(I&: II,
875 V: ConstantInt::getSigned(Ty: II.getType(), V: Exp));
876 }
877
878 if (isa<PoisonValue>(Val: Src))
879 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
880
881 if (isa<UndefValue>(Val: Src)) {
882 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
883 }
884
885 break;
886 }
887 case Intrinsic::amdgcn_class: {
888 Value *Src0 = II.getArgOperand(i: 0);
889 Value *Src1 = II.getArgOperand(i: 1);
890 const ConstantInt *CMask = dyn_cast<ConstantInt>(Val: Src1);
891 if (CMask) {
892 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
893 M: II.getModule(), id: Intrinsic::is_fpclass, Tys: Src0->getType()));
894
895 // Clamp any excess bits, as they're illegal for the generic intrinsic.
896 II.setArgOperand(i: 1, v: ConstantInt::get(Ty: Src1->getType(),
897 V: CMask->getZExtValue() & fcAllFlags));
898 return &II;
899 }
900
901 // Propagate poison.
902 if (isa<PoisonValue>(Val: Src0) || isa<PoisonValue>(Val: Src1))
903 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
904
905 // llvm.amdgcn.class(_, undef) -> false
906 if (IC.getSimplifyQuery().isUndefValue(V: Src1))
907 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: false));
908
909 // llvm.amdgcn.class(undef, mask) -> mask != 0
910 if (IC.getSimplifyQuery().isUndefValue(V: Src0)) {
911 Value *CmpMask = IC.Builder.CreateICmpNE(
912 LHS: Src1, RHS: ConstantInt::getNullValue(Ty: Src1->getType()));
913 return IC.replaceInstUsesWith(I&: II, V: CmpMask);
914 }
915 break;
916 }
917 case Intrinsic::amdgcn_cvt_pkrtz: {
918 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
919 Type *HalfTy = Type::getHalfTy(C&: Arg->getContext());
920
921 if (isa<PoisonValue>(Val: Arg))
922 return PoisonValue::get(T: HalfTy);
923 if (isa<UndefValue>(Val: Arg))
924 return UndefValue::get(T: HalfTy);
925
926 ConstantFP *CFP = nullptr;
927 if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
928 bool LosesInfo;
929 APFloat Val(CFP->getValueAPF());
930 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
931 return ConstantFP::get(Ty: HalfTy, V: Val);
932 }
933
934 Value *Src = nullptr;
935 if (match(V: Arg, P: m_FPExt(Op: m_Value(V&: Src)))) {
936 if (Src->getType()->isHalfTy())
937 return Src;
938 }
939
940 return nullptr;
941 };
942
943 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(i: 0))) {
944 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(i: 1))) {
945 Value *V = PoisonValue::get(T: II.getType());
946 V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src0, Idx: (uint64_t)0);
947 V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src1, Idx: (uint64_t)1);
948 return IC.replaceInstUsesWith(I&: II, V);
949 }
950 }
951
952 break;
953 }
954 case Intrinsic::amdgcn_cvt_pknorm_i16:
955 case Intrinsic::amdgcn_cvt_pknorm_u16:
956 case Intrinsic::amdgcn_cvt_pk_i16:
957 case Intrinsic::amdgcn_cvt_pk_u16: {
958 Value *Src0 = II.getArgOperand(i: 0);
959 Value *Src1 = II.getArgOperand(i: 1);
960
961 // TODO: Replace call with scalar operation if only one element is poison.
962 if (isa<PoisonValue>(Val: Src0) && isa<PoisonValue>(Val: Src1))
963 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
964
965 if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
966 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
967 }
968
969 break;
970 }
971 case Intrinsic::amdgcn_cvt_off_f32_i4: {
972 Value* Arg = II.getArgOperand(i: 0);
973 Type *Ty = II.getType();
974
975 if (isa<PoisonValue>(Val: Arg))
976 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: Ty));
977
978 if(IC.getSimplifyQuery().isUndefValue(V: Arg))
979 return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty));
980
981 ConstantInt *CArg = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 0));
982 if (!CArg)
983 break;
984
985 // Tabulated 0.0625 * (sext (CArg & 0xf)).
986 constexpr size_t ResValsSize = 16;
987 static constexpr float ResVals[ResValsSize] = {
988 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
989 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
990 Constant *Res =
991 ConstantFP::get(Ty, V: ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
992 return IC.replaceInstUsesWith(I&: II, V: Res);
993 }
994 case Intrinsic::amdgcn_ubfe:
995 case Intrinsic::amdgcn_sbfe: {
996 // Decompose simple cases into standard shifts.
997 Value *Src = II.getArgOperand(i: 0);
998 if (isa<UndefValue>(Val: Src)) {
999 return IC.replaceInstUsesWith(I&: II, V: Src);
1000 }
1001
1002 unsigned Width;
1003 Type *Ty = II.getType();
1004 unsigned IntSize = Ty->getIntegerBitWidth();
1005
1006 ConstantInt *CWidth = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 2));
1007 if (CWidth) {
1008 Width = CWidth->getZExtValue();
1009 if ((Width & (IntSize - 1)) == 0) {
1010 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getNullValue(Ty));
1011 }
1012
1013 // Hardware ignores high bits, so remove those.
1014 if (Width >= IntSize) {
1015 return IC.replaceOperand(
1016 I&: II, OpNum: 2, V: ConstantInt::get(Ty: CWidth->getType(), V: Width & (IntSize - 1)));
1017 }
1018 }
1019
1020 unsigned Offset;
1021 ConstantInt *COffset = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 1));
1022 if (COffset) {
1023 Offset = COffset->getZExtValue();
1024 if (Offset >= IntSize) {
1025 return IC.replaceOperand(
1026 I&: II, OpNum: 1,
1027 V: ConstantInt::get(Ty: COffset->getType(), V: Offset & (IntSize - 1)));
1028 }
1029 }
1030
1031 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1032
1033 if (!CWidth || !COffset)
1034 break;
1035
1036 // The case of Width == 0 is handled above, which makes this transformation
1037 // safe. If Width == 0, then the ashr and lshr instructions become poison
1038 // value since the shift amount would be equal to the bit size.
1039 assert(Width != 0);
1040
1041 // TODO: This allows folding to undef when the hardware has specific
1042 // behavior?
1043 if (Offset + Width < IntSize) {
1044 Value *Shl = IC.Builder.CreateShl(LHS: Src, RHS: IntSize - Offset - Width);
1045 Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Shl, RHS: IntSize - Width)
1046 : IC.Builder.CreateLShr(LHS: Shl, RHS: IntSize - Width);
1047 RightShift->takeName(V: &II);
1048 return IC.replaceInstUsesWith(I&: II, V: RightShift);
1049 }
1050
1051 Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Src, RHS: Offset)
1052 : IC.Builder.CreateLShr(LHS: Src, RHS: Offset);
1053
1054 RightShift->takeName(V: &II);
1055 return IC.replaceInstUsesWith(I&: II, V: RightShift);
1056 }
1057 case Intrinsic::amdgcn_exp:
1058 case Intrinsic::amdgcn_exp_row:
1059 case Intrinsic::amdgcn_exp_compr: {
1060 ConstantInt *En = cast<ConstantInt>(Val: II.getArgOperand(i: 1));
1061 unsigned EnBits = En->getZExtValue();
1062 if (EnBits == 0xf)
1063 break; // All inputs enabled.
1064
1065 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1066 bool Changed = false;
1067 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1068 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1069 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1070 Value *Src = II.getArgOperand(i: I + 2);
1071 if (!isa<PoisonValue>(Val: Src)) {
1072 IC.replaceOperand(I&: II, OpNum: I + 2, V: PoisonValue::get(T: Src->getType()));
1073 Changed = true;
1074 }
1075 }
1076 }
1077
1078 if (Changed) {
1079 return &II;
1080 }
1081
1082 break;
1083 }
1084 case Intrinsic::amdgcn_fmed3: {
1085 Value *Src0 = II.getArgOperand(i: 0);
1086 Value *Src1 = II.getArgOperand(i: 1);
1087 Value *Src2 = II.getArgOperand(i: 2);
1088
1089 for (Value *Src : {Src0, Src1, Src2}) {
1090 if (isa<PoisonValue>(Val: Src))
1091 return IC.replaceInstUsesWith(I&: II, V: Src);
1092 }
1093
1094 if (II.isStrictFP())
1095 break;
1096
1097 // med3 with a nan input acts like
1098 // v_min_f32(v_min_f32(s0, s1), s2)
1099 //
1100 // Signalingness is ignored with ieee=0, so we fold to
1101 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1102 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1103 // returned signaling nan will not be quieted.
1104
1105 // ieee=1
1106 // s0 snan: s2
1107 // s1 snan: s2
1108 // s2 snan: qnan
1109
1110 // s0 qnan: min(s1, s2)
1111 // s1 qnan: min(s0, s2)
1112 // s2 qnan: min(s0, s1)
1113
1114 // ieee=0
1115 // s0 _nan: min(s1, s2)
1116 // s1 _nan: min(s0, s2)
1117 // s2 _nan: min(s0, s1)
1118
1119 // med3 behavior with infinity
1120 // s0 +inf: max(s1, s2)
1121 // s1 +inf: max(s0, s2)
1122 // s2 +inf: max(s0, s1)
1123 // s0 -inf: min(s1, s2)
1124 // s1 -inf: min(s0, s2)
1125 // s2 -inf: min(s0, s1)
1126
1127 // Checking for NaN before canonicalization provides better fidelity when
1128 // mapping other operations onto fmed3 since the order of operands is
1129 // unchanged.
1130 Value *V = nullptr;
1131 const APFloat *ConstSrc0 = nullptr;
1132 const APFloat *ConstSrc1 = nullptr;
1133 const APFloat *ConstSrc2 = nullptr;
1134
1135 if ((match(V: Src0, P: m_APFloat(Res&: ConstSrc0)) &&
1136 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1137 isa<UndefValue>(Val: Src0)) {
1138 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1139 switch (fpenvIEEEMode(I: II)) {
1140 case KnownIEEEMode::On:
1141 // TODO: If Src2 is snan, does it need quieting?
1142 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1143 return IC.replaceInstUsesWith(I&: II, V: Src2);
1144
1145 V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src1, RHS: Src2)
1146 : IC.Builder.CreateMinNum(LHS: Src1, RHS: Src2);
1147 break;
1148 case KnownIEEEMode::Off:
1149 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src1, RHS: Src2)
1150 : IC.Builder.CreateMinimumNum(LHS: Src1, RHS: Src2);
1151 break;
1152 case KnownIEEEMode::Unknown:
1153 break;
1154 }
1155 } else if ((match(V: Src1, P: m_APFloat(Res&: ConstSrc1)) &&
1156 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1157 isa<UndefValue>(Val: Src1)) {
1158 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1159 switch (fpenvIEEEMode(I: II)) {
1160 case KnownIEEEMode::On:
1161 // TODO: If Src2 is snan, does it need quieting?
1162 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1163 return IC.replaceInstUsesWith(I&: II, V: Src2);
1164
1165 V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src2)
1166 : IC.Builder.CreateMinNum(LHS: Src0, RHS: Src2);
1167 break;
1168 case KnownIEEEMode::Off:
1169 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src2)
1170 : IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src2);
1171 break;
1172 case KnownIEEEMode::Unknown:
1173 break;
1174 }
1175 } else if ((match(V: Src2, P: m_APFloat(Res&: ConstSrc2)) &&
1176 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1177 isa<UndefValue>(Val: Src2)) {
1178 switch (fpenvIEEEMode(I: II)) {
1179 case KnownIEEEMode::On:
1180 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1181 auto *Quieted = ConstantFP::get(Ty: II.getType(), V: ConstSrc2->makeQuiet());
1182 return IC.replaceInstUsesWith(I&: II, V: Quieted);
1183 }
1184
1185 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1186 ? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src1)
1187 : IC.Builder.CreateMinNum(LHS: Src0, RHS: Src1);
1188 break;
1189 case KnownIEEEMode::Off:
1190 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1191 ? IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src1)
1192 : IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src1);
1193 break;
1194 case KnownIEEEMode::Unknown:
1195 break;
1196 }
1197 }
1198
1199 if (V) {
1200 if (auto *CI = dyn_cast<CallInst>(Val: V)) {
1201 CI->copyFastMathFlags(I: &II);
1202 CI->takeName(V: &II);
1203 }
1204 return IC.replaceInstUsesWith(I&: II, V);
1205 }
1206
1207 bool Swap = false;
1208 // Canonicalize constants to RHS operands.
1209 //
1210 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1211 if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1212 std::swap(a&: Src0, b&: Src1);
1213 Swap = true;
1214 }
1215
1216 if (isa<Constant>(Val: Src1) && !isa<Constant>(Val: Src2)) {
1217 std::swap(a&: Src1, b&: Src2);
1218 Swap = true;
1219 }
1220
1221 if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1222 std::swap(a&: Src0, b&: Src1);
1223 Swap = true;
1224 }
1225
1226 if (Swap) {
1227 II.setArgOperand(i: 0, v: Src0);
1228 II.setArgOperand(i: 1, v: Src1);
1229 II.setArgOperand(i: 2, v: Src2);
1230 return &II;
1231 }
1232
1233 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
1234 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
1235 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Val: Src2)) {
1236 APFloat Result = fmed3AMDGCN(Src0: C0->getValueAPF(), Src1: C1->getValueAPF(),
1237 Src2: C2->getValueAPF());
1238 return IC.replaceInstUsesWith(I&: II,
1239 V: ConstantFP::get(Ty: II.getType(), V: Result));
1240 }
1241 }
1242 }
1243
1244 if (!ST->hasMed3_16())
1245 break;
1246
1247 // Repeat floating-point width reduction done for minnum/maxnum.
1248 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1249 if (Value *X = matchFPExtFromF16(Arg: Src0)) {
1250 if (Value *Y = matchFPExtFromF16(Arg: Src1)) {
1251 if (Value *Z = matchFPExtFromF16(Arg: Src2)) {
1252 Value *NewCall = IC.Builder.CreateIntrinsic(
1253 ID: IID, Types: {X->getType()}, Args: {X, Y, Z}, FMFSource: &II, Name: II.getName());
1254 return new FPExtInst(NewCall, II.getType());
1255 }
1256 }
1257 }
1258
1259 break;
1260 }
1261 case Intrinsic::amdgcn_icmp:
1262 case Intrinsic::amdgcn_fcmp: {
1263 const ConstantInt *CC = cast<ConstantInt>(Val: II.getArgOperand(i: 2));
1264 // Guard against invalid arguments.
1265 int64_t CCVal = CC->getZExtValue();
1266 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1267 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1268 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1269 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1270 CCVal > CmpInst::LAST_FCMP_PREDICATE)))
1271 break;
1272
1273 Value *Src0 = II.getArgOperand(i: 0);
1274 Value *Src1 = II.getArgOperand(i: 1);
1275
1276 if (auto *CSrc0 = dyn_cast<Constant>(Val: Src0)) {
1277 if (auto *CSrc1 = dyn_cast<Constant>(Val: Src1)) {
1278 Constant *CCmp = ConstantFoldCompareInstOperands(
1279 Predicate: (ICmpInst::Predicate)CCVal, LHS: CSrc0, RHS: CSrc1, DL);
1280 if (CCmp && CCmp->isNullValue()) {
1281 return IC.replaceInstUsesWith(
1282 I&: II, V: IC.Builder.CreateSExt(V: CCmp, DestTy: II.getType()));
1283 }
1284
1285 // The result of V_ICMP/V_FCMP assembly instructions (which this
1286 // intrinsic exposes) is one bit per thread, masked with the EXEC
1287 // register (which contains the bitmask of live threads). So a
1288 // comparison that always returns true is the same as a read of the
1289 // EXEC register.
1290 Metadata *MDArgs[] = {MDString::get(Context&: II.getContext(), Str: "exec")};
1291 MDNode *MD = MDNode::get(Context&: II.getContext(), MDs: MDArgs);
1292 Value *Args[] = {MetadataAsValue::get(Context&: II.getContext(), MD)};
1293 CallInst *NewCall = IC.Builder.CreateIntrinsic(ID: Intrinsic::read_register,
1294 Types: II.getType(), Args);
1295 NewCall->addFnAttr(Kind: Attribute::Convergent);
1296 NewCall->takeName(V: &II);
1297 return IC.replaceInstUsesWith(I&: II, V: NewCall);
1298 }
1299
1300 // Canonicalize constants to RHS.
1301 CmpInst::Predicate SwapPred =
1302 CmpInst::getSwappedPredicate(pred: static_cast<CmpInst::Predicate>(CCVal));
1303 II.setArgOperand(i: 0, v: Src1);
1304 II.setArgOperand(i: 1, v: Src0);
1305 II.setArgOperand(
1306 i: 2, v: ConstantInt::get(Ty: CC->getType(), V: static_cast<int>(SwapPred)));
1307 return &II;
1308 }
1309
1310 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1311 break;
1312
1313 // Canonicalize compare eq with true value to compare != 0
1314 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1315 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1316 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1317 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1318 Value *ExtSrc;
1319 if (CCVal == CmpInst::ICMP_EQ &&
1320 ((match(V: Src1, P: PatternMatch::m_One()) &&
1321 match(V: Src0, P: m_ZExt(Op: PatternMatch::m_Value(V&: ExtSrc)))) ||
1322 (match(V: Src1, P: PatternMatch::m_AllOnes()) &&
1323 match(V: Src0, P: m_SExt(Op: PatternMatch::m_Value(V&: ExtSrc))))) &&
1324 ExtSrc->getType()->isIntegerTy(Bitwidth: 1)) {
1325 IC.replaceOperand(I&: II, OpNum: 1, V: ConstantInt::getNullValue(Ty: Src1->getType()));
1326 IC.replaceOperand(I&: II, OpNum: 2,
1327 V: ConstantInt::get(Ty: CC->getType(), V: CmpInst::ICMP_NE));
1328 return &II;
1329 }
1330
1331 CmpPredicate SrcPred;
1332 Value *SrcLHS;
1333 Value *SrcRHS;
1334
1335 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1336 // intrinsic. The typical use is a wave vote function in the library, which
1337 // will be fed from a user code condition compared with 0. Fold in the
1338 // redundant compare.
1339
1340 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1341 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1342 //
1343 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1344 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1345 if (match(V: Src1, P: PatternMatch::m_Zero()) &&
1346 match(V: Src0, P: PatternMatch::m_ZExtOrSExt(
1347 Op: m_Cmp(Pred&: SrcPred, L: PatternMatch::m_Value(V&: SrcLHS),
1348 R: PatternMatch::m_Value(V&: SrcRHS))))) {
1349 if (CCVal == CmpInst::ICMP_EQ)
1350 SrcPred = CmpInst::getInversePredicate(pred: SrcPred);
1351
1352 Intrinsic::ID NewIID = CmpInst::isFPPredicate(P: SrcPred)
1353 ? Intrinsic::amdgcn_fcmp
1354 : Intrinsic::amdgcn_icmp;
1355
1356 Type *Ty = SrcLHS->getType();
1357 if (auto *CmpType = dyn_cast<IntegerType>(Val: Ty)) {
1358 // Promote to next legal integer type.
1359 unsigned Width = CmpType->getBitWidth();
1360 unsigned NewWidth = Width;
1361
1362 // Don't do anything for i1 comparisons.
1363 if (Width == 1)
1364 break;
1365
1366 if (Width <= 16)
1367 NewWidth = 16;
1368 else if (Width <= 32)
1369 NewWidth = 32;
1370 else if (Width <= 64)
1371 NewWidth = 64;
1372 else
1373 break; // Can't handle this.
1374
1375 if (Width != NewWidth) {
1376 IntegerType *CmpTy = IC.Builder.getIntNTy(N: NewWidth);
1377 if (CmpInst::isSigned(predicate: SrcPred)) {
1378 SrcLHS = IC.Builder.CreateSExt(V: SrcLHS, DestTy: CmpTy);
1379 SrcRHS = IC.Builder.CreateSExt(V: SrcRHS, DestTy: CmpTy);
1380 } else {
1381 SrcLHS = IC.Builder.CreateZExt(V: SrcLHS, DestTy: CmpTy);
1382 SrcRHS = IC.Builder.CreateZExt(V: SrcRHS, DestTy: CmpTy);
1383 }
1384 }
1385 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1386 break;
1387
1388 Value *Args[] = {SrcLHS, SrcRHS,
1389 ConstantInt::get(Ty: CC->getType(), V: SrcPred)};
1390 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1391 ID: NewIID, Types: {II.getType(), SrcLHS->getType()}, Args);
1392 NewCall->takeName(V: &II);
1393 return IC.replaceInstUsesWith(I&: II, V: NewCall);
1394 }
1395
1396 break;
1397 }
1398 case Intrinsic::amdgcn_mbcnt_hi: {
1399 // exec_hi is all 0, so this is just a copy.
1400 if (ST->isWave32())
1401 return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: 1));
1402 break;
1403 }
1404 case Intrinsic::amdgcn_ballot: {
1405 Value *Arg = II.getArgOperand(i: 0);
1406 if (isa<PoisonValue>(Val: Arg))
1407 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1408
1409 if (auto *Src = dyn_cast<ConstantInt>(Val: Arg)) {
1410 if (Src->isZero()) {
1411 // amdgcn.ballot(i1 0) is zero.
1412 return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1413 }
1414 }
1415 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1416 // %b64 = call i64 ballot.i64(...)
1417 // =>
1418 // %b32 = call i32 ballot.i32(...)
1419 // %b64 = zext i32 %b32 to i64
1420 Value *Call = IC.Builder.CreateZExt(
1421 V: IC.Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_ballot,
1422 Types: {IC.Builder.getInt32Ty()},
1423 Args: {II.getArgOperand(i: 0)}),
1424 DestTy: II.getType());
1425 Call->takeName(V: &II);
1426 return IC.replaceInstUsesWith(I&: II, V: Call);
1427 }
1428 break;
1429 }
1430 case Intrinsic::amdgcn_wavefrontsize: {
1431 if (ST->isWaveSizeKnown())
1432 return IC.replaceInstUsesWith(
1433 I&: II, V: ConstantInt::get(Ty: II.getType(), V: ST->getWavefrontSize()));
1434 break;
1435 }
1436 case Intrinsic::amdgcn_wqm_vote: {
1437 // wqm_vote is identity when the argument is constant.
1438 if (!isa<Constant>(Val: II.getArgOperand(i: 0)))
1439 break;
1440
1441 return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: 0));
1442 }
1443 case Intrinsic::amdgcn_kill: {
1444 const ConstantInt *C = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 0));
1445 if (!C || !C->getZExtValue())
1446 break;
1447
1448 // amdgcn.kill(i1 1) is a no-op
1449 return IC.eraseInstFromFunction(I&: II);
1450 }
1451 case Intrinsic::amdgcn_update_dpp: {
1452 Value *Old = II.getArgOperand(i: 0);
1453
1454 auto *BC = cast<ConstantInt>(Val: II.getArgOperand(i: 5));
1455 auto *RM = cast<ConstantInt>(Val: II.getArgOperand(i: 3));
1456 auto *BM = cast<ConstantInt>(Val: II.getArgOperand(i: 4));
1457 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1458 BM->getZExtValue() != 0xF || isa<PoisonValue>(Val: Old))
1459 break;
1460
1461 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1462 return IC.replaceOperand(I&: II, OpNum: 0, V: PoisonValue::get(T: Old->getType()));
1463 }
1464 case Intrinsic::amdgcn_permlane16:
1465 case Intrinsic::amdgcn_permlane16_var:
1466 case Intrinsic::amdgcn_permlanex16:
1467 case Intrinsic::amdgcn_permlanex16_var: {
1468 // Discard vdst_in if it's not going to be read.
1469 Value *VDstIn = II.getArgOperand(i: 0);
1470 if (isa<PoisonValue>(Val: VDstIn))
1471 break;
1472
1473 // FetchInvalid operand idx.
1474 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1475 IID == Intrinsic::amdgcn_permlanex16)
1476 ? 4 /* for permlane16 and permlanex16 */
1477 : 3; /* for permlane16_var and permlanex16_var */
1478
1479 // BoundCtrl operand idx.
1480 // For permlane16 and permlanex16 it should be 5
1481 // For Permlane16_var and permlanex16_var it should be 4
1482 unsigned int BcIdx = FiIdx + 1;
1483
1484 ConstantInt *FetchInvalid = cast<ConstantInt>(Val: II.getArgOperand(i: FiIdx));
1485 ConstantInt *BoundCtrl = cast<ConstantInt>(Val: II.getArgOperand(i: BcIdx));
1486 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1487 break;
1488
1489 return IC.replaceOperand(I&: II, OpNum: 0, V: PoisonValue::get(T: VDstIn->getType()));
1490 }
1491 case Intrinsic::amdgcn_permlane64:
1492 case Intrinsic::amdgcn_readfirstlane:
1493 case Intrinsic::amdgcn_readlane:
1494 case Intrinsic::amdgcn_ds_bpermute: {
1495 // If the data argument is uniform these intrinsics return it unchanged.
1496 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1497 const Use &Src = II.getArgOperandUse(i: SrcIdx);
1498 if (isTriviallyUniform(U: Src))
1499 return IC.replaceInstUsesWith(I&: II, V: Src.get());
1500
1501 if (IID == Intrinsic::amdgcn_readlane &&
1502 simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: 1))
1503 return &II;
1504
1505 // If the lane argument of bpermute is uniform, change it to readlane. This
1506 // generates better code and can enable further optimizations because
1507 // readlane is AlwaysUniform.
1508 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1509 const Use &Lane = II.getArgOperandUse(i: 0);
1510 if (isTriviallyUniform(U: Lane)) {
1511 Value *NewLane = IC.Builder.CreateLShr(LHS: Lane, RHS: 2);
1512 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1513 M: II.getModule(), id: Intrinsic::amdgcn_readlane, Tys: II.getType());
1514 II.setCalledFunction(NewDecl);
1515 II.setOperand(i_nocapture: 0, Val_nocapture: Src);
1516 II.setOperand(i_nocapture: 1, Val_nocapture: NewLane);
1517 return &II;
1518 }
1519 }
1520
1521 if (IID != Intrinsic::amdgcn_ds_bpermute) {
1522 if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
1523 return Res;
1524 }
1525
1526 return std::nullopt;
1527 }
1528 case Intrinsic::amdgcn_writelane: {
1529 // TODO: Fold bitcast like readlane.
1530 if (simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: 1))
1531 return &II;
1532 return std::nullopt;
1533 }
1534 case Intrinsic::amdgcn_trig_preop: {
1535 // The intrinsic is declared with name mangling, but currently the
1536 // instruction only exists for f64
1537 if (!II.getType()->isDoubleTy())
1538 break;
1539
1540 Value *Src = II.getArgOperand(i: 0);
1541 Value *Segment = II.getArgOperand(i: 1);
1542 if (isa<PoisonValue>(Val: Src) || isa<PoisonValue>(Val: Segment))
1543 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1544
1545 if (isa<UndefValue>(Val: Segment))
1546 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1547
1548 // Sign bit is not used.
1549 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Val: Src);
1550 if (StrippedSign != Src)
1551 return IC.replaceOperand(I&: II, OpNum: 0, V: StrippedSign);
1552
1553 if (II.isStrictFP())
1554 break;
1555
1556 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Val: Src);
1557 if (!CSrc && !isa<UndefValue>(Val: Src))
1558 break;
1559
1560 // The instruction ignores special cases, and literally just extracts the
1561 // exponents. Fold undef to nan, and index the table as normal.
1562 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
1563 : APFloat::getQNaN(Sem: II.getType()->getFltSemantics())
1564 .bitcastToAPInt();
1565
1566 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Val: Segment);
1567 if (!Cseg) {
1568 if (isa<UndefValue>(Val: Src))
1569 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1570 break;
1571 }
1572
1573 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(numBits: 11, bitPosition: 52);
1574 unsigned SegmentVal = Cseg->getValue().trunc(width: 5).getZExtValue();
1575 unsigned Shift = SegmentVal * 53;
1576 if (Exponent > 1077)
1577 Shift += Exponent - 1077;
1578
1579 // 2.0/PI table.
1580 static const uint32_t TwoByPi[] = {
1581 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1582 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1583 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1584 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1585 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1586 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1587 0x56033046};
1588
1589 // Return 0 for outbound segment (hardware behavior).
1590 unsigned Idx = Shift >> 5;
1591 if (Idx + 2 >= std::size(TwoByPi)) {
1592 APFloat Zero = APFloat::getZero(Sem: II.getType()->getFltSemantics());
1593 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: II.getType(), V: Zero));
1594 }
1595
1596 unsigned BShift = Shift & 0x1f;
1597 uint64_t Thi = Make_64(High: TwoByPi[Idx], Low: TwoByPi[Idx + 1]);
1598 uint64_t Tlo = Make_64(High: TwoByPi[Idx + 2], Low: 0);
1599 if (BShift)
1600 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1601 Thi = Thi >> 11;
1602 APFloat Result = APFloat((double)Thi);
1603
1604 int Scale = -53 - Shift;
1605 if (Exponent >= 1968)
1606 Scale += 128;
1607
1608 Result = scalbn(X: Result, Exp: Scale, RM: RoundingMode::NearestTiesToEven);
1609 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: Src->getType(), V: Result));
1610 }
1611 case Intrinsic::amdgcn_fmul_legacy: {
1612 Value *Op0 = II.getArgOperand(i: 0);
1613 Value *Op1 = II.getArgOperand(i: 1);
1614
1615 for (Value *Src : {Op0, Op1}) {
1616 if (isa<PoisonValue>(Val: Src))
1617 return IC.replaceInstUsesWith(I&: II, V: Src);
1618 }
1619
1620 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1621 // infinity, gives +0.0.
1622 // TODO: Move to InstSimplify?
1623 if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) ||
1624 match(V: Op1, P: PatternMatch::m_AnyZeroFP()))
1625 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1626
1627 // If we can prove we don't have one of the special cases then we can use a
1628 // normal fmul instruction instead.
1629 if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1630 auto *FMul = IC.Builder.CreateFMulFMF(L: Op0, R: Op1, FMFSource: &II);
1631 FMul->takeName(V: &II);
1632 return IC.replaceInstUsesWith(I&: II, V: FMul);
1633 }
1634 break;
1635 }
1636 case Intrinsic::amdgcn_fma_legacy: {
1637 Value *Op0 = II.getArgOperand(i: 0);
1638 Value *Op1 = II.getArgOperand(i: 1);
1639 Value *Op2 = II.getArgOperand(i: 2);
1640
1641 for (Value *Src : {Op0, Op1, Op2}) {
1642 if (isa<PoisonValue>(Val: Src))
1643 return IC.replaceInstUsesWith(I&: II, V: Src);
1644 }
1645
1646 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1647 // infinity, gives +0.0.
1648 // TODO: Move to InstSimplify?
1649 if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) ||
1650 match(V: Op1, P: PatternMatch::m_AnyZeroFP())) {
1651 // It's tempting to just return Op2 here, but that would give the wrong
1652 // result if Op2 was -0.0.
1653 auto *Zero = ConstantFP::getZero(Ty: II.getType());
1654 auto *FAdd = IC.Builder.CreateFAddFMF(L: Zero, R: Op2, FMFSource: &II);
1655 FAdd->takeName(V: &II);
1656 return IC.replaceInstUsesWith(I&: II, V: FAdd);
1657 }
1658
1659 // If we can prove we don't have one of the special cases then we can use a
1660 // normal fma instead.
1661 if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1662 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1663 M: II.getModule(), id: Intrinsic::fma, Tys: II.getType()));
1664 return &II;
1665 }
1666 break;
1667 }
1668 case Intrinsic::amdgcn_is_shared:
1669 case Intrinsic::amdgcn_is_private: {
1670 Value *Src = II.getArgOperand(i: 0);
1671 if (isa<PoisonValue>(Val: Src))
1672 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1673 if (isa<UndefValue>(Val: Src))
1674 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1675
1676 if (isa<ConstantPointerNull>(Val: II.getArgOperand(i: 0)))
1677 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getFalse(Ty: II.getType()));
1678 break;
1679 }
1680 case Intrinsic::amdgcn_make_buffer_rsrc: {
1681 Value *Src = II.getArgOperand(i: 0);
1682 if (isa<PoisonValue>(Val: Src))
1683 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1684 return std::nullopt;
1685 }
1686 case Intrinsic::amdgcn_raw_buffer_store_format:
1687 case Intrinsic::amdgcn_struct_buffer_store_format:
1688 case Intrinsic::amdgcn_raw_tbuffer_store:
1689 case Intrinsic::amdgcn_struct_tbuffer_store:
1690 case Intrinsic::amdgcn_image_store_1d:
1691 case Intrinsic::amdgcn_image_store_1darray:
1692 case Intrinsic::amdgcn_image_store_2d:
1693 case Intrinsic::amdgcn_image_store_2darray:
1694 case Intrinsic::amdgcn_image_store_2darraymsaa:
1695 case Intrinsic::amdgcn_image_store_2dmsaa:
1696 case Intrinsic::amdgcn_image_store_3d:
1697 case Intrinsic::amdgcn_image_store_cube:
1698 case Intrinsic::amdgcn_image_store_mip_1d:
1699 case Intrinsic::amdgcn_image_store_mip_1darray:
1700 case Intrinsic::amdgcn_image_store_mip_2d:
1701 case Intrinsic::amdgcn_image_store_mip_2darray:
1702 case Intrinsic::amdgcn_image_store_mip_3d:
1703 case Intrinsic::amdgcn_image_store_mip_cube: {
1704 if (!isa<FixedVectorType>(Val: II.getArgOperand(i: 0)->getType()))
1705 break;
1706
1707 APInt DemandedElts;
1708 if (ST->hasDefaultComponentBroadcast())
1709 DemandedElts = defaultComponentBroadcast(V: II.getArgOperand(i: 0));
1710 else if (ST->hasDefaultComponentZero())
1711 DemandedElts = trimTrailingZerosInVector(IC, UseV: II.getArgOperand(i: 0), I: &II);
1712 else
1713 break;
1714
1715 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID()) ? 1 : -1;
1716 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1717 IsLoad: false)) {
1718 return IC.eraseInstFromFunction(I&: II);
1719 }
1720
1721 break;
1722 }
1723 case Intrinsic::amdgcn_prng_b32: {
1724 auto *Src = II.getArgOperand(i: 0);
1725 if (isa<UndefValue>(Val: Src)) {
1726 return IC.replaceInstUsesWith(I&: II, V: Src);
1727 }
1728 return std::nullopt;
1729 }
1730 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1731 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1732 Value *Src0 = II.getArgOperand(i: 0);
1733 Value *Src1 = II.getArgOperand(i: 1);
1734 uint64_t CBSZ = cast<ConstantInt>(Val: II.getArgOperand(i: 3))->getZExtValue();
1735 uint64_t BLGP = cast<ConstantInt>(Val: II.getArgOperand(i: 4))->getZExtValue();
1736 auto *Src0Ty = cast<FixedVectorType>(Val: Src0->getType());
1737 auto *Src1Ty = cast<FixedVectorType>(Val: Src1->getType());
1738
1739 auto getFormatNumRegs = [](unsigned FormatVal) {
1740 switch (FormatVal) {
1741 case AMDGPU::MFMAScaleFormats::FP6_E2M3:
1742 case AMDGPU::MFMAScaleFormats::FP6_E3M2:
1743 return 6u;
1744 case AMDGPU::MFMAScaleFormats::FP4_E2M1:
1745 return 4u;
1746 case AMDGPU::MFMAScaleFormats::FP8_E4M3:
1747 case AMDGPU::MFMAScaleFormats::FP8_E5M2:
1748 return 8u;
1749 default:
1750 llvm_unreachable("invalid format value");
1751 }
1752 };
1753
1754 bool MadeChange = false;
1755 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
1756 unsigned Src1NumElts = getFormatNumRegs(BLGP);
1757
1758 // Depending on the used format, fewer registers are required so shrink the
1759 // vector type.
1760 if (Src0Ty->getNumElements() > Src0NumElts) {
1761 Src0 = IC.Builder.CreateExtractVector(
1762 DstType: FixedVectorType::get(ElementType: Src0Ty->getElementType(), NumElts: Src0NumElts), SrcVec: Src0,
1763 Idx: uint64_t(0));
1764 MadeChange = true;
1765 }
1766
1767 if (Src1Ty->getNumElements() > Src1NumElts) {
1768 Src1 = IC.Builder.CreateExtractVector(
1769 DstType: FixedVectorType::get(ElementType: Src1Ty->getElementType(), NumElts: Src1NumElts), SrcVec: Src1,
1770 Idx: uint64_t(0));
1771 MadeChange = true;
1772 }
1773
1774 if (!MadeChange)
1775 return std::nullopt;
1776
1777 SmallVector<Value *, 10> Args(II.args());
1778 Args[0] = Src0;
1779 Args[1] = Src1;
1780
1781 CallInst *NewII = IC.Builder.CreateIntrinsic(
1782 ID: IID, Types: {Src0->getType(), Src1->getType()}, Args, FMFSource: &II);
1783 NewII->takeName(V: &II);
1784 return IC.replaceInstUsesWith(I&: II, V: NewII);
1785 }
1786 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
1787 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1788 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
1789 Value *Src0 = II.getArgOperand(i: 1);
1790 Value *Src1 = II.getArgOperand(i: 3);
1791 unsigned FmtA = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue();
1792 uint64_t FmtB = cast<ConstantInt>(Val: II.getArgOperand(i: 2))->getZExtValue();
1793 auto *Src0Ty = cast<FixedVectorType>(Val: Src0->getType());
1794 auto *Src1Ty = cast<FixedVectorType>(Val: Src1->getType());
1795
1796 bool MadeChange = false;
1797 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt: FmtA);
1798 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt: FmtB);
1799
1800 // Depending on the used format, fewer registers are required so shrink the
1801 // vector type.
1802 if (Src0Ty->getNumElements() > Src0NumElts) {
1803 Src0 = IC.Builder.CreateExtractVector(
1804 DstType: FixedVectorType::get(ElementType: Src0Ty->getElementType(), NumElts: Src0NumElts), SrcVec: Src0,
1805 Idx: IC.Builder.getInt64(C: 0));
1806 MadeChange = true;
1807 }
1808
1809 if (Src1Ty->getNumElements() > Src1NumElts) {
1810 Src1 = IC.Builder.CreateExtractVector(
1811 DstType: FixedVectorType::get(ElementType: Src1Ty->getElementType(), NumElts: Src1NumElts), SrcVec: Src1,
1812 Idx: IC.Builder.getInt64(C: 0));
1813 MadeChange = true;
1814 }
1815
1816 if (!MadeChange)
1817 return std::nullopt;
1818
1819 SmallVector<Value *, 13> Args(II.args());
1820 Args[1] = Src0;
1821 Args[3] = Src1;
1822
1823 CallInst *NewII = IC.Builder.CreateIntrinsic(
1824 ID: IID, Types: {II.getArgOperand(i: 5)->getType(), Src0->getType(), Src1->getType()},
1825 Args, FMFSource: &II);
1826 NewII->takeName(V: &II);
1827 return IC.replaceInstUsesWith(I&: II, V: NewII);
1828 }
1829 case Intrinsic::amdgcn_wave_shuffle: {
1830 if (!ST->hasDPP())
1831 return std::nullopt;
1832
1833 return tryWaveShuffleDPP(ST: *ST, IC, II);
1834 }
1835 }
1836 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1837 AMDGPU::getImageDimIntrinsicInfo(Intr: II.getIntrinsicID())) {
1838 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1839 }
1840 return std::nullopt;
1841}
1842
1843/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1844///
1845/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1846/// definitions of the intrinsics vector argument, not Uses of the result like
1847/// image and buffer loads.
1848/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1849/// struct returns.
1850static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1851 IntrinsicInst &II,
1852 APInt DemandedElts,
1853 int DMaskIdx, bool IsLoad) {
1854
1855 auto *IIVTy = cast<FixedVectorType>(Val: IsLoad ? II.getType()
1856 : II.getOperand(i_nocapture: 0)->getType());
1857 unsigned VWidth = IIVTy->getNumElements();
1858 if (VWidth == 1)
1859 return nullptr;
1860 Type *EltTy = IIVTy->getElementType();
1861
1862 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1863 IC.Builder.SetInsertPoint(&II);
1864
1865 // Assume the arguments are unchanged and later override them, if needed.
1866 SmallVector<Value *, 16> Args(II.args());
1867
1868 if (DMaskIdx < 0) {
1869 // Buffer case.
1870
1871 const unsigned ActiveBits = DemandedElts.getActiveBits();
1872 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1873
1874 // Start assuming the prefix of elements is demanded, but possibly clear
1875 // some other bits if there are trailing zeros (unused components at front)
1876 // and update offset.
1877 DemandedElts = (1 << ActiveBits) - 1;
1878
1879 if (UnusedComponentsAtFront > 0) {
1880 static const unsigned InvalidOffsetIdx = 0xf;
1881
1882 unsigned OffsetIdx;
1883 switch (II.getIntrinsicID()) {
1884 case Intrinsic::amdgcn_raw_buffer_load:
1885 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1886 OffsetIdx = 1;
1887 break;
1888 case Intrinsic::amdgcn_s_buffer_load:
1889 // If resulting type is vec3, there is no point in trimming the
1890 // load with updated offset, as the vec3 would most likely be widened to
1891 // vec4 anyway during lowering.
1892 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1893 OffsetIdx = InvalidOffsetIdx;
1894 else
1895 OffsetIdx = 1;
1896 break;
1897 case Intrinsic::amdgcn_struct_buffer_load:
1898 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1899 OffsetIdx = 2;
1900 break;
1901 default:
1902 // TODO: handle tbuffer* intrinsics.
1903 OffsetIdx = InvalidOffsetIdx;
1904 break;
1905 }
1906
1907 if (OffsetIdx != InvalidOffsetIdx) {
1908 // Clear demanded bits and update the offset.
1909 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1910 auto *Offset = Args[OffsetIdx];
1911 unsigned SingleComponentSizeInBits =
1912 IC.getDataLayout().getTypeSizeInBits(Ty: EltTy);
1913 unsigned OffsetAdd =
1914 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1915 auto *OffsetAddVal = ConstantInt::get(Ty: Offset->getType(), V: OffsetAdd);
1916 Args[OffsetIdx] = IC.Builder.CreateAdd(LHS: Offset, RHS: OffsetAddVal);
1917 }
1918 }
1919 } else {
1920 // Image case.
1921
1922 ConstantInt *DMask = cast<ConstantInt>(Val: Args[DMaskIdx]);
1923 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1924
1925 // dmask 0 has special semantics, do not simplify.
1926 if (DMaskVal == 0)
1927 return nullptr;
1928
1929 // Mask off values that are undefined because the dmask doesn't cover them
1930 DemandedElts &= (1 << llvm::popcount(Value: DMaskVal)) - 1;
1931
1932 unsigned NewDMaskVal = 0;
1933 unsigned OrigLdStIdx = 0;
1934 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1935 const unsigned Bit = 1 << SrcIdx;
1936 if (!!(DMaskVal & Bit)) {
1937 if (!!DemandedElts[OrigLdStIdx])
1938 NewDMaskVal |= Bit;
1939 OrigLdStIdx++;
1940 }
1941 }
1942
1943 if (DMaskVal != NewDMaskVal)
1944 Args[DMaskIdx] = ConstantInt::get(Ty: DMask->getType(), V: NewDMaskVal);
1945 }
1946
1947 unsigned NewNumElts = DemandedElts.popcount();
1948 if (!NewNumElts)
1949 return PoisonValue::get(T: IIVTy);
1950
1951 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1952 if (DMaskIdx >= 0)
1953 II.setArgOperand(i: DMaskIdx, v: Args[DMaskIdx]);
1954 return nullptr;
1955 }
1956
1957 // Validate function argument and return types, extracting overloaded types
1958 // along the way.
1959 SmallVector<Type *, 6> OverloadTys;
1960 if (!Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: OverloadTys))
1961 return nullptr;
1962
1963 Type *NewTy =
1964 (NewNumElts == 1) ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: NewNumElts);
1965 OverloadTys[0] = NewTy;
1966
1967 if (!IsLoad) {
1968 SmallVector<int, 8> EltMask;
1969 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1970 if (DemandedElts[OrigStoreIdx])
1971 EltMask.push_back(Elt: OrigStoreIdx);
1972
1973 if (NewNumElts == 1)
1974 Args[0] = IC.Builder.CreateExtractElement(Vec: II.getOperand(i_nocapture: 0), Idx: EltMask[0]);
1975 else
1976 Args[0] = IC.Builder.CreateShuffleVector(V: II.getOperand(i_nocapture: 0), Mask: EltMask);
1977 }
1978
1979 CallInst *NewCall =
1980 IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: OverloadTys, Args);
1981 NewCall->takeName(V: &II);
1982 NewCall->copyMetadata(SrcInst: II);
1983
1984 if (IsLoad) {
1985 if (NewNumElts == 1) {
1986 return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: IIVTy), NewElt: NewCall,
1987 Idx: DemandedElts.countr_zero());
1988 }
1989
1990 SmallVector<int, 8> EltMask;
1991 unsigned NewLoadIdx = 0;
1992 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1993 if (!!DemandedElts[OrigLoadIdx])
1994 EltMask.push_back(Elt: NewLoadIdx++);
1995 else
1996 EltMask.push_back(Elt: NewNumElts);
1997 }
1998
1999 auto *Shuffle = IC.Builder.CreateShuffleVector(V: NewCall, Mask: EltMask);
2000
2001 return Shuffle;
2002 }
2003
2004 return NewCall;
2005}
2006
2007Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
2008 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2009 APInt &UndefElts) const {
2010 auto *VT = dyn_cast<FixedVectorType>(Val: II.getType());
2011 if (!VT)
2012 return nullptr;
2013
2014 const unsigned FirstElt = DemandedElts.countr_zero();
2015 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2016 const unsigned MaskLen = LastElt - FirstElt + 1;
2017
2018 unsigned OldNumElts = VT->getNumElements();
2019 if (MaskLen == OldNumElts && MaskLen != 1)
2020 return nullptr;
2021
2022 Type *EltTy = VT->getElementType();
2023 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: MaskLen);
2024
2025 // Theoretically we should support these intrinsics for any legal type. Avoid
2026 // introducing cases that aren't direct register types like v3i16.
2027 if (!isTypeLegal(Ty: NewVT))
2028 return nullptr;
2029
2030 Value *Src = II.getArgOperand(i: 0);
2031
2032 // Make sure convergence tokens are preserved.
2033 // TODO: CreateIntrinsic should allow directly copying bundles
2034 SmallVector<OperandBundleDef, 2> OpBundles;
2035 II.getOperandBundlesAsDefs(Defs&: OpBundles);
2036
2037 Module *M = IC.Builder.GetInsertBlock()->getModule();
2038 Function *Remangled =
2039 Intrinsic::getOrInsertDeclaration(M, id: II.getIntrinsicID(), Tys: {NewVT});
2040
2041 if (MaskLen == 1) {
2042 Value *Extract = IC.Builder.CreateExtractElement(Vec: Src, Idx: FirstElt);
2043
2044 // TODO: Preserve callsite attributes?
2045 CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
2046
2047 return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: II.getType()),
2048 NewElt: NewCall, Idx: FirstElt);
2049 }
2050
2051 SmallVector<int> ExtractMask(MaskLen, -1);
2052 for (unsigned I = 0; I != MaskLen; ++I) {
2053 if (DemandedElts[FirstElt + I])
2054 ExtractMask[I] = FirstElt + I;
2055 }
2056
2057 Value *Extract = IC.Builder.CreateShuffleVector(V: Src, Mask: ExtractMask);
2058
2059 // TODO: Preserve callsite attributes?
2060 CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
2061
2062 SmallVector<int> InsertMask(OldNumElts, -1);
2063 for (unsigned I = 0; I != MaskLen; ++I) {
2064 if (DemandedElts[FirstElt + I])
2065 InsertMask[FirstElt + I] = I;
2066 }
2067
2068 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2069 // call behind.
2070 return IC.Builder.CreateShuffleVector(V: NewCall, Mask: InsertMask);
2071}
2072
2073std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
2074 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2075 APInt &UndefElts2, APInt &UndefElts3,
2076 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2077 SimplifyAndSetOp) const {
2078 switch (II.getIntrinsicID()) {
2079 case Intrinsic::amdgcn_readfirstlane:
2080 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2081 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2082 case Intrinsic::amdgcn_raw_buffer_load:
2083 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2084 case Intrinsic::amdgcn_raw_buffer_load_format:
2085 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2086 case Intrinsic::amdgcn_raw_tbuffer_load:
2087 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2088 case Intrinsic::amdgcn_s_buffer_load:
2089 case Intrinsic::amdgcn_struct_buffer_load:
2090 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2091 case Intrinsic::amdgcn_struct_buffer_load_format:
2092 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2093 case Intrinsic::amdgcn_struct_tbuffer_load:
2094 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2095 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2096 default: {
2097 if (getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID())) {
2098 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx: 0);
2099 }
2100 break;
2101 }
2102 }
2103 return std::nullopt;
2104}
2105