1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUTargetTransformInfo.h"
19#include "GCNSubtarget.h"
20#include "SIDefines.h"
21#include "llvm/ADT/FloatingPointMode.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/Sequence.h"
24#include "llvm/Analysis/ConstantFolding.h"
25#include "llvm/Analysis/ValueTracking.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/Support/MathExtras.h"
30#include "llvm/Transforms/InstCombine/InstCombiner.h"
31#include <optional>
32
33using namespace llvm;
34using namespace llvm::PatternMatch;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
38namespace {
39
40struct AMDGPUImageDMaskIntrinsic {
41 unsigned Intr;
42};
43
44#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
45#include "AMDGPUGenSearchableTables.inc"
46
47} // end anonymous namespace
48
49// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
50//
51// A single NaN input is folded to minnum, so we rely on that folding for
52// handling NaNs.
53static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
54 const APFloat &Src2) {
55 assert(!Src0.isNaN() && !Src1.isNaN() && !Src2.isNaN() &&
56 "nans handled separately");
57 APFloat Max3 = maxnum(A: maxnum(A: Src0, B: Src1), B: Src2);
58
59 if (Max3.bitwiseIsEqual(RHS: Src0))
60 return maxnum(A: Src1, B: Src2);
61
62 if (Max3.bitwiseIsEqual(RHS: Src1))
63 return maxnum(A: Src0, B: Src2);
64
65 return maxnum(A: Src0, B: Src1);
66}
67
68// Check if a value can be converted to a 16-bit value without losing
69// precision.
70// The value is expected to be either a float (IsFloat = true) or an unsigned
71// integer (IsFloat = false).
72static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
73 Type *VTy = V.getType();
74 if (VTy->isHalfTy() || VTy->isIntegerTy(BitWidth: 16)) {
75 // The value is already 16-bit, so we don't want to convert to 16-bit again!
76 return false;
77 }
78 if (IsFloat) {
79 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(Val: &V)) {
80 // We need to check that if we cast the index down to a half, we do not
81 // lose precision.
82 APFloat FloatValue(ConstFloat->getValueAPF());
83 bool LosesInfo = true;
84 FloatValue.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero,
85 losesInfo: &LosesInfo);
86 return !LosesInfo;
87 }
88 } else {
89 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Val: &V)) {
90 // We need to check that if we cast the index down to an i16, we do not
91 // lose precision.
92 APInt IntValue(ConstInt->getValue());
93 return IntValue.getActiveBits() <= 16;
94 }
95 }
96
97 Value *CastSrc;
98 bool IsExt = IsFloat ? match(V: &V, P: m_FPExt(Op: PatternMatch::m_Value(V&: CastSrc)))
99 : match(V: &V, P: m_ZExt(Op: PatternMatch::m_Value(V&: CastSrc)));
100 if (IsExt) {
101 Type *CastSrcTy = CastSrc->getType();
102 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(BitWidth: 16))
103 return true;
104 }
105
106 return false;
107}
108
109// Convert a value to 16-bit.
110static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
111 Type *VTy = V.getType();
112 if (isa<FPExtInst, SExtInst, ZExtInst>(Val: &V))
113 return cast<Instruction>(Val: &V)->getOperand(i: 0);
114 if (VTy->isIntegerTy())
115 return Builder.CreateIntCast(V: &V, DestTy: Type::getInt16Ty(C&: V.getContext()), isSigned: false);
116 if (VTy->isFloatingPointTy())
117 return Builder.CreateFPCast(V: &V, DestTy: Type::getHalfTy(C&: V.getContext()));
118
119 llvm_unreachable("Should never be called!");
120}
121
122/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
123/// modified arguments (based on OldIntr) and replaces InstToReplace with
124/// this newly created intrinsic call.
125static std::optional<Instruction *> modifyIntrinsicCall(
126 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
127 InstCombiner &IC,
128 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
129 Func) {
130 SmallVector<Type *, 4> OverloadTys;
131 if (!Intrinsic::isSignatureValid(F: OldIntr.getCalledFunction(), OverloadTys))
132 return std::nullopt;
133
134 SmallVector<Value *, 8> Args(OldIntr.args());
135
136 // Modify arguments and types
137 Func(Args, OverloadTys);
138
139 CallInst *NewCall =
140 IC.Builder.CreateIntrinsicWithoutFolding(ID: NewIntr, OverloadTypes: OverloadTys, Args);
141 NewCall->takeName(V: &OldIntr);
142 NewCall->copyMetadata(SrcInst: OldIntr);
143 if (isa<FPMathOperator>(Val: NewCall))
144 NewCall->copyFastMathFlags(I: &OldIntr);
145 // Copy attributes
146 AttributeList OldAttrList = OldIntr.getAttributes();
147 NewCall->setAttributes(OldAttrList);
148
149 // Erase and replace uses
150 if (!InstToReplace.getType()->isVoidTy())
151 IC.replaceInstUsesWith(I&: InstToReplace, V: NewCall);
152
153 bool RemoveOldIntr = &OldIntr != &InstToReplace;
154
155 auto *RetValue = IC.eraseInstFromFunction(I&: InstToReplace);
156 if (RemoveOldIntr)
157 IC.eraseInstFromFunction(I&: OldIntr);
158
159 return RetValue;
160}
161
162static std::optional<Instruction *>
163simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
164 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
165 IntrinsicInst &II, InstCombiner &IC) {
166 // Optimize _L to _LZ when _L is zero
167 if (const auto *LZMappingInfo =
168 AMDGPU::getMIMGLZMappingInfo(L: ImageDimIntr->BaseOpcode)) {
169 if (auto *ConstantLod =
170 dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->LodIndex))) {
171 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
172 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
173 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: LZMappingInfo->LZ,
174 Dim: ImageDimIntr->Dim);
175 return modifyIntrinsicCall(
176 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
177 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
178 });
179 }
180 }
181 }
182
183 // Optimize _mip away, when 'lod' is zero
184 if (const auto *MIPMappingInfo =
185 AMDGPU::getMIMGMIPMappingInfo(MIP: ImageDimIntr->BaseOpcode)) {
186 if (auto *ConstantMip =
187 dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->MipIndex))) {
188 if (ConstantMip->isZero()) {
189 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
190 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: MIPMappingInfo->NONMIP,
191 Dim: ImageDimIntr->Dim);
192 return modifyIntrinsicCall(
193 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
194 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
195 });
196 }
197 }
198 }
199
200 // Optimize _bias away when 'bias' is zero
201 if (const auto *BiasMappingInfo =
202 AMDGPU::getMIMGBiasMappingInfo(Bias: ImageDimIntr->BaseOpcode)) {
203 if (auto *ConstantBias =
204 dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->BiasIndex))) {
205 if (ConstantBias->isZero()) {
206 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
207 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: BiasMappingInfo->NoBias,
208 Dim: ImageDimIntr->Dim);
209 return modifyIntrinsicCall(
210 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
211 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
212 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
213 });
214 }
215 }
216 }
217
218 // Optimize _offset away when 'offset' is zero
219 if (const auto *OffsetMappingInfo =
220 AMDGPU::getMIMGOffsetMappingInfo(Offset: ImageDimIntr->BaseOpcode)) {
221 if (auto *ConstantOffset =
222 dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->OffsetIndex))) {
223 if (ConstantOffset->isZero()) {
224 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
225 AMDGPU::getImageDimIntrinsicByBaseOpcode(
226 BaseOpcode: OffsetMappingInfo->NoOffset, Dim: ImageDimIntr->Dim);
227 return modifyIntrinsicCall(
228 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
229 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
230 });
231 }
232 }
233 }
234
235 // Try to use D16
236 if (ST->hasD16Images()) {
237
238 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
239 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode);
240
241 if (BaseOpcode->HasD16) {
242
243 // If the only use of image intrinsic is a fptrunc (with conversion to
244 // half) then both fptrunc and image intrinsic will be replaced with image
245 // intrinsic with D16 flag.
246 if (II.hasOneUse()) {
247 Instruction *User = II.user_back();
248
249 if (User->getOpcode() == Instruction::FPTrunc &&
250 User->getType()->getScalarType()->isHalfTy()) {
251
252 return modifyIntrinsicCall(OldIntr&: II, InstToReplace&: *User, NewIntr: ImageDimIntr->Intr, IC,
253 Func: [&](auto &Args, auto &ArgTys) {
254 // Change return type of image intrinsic.
255 // Set it to return type of fptrunc.
256 ArgTys[0] = User->getType();
257 });
258 }
259 }
260
261 // Only perform D16 folding if every user of the image sample is
262 // an ExtractElementInst immediately followed by an FPTrunc to half.
263 SmallVector<std::pair<ExtractElementInst *, FPTruncInst *>, 4>
264 ExtractTruncPairs;
265 bool AllHalfExtracts = true;
266
267 for (User *U : II.users()) {
268 auto *Ext = dyn_cast<ExtractElementInst>(Val: U);
269 if (!Ext || !Ext->hasOneUse()) {
270 AllHalfExtracts = false;
271 break;
272 }
273
274 auto *Tr = dyn_cast<FPTruncInst>(Val: *Ext->user_begin());
275 if (!Tr || !Tr->getType()->isHalfTy()) {
276 AllHalfExtracts = false;
277 break;
278 }
279
280 ExtractTruncPairs.emplace_back(Args&: Ext, Args&: Tr);
281 }
282
283 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
284 auto *VecTy = cast<VectorType>(Val: II.getType());
285 Type *HalfVecTy =
286 VecTy->getWithNewType(EltTy: Type::getHalfTy(C&: II.getContext()));
287
288 // Obtain the original image sample intrinsic's signature
289 // and replace its return type with the half-vector for D16 folding
290 SmallVector<Type *, 8> OverloadTys;
291 if (!Intrinsic::isSignatureValid(F: II.getCalledFunction(), OverloadTys))
292 return std::nullopt;
293
294 OverloadTys[0] = HalfVecTy;
295 Module *M = II.getModule();
296 Function *HalfDecl = Intrinsic::getOrInsertDeclaration(
297 M, id: ImageDimIntr->Intr, OverloadTys);
298
299 II.mutateType(Ty: HalfVecTy);
300 II.setCalledFunction(HalfDecl);
301
302 IRBuilder<> Builder(II.getContext());
303 for (auto &[Ext, Tr] : ExtractTruncPairs) {
304 Value *Idx = Ext->getIndexOperand();
305
306 Builder.SetInsertPoint(Tr);
307
308 Value *HalfExtract = Builder.CreateExtractElement(Vec: &II, Idx);
309 HalfExtract->takeName(V: Tr);
310
311 Tr->replaceAllUsesWith(V: HalfExtract);
312 }
313
314 for (auto &[Ext, Tr] : ExtractTruncPairs) {
315 IC.eraseInstFromFunction(I&: *Tr);
316 IC.eraseInstFromFunction(I&: *Ext);
317 }
318
319 return &II;
320 }
321 }
322 }
323
324 // Try to use A16 or G16
325 if (!ST->hasA16() && !ST->hasG16())
326 return std::nullopt;
327
328 // Address is interpreted as float if the instruction has a sampler or as
329 // unsigned int if there is no sampler.
330 bool HasSampler =
331 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode)->Sampler;
332 bool FloatCoord = false;
333 // true means derivatives can be converted to 16 bit, coordinates not
334 bool OnlyDerivatives = false;
335
336 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
337 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
338 Value *Coord = II.getOperand(i_nocapture: OperandIndex);
339 // If the values are not derived from 16-bit values, we cannot optimize.
340 if (!canSafelyConvertTo16Bit(V&: *Coord, IsFloat: HasSampler)) {
341 if (OperandIndex < ImageDimIntr->CoordStart ||
342 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
343 return std::nullopt;
344 }
345 // All gradients can be converted, so convert only them
346 OnlyDerivatives = true;
347 break;
348 }
349
350 assert(OperandIndex == ImageDimIntr->GradientStart ||
351 FloatCoord == Coord->getType()->isFloatingPointTy());
352 FloatCoord = Coord->getType()->isFloatingPointTy();
353 }
354
355 if (!OnlyDerivatives && !ST->hasA16())
356 OnlyDerivatives = true; // Only supports G16
357
358 // Check if there is a bias parameter and if it can be converted to f16
359 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
360 Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
361 assert(HasSampler &&
362 "Only image instructions with a sampler can have a bias");
363 if (!canSafelyConvertTo16Bit(V&: *Bias, IsFloat: HasSampler))
364 OnlyDerivatives = true;
365 }
366
367 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
368 ImageDimIntr->CoordStart))
369 return std::nullopt;
370
371 Type *CoordType = FloatCoord ? Type::getHalfTy(C&: II.getContext())
372 : Type::getInt16Ty(C&: II.getContext());
373
374 return modifyIntrinsicCall(
375 OldIntr&: II, InstToReplace&: II, NewIntr: II.getIntrinsicID(), IC, Func: [&](auto &Args, auto &ArgTys) {
376 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
377 if (!OnlyDerivatives) {
378 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
379
380 // Change the bias type
381 if (ImageDimIntr->NumBiasArgs != 0)
382 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(C&: II.getContext());
383 }
384
385 unsigned EndIndex =
386 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
387 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
388 OperandIndex < EndIndex; OperandIndex++) {
389 Args[OperandIndex] =
390 convertTo16Bit(V&: *II.getOperand(i_nocapture: OperandIndex), Builder&: IC.Builder);
391 }
392
393 // Convert the bias
394 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
395 Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
396 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(V&: *Bias, Builder&: IC.Builder);
397 }
398 });
399}
400
401bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
402 const Value *Op0, const Value *Op1,
403 InstCombiner &IC) const {
404 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
405 // infinity, gives +0.0. If we can prove we don't have one of the special
406 // cases then we can use a normal multiply instead.
407 // TODO: Create and use isKnownFiniteNonZero instead of just matching
408 // constants here.
409 if (match(V: Op0, P: PatternMatch::m_FiniteNonZero()) ||
410 match(V: Op1, P: PatternMatch::m_FiniteNonZero())) {
411 // One operand is not zero or infinity or NaN.
412 return true;
413 }
414
415 SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(I: &I);
416 if (isKnownNeverInfOrNaN(V: Op0, SQ) && isKnownNeverInfOrNaN(V: Op1, SQ)) {
417 // Neither operand is infinity or NaN.
418 return true;
419 }
420 return false;
421}
422
423/// Match an fpext from half to float, or a constant we can convert.
424static Value *matchFPExtFromF16(Value *Arg) {
425 Value *Src = nullptr;
426 ConstantFP *CFP = nullptr;
427 if (match(V: Arg, P: m_OneUse(SubPattern: m_FPExt(Op: m_Value(V&: Src))))) {
428 if (Src->getType()->isHalfTy())
429 return Src;
430 } else if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
431 bool LosesInfo;
432 APFloat Val(CFP->getValueAPF());
433 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
434 if (!LosesInfo)
435 return ConstantFP::get(Ty: Type::getHalfTy(C&: Arg->getContext()), V: Val);
436 }
437 return nullptr;
438}
439
440// Trim all zero components from the end of the vector \p UseV and return
441// an appropriate bitset with known elements.
442static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
443 Instruction *I) {
444 auto *VTy = cast<FixedVectorType>(Val: UseV->getType());
445 unsigned VWidth = VTy->getNumElements();
446 APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
447
448 for (int i = VWidth - 1; i > 0; --i) {
449 auto *Elt = findScalarElement(V: UseV, EltNo: i);
450 if (!Elt)
451 break;
452
453 if (auto *ConstElt = dyn_cast<Constant>(Val: Elt)) {
454 if (!ConstElt->isNullValue() && !isa<UndefValue>(Val: Elt))
455 break;
456 } else {
457 break;
458 }
459
460 DemandedElts.clearBit(BitPosition: i);
461 }
462
463 return DemandedElts;
464}
465
466// Trim elements of the end of the vector \p V, if they are
467// equal to the first element of the vector.
468static APInt defaultComponentBroadcast(Value *V) {
469 auto *VTy = cast<FixedVectorType>(Val: V->getType());
470 unsigned VWidth = VTy->getNumElements();
471 APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
472 Value *FirstComponent = findScalarElement(V, EltNo: 0);
473
474 SmallVector<int> ShuffleMask;
475 if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: V))
476 SVI->getShuffleMask(Result&: ShuffleMask);
477
478 for (int I = VWidth - 1; I > 0; --I) {
479 if (ShuffleMask.empty()) {
480 auto *Elt = findScalarElement(V, EltNo: I);
481 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Val: Elt)))
482 break;
483 } else {
484 // Detect identical elements in the shufflevector result, even though
485 // findScalarElement cannot tell us what that element is.
486 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
487 break;
488 }
489 DemandedElts.clearBit(BitPosition: I);
490 }
491
492 return DemandedElts;
493}
494
495static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
496 IntrinsicInst &II,
497 APInt DemandedElts,
498 int DMaskIdx = -1,
499 bool IsLoad = true);
500
501/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
502static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
503 return (SqrtOp->getType()->isFloatTy() &&
504 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
505 SqrtOp->getType()->isHalfTy();
506}
507
508/// Return true if we can easily prove that use U is uniform.
509static bool isTriviallyUniform(const Use &U) {
510 Value *V = U.get();
511 if (isa<Constant>(Val: V))
512 return true;
513 if (const auto *A = dyn_cast<Argument>(Val: V))
514 return AMDGPU::isArgPassedInSGPR(Arg: A);
515 if (const auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
516 if (!AMDGPU::isIntrinsicAlwaysUniform(IntrID: II->getIntrinsicID()))
517 return false;
518 // If II and U are in different blocks then there is a possibility of
519 // temporal divergence.
520 return II->getParent() == cast<Instruction>(Val: U.getUser())->getParent();
521 }
522 return false;
523}
524
525/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
526///
527/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
528bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
529 IntrinsicInst &II,
530 unsigned LaneArgIdx) const {
531 unsigned MaskBits = ST->getWavefrontSizeLog2();
532 APInt DemandedMask(32, maskTrailingOnes<unsigned>(N: MaskBits));
533
534 KnownBits Known(32);
535 if (IC.SimplifyDemandedBits(I: &II, OpNo: LaneArgIdx, DemandedMask, Known))
536 return true;
537
538 if (!Known.isConstant())
539 return false;
540
541 // Out of bounds indexes may appear in wave64 code compiled for wave32.
542 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
543 // manually fix it up.
544
545 Value *LaneArg = II.getArgOperand(i: LaneArgIdx);
546 Constant *MaskedConst =
547 ConstantInt::get(Ty: LaneArg->getType(), V: Known.getConstant() & DemandedMask);
548 if (MaskedConst != LaneArg) {
549 II.getOperandUse(i: LaneArgIdx).set(MaskedConst);
550 return true;
551 }
552
553 return false;
554}
555
556static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
557 Function &NewCallee, ArrayRef<Value *> Ops) {
558 SmallVector<OperandBundleDef, 2> OpBundles;
559 Old.getOperandBundlesAsDefs(Defs&: OpBundles);
560
561 CallInst *NewCall = B.CreateCall(Callee: &NewCallee, Args: Ops, OpBundles);
562 NewCall->takeName(V: &Old);
563 return NewCall;
564}
565
566// Return true for sequences of instructions that effectively assign
567// each lane to its thread ID
568static bool isThreadID(const GCNSubtarget &ST, Value *V) {
569 // Case 1:
570 // wave32: mbcnt_lo(-1, 0)
571 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
572 auto W32Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(Op0: m_ConstantInt<-1>(),
573 Op1: m_ConstantInt<0>());
574 auto W64Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
575 Op0: m_ConstantInt<-1>(), Op1: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
576 Op0: m_ConstantInt<-1>(), Op1: m_ConstantInt<0>()));
577 if (ST.isWave32() && match(V, P: W32Pred))
578 return true;
579 if (ST.isWave64() && match(V, P: W64Pred))
580 return true;
581
582 return false;
583}
584
585Instruction *
586GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
587 IntrinsicInst &II) const {
588 const auto IID = II.getIntrinsicID();
589 assert(IID == Intrinsic::amdgcn_readlane ||
590 IID == Intrinsic::amdgcn_readfirstlane ||
591 IID == Intrinsic::amdgcn_permlane64);
592
593 Instruction *OpInst = dyn_cast<Instruction>(Val: II.getOperand(i_nocapture: 0));
594
595 // Only do this if both instructions are in the same block
596 // (so the exec mask won't change) and the readlane is the only user of its
597 // operand.
598 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
599 return nullptr;
600
601 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
602
603 // If this is a readlane, check that the second operand is a constant, or is
604 // defined before OpInst so we know it's safe to move this intrinsic higher.
605 Value *LaneID = nullptr;
606 if (IsReadLane) {
607 LaneID = II.getOperand(i_nocapture: 1);
608
609 // readlane take an extra operand for the lane ID, so we must check if that
610 // LaneID value can be used at the point where we want to move the
611 // intrinsic.
612 if (auto *LaneIDInst = dyn_cast<Instruction>(Val: LaneID)) {
613 if (!IC.getDominatorTree().dominates(Def: LaneIDInst, User: OpInst))
614 return nullptr;
615 }
616 }
617
618 // Hoist the intrinsic (II) through OpInst.
619 //
620 // (II (OpInst x)) -> (OpInst (II x))
621 const auto DoIt = [&](unsigned OpIdx,
622 Function *NewIntrinsic) -> Instruction * {
623 SmallVector<Value *, 2> Ops{OpInst->getOperand(i: OpIdx)};
624 if (IsReadLane)
625 Ops.push_back(Elt: LaneID);
626
627 // Rewrite the intrinsic call.
628 CallInst *NewII = rewriteCall(B&: IC.Builder, Old&: II, NewCallee&: *NewIntrinsic, Ops);
629
630 // Rewrite OpInst so it takes the result of the intrinsic now.
631 Instruction &NewOp = *OpInst->clone();
632 NewOp.setOperand(i: OpIdx, Val: NewII);
633 return &NewOp;
634 };
635
636 // TODO(?): Should we do more with permlane64?
637 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(Val: OpInst))
638 return nullptr;
639
640 if (isa<UnaryOperator>(Val: OpInst))
641 return DoIt(0, II.getCalledFunction());
642
643 if (isa<CastInst>(Val: OpInst)) {
644 Value *Src = OpInst->getOperand(i: 0);
645 Type *SrcTy = Src->getType();
646 if (!isTypeLegal(Ty: SrcTy))
647 return nullptr;
648
649 Function *Remangled =
650 Intrinsic::getOrInsertDeclaration(M: II.getModule(), id: IID, OverloadTys: {SrcTy});
651 return DoIt(0, Remangled);
652 }
653
654 // We can also hoist through binary operators if the other operand is uniform.
655 if (isa<BinaryOperator>(Val: OpInst)) {
656 // FIXME: If we had access to UniformityInfo here we could just check
657 // if the operand is uniform.
658 if (isTriviallyUniform(U: OpInst->getOperandUse(i: 0)))
659 return DoIt(1, II.getCalledFunction());
660 if (isTriviallyUniform(U: OpInst->getOperandUse(i: 1)))
661 return DoIt(0, II.getCalledFunction());
662 }
663
664 return nullptr;
665}
666
667/// Evaluate V as a function of the lane ID and return its value on Lane, or
668/// std::nullopt if V is not a closed-form expression of the lane ID.
669static std::optional<unsigned> evalLaneExpr(Value *V, unsigned Lane,
670 const GCNSubtarget &ST,
671 const DataLayout &DL,
672 unsigned Depth = 0) {
673 if (Depth >= MaxAnalysisRecursionDepth)
674 return std::nullopt;
675
676 // Poison/undef in the index expression: bail and let InstCombine fold the
677 // intrinsic the usual way.
678 if (isa<UndefValue>(Val: V))
679 return std::nullopt;
680
681 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val: V))
682 return CI->getZExtValue();
683
684 if (isThreadID(ST, V))
685 return Lane;
686
687 const BinaryOperator *BO = dyn_cast<BinaryOperator>(Val: V);
688 if (!BO)
689 return std::nullopt;
690
691 std::optional<unsigned> LHS =
692 evalLaneExpr(V: BO->getOperand(i_nocapture: 0), Lane, ST, DL, Depth: Depth + 1);
693 if (!LHS)
694 return std::nullopt;
695 std::optional<unsigned> RHS =
696 evalLaneExpr(V: BO->getOperand(i_nocapture: 1), Lane, ST, DL, Depth: Depth + 1);
697 if (!RHS)
698 return std::nullopt;
699
700 Type *Ty = BO->getType();
701 Constant *Ops[] = {ConstantInt::get(Ty, V: *LHS), ConstantInt::get(Ty, V: *RHS)};
702 auto *CI =
703 dyn_cast_or_null<ConstantInt>(Val: ConstantFoldInstOperands(I: BO, Ops, DL));
704 return CI ? std::optional<unsigned>(CI->getZExtValue()) : std::nullopt;
705}
706
707/// Build the per-lane shuffle map by evaluating Index for every lane in the
708/// wave. Returns false if any lane index is non-constant or out of range.
709static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST,
710 SmallVectorImpl<uint8_t> &Ids,
711 const DataLayout &DL) {
712 unsigned WaveSize = ST.getWavefrontSize();
713 Ids.resize(N: WaveSize);
714 for (unsigned Lane : seq(Size: WaveSize)) {
715 std::optional<unsigned> Val = evalLaneExpr(V: Index, Lane, ST, DL);
716 if (!Val || *Val >= WaveSize)
717 return false;
718 Ids[Lane] = *Val;
719 }
720 return true;
721}
722
723/// Lanes are partitioned into groups of Period; each group is a translated
724/// copy of the first: Ids[I] = Ids[I % Period] + (I & ~(Period - 1)).
725template <unsigned Period>
726static bool hasPeriodicLayout(ArrayRef<uint8_t> Ids) {
727 static_assert(isPowerOf2_32(Value: Period), "Period must be a power of two");
728 for (unsigned I = Period, E = Ids.size(); I < E; ++I)
729 if (Ids[I] != Ids[I % Period] + (I & ~(Period - 1)))
730 return false;
731 return true;
732}
733
734/// Match an N-lane row pattern: each lane in [0, N) reads from a source lane
735/// in the same N-lane row, and the pattern repeats periodically across rows.
736template <unsigned N> static bool isRowPattern(ArrayRef<uint8_t> Ids) {
737 for (unsigned I = 0; I < N; ++I)
738 if (Ids[I] >= N)
739 return false;
740 return hasPeriodicLayout<N>(Ids);
741}
742
743static constexpr auto isQuadPattern = isRowPattern<4>;
744static constexpr auto isHalfRowPattern = isRowPattern<8>;
745static constexpr auto isFullRowPattern = isRowPattern<16>;
746
747/// Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp
748/// QUAD_PERM control word: bits[1:0]=Ids[0], [3:2]=Ids[1], [5:4]=Ids[2],
749/// [7:6]=Ids[3].
750static std::optional<unsigned> matchQuadPermPattern(ArrayRef<uint8_t> Ids) {
751 if (!isQuadPattern(Ids))
752 return std::nullopt;
753 return Ids[3] << 6 | Ids[2] << 4 | Ids[1] << 2 | Ids[0];
754}
755
756/// Match an N-lane reversal (mirror) pattern.
757template <unsigned N> static bool matchMirrorPattern(ArrayRef<uint8_t> Ids) {
758 if (!isRowPattern<N>(Ids))
759 return false;
760 for (unsigned J = 0; J < N; ++J)
761 if (Ids[J] != (N - 1) - J)
762 return false;
763 return true;
764}
765
766static constexpr auto matchHalfRowMirrorPattern = matchMirrorPattern<8>;
767static constexpr auto matchFullRowMirrorPattern = matchMirrorPattern<16>;
768
769/// Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
770static std::optional<unsigned> matchRowRotatePattern(ArrayRef<uint8_t> Ids) {
771 if (Ids[0] == 0 || !isFullRowPattern(Ids))
772 return std::nullopt;
773 for (unsigned J = 1; J < 16; ++J)
774 if (Ids[J] != (Ids[0] + J) % 16)
775 return std::nullopt;
776 return 16u - Ids[0];
777}
778
779/// Match a row-share pattern: all 16 lanes of each row read the same source
780/// lane. Returns the shared source lane index in [0, 16).
781static std::optional<unsigned> matchRowSharePattern(ArrayRef<uint8_t> Ids) {
782 if (!isFullRowPattern(Ids))
783 return std::nullopt;
784 if (!all_equal(Range: Ids.take_front(N: 16)))
785 return std::nullopt;
786 return Ids[0];
787}
788
789/// Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J,
790/// with Mask in [1, 15].
791static std::optional<unsigned> matchRowXMaskPattern(ArrayRef<uint8_t> Ids) {
792 unsigned Mask = Ids[0];
793 if (Mask == 0 || !isFullRowPattern(Ids))
794 return std::nullopt;
795 for (unsigned J = 0; J < 16; ++J)
796 if (Ids[J] != (Mask ^ J))
797 return std::nullopt;
798 return Mask;
799}
800
801/// Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8
802/// 24-bit selector (three bits per output lane).
803static std::optional<unsigned> matchHalfRowPermPattern(ArrayRef<uint8_t> Ids) {
804 if (!isHalfRowPattern(Ids))
805 return std::nullopt;
806 unsigned Selector = 0;
807 for (unsigned J = 0; J < 8; ++J)
808 Selector |= Ids[J] << (J * 3);
809 return Selector;
810}
811
812/// Pack a 16-lane permutation into a single 64-bit value: four bits per output
813/// lane, lane J in bits [J*4 + 3 : J*4]. The caller splits it into the low and
814/// high 32-bit selector operands of v_permlane16 / v_permlanex16.
815static uint64_t computePermlane16Masks(ArrayRef<uint8_t> Ids) {
816 uint64_t Sel = 0;
817 for (unsigned J = 0; J < 16; ++J)
818 Sel |= static_cast<uint64_t>(Ids[J] & 0xF) << (J * 4);
819 return Sel;
820}
821
822/// Match a half-wave swap: lane J reads from lane J ^ 32. Only meaningful on
823/// wave64 targets.
824static bool matchHalfWaveSwapPattern(ArrayRef<uint8_t> Ids) {
825 if (Ids.size() != 64)
826 return false;
827 for (unsigned J = 0; J < 64; ++J)
828 if (Ids[J] != (J ^ 32))
829 return false;
830 return true;
831}
832
833/// Match a cross-row permutation suitable for v_permlanex16: every lane in
834/// the low 16-lane half reads from the high half of its own row, and vice
835/// versa.
836static bool isCrossRowPattern(ArrayRef<uint8_t> Ids) {
837 if (!hasPeriodicLayout<32>(Ids))
838 return false;
839 for (unsigned J = 0; J < 16; ++J) {
840 if (Ids[J] < 16 || Ids[J] >= 32)
841 return false;
842 if (Ids[J + 16] != Ids[J] - 16)
843 return false;
844 }
845 return true;
846}
847
848/// Match a DS_SWIZZLE bitmask-mode permutation:
849/// dst_lane = ((src_lane & AND) | OR) ^ XOR
850/// with each mask being five bits. Returns the encoded swizzle immediate.
851/// The hardware applies the formula independently within each 32-lane group,
852/// so on wave64 the high group must replicate the low one (translated by 32).
853static std::optional<unsigned>
854matchDsSwizzleBitmaskPattern(ArrayRef<uint8_t> Ids) {
855 if (!hasPeriodicLayout<32>(Ids))
856 return std::nullopt;
857
858 // The formula is per-bit: output bit B depends only on input bit B. Probe
859 // each bit with src=0 and src=(1<<B); if the output bit flipped, AND[B]=1
860 // and XOR[B] carries the constant offset; otherwise it is a constant bit
861 // encoded in OR (with AND[B]=0, XOR[B]=0).
862 unsigned AndMask = 0, OrMask = 0, XorMask = 0;
863 for (unsigned B = 0; B < 5; ++B) {
864 unsigned Bit0 = (Ids[0] >> B) & 1;
865 unsigned Bit1 = (Ids[1u << B] >> B) & 1;
866 if (Bit0 != Bit1) {
867 AndMask |= 1u << B;
868 XorMask |= Bit0 << B;
869 } else {
870 OrMask |= Bit0 << B;
871 }
872 }
873
874 // The per-bit derivation assumes bit independence; verify the masks
875 // actually reproduce every lane in the 32-lane group.
876 for (unsigned I : seq(Size: 32u)) {
877 unsigned Expected = ((I & AndMask) | OrMask) ^ XorMask;
878 if (Ids[I] != Expected)
879 return std::nullopt;
880 }
881
882 return AMDGPU::Swizzle::BITMASK_PERM_ENC |
883 AndMask << AMDGPU::Swizzle::BITMASK_AND_SHIFT |
884 OrMask << AMDGPU::Swizzle::BITMASK_OR_SHIFT |
885 XorMask << AMDGPU::Swizzle::BITMASK_XOR_SHIFT;
886}
887
888/// Match a GFX9+ DS_SWIZZLE rotate-mode permutation: a cyclic left-rotation
889/// of all 32 lanes within each 32-lane group by a constant N in [0, 31],
890/// i.e. dst_lane = (src_lane + N) % 32. On wave64, hasPeriodicLayout<32>
891/// ensures both 32-lane groups rotate by the same amount.
892static std::optional<unsigned>
893matchDsSwizzleRotatePattern(ArrayRef<uint8_t> Ids) {
894 if (!hasPeriodicLayout<32>(Ids))
895 return std::nullopt;
896
897 // Determine the rotation amount from lane 0: every lane must read from
898 // lane (I + N) % 32 where N = Ids[0] and 0 <= N <= 31.
899 unsigned N = Ids[0];
900 if (N >= 32)
901 return std::nullopt;
902
903 for (unsigned I = 0; I < 32; ++I)
904 if (Ids[I] != (I + N) % 32)
905 return std::nullopt;
906
907 return AMDGPU::Swizzle::ROTATE_MODE_ENC |
908 (N << AMDGPU::Swizzle::ROTATE_SIZE_SHIFT);
909}
910
911/// Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and
912/// bound_ctrl=1 so out-of-bounds lanes are well-defined and the DPP mov can
913/// be folded into a consuming VALU op by GCNDPPCombine.
914static Value *createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl) {
915 Type *Ty = Val->getType();
916 return B.CreateIntrinsic(ID: Intrinsic::amdgcn_update_dpp, OverloadTypes: {Ty},
917 Args: {PoisonValue::get(T: Ty), Val, B.getInt32(C: Ctrl),
918 B.getInt32(C: 0xF), B.getInt32(C: 0xF), B.getTrue()});
919}
920
921/// Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
922static Value *createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector) {
923 return B.CreateIntrinsic(ID: Intrinsic::amdgcn_mov_dpp8, OverloadTypes: {Val->getType()},
924 Args: {Val, B.getInt32(C: Selector)});
925}
926
927/// Emit v_permlane16 with the precomputed lane-select halves.
928static Value *createPermlane16(IRBuilderBase &B, Value *Val, uint32_t Lo,
929 uint32_t Hi) {
930 Type *Ty = Val->getType();
931 return B.CreateIntrinsic(ID: Intrinsic::amdgcn_permlane16, OverloadTypes: {Ty},
932 Args: {PoisonValue::get(T: Ty), Val, B.getInt32(C: Lo),
933 B.getInt32(C: Hi), B.getFalse(), B.getFalse()});
934}
935
936/// Emit v_permlanex16 with the precomputed lane-select halves. Each output
937/// lane reads from the other 16-lane half of the same row.
938static Value *createPermlaneX16(IRBuilderBase &B, Value *Val, uint32_t Lo,
939 uint32_t Hi) {
940 Type *Ty = Val->getType();
941 return B.CreateIntrinsic(ID: Intrinsic::amdgcn_permlanex16, OverloadTypes: {Ty},
942 Args: {PoisonValue::get(T: Ty), Val, B.getInt32(C: Lo),
943 B.getInt32(C: Hi), B.getFalse(), B.getFalse()});
944}
945
946/// Emit ds_swizzle with the given immediate, bitcasting/converting between
947/// pointer/float types and i32 as required by the intrinsic signature.
948static Value *createDsSwizzle(IRBuilderBase &B, Value *Val, unsigned Offset,
949 const DataLayout &DL) {
950 Type *OrigTy = Val->getType();
951 assert(DL.getTypeSizeInBits(OrigTy) == 32 &&
952 "ds_swizzle only supports 32-bit operands");
953 IntegerType *I32Ty = B.getInt32Ty();
954 Value *Src = Val;
955 if (OrigTy->isPointerTy())
956 Src = B.CreatePtrToInt(V: Src, DestTy: I32Ty);
957 else if (OrigTy != I32Ty)
958 Src = B.CreateBitCast(V: Src, DestTy: I32Ty);
959 Value *Result = B.CreateIntrinsic(ID: Intrinsic::amdgcn_ds_swizzle, OverloadTypes: {},
960 Args: {Src, B.getInt32(C: Offset)});
961 if (OrigTy->isPointerTy())
962 return B.CreateIntToPtr(V: Result, DestTy: OrigTy);
963 if (OrigTy != I32Ty)
964 return B.CreateBitCast(V: Result, DestTy: OrigTy);
965 return Result;
966}
967
968/// Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
969static Value *createPermlane64(IRBuilderBase &B, Value *Val) {
970 return B.CreateIntrinsic(ID: Intrinsic::amdgcn_permlane64, OverloadTypes: {Val->getType()},
971 Args: {Val});
972}
973
974/// Given a shuffle map, try to emit the best hardware intrinsic.
975static Value *matchShuffleToHWIntrinsic(IRBuilderBase &B, Value *Src,
976 ArrayRef<uint8_t> Ids,
977 const GCNSubtarget &ST,
978 const DataLayout &DL) {
979 // Identity shuffle (every lane reads itself) folds to the source value.
980 if (all_of(Range: enumerate(First&: Ids),
981 P: [](const auto &E) { return E.value() == E.index(); }))
982 return Src;
983
984 // Uniform shuffle (all lanes read the same value) is handled by cheaper
985 // broadcast/readlane intrinsics.
986 if (all_equal(Range&: Ids))
987 return nullptr;
988
989 if (std::optional<unsigned> QP = matchQuadPermPattern(Ids)) {
990 if (ST.hasDPP())
991 return createUpdateDpp(B, Val: Src, Ctrl: *QP);
992 return createDsSwizzle(B, Val: Src, Offset: AMDGPU::Swizzle::QUAD_PERM_ENC | *QP, DL);
993 }
994
995 if (ST.hasDPP()) {
996 if (matchHalfRowMirrorPattern(Ids))
997 return createUpdateDpp(B, Val: Src, Ctrl: AMDGPU::DPP::ROW_HALF_MIRROR);
998 if (matchFullRowMirrorPattern(Ids))
999 return createUpdateDpp(B, Val: Src, Ctrl: AMDGPU::DPP::ROW_MIRROR);
1000 if (std::optional<unsigned> Amt = matchRowRotatePattern(Ids))
1001 return createUpdateDpp(B, Val: Src, Ctrl: AMDGPU::DPP::ROW_ROR_FIRST + *Amt - 1);
1002 }
1003
1004 // row_share is supported on GFX90A and GFX10+; row_xmask is GFX10+ only.
1005 if (ST.hasDPPRowShare()) {
1006 if (std::optional<unsigned> Lane = matchRowSharePattern(Ids))
1007 return createUpdateDpp(B, Val: Src, Ctrl: AMDGPU::DPP::ROW_SHARE_FIRST + *Lane);
1008 }
1009
1010 if (ST.hasDPP() && ST.hasGFX10Insts()) {
1011 if (std::optional<unsigned> Mask = matchRowXMaskPattern(Ids))
1012 return createUpdateDpp(B, Val: Src, Ctrl: AMDGPU::DPP::ROW_XMASK_FIRST + *Mask);
1013 }
1014
1015 if (ST.hasDPP8()) {
1016 if (std::optional<unsigned> Sel = matchHalfRowPermPattern(Ids))
1017 return createMovDpp8(B, Val: Src, Selector: *Sel);
1018 }
1019
1020 if (ST.hasPermlane16Insts()) {
1021 if (isFullRowPattern(Ids)) {
1022 uint64_t Sel = computePermlane16Masks(Ids);
1023 return createPermlane16(B, Val: Src, Lo: Lo_32(Value: Sel), Hi: Hi_32(Value: Sel));
1024 }
1025 // Cross-row shuffles (e.g. XOR 16..31) — covered by permlanex16.
1026 if (isCrossRowPattern(Ids)) {
1027 uint64_t Sel = computePermlane16Masks(Ids);
1028 return createPermlaneX16(B, Val: Src, Lo: Lo_32(Value: Sel), Hi: Hi_32(Value: Sel));
1029 }
1030 }
1031
1032 // Generic DS_SWIZZLE bitmask-mode fallback: handles any 32-lane shuffle that
1033 // can be expressed as dst = ((src & AND) | OR) ^ XOR with 5-bit masks. This
1034 // is available on every target that has ds_swizzle.
1035 if (std::optional<unsigned> Imm = matchDsSwizzleBitmaskPattern(Ids))
1036 return createDsSwizzle(B, Val: Src, Offset: *Imm, DL);
1037
1038 // DS_SWIZZLE rotate mode (GFX9+): handles cyclic 32-lane rotations that
1039 // bitmask mode cannot express (e.g. +1 mod 32 requires inter-bit carry).
1040 if (ST.hasDsSwizzleRotateMode()) {
1041 if (std::optional<unsigned> Imm = matchDsSwizzleRotatePattern(Ids))
1042 return createDsSwizzle(B, Val: Src, Offset: *Imm, DL);
1043 }
1044
1045 if (ST.hasPermLane64() && matchHalfWaveSwapPattern(Ids))
1046 return createPermlane64(B, Val: Src);
1047
1048 return nullptr;
1049}
1050
1051/// Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant
1052/// function of the lane ID into a hardware-specific lane permutation intrinsic.
1053static std::optional<Instruction *>
1054tryOptimizeShufflePattern(InstCombiner &IC, IntrinsicInst &II,
1055 const GCNSubtarget &ST) {
1056 const DataLayout &DL = IC.getDataLayout();
1057 if (DL.getTypeSizeInBits(Ty: II.getType()) != 32)
1058 return std::nullopt;
1059
1060 if (!ST.isWaveSizeKnown())
1061 return std::nullopt;
1062
1063 unsigned WaveSize = ST.getWavefrontSize();
1064 bool IsBpermute = II.getIntrinsicID() == Intrinsic::amdgcn_ds_bpermute;
1065 Value *Src = II.getArgOperand(i: IsBpermute ? 1 : 0);
1066 Value *Index = II.getArgOperand(i: IsBpermute ? 0 : 1);
1067
1068 SmallVector<uint8_t, 64> Ids;
1069 if (IsBpermute) {
1070 Ids.resize(N: WaveSize);
1071 for (unsigned Lane : seq(Size: WaveSize)) {
1072 std::optional<unsigned> Val = evalLaneExpr(V: Index, Lane, ST, DL);
1073 if (!Val || (*Val & 3) || (*Val >> 2) >= WaveSize)
1074 return std::nullopt;
1075 Ids[Lane] = *Val >> 2;
1076 }
1077 } else {
1078 if (!tryBuildShuffleMap(Index, ST, Ids, DL))
1079 return std::nullopt;
1080 }
1081
1082 Value *Result = matchShuffleToHWIntrinsic(B&: IC.Builder, Src, Ids, ST, DL);
1083 if (!Result)
1084 return std::nullopt;
1085
1086 return IC.replaceInstUsesWith(I&: II, V: Result);
1087}
1088std::optional<Instruction *>
1089GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
1090 Intrinsic::ID IID = II.getIntrinsicID();
1091 switch (IID) {
1092 case Intrinsic::amdgcn_implicitarg_ptr: {
1093 if (II.getFunction()->hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr"))
1094 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1095 uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(F: *II.getFunction());
1096
1097 uint64_t CurrentOrNullBytes =
1098 II.getAttributes().getRetDereferenceableOrNullBytes();
1099 if (CurrentOrNullBytes != 0) {
1100 // Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
1101 // into dereferenceable(max(A, B))
1102 uint64_t NewBytes = std::max(a: CurrentOrNullBytes, b: ImplicitArgBytes);
1103 II.addRetAttr(
1104 Attr: Attribute::getWithDereferenceableBytes(Context&: II.getContext(), Bytes: NewBytes));
1105 II.removeRetAttr(Kind: Attribute::DereferenceableOrNull);
1106 return &II;
1107 }
1108
1109 uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
1110 uint64_t NewBytes = std::max(a: CurrentBytes, b: ImplicitArgBytes);
1111 if (NewBytes != CurrentBytes) {
1112 II.addRetAttr(
1113 Attr: Attribute::getWithDereferenceableBytes(Context&: II.getContext(), Bytes: NewBytes));
1114 return &II;
1115 }
1116
1117 return std::nullopt;
1118 }
1119 case Intrinsic::amdgcn_rcp: {
1120 Value *Src = II.getArgOperand(i: 0);
1121 if (isa<PoisonValue>(Val: Src))
1122 return IC.replaceInstUsesWith(I&: II, V: Src);
1123
1124 // TODO: Move to ConstantFolding/InstSimplify?
1125 if (isa<UndefValue>(Val: Src)) {
1126 Type *Ty = II.getType();
1127 auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
1128 return IC.replaceInstUsesWith(I&: II, V: QNaN);
1129 }
1130
1131 if (II.isStrictFP())
1132 break;
1133
1134 if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
1135 const APFloat &ArgVal = C->getValueAPF();
1136 APFloat Val(ArgVal.getSemantics(), 1);
1137 Val.divide(RHS: ArgVal, RM: APFloat::rmNearestTiesToEven);
1138
1139 // This is more precise than the instruction may give.
1140 //
1141 // TODO: The instruction always flushes denormal results (except for f16),
1142 // should this also?
1143 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Val));
1144 }
1145
1146 FastMathFlags FMF = cast<FPMathOperator>(Val&: II).getFastMathFlags();
1147 if (!FMF.allowContract())
1148 break;
1149 auto *SrcCI = dyn_cast<IntrinsicInst>(Val: Src);
1150 if (!SrcCI)
1151 break;
1152
1153 auto IID = SrcCI->getIntrinsicID();
1154 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
1155 //
1156 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
1157 // relaxed.
1158 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
1159 const FPMathOperator *SqrtOp = cast<FPMathOperator>(Val: SrcCI);
1160 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
1161 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
1162 break;
1163
1164 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
1165 break;
1166
1167 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1168 M: SrcCI->getModule(), id: Intrinsic::amdgcn_rsq, OverloadTys: {SrcCI->getType()});
1169
1170 InnerFMF |= FMF;
1171 II.setFastMathFlags(InnerFMF);
1172
1173 II.setCalledFunction(NewDecl);
1174 return IC.replaceOperand(I&: II, OpNum: 0, V: SrcCI->getArgOperand(i: 0));
1175 }
1176
1177 break;
1178 }
1179 case Intrinsic::amdgcn_sqrt:
1180 case Intrinsic::amdgcn_rsq:
1181 case Intrinsic::amdgcn_tanh: {
1182 Value *Src = II.getArgOperand(i: 0);
1183 if (isa<PoisonValue>(Val: Src))
1184 return IC.replaceInstUsesWith(I&: II, V: Src);
1185
1186 // TODO: Move to ConstantFolding/InstSimplify?
1187 if (isa<UndefValue>(Val: Src)) {
1188 Type *Ty = II.getType();
1189 auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
1190 return IC.replaceInstUsesWith(I&: II, V: QNaN);
1191 }
1192
1193 // f16 amdgcn.sqrt is identical to regular sqrt.
1194 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
1195 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1196 M: II.getModule(), id: Intrinsic::sqrt, OverloadTys: {II.getType()});
1197 II.setCalledFunction(NewDecl);
1198 return &II;
1199 }
1200
1201 break;
1202 }
1203 case Intrinsic::amdgcn_log:
1204 case Intrinsic::amdgcn_exp2: {
1205 const bool IsLog = IID == Intrinsic::amdgcn_log;
1206 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
1207 Value *Src = II.getArgOperand(i: 0);
1208 Type *Ty = II.getType();
1209
1210 if (isa<PoisonValue>(Val: Src))
1211 return IC.replaceInstUsesWith(I&: II, V: Src);
1212
1213 if (IC.getSimplifyQuery().isUndefValue(V: Src))
1214 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
1215
1216 if (ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
1217 if (C->isInfinity()) {
1218 // exp2(+inf) -> +inf
1219 // log2(+inf) -> +inf
1220 if (!C->isNegative())
1221 return IC.replaceInstUsesWith(I&: II, V: C);
1222
1223 // exp2(-inf) -> 0
1224 if (IsExp && C->isNegative())
1225 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty));
1226 }
1227
1228 if (II.isStrictFP())
1229 break;
1230
1231 if (C->isNaN()) {
1232 Constant *Quieted = ConstantFP::get(Ty, V: C->getValue().makeQuiet());
1233 return IC.replaceInstUsesWith(I&: II, V: Quieted);
1234 }
1235
1236 // f32 instruction doesn't handle denormals, f16 does.
1237 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
1238 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, Negative: true)
1239 : ConstantFP::get(Ty, V: 1.0);
1240 return IC.replaceInstUsesWith(I&: II, V: FoldedValue);
1241 }
1242
1243 if (IsLog && C->isNegative())
1244 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
1245
1246 // TODO: Full constant folding matching hardware behavior.
1247 }
1248
1249 break;
1250 }
1251 case Intrinsic::amdgcn_frexp_mant:
1252 case Intrinsic::amdgcn_frexp_exp: {
1253 Value *Src = II.getArgOperand(i: 0);
1254 if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
1255 int Exp;
1256 APFloat Significand =
1257 frexp(X: C->getValueAPF(), Exp, RM: APFloat::rmNearestTiesToEven);
1258
1259 if (IID == Intrinsic::amdgcn_frexp_mant) {
1260 return IC.replaceInstUsesWith(
1261 I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Significand));
1262 }
1263
1264 // Match instruction special case behavior.
1265 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
1266 Exp = 0;
1267
1268 return IC.replaceInstUsesWith(I&: II,
1269 V: ConstantInt::getSigned(Ty: II.getType(), V: Exp));
1270 }
1271
1272 if (isa<PoisonValue>(Val: Src))
1273 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1274
1275 if (isa<UndefValue>(Val: Src)) {
1276 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1277 }
1278
1279 break;
1280 }
1281 case Intrinsic::amdgcn_class: {
1282 Value *Src0 = II.getArgOperand(i: 0);
1283 Value *Src1 = II.getArgOperand(i: 1);
1284 const ConstantInt *CMask = dyn_cast<ConstantInt>(Val: Src1);
1285 if (CMask) {
1286 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1287 M: II.getModule(), id: Intrinsic::is_fpclass, OverloadTys: Src0->getType()));
1288
1289 // Clamp any excess bits, as they're illegal for the generic intrinsic.
1290 II.setArgOperand(i: 1, v: ConstantInt::get(Ty: Src1->getType(),
1291 V: CMask->getZExtValue() & fcAllFlags));
1292 return &II;
1293 }
1294
1295 // Propagate poison.
1296 if (isa<PoisonValue>(Val: Src0) || isa<PoisonValue>(Val: Src1))
1297 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1298
1299 // llvm.amdgcn.class(_, undef) -> false
1300 if (IC.getSimplifyQuery().isUndefValue(V: Src1))
1301 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: false));
1302
1303 // llvm.amdgcn.class(undef, mask) -> mask != 0
1304 if (IC.getSimplifyQuery().isUndefValue(V: Src0)) {
1305 Value *CmpMask = IC.Builder.CreateICmpNE(
1306 LHS: Src1, RHS: ConstantInt::getNullValue(Ty: Src1->getType()));
1307 return IC.replaceInstUsesWith(I&: II, V: CmpMask);
1308 }
1309 break;
1310 }
1311 case Intrinsic::amdgcn_cvt_pkrtz: {
1312 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
1313 Type *HalfTy = Type::getHalfTy(C&: Arg->getContext());
1314
1315 if (isa<PoisonValue>(Val: Arg))
1316 return PoisonValue::get(T: HalfTy);
1317 if (isa<UndefValue>(Val: Arg))
1318 return UndefValue::get(T: HalfTy);
1319
1320 ConstantFP *CFP = nullptr;
1321 if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
1322 bool LosesInfo;
1323 APFloat Val(CFP->getValueAPF());
1324 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
1325 return ConstantFP::get(Ty: HalfTy, V: Val);
1326 }
1327
1328 Value *Src = nullptr;
1329 if (match(V: Arg, P: m_FPExt(Op: m_Value(V&: Src)))) {
1330 if (Src->getType()->isHalfTy())
1331 return Src;
1332 }
1333
1334 return nullptr;
1335 };
1336
1337 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(i: 0))) {
1338 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(i: 1))) {
1339 Value *V = PoisonValue::get(T: II.getType());
1340 V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src0, Idx: (uint64_t)0);
1341 V = IC.Builder.CreateInsertElement(Vec: V, NewElt: Src1, Idx: (uint64_t)1);
1342 return IC.replaceInstUsesWith(I&: II, V);
1343 }
1344 }
1345
1346 break;
1347 }
1348 case Intrinsic::amdgcn_cvt_pknorm_i16:
1349 case Intrinsic::amdgcn_cvt_pknorm_u16:
1350 case Intrinsic::amdgcn_cvt_pk_i16:
1351 case Intrinsic::amdgcn_cvt_pk_u16: {
1352 Value *Src0 = II.getArgOperand(i: 0);
1353 Value *Src1 = II.getArgOperand(i: 1);
1354
1355 // TODO: Replace call with scalar operation if only one element is poison.
1356 if (isa<PoisonValue>(Val: Src0) && isa<PoisonValue>(Val: Src1))
1357 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1358
1359 if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
1360 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1361 }
1362
1363 break;
1364 }
1365 case Intrinsic::amdgcn_cvt_off_f32_i4: {
1366 Value* Arg = II.getArgOperand(i: 0);
1367 Type *Ty = II.getType();
1368
1369 if (isa<PoisonValue>(Val: Arg))
1370 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: Ty));
1371
1372 if(IC.getSimplifyQuery().isUndefValue(V: Arg))
1373 return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty));
1374
1375 ConstantInt *CArg = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 0));
1376 if (!CArg)
1377 break;
1378
1379 // Tabulated 0.0625 * (sext (CArg & 0xf)).
1380 constexpr size_t ResValsSize = 16;
1381 static constexpr float ResVals[ResValsSize] = {
1382 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
1383 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
1384 Constant *Res =
1385 ConstantFP::get(Ty, V: ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
1386 return IC.replaceInstUsesWith(I&: II, V: Res);
1387 }
1388 case Intrinsic::amdgcn_ubfe:
1389 case Intrinsic::amdgcn_sbfe: {
1390 // Decompose simple cases into standard shifts.
1391 Value *Src = II.getArgOperand(i: 0);
1392 if (isa<UndefValue>(Val: Src)) {
1393 return IC.replaceInstUsesWith(I&: II, V: Src);
1394 }
1395
1396 unsigned Width;
1397 Type *Ty = II.getType();
1398 unsigned IntSize = Ty->getIntegerBitWidth();
1399
1400 ConstantInt *CWidth = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 2));
1401 if (CWidth) {
1402 Width = CWidth->getZExtValue();
1403 if ((Width & (IntSize - 1)) == 0) {
1404 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getNullValue(Ty));
1405 }
1406
1407 // Hardware ignores high bits, so remove those.
1408 if (Width >= IntSize) {
1409 return IC.replaceOperand(
1410 I&: II, OpNum: 2, V: ConstantInt::get(Ty: CWidth->getType(), V: Width & (IntSize - 1)));
1411 }
1412 }
1413
1414 unsigned Offset;
1415 ConstantInt *COffset = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 1));
1416 if (COffset) {
1417 Offset = COffset->getZExtValue();
1418 if (Offset >= IntSize) {
1419 return IC.replaceOperand(
1420 I&: II, OpNum: 1,
1421 V: ConstantInt::get(Ty: COffset->getType(), V: Offset & (IntSize - 1)));
1422 }
1423 }
1424
1425 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1426
1427 if (!CWidth || !COffset)
1428 break;
1429
1430 // The case of Width == 0 is handled above, which makes this transformation
1431 // safe. If Width == 0, then the ashr and lshr instructions become poison
1432 // value since the shift amount would be equal to the bit size.
1433 assert(Width != 0);
1434
1435 // TODO: This allows folding to undef when the hardware has specific
1436 // behavior?
1437 if (Offset + Width < IntSize) {
1438 Value *Shl = IC.Builder.CreateShl(LHS: Src, RHS: IntSize - Offset - Width);
1439 Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Shl, RHS: IntSize - Width)
1440 : IC.Builder.CreateLShr(LHS: Shl, RHS: IntSize - Width);
1441 RightShift->takeName(V: &II);
1442 return IC.replaceInstUsesWith(I&: II, V: RightShift);
1443 }
1444
1445 Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Src, RHS: Offset)
1446 : IC.Builder.CreateLShr(LHS: Src, RHS: Offset);
1447
1448 RightShift->takeName(V: &II);
1449 return IC.replaceInstUsesWith(I&: II, V: RightShift);
1450 }
1451 case Intrinsic::amdgcn_exp:
1452 case Intrinsic::amdgcn_exp_row:
1453 case Intrinsic::amdgcn_exp_compr: {
1454 ConstantInt *En = cast<ConstantInt>(Val: II.getArgOperand(i: 1));
1455 unsigned EnBits = En->getZExtValue();
1456 if (EnBits == 0xf)
1457 break; // All inputs enabled.
1458
1459 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1460 bool Changed = false;
1461 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1462 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1463 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1464 Value *Src = II.getArgOperand(i: I + 2);
1465 if (!isa<PoisonValue>(Val: Src)) {
1466 IC.replaceOperand(I&: II, OpNum: I + 2, V: PoisonValue::get(T: Src->getType()));
1467 Changed = true;
1468 }
1469 }
1470 }
1471
1472 if (Changed) {
1473 return &II;
1474 }
1475
1476 break;
1477 }
1478 case Intrinsic::amdgcn_fmed3: {
1479 Value *Src0 = II.getArgOperand(i: 0);
1480 Value *Src1 = II.getArgOperand(i: 1);
1481 Value *Src2 = II.getArgOperand(i: 2);
1482
1483 for (Value *Src : {Src0, Src1, Src2}) {
1484 if (isa<PoisonValue>(Val: Src))
1485 return IC.replaceInstUsesWith(I&: II, V: Src);
1486 }
1487
1488 if (II.isStrictFP())
1489 break;
1490
1491 // med3 with a nan input acts like
1492 // v_min_f32(v_min_f32(s0, s1), s2)
1493 //
1494 // Signalingness is ignored with ieee=0, so we fold to
1495 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1496 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1497 // returned signaling nan will not be quieted.
1498
1499 // ieee=1
1500 // s0 snan: s2
1501 // s1 snan: s2
1502 // s2 snan: qnan
1503
1504 // s0 qnan: min(s1, s2)
1505 // s1 qnan: min(s0, s2)
1506 // s2 qnan: min(s0, s1)
1507
1508 // ieee=0
1509 // s0 _nan: min(s1, s2)
1510 // s1 _nan: min(s0, s2)
1511 // s2 _nan: min(s0, s1)
1512
1513 // med3 behavior with infinity
1514 // s0 +inf: max(s1, s2)
1515 // s1 +inf: max(s0, s2)
1516 // s2 +inf: max(s0, s1)
1517 // s0 -inf: min(s1, s2)
1518 // s1 -inf: min(s0, s2)
1519 // s2 -inf: min(s0, s1)
1520
1521 // Checking for NaN before canonicalization provides better fidelity when
1522 // mapping other operations onto fmed3 since the order of operands is
1523 // unchanged.
1524 Value *V = nullptr;
1525 const APFloat *ConstSrc0 = nullptr;
1526 const APFloat *ConstSrc1 = nullptr;
1527 const APFloat *ConstSrc2 = nullptr;
1528
1529 if ((match(V: Src0, P: m_APFloat(Res&: ConstSrc0)) &&
1530 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1531 isa<UndefValue>(Val: Src0)) {
1532 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1533 switch (fpenvIEEEMode(I: II)) {
1534 case KnownIEEEMode::On:
1535 // TODO: If Src2 is snan, does it need quieting?
1536 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1537 return IC.replaceInstUsesWith(I&: II, V: Src2);
1538
1539 V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src1, RHS: Src2)
1540 : IC.Builder.CreateMinNum(LHS: Src1, RHS: Src2);
1541 break;
1542 case KnownIEEEMode::Off:
1543 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src1, RHS: Src2)
1544 : IC.Builder.CreateMinimumNum(LHS: Src1, RHS: Src2);
1545 break;
1546 case KnownIEEEMode::Unknown:
1547 break;
1548 }
1549 } else if ((match(V: Src1, P: m_APFloat(Res&: ConstSrc1)) &&
1550 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1551 isa<UndefValue>(Val: Src1)) {
1552 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1553 switch (fpenvIEEEMode(I: II)) {
1554 case KnownIEEEMode::On:
1555 // TODO: If Src2 is snan, does it need quieting?
1556 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1557 return IC.replaceInstUsesWith(I&: II, V: Src2);
1558
1559 V = IsPosInfinity ? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src2)
1560 : IC.Builder.CreateMinNum(LHS: Src0, RHS: Src2);
1561 break;
1562 case KnownIEEEMode::Off:
1563 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src2)
1564 : IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src2);
1565 break;
1566 case KnownIEEEMode::Unknown:
1567 break;
1568 }
1569 } else if ((match(V: Src2, P: m_APFloat(Res&: ConstSrc2)) &&
1570 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1571 isa<UndefValue>(Val: Src2)) {
1572 switch (fpenvIEEEMode(I: II)) {
1573 case KnownIEEEMode::On:
1574 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1575 auto *Quieted = ConstantFP::get(Ty: II.getType(), V: ConstSrc2->makeQuiet());
1576 return IC.replaceInstUsesWith(I&: II, V: Quieted);
1577 }
1578
1579 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1580 ? IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src1)
1581 : IC.Builder.CreateMinNum(LHS: Src0, RHS: Src1);
1582 break;
1583 case KnownIEEEMode::Off:
1584 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1585 ? IC.Builder.CreateMinimumNum(LHS: Src0, RHS: Src1)
1586 : IC.Builder.CreateMaximumNum(LHS: Src0, RHS: Src1);
1587 break;
1588 case KnownIEEEMode::Unknown:
1589 break;
1590 }
1591 }
1592
1593 if (V) {
1594 if (auto *CI = dyn_cast<CallInst>(Val: V)) {
1595 CI->copyFastMathFlags(I: &II);
1596 CI->takeName(V: &II);
1597 }
1598 return IC.replaceInstUsesWith(I&: II, V);
1599 }
1600
1601 bool Swap = false;
1602 // Canonicalize constants to RHS operands.
1603 //
1604 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1605 if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1606 std::swap(a&: Src0, b&: Src1);
1607 Swap = true;
1608 }
1609
1610 if (isa<Constant>(Val: Src1) && !isa<Constant>(Val: Src2)) {
1611 std::swap(a&: Src1, b&: Src2);
1612 Swap = true;
1613 }
1614
1615 if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
1616 std::swap(a&: Src0, b&: Src1);
1617 Swap = true;
1618 }
1619
1620 if (Swap) {
1621 II.setArgOperand(i: 0, v: Src0);
1622 II.setArgOperand(i: 1, v: Src1);
1623 II.setArgOperand(i: 2, v: Src2);
1624 return &II;
1625 }
1626
1627 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
1628 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
1629 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Val: Src2)) {
1630 APFloat Result = fmed3AMDGCN(Src0: C0->getValueAPF(), Src1: C1->getValueAPF(),
1631 Src2: C2->getValueAPF());
1632 return IC.replaceInstUsesWith(I&: II,
1633 V: ConstantFP::get(Ty: II.getType(), V: Result));
1634 }
1635 }
1636 }
1637
1638 if (!ST->hasMed3_16())
1639 break;
1640
1641 // Repeat floating-point width reduction done for minnum/maxnum.
1642 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1643 if (Value *X = matchFPExtFromF16(Arg: Src0)) {
1644 if (Value *Y = matchFPExtFromF16(Arg: Src1)) {
1645 if (Value *Z = matchFPExtFromF16(Arg: Src2)) {
1646 Value *NewCall = IC.Builder.CreateIntrinsic(
1647 ID: IID, OverloadTypes: {X->getType()}, Args: {X, Y, Z}, FMFSource: &II, Name: II.getName());
1648 return new FPExtInst(NewCall, II.getType());
1649 }
1650 }
1651 }
1652
1653 break;
1654 }
1655 case Intrinsic::amdgcn_icmp:
1656 case Intrinsic::amdgcn_fcmp: {
1657 const ConstantInt *CC = cast<ConstantInt>(Val: II.getArgOperand(i: 2));
1658 // Guard against invalid arguments.
1659 int64_t CCVal = CC->getZExtValue();
1660 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1661 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1662 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1663 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1664 CCVal > CmpInst::LAST_FCMP_PREDICATE)))
1665 break;
1666
1667 Value *Src0 = II.getArgOperand(i: 0);
1668 Value *Src1 = II.getArgOperand(i: 1);
1669
1670 if (auto *CSrc0 = dyn_cast<Constant>(Val: Src0)) {
1671 if (auto *CSrc1 = dyn_cast<Constant>(Val: Src1)) {
1672 Constant *CCmp = ConstantFoldCompareInstOperands(
1673 Predicate: (ICmpInst::Predicate)CCVal, LHS: CSrc0, RHS: CSrc1, DL);
1674 if (CCmp && CCmp->isNullValue()) {
1675 return IC.replaceInstUsesWith(
1676 I&: II, V: IC.Builder.CreateSExt(V: CCmp, DestTy: II.getType()));
1677 }
1678
1679 // The result of V_ICMP/V_FCMP assembly instructions (which this
1680 // intrinsic exposes) is one bit per thread, masked with the EXEC
1681 // register (which contains the bitmask of live threads). So a
1682 // comparison that always returns true is the same as a read of the
1683 // EXEC register. ballot(true) reads EXEC at the wave-size width, so
1684 // zext/trunc the result to the intrinsic's return type.
1685 Type *WaveTy = IC.Builder.getIntNTy(N: ST->getWavefrontSize());
1686 Value *Ballot = IC.Builder.CreateIntrinsic(
1687 ID: Intrinsic::amdgcn_ballot, OverloadTypes: WaveTy, Args: IC.Builder.getTrue());
1688 Value *Result = IC.Builder.CreateZExtOrTrunc(V: Ballot, DestTy: II.getType());
1689 return IC.replaceInstUsesWith(I&: II, V: Result);
1690 }
1691
1692 // Canonicalize constants to RHS.
1693 CmpInst::Predicate SwapPred =
1694 CmpInst::getSwappedPredicate(pred: static_cast<CmpInst::Predicate>(CCVal));
1695 II.setArgOperand(i: 0, v: Src1);
1696 II.setArgOperand(i: 1, v: Src0);
1697 II.setArgOperand(
1698 i: 2, v: ConstantInt::get(Ty: CC->getType(), V: static_cast<int>(SwapPred)));
1699 return &II;
1700 }
1701
1702 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1703 break;
1704
1705 // Canonicalize compare eq with true value to compare != 0
1706 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1707 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1708 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1709 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1710 Value *ExtSrc;
1711 if (CCVal == CmpInst::ICMP_EQ &&
1712 ((match(V: Src1, P: PatternMatch::m_One()) &&
1713 match(V: Src0, P: m_ZExt(Op: PatternMatch::m_Value(V&: ExtSrc)))) ||
1714 (match(V: Src1, P: PatternMatch::m_AllOnes()) &&
1715 match(V: Src0, P: m_SExt(Op: PatternMatch::m_Value(V&: ExtSrc))))) &&
1716 ExtSrc->getType()->isIntegerTy(BitWidth: 1)) {
1717 IC.replaceOperand(I&: II, OpNum: 1, V: ConstantInt::getNullValue(Ty: Src1->getType()));
1718 IC.replaceOperand(I&: II, OpNum: 2,
1719 V: ConstantInt::get(Ty: CC->getType(), V: CmpInst::ICMP_NE));
1720 return &II;
1721 }
1722
1723 CmpPredicate SrcPred;
1724 Value *SrcLHS;
1725 Value *SrcRHS;
1726
1727 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1728 // intrinsic. The typical use is a wave vote function in the library, which
1729 // will be fed from a user code condition compared with 0. Fold in the
1730 // redundant compare.
1731
1732 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1733 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1734 //
1735 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1736 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1737 if (match(V: Src1, P: PatternMatch::m_Zero()) &&
1738 match(V: Src0, P: PatternMatch::m_ZExtOrSExt(
1739 Op: m_Cmp(Pred&: SrcPred, L: PatternMatch::m_Value(V&: SrcLHS),
1740 R: PatternMatch::m_Value(V&: SrcRHS))))) {
1741 if (CCVal == CmpInst::ICMP_EQ)
1742 SrcPred = CmpInst::getInversePredicate(pred: SrcPred);
1743
1744 Intrinsic::ID NewIID = CmpInst::isFPPredicate(P: SrcPred)
1745 ? Intrinsic::amdgcn_fcmp
1746 : Intrinsic::amdgcn_icmp;
1747
1748 Type *Ty = SrcLHS->getType();
1749 if (auto *CmpType = dyn_cast<IntegerType>(Val: Ty)) {
1750 // Promote to next legal integer type.
1751 unsigned Width = CmpType->getBitWidth();
1752 unsigned NewWidth = Width;
1753
1754 // Don't do anything for i1 comparisons.
1755 if (Width == 1)
1756 break;
1757
1758 if (Width <= 16)
1759 NewWidth = 16;
1760 else if (Width <= 32)
1761 NewWidth = 32;
1762 else if (Width <= 64)
1763 NewWidth = 64;
1764 else
1765 break; // Can't handle this.
1766
1767 if (Width != NewWidth) {
1768 IntegerType *CmpTy = IC.Builder.getIntNTy(N: NewWidth);
1769 if (CmpInst::isSigned(Pred: SrcPred)) {
1770 SrcLHS = IC.Builder.CreateSExt(V: SrcLHS, DestTy: CmpTy);
1771 SrcRHS = IC.Builder.CreateSExt(V: SrcRHS, DestTy: CmpTy);
1772 } else {
1773 SrcLHS = IC.Builder.CreateZExt(V: SrcLHS, DestTy: CmpTy);
1774 SrcRHS = IC.Builder.CreateZExt(V: SrcRHS, DestTy: CmpTy);
1775 }
1776 }
1777 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1778 break;
1779
1780 Value *Args[] = {SrcLHS, SrcRHS,
1781 ConstantInt::get(Ty: CC->getType(), V: SrcPred)};
1782 Value *NewCall = IC.Builder.CreateIntrinsic(
1783 ID: NewIID, OverloadTypes: {II.getType(), SrcLHS->getType()}, Args);
1784 NewCall->takeName(V: &II);
1785 return IC.replaceInstUsesWith(I&: II, V: NewCall);
1786 }
1787
1788 break;
1789 }
1790 case Intrinsic::amdgcn_mbcnt_hi:
1791 // exec_hi is all 0, so this is just a copy.
1792 if (ST->isWave32())
1793 return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: 1));
1794 [[fallthrough]];
1795 case Intrinsic::amdgcn_mbcnt_lo: {
1796 ConstantRange AccRange =
1797 computeConstantRange(V: II.getArgOperand(i: 1),
1798 /*ForSigned=*/false, SQ: IC.getSimplifyQuery());
1799 if (AccRange.isFullSet())
1800 return nullptr;
1801
1802 // TODO: Can raise lower bound by inspecting first argument.
1803 ConstantRange MbcntRange(APInt(32, 0), APInt(32, 32 + 1));
1804 ConstantRange ComputedRange = AccRange.add(Other: MbcntRange);
1805 if (ComputedRange.isFullSet())
1806 return nullptr;
1807
1808 if (std::optional<ConstantRange> ExistingRange = II.getRange()) {
1809 ComputedRange = ComputedRange.intersectWith(CR: *ExistingRange);
1810 if (ComputedRange == *ExistingRange)
1811 return nullptr;
1812 }
1813
1814 II.addRangeRetAttr(CR: ComputedRange);
1815 return nullptr;
1816 }
1817 case Intrinsic::amdgcn_ballot: {
1818 Value *Arg = II.getArgOperand(i: 0);
1819 if (isa<PoisonValue>(Val: Arg))
1820 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1821
1822 if (auto *Src = dyn_cast<ConstantInt>(Val: Arg)) {
1823 if (Src->isZero()) {
1824 // amdgcn.ballot(i1 0) is zero.
1825 return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1826 }
1827 }
1828 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1829 // %b64 = call i64 ballot.i64(...)
1830 // =>
1831 // %b32 = call i32 ballot.i32(...)
1832 // %b64 = zext i32 %b32 to i64
1833 Value *Call = IC.Builder.CreateZExt(
1834 V: IC.Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_ballot,
1835 OverloadTypes: {IC.Builder.getInt32Ty()},
1836 Args: {II.getArgOperand(i: 0)}),
1837 DestTy: II.getType());
1838 Call->takeName(V: &II);
1839 return IC.replaceInstUsesWith(I&: II, V: Call);
1840 }
1841 break;
1842 }
1843 case Intrinsic::amdgcn_wavefrontsize: {
1844 if (ST->isWaveSizeKnown())
1845 return IC.replaceInstUsesWith(
1846 I&: II, V: ConstantInt::get(Ty: II.getType(), V: ST->getWavefrontSize()));
1847 break;
1848 }
1849 case Intrinsic::amdgcn_wqm_vote: {
1850 // wqm_vote is identity when the argument is constant.
1851 if (!isa<Constant>(Val: II.getArgOperand(i: 0)))
1852 break;
1853
1854 return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: 0));
1855 }
1856 case Intrinsic::amdgcn_kill: {
1857 const ConstantInt *C = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 0));
1858 if (!C || !C->getZExtValue())
1859 break;
1860
1861 // amdgcn.kill(i1 1) is a no-op
1862 return IC.eraseInstFromFunction(I&: II);
1863 }
1864 case Intrinsic::amdgcn_s_sendmsg:
1865 case Intrinsic::amdgcn_s_sendmsghalt: {
1866 // The second operand is copied to m0, but is only actually used for
1867 // certain message types. For message types that are known to not use m0,
1868 // fold it to poison.
1869 using namespace AMDGPU::SendMsg;
1870
1871 Value *M0Val = II.getArgOperand(i: 1);
1872 if (isa<PoisonValue>(Val: M0Val))
1873 break;
1874
1875 auto *MsgImm = cast<ConstantInt>(Val: II.getArgOperand(i: 0));
1876 uint16_t MsgId, OpId, StreamId;
1877 decodeMsg(Val: MsgImm->getZExtValue(), MsgId, OpId, StreamId, STI: *ST);
1878
1879 if (!msgDoesNotUseM0(MsgId, STI: *ST))
1880 break;
1881
1882 // Drop UB-implying attributes since we're replacing with poison.
1883 II.dropUBImplyingAttrsAndMetadata();
1884 IC.replaceOperand(I&: II, OpNum: 1, V: PoisonValue::get(T: M0Val->getType()));
1885 return nullptr;
1886 }
1887 case Intrinsic::amdgcn_update_dpp: {
1888 Value *Old = II.getArgOperand(i: 0);
1889
1890 auto *BC = cast<ConstantInt>(Val: II.getArgOperand(i: 5));
1891 auto *RM = cast<ConstantInt>(Val: II.getArgOperand(i: 3));
1892 auto *BM = cast<ConstantInt>(Val: II.getArgOperand(i: 4));
1893 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1894 BM->getZExtValue() != 0xF || isa<PoisonValue>(Val: Old))
1895 break;
1896
1897 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1898 return IC.replaceOperand(I&: II, OpNum: 0, V: PoisonValue::get(T: Old->getType()));
1899 }
1900 case Intrinsic::amdgcn_permlane16:
1901 case Intrinsic::amdgcn_permlane16_var:
1902 case Intrinsic::amdgcn_permlanex16:
1903 case Intrinsic::amdgcn_permlanex16_var: {
1904 // Discard vdst_in if it's not going to be read.
1905 Value *VDstIn = II.getArgOperand(i: 0);
1906 if (isa<PoisonValue>(Val: VDstIn))
1907 break;
1908
1909 // FetchInvalid operand idx.
1910 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1911 IID == Intrinsic::amdgcn_permlanex16)
1912 ? 4 /* for permlane16 and permlanex16 */
1913 : 3; /* for permlane16_var and permlanex16_var */
1914
1915 // BoundCtrl operand idx.
1916 // For permlane16 and permlanex16 it should be 5
1917 // For Permlane16_var and permlanex16_var it should be 4
1918 unsigned int BcIdx = FiIdx + 1;
1919
1920 ConstantInt *FetchInvalid = cast<ConstantInt>(Val: II.getArgOperand(i: FiIdx));
1921 ConstantInt *BoundCtrl = cast<ConstantInt>(Val: II.getArgOperand(i: BcIdx));
1922 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1923 break;
1924
1925 return IC.replaceOperand(I&: II, OpNum: 0, V: PoisonValue::get(T: VDstIn->getType()));
1926 }
1927 case Intrinsic::amdgcn_wave_shuffle:
1928 return tryOptimizeShufflePattern(IC, II, ST: *ST);
1929 case Intrinsic::amdgcn_permlane64:
1930 case Intrinsic::amdgcn_readfirstlane:
1931 case Intrinsic::amdgcn_readlane:
1932 case Intrinsic::amdgcn_ds_bpermute: {
1933 // If the data argument is uniform these intrinsics return it unchanged.
1934 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1935 const Use &Src = II.getArgOperandUse(i: SrcIdx);
1936 if (isTriviallyUniform(U: Src))
1937 return IC.replaceInstUsesWith(I&: II, V: Src.get());
1938
1939 if (IID == Intrinsic::amdgcn_readlane &&
1940 simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: 1))
1941 return &II;
1942
1943 // If the lane argument of bpermute is uniform, change it to readlane. This
1944 // generates better code and can enable further optimizations because
1945 // readlane is AlwaysUniform.
1946 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1947 const Use &Lane = II.getArgOperandUse(i: 0);
1948 if (isTriviallyUniform(U: Lane)) {
1949 Value *NewLane = IC.Builder.CreateLShr(LHS: Lane, RHS: 2);
1950 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1951 M: II.getModule(), id: Intrinsic::amdgcn_readlane, OverloadTys: II.getType());
1952 II.setCalledFunction(NewDecl);
1953 II.setOperand(i_nocapture: 0, Val_nocapture: Src);
1954 II.setOperand(i_nocapture: 1, Val_nocapture: NewLane);
1955 return &II;
1956 }
1957 }
1958
1959 if (IID == Intrinsic::amdgcn_ds_bpermute)
1960 return tryOptimizeShufflePattern(IC, II, ST: *ST);
1961
1962 if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
1963 return Res;
1964
1965 return std::nullopt;
1966 }
1967 case Intrinsic::amdgcn_writelane: {
1968 // TODO: Fold bitcast like readlane.
1969 if (simplifyDemandedLaneMaskArg(IC, II, LaneArgIdx: 1))
1970 return &II;
1971 return std::nullopt;
1972 }
1973 case Intrinsic::amdgcn_trig_preop: {
1974 // The intrinsic is declared with name mangling, but currently the
1975 // instruction only exists for f64
1976 if (!II.getType()->isDoubleTy())
1977 break;
1978
1979 Value *Src = II.getArgOperand(i: 0);
1980 Value *Segment = II.getArgOperand(i: 1);
1981 if (isa<PoisonValue>(Val: Src) || isa<PoisonValue>(Val: Segment))
1982 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
1983
1984 if (isa<UndefValue>(Val: Segment))
1985 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1986
1987 // Sign bit is not used.
1988 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Val: Src);
1989 if (StrippedSign != Src)
1990 return IC.replaceOperand(I&: II, OpNum: 0, V: StrippedSign);
1991
1992 if (II.isStrictFP())
1993 break;
1994
1995 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Val: Src);
1996 if (!CSrc && !isa<UndefValue>(Val: Src))
1997 break;
1998
1999 // The instruction ignores special cases, and literally just extracts the
2000 // exponents. Fold undef to nan, and index the table as normal.
2001 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
2002 : APFloat::getQNaN(Sem: II.getType()->getFltSemantics())
2003 .bitcastToAPInt();
2004
2005 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Val: Segment);
2006 if (!Cseg) {
2007 if (isa<UndefValue>(Val: Src))
2008 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
2009 break;
2010 }
2011
2012 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(numBits: 11, bitPosition: 52);
2013 unsigned SegmentVal = Cseg->getValue().trunc(width: 5).getZExtValue();
2014 unsigned Shift = SegmentVal * 53;
2015 if (Exponent > 1077)
2016 Shift += Exponent - 1077;
2017
2018 // 2.0/PI table.
2019 static const uint32_t TwoByPi[] = {
2020 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
2021 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
2022 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
2023 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
2024 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
2025 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
2026 0x56033046};
2027
2028 // Return 0 for outbound segment (hardware behavior).
2029 unsigned Idx = Shift >> 5;
2030 if (Idx + 2 >= std::size(TwoByPi)) {
2031 APFloat Zero = APFloat::getZero(Sem: II.getType()->getFltSemantics());
2032 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: II.getType(), V: Zero));
2033 }
2034
2035 unsigned BShift = Shift & 0x1f;
2036 uint64_t Thi = Make_64(High: TwoByPi[Idx], Low: TwoByPi[Idx + 1]);
2037 uint64_t Tlo = Make_64(High: TwoByPi[Idx + 2], Low: 0);
2038 if (BShift)
2039 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
2040 Thi = Thi >> 11;
2041 APFloat Result = APFloat((double)Thi);
2042
2043 int Scale = -53 - Shift;
2044 if (Exponent >= 1968)
2045 Scale += 128;
2046
2047 Result = scalbn(X: Result, Exp: Scale, RM: RoundingMode::NearestTiesToEven);
2048 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Ty: Src->getType(), V: Result));
2049 }
2050 case Intrinsic::amdgcn_fmul_legacy: {
2051 Value *Op0 = II.getArgOperand(i: 0);
2052 Value *Op1 = II.getArgOperand(i: 1);
2053
2054 for (Value *Src : {Op0, Op1}) {
2055 if (isa<PoisonValue>(Val: Src))
2056 return IC.replaceInstUsesWith(I&: II, V: Src);
2057 }
2058
2059 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2060 // infinity, gives +0.0.
2061 // TODO: Move to InstSimplify?
2062 if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) ||
2063 match(V: Op1, P: PatternMatch::m_AnyZeroFP()))
2064 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
2065
2066 // If we can prove we don't have one of the special cases then we can use a
2067 // normal fmul instruction instead.
2068 if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
2069 auto *FMul = IC.Builder.CreateFMulFMF(L: Op0, R: Op1, FMFSource: &II);
2070 FMul->takeName(V: &II);
2071 return IC.replaceInstUsesWith(I&: II, V: FMul);
2072 }
2073 break;
2074 }
2075 case Intrinsic::amdgcn_fma_legacy: {
2076 Value *Op0 = II.getArgOperand(i: 0);
2077 Value *Op1 = II.getArgOperand(i: 1);
2078 Value *Op2 = II.getArgOperand(i: 2);
2079
2080 for (Value *Src : {Op0, Op1, Op2}) {
2081 if (isa<PoisonValue>(Val: Src))
2082 return IC.replaceInstUsesWith(I&: II, V: Src);
2083 }
2084
2085 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2086 // infinity, gives +0.0.
2087 // TODO: Move to InstSimplify?
2088 if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) ||
2089 match(V: Op1, P: PatternMatch::m_AnyZeroFP())) {
2090 // It's tempting to just return Op2 here, but that would give the wrong
2091 // result if Op2 was -0.0.
2092 auto *Zero = ConstantFP::getZero(Ty: II.getType());
2093 auto *FAdd = IC.Builder.CreateFAddFMF(L: Zero, R: Op2, FMFSource: &II);
2094 FAdd->takeName(V: &II);
2095 return IC.replaceInstUsesWith(I&: II, V: FAdd);
2096 }
2097
2098 // If we can prove we don't have one of the special cases then we can use a
2099 // normal fma instead.
2100 if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
2101 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
2102 M: II.getModule(), id: Intrinsic::fma, OverloadTys: II.getType()));
2103 return &II;
2104 }
2105 break;
2106 }
2107 case Intrinsic::amdgcn_is_shared:
2108 case Intrinsic::amdgcn_is_private: {
2109 Value *Src = II.getArgOperand(i: 0);
2110 if (isa<PoisonValue>(Val: Src))
2111 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
2112 if (isa<UndefValue>(Val: Src))
2113 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
2114
2115 if (isa<ConstantPointerNull>(Val: II.getArgOperand(i: 0)))
2116 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getFalse(Ty: II.getType()));
2117 break;
2118 }
2119 case Intrinsic::amdgcn_make_buffer_rsrc: {
2120 Value *Src = II.getArgOperand(i: 0);
2121 if (isa<PoisonValue>(Val: Src))
2122 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
2123 return std::nullopt;
2124 }
2125 case Intrinsic::amdgcn_raw_buffer_store_format:
2126 case Intrinsic::amdgcn_struct_buffer_store_format:
2127 case Intrinsic::amdgcn_raw_tbuffer_store:
2128 case Intrinsic::amdgcn_struct_tbuffer_store:
2129 case Intrinsic::amdgcn_image_store_1d:
2130 case Intrinsic::amdgcn_image_store_1darray:
2131 case Intrinsic::amdgcn_image_store_2d:
2132 case Intrinsic::amdgcn_image_store_2darray:
2133 case Intrinsic::amdgcn_image_store_2darraymsaa:
2134 case Intrinsic::amdgcn_image_store_2dmsaa:
2135 case Intrinsic::amdgcn_image_store_3d:
2136 case Intrinsic::amdgcn_image_store_cube:
2137 case Intrinsic::amdgcn_image_store_mip_1d:
2138 case Intrinsic::amdgcn_image_store_mip_1darray:
2139 case Intrinsic::amdgcn_image_store_mip_2d:
2140 case Intrinsic::amdgcn_image_store_mip_2darray:
2141 case Intrinsic::amdgcn_image_store_mip_3d:
2142 case Intrinsic::amdgcn_image_store_mip_cube: {
2143 if (!isa<FixedVectorType>(Val: II.getArgOperand(i: 0)->getType()))
2144 break;
2145
2146 APInt DemandedElts;
2147 if (ST->hasDefaultComponentBroadcast())
2148 DemandedElts = defaultComponentBroadcast(V: II.getArgOperand(i: 0));
2149 else if (ST->hasDefaultComponentZero())
2150 DemandedElts = trimTrailingZerosInVector(IC, UseV: II.getArgOperand(i: 0), I: &II);
2151 else
2152 break;
2153
2154 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID()) ? 1 : -1;
2155 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
2156 IsLoad: false)) {
2157 return IC.eraseInstFromFunction(I&: II);
2158 }
2159
2160 break;
2161 }
2162 case Intrinsic::amdgcn_prng_b32: {
2163 auto *Src = II.getArgOperand(i: 0);
2164 if (isa<UndefValue>(Val: Src)) {
2165 return IC.replaceInstUsesWith(I&: II, V: Src);
2166 }
2167 return std::nullopt;
2168 }
2169 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
2170 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
2171 Value *Src0 = II.getArgOperand(i: 0);
2172 Value *Src1 = II.getArgOperand(i: 1);
2173 uint64_t CBSZ = cast<ConstantInt>(Val: II.getArgOperand(i: 3))->getZExtValue();
2174 uint64_t BLGP = cast<ConstantInt>(Val: II.getArgOperand(i: 4))->getZExtValue();
2175 auto *Src0Ty = cast<FixedVectorType>(Val: Src0->getType());
2176 auto *Src1Ty = cast<FixedVectorType>(Val: Src1->getType());
2177
2178 auto getFormatNumRegs = [](unsigned FormatVal) {
2179 switch (FormatVal) {
2180 case AMDGPU::MFMAScaleFormats::FP6_E2M3:
2181 case AMDGPU::MFMAScaleFormats::FP6_E3M2:
2182 return 6u;
2183 case AMDGPU::MFMAScaleFormats::FP4_E2M1:
2184 return 4u;
2185 case AMDGPU::MFMAScaleFormats::FP8_E4M3:
2186 case AMDGPU::MFMAScaleFormats::FP8_E5M2:
2187 return 8u;
2188 default:
2189 llvm_unreachable("invalid format value");
2190 }
2191 };
2192
2193 bool MadeChange = false;
2194 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
2195 unsigned Src1NumElts = getFormatNumRegs(BLGP);
2196
2197 // Depending on the used format, fewer registers are required so shrink the
2198 // vector type.
2199 if (Src0Ty->getNumElements() > Src0NumElts) {
2200 Src0 = IC.Builder.CreateExtractVector(
2201 DstType: FixedVectorType::get(ElementType: Src0Ty->getElementType(), NumElts: Src0NumElts), SrcVec: Src0,
2202 Idx: uint64_t(0));
2203 MadeChange = true;
2204 }
2205
2206 if (Src1Ty->getNumElements() > Src1NumElts) {
2207 Src1 = IC.Builder.CreateExtractVector(
2208 DstType: FixedVectorType::get(ElementType: Src1Ty->getElementType(), NumElts: Src1NumElts), SrcVec: Src1,
2209 Idx: uint64_t(0));
2210 MadeChange = true;
2211 }
2212
2213 if (!MadeChange)
2214 return std::nullopt;
2215
2216 SmallVector<Value *, 10> Args(II.args());
2217 Args[0] = Src0;
2218 Args[1] = Src1;
2219
2220 Value *NewII = IC.Builder.CreateIntrinsic(
2221 ID: IID, OverloadTypes: {Src0->getType(), Src1->getType()}, Args, FMFSource: &II);
2222 NewII->takeName(V: &II);
2223 return IC.replaceInstUsesWith(I&: II, V: NewII);
2224 }
2225 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
2226 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
2227 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
2228 Value *Src0 = II.getArgOperand(i: 1);
2229 Value *Src1 = II.getArgOperand(i: 3);
2230 unsigned FmtA = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue();
2231 uint64_t FmtB = cast<ConstantInt>(Val: II.getArgOperand(i: 2))->getZExtValue();
2232 auto *Src0Ty = cast<FixedVectorType>(Val: Src0->getType());
2233 auto *Src1Ty = cast<FixedVectorType>(Val: Src1->getType());
2234
2235 bool MadeChange = false;
2236 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt: FmtA);
2237 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt: FmtB);
2238
2239 // Depending on the used format, fewer registers are required so shrink the
2240 // vector type.
2241 if (Src0Ty->getNumElements() > Src0NumElts) {
2242 Src0 = IC.Builder.CreateExtractVector(
2243 DstType: FixedVectorType::get(ElementType: Src0Ty->getElementType(), NumElts: Src0NumElts), SrcVec: Src0,
2244 Idx: IC.Builder.getInt64(C: 0));
2245 MadeChange = true;
2246 }
2247
2248 if (Src1Ty->getNumElements() > Src1NumElts) {
2249 Src1 = IC.Builder.CreateExtractVector(
2250 DstType: FixedVectorType::get(ElementType: Src1Ty->getElementType(), NumElts: Src1NumElts), SrcVec: Src1,
2251 Idx: IC.Builder.getInt64(C: 0));
2252 MadeChange = true;
2253 }
2254
2255 if (!MadeChange)
2256 return std::nullopt;
2257
2258 SmallVector<Value *, 13> Args(II.args());
2259 Args[1] = Src0;
2260 Args[3] = Src1;
2261
2262 Value *NewII = IC.Builder.CreateIntrinsic(
2263 ID: IID, OverloadTypes: {II.getArgOperand(i: 5)->getType(), Src0->getType(), Src1->getType()},
2264 Args, FMFSource: &II);
2265 NewII->takeName(V: &II);
2266 return IC.replaceInstUsesWith(I&: II, V: NewII);
2267 }
2268 }
2269 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
2270 AMDGPU::getImageDimIntrinsicInfo(Intr: II.getIntrinsicID())) {
2271 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
2272 }
2273 return std::nullopt;
2274}
2275
2276/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
2277///
2278/// The result of simplifying amdgcn image and buffer store intrinsics is updating
2279/// definitions of the intrinsics vector argument, not Uses of the result like
2280/// image and buffer loads.
2281/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
2282/// struct returns.
2283static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
2284 IntrinsicInst &II,
2285 APInt DemandedElts,
2286 int DMaskIdx, bool IsLoad) {
2287
2288 auto *IIVTy = cast<FixedVectorType>(Val: IsLoad ? II.getType()
2289 : II.getOperand(i_nocapture: 0)->getType());
2290 unsigned VWidth = IIVTy->getNumElements();
2291 if (VWidth == 1)
2292 return nullptr;
2293 Type *EltTy = IIVTy->getElementType();
2294
2295 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
2296 IC.Builder.SetInsertPoint(&II);
2297
2298 // Assume the arguments are unchanged and later override them, if needed.
2299 SmallVector<Value *, 16> Args(II.args());
2300
2301 if (DMaskIdx < 0) {
2302 // Buffer case.
2303
2304 const unsigned ActiveBits = DemandedElts.getActiveBits();
2305 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
2306
2307 // Start assuming the prefix of elements is demanded, but possibly clear
2308 // some other bits if there are trailing zeros (unused components at front)
2309 // and update offset.
2310 DemandedElts = (1 << ActiveBits) - 1;
2311
2312 if (UnusedComponentsAtFront > 0) {
2313 static const unsigned InvalidOffsetIdx = 0xf;
2314
2315 unsigned OffsetIdx;
2316 switch (II.getIntrinsicID()) {
2317 case Intrinsic::amdgcn_raw_buffer_load:
2318 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2319 OffsetIdx = 1;
2320 break;
2321 case Intrinsic::amdgcn_s_buffer_load:
2322 // If resulting type is vec3, there is no point in trimming the
2323 // load with updated offset, as the vec3 would most likely be widened to
2324 // vec4 anyway during lowering.
2325 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
2326 OffsetIdx = InvalidOffsetIdx;
2327 else
2328 OffsetIdx = 1;
2329 break;
2330 case Intrinsic::amdgcn_struct_buffer_load:
2331 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2332 OffsetIdx = 2;
2333 break;
2334 default:
2335 // TODO: handle tbuffer* intrinsics.
2336 OffsetIdx = InvalidOffsetIdx;
2337 break;
2338 }
2339
2340 if (OffsetIdx != InvalidOffsetIdx) {
2341 // Clear demanded bits and update the offset.
2342 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
2343 auto *Offset = Args[OffsetIdx];
2344 unsigned SingleComponentSizeInBits =
2345 IC.getDataLayout().getTypeSizeInBits(Ty: EltTy);
2346 unsigned OffsetAdd =
2347 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
2348 auto *OffsetAddVal = ConstantInt::get(Ty: Offset->getType(), V: OffsetAdd);
2349 Args[OffsetIdx] = IC.Builder.CreateAdd(LHS: Offset, RHS: OffsetAddVal);
2350 }
2351 }
2352 } else {
2353 // Image case.
2354
2355 ConstantInt *DMask = cast<ConstantInt>(Val: Args[DMaskIdx]);
2356 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
2357
2358 // dmask 0 has special semantics, do not simplify.
2359 if (DMaskVal == 0)
2360 return nullptr;
2361
2362 // Mask off values that are undefined because the dmask doesn't cover them
2363 DemandedElts &= (1 << llvm::popcount(Value: DMaskVal)) - 1;
2364
2365 unsigned NewDMaskVal = 0;
2366 unsigned OrigLdStIdx = 0;
2367 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
2368 const unsigned Bit = 1 << SrcIdx;
2369 if (!!(DMaskVal & Bit)) {
2370 if (!!DemandedElts[OrigLdStIdx])
2371 NewDMaskVal |= Bit;
2372 OrigLdStIdx++;
2373 }
2374 }
2375
2376 if (DMaskVal != NewDMaskVal)
2377 Args[DMaskIdx] = ConstantInt::get(Ty: DMask->getType(), V: NewDMaskVal);
2378 }
2379
2380 unsigned NewNumElts = DemandedElts.popcount();
2381 if (!NewNumElts)
2382 return PoisonValue::get(T: IIVTy);
2383
2384 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
2385 if (DMaskIdx >= 0)
2386 II.setArgOperand(i: DMaskIdx, v: Args[DMaskIdx]);
2387 return nullptr;
2388 }
2389
2390 // Validate function argument and return types, extracting overloaded types
2391 // along the way.
2392 SmallVector<Type *, 6> OverloadTys;
2393 if (!Intrinsic::isSignatureValid(F: II.getCalledFunction(), OverloadTys))
2394 return nullptr;
2395
2396 Type *NewTy =
2397 (NewNumElts == 1) ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: NewNumElts);
2398 OverloadTys[0] = NewTy;
2399
2400 if (!IsLoad) {
2401 SmallVector<int, 8> EltMask;
2402 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
2403 if (DemandedElts[OrigStoreIdx])
2404 EltMask.push_back(Elt: OrigStoreIdx);
2405
2406 if (NewNumElts == 1)
2407 Args[0] = IC.Builder.CreateExtractElement(Vec: II.getOperand(i_nocapture: 0), Idx: EltMask[0]);
2408 else
2409 Args[0] = IC.Builder.CreateShuffleVector(V: II.getOperand(i_nocapture: 0), Mask: EltMask);
2410 }
2411
2412 CallInst *NewCall = IC.Builder.CreateIntrinsicWithoutFolding(
2413 ID: II.getIntrinsicID(), OverloadTypes: OverloadTys, Args);
2414 NewCall->takeName(V: &II);
2415 NewCall->copyMetadata(SrcInst: II);
2416 AttributeList OldAttrList = II.getAttributes();
2417 NewCall->setAttributes(OldAttrList);
2418
2419 if (IsLoad) {
2420 if (NewNumElts == 1) {
2421 return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: IIVTy), NewElt: NewCall,
2422 Idx: DemandedElts.countr_zero());
2423 }
2424
2425 SmallVector<int, 8> EltMask;
2426 unsigned NewLoadIdx = 0;
2427 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2428 if (!!DemandedElts[OrigLoadIdx])
2429 EltMask.push_back(Elt: NewLoadIdx++);
2430 else
2431 EltMask.push_back(Elt: NewNumElts);
2432 }
2433
2434 auto *Shuffle = IC.Builder.CreateShuffleVector(V: NewCall, Mask: EltMask);
2435
2436 return Shuffle;
2437 }
2438
2439 return NewCall;
2440}
2441
2442Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
2443 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2444 APInt &UndefElts) const {
2445 auto *VT = dyn_cast<FixedVectorType>(Val: II.getType());
2446 if (!VT)
2447 return nullptr;
2448
2449 const unsigned FirstElt = DemandedElts.countr_zero();
2450 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2451 const unsigned MaskLen = LastElt - FirstElt + 1;
2452
2453 unsigned OldNumElts = VT->getNumElements();
2454 if (MaskLen == OldNumElts && MaskLen != 1)
2455 return nullptr;
2456
2457 Type *EltTy = VT->getElementType();
2458 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: MaskLen);
2459
2460 // Theoretically we should support these intrinsics for any legal type. Avoid
2461 // introducing cases that aren't direct register types like v3i16.
2462 if (!isTypeLegal(Ty: NewVT))
2463 return nullptr;
2464
2465 Value *Src = II.getArgOperand(i: 0);
2466
2467 // Make sure convergence tokens are preserved.
2468 // TODO: CreateIntrinsic should allow directly copying bundles
2469 SmallVector<OperandBundleDef, 2> OpBundles;
2470 II.getOperandBundlesAsDefs(Defs&: OpBundles);
2471
2472 Module *M = IC.Builder.GetInsertBlock()->getModule();
2473 Function *Remangled =
2474 Intrinsic::getOrInsertDeclaration(M, id: II.getIntrinsicID(), OverloadTys: {NewVT});
2475
2476 if (MaskLen == 1) {
2477 Value *Extract = IC.Builder.CreateExtractElement(Vec: Src, Idx: FirstElt);
2478
2479 // TODO: Preserve callsite attributes?
2480 CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
2481
2482 return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: II.getType()),
2483 NewElt: NewCall, Idx: FirstElt);
2484 }
2485
2486 SmallVector<int> ExtractMask(MaskLen, -1);
2487 for (unsigned I = 0; I != MaskLen; ++I) {
2488 if (DemandedElts[FirstElt + I])
2489 ExtractMask[I] = FirstElt + I;
2490 }
2491
2492 Value *Extract = IC.Builder.CreateShuffleVector(V: Src, Mask: ExtractMask);
2493
2494 // TODO: Preserve callsite attributes?
2495 CallInst *NewCall = IC.Builder.CreateCall(Callee: Remangled, Args: {Extract}, OpBundles);
2496
2497 SmallVector<int> InsertMask(OldNumElts, -1);
2498 for (unsigned I = 0; I != MaskLen; ++I) {
2499 if (DemandedElts[FirstElt + I])
2500 InsertMask[FirstElt + I] = I;
2501 }
2502
2503 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2504 // call behind.
2505 return IC.Builder.CreateShuffleVector(V: NewCall, Mask: InsertMask);
2506}
2507
2508std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
2509 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2510 APInt &UndefElts2, APInt &UndefElts3,
2511 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2512 SimplifyAndSetOp) const {
2513 switch (II.getIntrinsicID()) {
2514 case Intrinsic::amdgcn_readfirstlane:
2515 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2516 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2517 case Intrinsic::amdgcn_raw_buffer_load:
2518 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2519 case Intrinsic::amdgcn_raw_buffer_load_format:
2520 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2521 case Intrinsic::amdgcn_raw_tbuffer_load:
2522 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2523 case Intrinsic::amdgcn_s_buffer_load:
2524 case Intrinsic::amdgcn_struct_buffer_load:
2525 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2526 case Intrinsic::amdgcn_struct_buffer_load_format:
2527 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2528 case Intrinsic::amdgcn_struct_tbuffer_load:
2529 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2530 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2531 default: {
2532 if (getAMDGPUImageDMaskIntrinsic(Intr: II.getIntrinsicID())) {
2533 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx: 0);
2534 }
2535 break;
2536 }
2537 }
2538 return std::nullopt;
2539}
2540