1//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass simplifies certain intrinsic calls when the arguments are uniform.
11/// It's true that this pass has transforms that can lead to a situation where
12/// some instruction whose operand was previously recognized as statically
13/// uniform is later on no longer recognized as statically uniform. However, the
14/// semantics of how programs execute don't (and must not, for this precise
15/// reason) care about static uniformity, they only ever care about dynamic
16/// uniformity. And every instruction that's downstream and cares about dynamic
17/// uniformity must be convergent (and isel will introduce v_readfirstlane for
18/// them if their operands can't be proven statically uniform).
19//===----------------------------------------------------------------------===//
20
21#include "AMDGPU.h"
22#include "GCNSubtarget.h"
23#include "llvm/Analysis/DomTreeUpdater.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/ScalarEvolution.h"
26#include "llvm/Analysis/TargetLibraryInfo.h"
27#include "llvm/Analysis/UniformityAnalysis.h"
28#include "llvm/CodeGen/TargetPassConfig.h"
29#include "llvm/IR/IRBuilder.h"
30#include "llvm/IR/InstIterator.h"
31#include "llvm/IR/InstVisitor.h"
32#include "llvm/IR/IntrinsicsAMDGPU.h"
33#include "llvm/IR/PatternMatch.h"
34#include "llvm/InitializePasses.h"
35#include "llvm/Target/TargetMachine.h"
36#include "llvm/Transforms/Utils/BasicBlockUtils.h"
37
38#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
39
40using namespace llvm;
41using namespace llvm::AMDGPU;
42using namespace llvm::PatternMatch;
43
44/// Wrapper for querying uniformity info that first checks locally tracked
45/// instructions.
46static bool
47isDivergentUseWithNew(const Use &U, const UniformityInfo &UI,
48 const ValueMap<const Value *, bool> &Tracker) {
49 Value *V = U.get();
50 if (auto It = Tracker.find(Val: V); It != Tracker.end())
51 return !It->second; // divergent if marked false
52 return UI.isDivergentUse(U);
53}
54
55/// Optimizes uniform intrinsics calls if their operand can be proven uniform.
56static bool optimizeUniformIntrinsic(IntrinsicInst &II,
57 const UniformityInfo &UI,
58 ValueMap<const Value *, bool> &Tracker) {
59 llvm::Intrinsic::ID IID = II.getIntrinsicID();
60 /// We deliberately do not simplify readfirstlane with a uniform argument, so
61 /// that frontends can use it to force a copy to SGPR and thereby prevent the
62 /// backend from generating unwanted waterfall loops.
63 switch (IID) {
64 case Intrinsic::amdgcn_permlane64:
65 case Intrinsic::amdgcn_readlane: {
66 Value *Src = II.getArgOperand(i: 0);
67 if (isDivergentUseWithNew(U: II.getOperandUse(i: 0), UI, Tracker))
68 return false;
69 LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
70 II.replaceAllUsesWith(V: Src);
71 II.eraseFromParent();
72 return true;
73 }
74 case Intrinsic::amdgcn_ballot: {
75 Value *Src = II.getArgOperand(i: 0);
76 if (isDivergentUseWithNew(U: II.getOperandUse(i: 0), UI, Tracker))
77 return false;
78 LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
79
80 bool Changed = false;
81 for (User *U : make_early_inc_range(Range: II.users())) {
82 if (auto *ICmp = dyn_cast<ICmpInst>(Val: U)) {
83 Value *Op0 = ICmp->getOperand(i_nocapture: 0);
84 Value *Op1 = ICmp->getOperand(i_nocapture: 1);
85 ICmpInst::Predicate Pred = ICmp->getPredicate();
86 Value *OtherOp = Op0 == &II ? Op1 : Op0;
87
88 if (Pred == ICmpInst::ICMP_EQ && match(V: OtherOp, P: m_Zero())) {
89 // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
90 Instruction *NotOp =
91 BinaryOperator::CreateNot(Op: Src, Name: "", InsertBefore: ICmp->getIterator());
92 Tracker[NotOp] = true; // NOT preserves uniformity
93 LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
94 ICmp->replaceAllUsesWith(V: NotOp);
95 Changed = true;
96 } else if (Pred == ICmpInst::ICMP_NE && match(V: OtherOp, P: m_Zero())) {
97 // Case: (icmp ne %ballot, 0) -> %ballot_arg
98 LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
99 << *Src << '\n');
100 ICmp->replaceAllUsesWith(V: Src);
101 Changed = true;
102 }
103 }
104 }
105 // Erase the intrinsic if it has no remaining uses.
106 if (II.use_empty())
107 II.eraseFromParent();
108 return Changed;
109 }
110 case Intrinsic::amdgcn_wave_shuffle: {
111 Use &Val = II.getOperandUse(i: 0);
112 Use &Idx = II.getOperandUse(i: 1);
113
114 // Like with readlane, if Value is uniform then just propagate it
115 if (!isDivergentUseWithNew(U: Val, UI, Tracker)) {
116 II.replaceAllUsesWith(V: Val);
117 II.eraseFromParent();
118 return true;
119 }
120
121 // Otherwise, when Index is uniform, this is just a readlane operation
122 if (isDivergentUseWithNew(U: Idx, UI, Tracker))
123 return false;
124
125 // The readlane intrinsic we want to call has the exact same function
126 // signature, so we can quickly modify the instruction in-place
127 Module *Mod = II.getModule();
128 II.setCalledFunction(Intrinsic::getOrInsertDeclaration(
129 M: Mod, id: Intrinsic::amdgcn_readlane, Tys: II.getType()));
130 return true;
131 }
132 default:
133 return false;
134 }
135 return false;
136}
137
138/// Iterates over intrinsic calls in the Function to optimize.
139static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
140 bool IsChanged = false;
141 ValueMap<const Value *, bool> Tracker;
142
143 for (Instruction &I : make_early_inc_range(Range: instructions(F))) {
144 auto *II = dyn_cast<IntrinsicInst>(Val: &I);
145 if (!II)
146 continue;
147 IsChanged |= optimizeUniformIntrinsic(II&: *II, UI, Tracker);
148 }
149 return IsChanged;
150}
151
152PreservedAnalyses
153AMDGPUUniformIntrinsicCombinePass::run(Function &F,
154 FunctionAnalysisManager &AM) {
155 const auto &UI = AM.getResult<UniformityInfoAnalysis>(IR&: F);
156 if (!runUniformIntrinsicCombine(F, UI))
157 return PreservedAnalyses::all();
158
159 PreservedAnalyses PA;
160 PA.preserve<UniformityInfoAnalysis>();
161 return PA;
162}
163
164namespace {
165class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
166public:
167 static char ID;
168 AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
169 initializeAMDGPUUniformIntrinsicCombineLegacyPass(
170 *PassRegistry::getPassRegistry());
171 }
172
173private:
174 bool runOnFunction(Function &F) override;
175 void getAnalysisUsage(AnalysisUsage &AU) const override {
176 AU.setPreservesCFG();
177 AU.addRequired<UniformityInfoWrapperPass>();
178 AU.addRequired<TargetPassConfig>();
179 }
180};
181} // namespace
182
183char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
184char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
185 AMDGPUUniformIntrinsicCombineLegacy::ID;
186
187bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
188 if (skipFunction(F))
189 return false;
190 const UniformityInfo &UI =
191 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
192 return runUniformIntrinsicCombine(F, UI);
193}
194
195INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
196 "AMDGPU Uniform Intrinsic Combine", false, false)
197INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
198INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
199INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
200 "AMDGPU Uniform Intrinsic Combine", false, false)
201
202FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
203 return new AMDGPUUniformIntrinsicCombineLegacy();
204}
205