AMDGPUUniformIntrinsicCombine.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp]

1	//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This pass simplifies certain intrinsic calls when the arguments are uniform.
11	/// It's true that this pass has transforms that can lead to a situation where
12	/// some instruction whose operand was previously recognized as statically
13	/// uniform is later on no longer recognized as statically uniform. However, the
14	/// semantics of how programs execute don't (and must not, for this precise
15	/// reason) care about static uniformity, they only ever care about dynamic
16	/// uniformity. And every instruction that's downstream and cares about dynamic
17	/// uniformity must be convergent (and isel will introduce v_readfirstlane for
18	/// them if their operands can't be proven statically uniform).
19	//===----------------------------------------------------------------------===//
20
21	#include "AMDGPU.h"
22	#include "GCNSubtarget.h"
23	#include "llvm/Analysis/DomTreeUpdater.h"
24	#include "llvm/Analysis/LoopInfo.h"
25	#include "llvm/Analysis/ScalarEvolution.h"
26	#include "llvm/Analysis/TargetLibraryInfo.h"
27	#include "llvm/Analysis/UniformityAnalysis.h"
28	#include "llvm/CodeGen/TargetPassConfig.h"
29	#include "llvm/IR/IRBuilder.h"
30	#include "llvm/IR/InstIterator.h"
31	#include "llvm/IR/InstVisitor.h"
32	#include "llvm/IR/IntrinsicsAMDGPU.h"
33	#include "llvm/IR/PatternMatch.h"
34	#include "llvm/InitializePasses.h"
35	#include "llvm/Target/TargetMachine.h"
36	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
37
38	#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
39
40	using namespace llvm;
41	using namespace llvm::AMDGPU;
42	using namespace llvm::PatternMatch;
43
44	/// Wrapper for querying uniformity info that first checks locally tracked
45	/// instructions.
46	static bool
47	isDivergentUseWithNew(const Use &U, const UniformityInfo &UI,
48	const ValueMap<const Value , bool*> &Tracker) {
49	Value *V = U.get();
50	if (auto It = Tracker.find(Val: V); It != Tracker.end())
51	return !It ->second; // divergent if marked false
52	return UI.isDivergentUse(U);
53	}
54
55	/// Optimizes uniform intrinsics calls if their operand can be proven uniform.
56	static bool optimizeUniformIntrinsic(IntrinsicInst &II,
57	const UniformityInfo &UI,
58	ValueMap<const Value , bool*> &Tracker) {
59	llvm::Intrinsic::ID IID = II.getIntrinsicID();
60	/// We deliberately do not simplify readfirstlane with a uniform argument, so
61	/// that frontends can use it to force a copy to SGPR and thereby prevent the
62	/// backend from generating unwanted waterfall loops.
63	switch (IID) {
64	case Intrinsic::amdgcn_permlane64:
65	case Intrinsic::amdgcn_readlane: {
66	Value *Src = II.getArgOperand(i: `0`);
67	if (isDivergentUseWithNew(U: II.getOperandUse(i: `0`), UI, Tracker))
68	return false;
69	LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << `'\n'`);
70	II.replaceAllUsesWith(V: Src);
71	II.eraseFromParent();
72	return true;
73	}
74	case Intrinsic::amdgcn_ballot: {
75	Value *Src = II.getArgOperand(i: `0`);
76	if (isDivergentUseWithNew(U: II.getOperandUse(i: `0`), UI, Tracker))
77	return false;
78	LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << `'\n'`);
79
80	bool Changed = false;
81	for (User *U : make_early_inc_range(Range: II.users())) {
82	if (auto *ICmp = dyn_cast<ICmpInst>(Val: U)) {
83	Value *Op0 = ICmp->getOperand(i_nocapture: `0`);
84	Value *Op1 = ICmp->getOperand(i_nocapture: `1`);
85	ICmpInst::Predicate Pred = ICmp->getPredicate();
86	Value *OtherOp = Op0 == &II ? Op1 : Op0;
87
88	if (Pred == ICmpInst::ICMP_EQ && match(V: OtherOp, P: m_Zero())) {
89	// Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
90	Instruction *NotOp =
91	BinaryOperator::CreateNot(Op: Src, Name: "", InsertBefore: ICmp->getIterator());
92	Tracker [NotOp] = true; // NOT preserves uniformity
93	LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << `'\n'`);
94	ICmp->replaceAllUsesWith(V: NotOp);
95	Changed = true;
96	} else if (Pred == ICmpInst::ICMP_NE && match(V: OtherOp, P: m_Zero())) {
97	// Case: (icmp ne %ballot, 0) -> %ballot_arg
98	LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
99	<< *Src << `'\n'`);
100	ICmp->replaceAllUsesWith(V: Src);
101	Changed = true;
102	}
103	}
104	}
105	// Erase the intrinsic if it has no remaining uses.
106	if (II.use_empty())
107	II.eraseFromParent();
108	return Changed;
109	}
110	case Intrinsic::amdgcn_wave_shuffle: {
111	Use &Val = II.getOperandUse(i: `0`);
112	Use &Idx = II.getOperandUse(i: `1`);
113
114	// Like with readlane, if Value is uniform then just propagate it
115	if (!isDivergentUseWithNew(U: Val, UI, Tracker)) {
116	II.replaceAllUsesWith(V: Val);
117	II.eraseFromParent();
118	return true;
119	}
120
121	// Otherwise, when Index is uniform, this is just a readlane operation
122	if (isDivergentUseWithNew(U: Idx, UI, Tracker))
123	return false;
124
125	// The readlane intrinsic we want to call has the exact same function
126	// signature, so we can quickly modify the instruction in-place
127	Module *Mod = II.getModule();
128	II.setCalledFunction(Intrinsic::getOrInsertDeclaration(
129	M: Mod, id: Intrinsic::amdgcn_readlane, Tys: II.getType()));
130	return true;
131	}
132	default:
133	return false;
134	}
135	return false;
136	}
137
138	/// Iterates over intrinsic calls in the Function to optimize.
139	static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
140	bool IsChanged = false;
141	ValueMap<const Value , bool*> Tracker;
142
143	for (Instruction &I : make_early_inc_range(Range: instructions(F))) {
144	auto *II = dyn_cast<IntrinsicInst>(Val: &I);
145	if (!II)
146	continue;
147	IsChanged \|= optimizeUniformIntrinsic(II&: *II, UI, Tracker);
148	}
149	return IsChanged;
150	}
151
152	PreservedAnalyses
153	AMDGPUUniformIntrinsicCombinePass::run(Function &F,
154	FunctionAnalysisManager &AM) {
155	const auto &UI = AM.getResult<UniformityInfoAnalysis>(IR&: F);
156	if (!runUniformIntrinsicCombine(F, UI))
157	return PreservedAnalyses::all();
158
159	PreservedAnalyses PA;
160	PA.preserve<UniformityInfoAnalysis>();
161	return PA;
162	}
163
164	namespace {
165	class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
166	public:
167	static char ID;
168	AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass (ID) {}
169
170	private:
171	bool runOnFunction(Function &F) override;
172	void getAnalysisUsage(AnalysisUsage &AU) const override {
173	AU.setPreservesCFG();
174	AU.addRequired<UniformityInfoWrapperPass>();
175	AU.addRequired<TargetPassConfig>();
176	}
177	};
178	} // namespace
179
180	char AMDGPUUniformIntrinsicCombineLegacy::ID = `0`;
181	char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
182	AMDGPUUniformIntrinsicCombineLegacy::ID;
183
184	bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
185	if (skipFunction(F))
186	return false;
187	const UniformityInfo &UI =
188	getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
189	return runUniformIntrinsicCombine(F, UI);
190	}
191
192	INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
193	"AMDGPU Uniform Intrinsic Combine", false, false)
194	INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
195	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
196	INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
197	"AMDGPU Uniform Intrinsic Combine", false, false)
198
199	FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
200	return new AMDGPUUniformIntrinsicCombineLegacy ();
201	}
202

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp