1//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Lower intrinsics that would otherwise require separate handling in both
10// SelectionDAG and GlobalISel.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPUTargetMachine.h"
16#include "GCNSubtarget.h"
17#include "llvm/IR/IRBuilder.h"
18#include "llvm/IR/IntrinsicInst.h"
19#include "llvm/IR/IntrinsicsAMDGPU.h"
20#include "llvm/InitializePasses.h"
21#include "llvm/Transforms/Utils/BasicBlockUtils.h"
22
23#define DEBUG_TYPE "amdgpu-lower-intrinsics"
24
25using namespace llvm;
26
27namespace {
28
29class AMDGPULowerIntrinsicsImpl {
30public:
31 Module &M;
32 const AMDGPUTargetMachine &TM;
33
34 AMDGPULowerIntrinsicsImpl(Module &M, const AMDGPUTargetMachine &TM)
35 : M(M), TM(TM) {}
36
37 bool run();
38
39private:
40 bool visitBarrier(IntrinsicInst &I);
41};
42
43class AMDGPULowerIntrinsicsLegacy : public ModulePass {
44public:
45 static char ID;
46
47 AMDGPULowerIntrinsicsLegacy() : ModulePass(ID) {}
48
49 bool runOnModule(Module &M) override;
50
51 void getAnalysisUsage(AnalysisUsage &AU) const override {
52 AU.addRequired<TargetPassConfig>();
53 }
54};
55
56template <class T> static void forEachCall(Function &Intrin, T Callback) {
57 for (User *U : make_early_inc_range(Range: Intrin.users())) {
58 if (auto *CI = dyn_cast<IntrinsicInst>(Val: U))
59 Callback(CI);
60 }
61}
62
63} // anonymous namespace
64
65bool AMDGPULowerIntrinsicsImpl::run() {
66 bool Changed = false;
67
68 for (Function &F : M) {
69 switch (F.getIntrinsicID()) {
70 default:
71 continue;
72 case Intrinsic::amdgcn_s_barrier:
73 case Intrinsic::amdgcn_s_barrier_signal:
74 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
75 case Intrinsic::amdgcn_s_barrier_wait:
76 case Intrinsic::amdgcn_s_cluster_barrier:
77 forEachCall(Intrin&: F, Callback: [&](IntrinsicInst *II) { Changed |= visitBarrier(I&: *II); });
78 break;
79 }
80 }
81
82 return Changed;
83}
84
85// Optimize barriers and lower s_(cluster_)barrier to a sequence of split
86// barrier intrinsics.
87bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
88 assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
89 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
90 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
91 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
92 I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier);
93
94 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F: *I.getFunction());
95 bool IsSingleWaveWG = false;
96
97 if (TM.getOptLevel() > CodeGenOptLevel::None)
98 IsSingleWaveWG = ST.isSingleWavefrontWorkgroup(F: *I.getFunction());
99
100 IRBuilder<> B(&I);
101
102 // Lower the s_cluster_barrier intrinsic first. There is no corresponding
103 // hardware instruction in any subtarget.
104 if (I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier) {
105 // The default cluster barrier expects one signal per workgroup. So we need
106 // a workgroup barrier first.
107 if (IsSingleWaveWG) {
108 B.CreateIntrinsicWithoutFolding(RetTy: B.getVoidTy(),
109 ID: Intrinsic::amdgcn_wave_barrier, Args: {})
110 ->copyMetadata(SrcInst: I);
111 } else {
112 Value *BarrierID_32 = B.getInt32(C: AMDGPU::Barrier::WORKGROUP);
113 Value *BarrierID_16 = B.getInt16(C: AMDGPU::Barrier::WORKGROUP);
114 CallInst *IsFirst = B.CreateIntrinsicWithoutFolding(
115 RetTy: B.getInt1Ty(), ID: Intrinsic::amdgcn_s_barrier_signal_isfirst,
116 Args: {BarrierID_32});
117 IsFirst->copyMetadata(SrcInst: I);
118 B.CreateIntrinsicWithoutFolding(
119 RetTy: B.getVoidTy(), ID: Intrinsic::amdgcn_s_barrier_wait, Args: {BarrierID_16})
120 ->copyMetadata(SrcInst: I);
121
122 Instruction *ThenTerm =
123 SplitBlockAndInsertIfThen(Cond: IsFirst, SplitBefore: I.getIterator(), Unreachable: false);
124 B.SetInsertPoint(ThenTerm);
125 }
126
127 // Now we can signal the cluster barrier from a single wave and wait for the
128 // barrier in all waves.
129 Value *BarrierID_32 = B.getInt32(C: AMDGPU::Barrier::CLUSTER);
130 Value *BarrierID_16 = B.getInt16(C: AMDGPU::Barrier::CLUSTER);
131 B.CreateIntrinsicWithoutFolding(
132 RetTy: B.getVoidTy(), ID: Intrinsic::amdgcn_s_barrier_signal, Args: {BarrierID_32})
133 ->copyMetadata(SrcInst: I);
134
135 B.SetInsertPoint(&I);
136 B.CreateIntrinsicWithoutFolding(
137 RetTy: B.getVoidTy(), ID: Intrinsic::amdgcn_s_barrier_wait, Args: {BarrierID_16})
138 ->copyMetadata(SrcInst: I);
139
140 I.eraseFromParent();
141 return true;
142 }
143
144 bool IsWorkgroupScope = false;
145
146 if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
147 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
148 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst) {
149 int BarrierID = cast<ConstantInt>(Val: I.getArgOperand(i: 0))->getSExtValue();
150 if (BarrierID == AMDGPU::Barrier::TRAP ||
151 BarrierID == AMDGPU::Barrier::WORKGROUP ||
152 (BarrierID >= AMDGPU::Barrier::NAMED_BARRIER_FIRST &&
153 BarrierID <= AMDGPU::Barrier::NAMED_BARRIER_LAST))
154 IsWorkgroupScope = true;
155 } else {
156 assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier);
157 IsWorkgroupScope = true;
158 }
159
160 if (IsWorkgroupScope && IsSingleWaveWG) {
161 // Down-grade waits, remove split signals.
162 if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
163 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
164 B.CreateIntrinsicWithoutFolding(RetTy: B.getVoidTy(),
165 ID: Intrinsic::amdgcn_wave_barrier, Args: {})
166 ->copyMetadata(SrcInst: I);
167 } else if (I.getIntrinsicID() ==
168 Intrinsic::amdgcn_s_barrier_signal_isfirst) {
169 // If we're the only wave of the workgroup, we're always first.
170 I.replaceAllUsesWith(V: B.getInt1(V: true));
171 }
172 I.eraseFromParent();
173 return true;
174 }
175
176 if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier &&
177 ST.hasSplitBarriers()) {
178 // Lower to split barriers.
179 Value *BarrierID_32 = B.getInt32(C: AMDGPU::Barrier::WORKGROUP);
180 Value *BarrierID_16 = B.getInt16(C: AMDGPU::Barrier::WORKGROUP);
181 B.CreateIntrinsicWithoutFolding(
182 RetTy: B.getVoidTy(), ID: Intrinsic::amdgcn_s_barrier_signal, Args: {BarrierID_32})
183 ->copyMetadata(SrcInst: I);
184 B.CreateIntrinsicWithoutFolding(
185 RetTy: B.getVoidTy(), ID: Intrinsic::amdgcn_s_barrier_wait, Args: {BarrierID_16})
186 ->copyMetadata(SrcInst: I);
187 I.eraseFromParent();
188 return true;
189 }
190
191 return false;
192}
193
194PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
195 ModuleAnalysisManager &MAM) {
196 AMDGPULowerIntrinsicsImpl Impl(M, TM);
197 if (!Impl.run())
198 return PreservedAnalyses::all();
199 return PreservedAnalyses::none();
200}
201
202bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
203 auto &TPC = getAnalysis<TargetPassConfig>();
204 const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();
205
206 AMDGPULowerIntrinsicsImpl Impl(M, TM);
207 return Impl.run();
208}
209
210#define PASS_DESC "AMDGPU lower intrinsics"
211INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
212 false)
213INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
214INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
215 false)
216
217char AMDGPULowerIntrinsicsLegacy::ID = 0;
218
219ModulePass *llvm::createAMDGPULowerIntrinsicsLegacyPass() {
220 return new AMDGPULowerIntrinsicsLegacy;
221}
222