1 | //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This pass does misc. AMDGPU optimizations on IR *just* before instruction |
11 | /// selection. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "AMDGPU.h" |
16 | #include "AMDGPUTargetMachine.h" |
17 | #include "llvm/Analysis/AssumptionCache.h" |
18 | #include "llvm/Analysis/UniformityAnalysis.h" |
19 | #include "llvm/Analysis/ValueTracking.h" |
20 | #include "llvm/CodeGen/TargetPassConfig.h" |
21 | #include "llvm/IR/IRBuilder.h" |
22 | #include "llvm/IR/InstVisitor.h" |
23 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
24 | #include "llvm/Support/CommandLine.h" |
25 | #include "llvm/Support/KnownBits.h" |
26 | #include "llvm/Transforms/Utils/Local.h" |
27 | |
28 | #define DEBUG_TYPE "amdgpu-late-codegenprepare" |
29 | |
30 | using namespace llvm; |
31 | |
32 | // Scalar load widening needs running after load-store-vectorizer as that pass |
33 | // doesn't handle overlapping cases. In addition, this pass enhances the |
34 | // widening to handle cases where scalar sub-dword loads are naturally aligned |
35 | // only but not dword aligned. |
36 | static cl::opt<bool> |
37 | WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads" , |
38 | cl::desc("Widen sub-dword constant address space loads in " |
39 | "AMDGPULateCodeGenPrepare" ), |
40 | cl::ReallyHidden, cl::init(Val: true)); |
41 | |
42 | namespace { |
43 | |
44 | class AMDGPULateCodeGenPrepare |
45 | : public InstVisitor<AMDGPULateCodeGenPrepare, bool> { |
46 | Function &F; |
47 | const DataLayout &DL; |
48 | const GCNSubtarget &ST; |
49 | |
50 | AssumptionCache *const AC; |
51 | UniformityInfo &UA; |
52 | |
53 | SmallVector<WeakTrackingVH, 8> DeadInsts; |
54 | |
55 | public: |
56 | AMDGPULateCodeGenPrepare(Function &F, const GCNSubtarget &ST, |
57 | AssumptionCache *AC, UniformityInfo &UA) |
58 | : F(F), DL(F.getDataLayout()), ST(ST), AC(AC), UA(UA) {} |
59 | bool run(); |
60 | bool visitInstruction(Instruction &) { return false; } |
61 | |
62 | // Check if the specified value is at least DWORD aligned. |
63 | bool isDWORDAligned(const Value *V) const { |
64 | KnownBits Known = computeKnownBits(V, DL, AC); |
65 | return Known.countMinTrailingZeros() >= 2; |
66 | } |
67 | |
68 | bool canWidenScalarExtLoad(LoadInst &LI) const; |
69 | bool visitLoadInst(LoadInst &LI); |
70 | }; |
71 | |
72 | using ValueToValueMap = DenseMap<const Value *, Value *>; |
73 | |
74 | class LiveRegOptimizer { |
75 | private: |
76 | Module &Mod; |
77 | const DataLayout &DL; |
78 | const GCNSubtarget &ST; |
79 | |
80 | /// The scalar type to convert to |
81 | Type *const ConvertToScalar; |
82 | /// The set of visited Instructions |
83 | SmallPtrSet<Instruction *, 4> Visited; |
84 | /// Map of Value -> Converted Value |
85 | ValueToValueMap ValMap; |
86 | /// Map of containing conversions from Optimal Type -> Original Type per BB. |
87 | DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap; |
88 | |
89 | public: |
90 | /// Calculate the and \p return the type to convert to given a problematic \p |
91 | /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32). |
92 | Type *calculateConvertType(Type *OriginalType); |
93 | /// Convert the virtual register defined by \p V to the compatible vector of |
94 | /// legal type |
95 | Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt); |
96 | /// Convert the virtual register defined by \p V back to the original type \p |
97 | /// ConvertType, stripping away the MSBs in cases where there was an imperfect |
98 | /// fit (e.g. v2i32 -> v7i8) |
99 | Value *convertFromOptType(Type *ConvertType, Instruction *V, |
100 | BasicBlock::iterator &InstPt, |
101 | BasicBlock *InsertBlock); |
102 | /// Check for problematic PHI nodes or cross-bb values based on the value |
103 | /// defined by \p I, and coerce to legal types if necessary. For problematic |
104 | /// PHI node, we coerce all incoming values in a single invocation. |
105 | bool optimizeLiveType(Instruction *I, |
106 | SmallVectorImpl<WeakTrackingVH> &DeadInsts); |
107 | |
108 | // Whether or not the type should be replaced to avoid inefficient |
109 | // legalization code |
110 | bool shouldReplace(Type *ITy) { |
111 | FixedVectorType *VTy = dyn_cast<FixedVectorType>(Val: ITy); |
112 | if (!VTy) |
113 | return false; |
114 | |
115 | const auto *TLI = ST.getTargetLowering(); |
116 | |
117 | Type *EltTy = VTy->getElementType(); |
118 | // If the element size is not less than the convert to scalar size, then we |
119 | // can't do any bit packing |
120 | if (!EltTy->isIntegerTy() || |
121 | EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits()) |
122 | return false; |
123 | |
124 | // Only coerce illegal types |
125 | TargetLoweringBase::LegalizeKind LK = |
126 | TLI->getTypeConversion(Context&: EltTy->getContext(), VT: EVT::getEVT(Ty: EltTy, HandleUnknown: false)); |
127 | return LK.first != TargetLoweringBase::TypeLegal; |
128 | } |
129 | |
130 | bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(Val: I); } |
131 | |
132 | bool isCoercionProfitable(Instruction *II) { |
133 | SmallPtrSet<Instruction *, 4> CVisited; |
134 | SmallVector<Instruction *, 4> UserList; |
135 | |
136 | // Check users for profitable conditions (across block user which can |
137 | // natively handle the illegal vector). |
138 | for (User *V : II->users()) |
139 | if (auto *UseInst = dyn_cast<Instruction>(Val: V)) |
140 | UserList.push_back(Elt: UseInst); |
141 | |
142 | auto IsLookThru = [](Instruction *II) { |
143 | if (const auto *Intr = dyn_cast<IntrinsicInst>(Val: II)) |
144 | return Intr->getIntrinsicID() == Intrinsic::amdgcn_perm; |
145 | return isa<PHINode, ShuffleVectorInst, InsertElementInst, |
146 | ExtractElementInst, CastInst>(Val: II); |
147 | }; |
148 | |
149 | while (!UserList.empty()) { |
150 | auto CII = UserList.pop_back_val(); |
151 | if (!CVisited.insert(Ptr: CII).second) |
152 | continue; |
153 | |
154 | if (CII->getParent() == II->getParent() && !IsLookThru(II)) |
155 | continue; |
156 | |
157 | if (isOpLegal(I: CII)) |
158 | return true; |
159 | |
160 | if (IsLookThru(CII)) |
161 | for (User *V : CII->users()) |
162 | if (auto *UseInst = dyn_cast<Instruction>(Val: V)) |
163 | UserList.push_back(Elt: UseInst); |
164 | } |
165 | return false; |
166 | } |
167 | |
168 | LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST) |
169 | : Mod(Mod), DL(Mod.getDataLayout()), ST(ST), |
170 | ConvertToScalar(Type::getInt32Ty(C&: Mod.getContext())) {} |
171 | }; |
172 | |
173 | } // end anonymous namespace |
174 | |
175 | bool AMDGPULateCodeGenPrepare::run() { |
176 | // "Optimize" the virtual regs that cross basic block boundaries. When |
177 | // building the SelectionDAG, vectors of illegal types that cross basic blocks |
178 | // will be scalarized and widened, with each scalar living in its |
179 | // own register. To work around this, this optimization converts the |
180 | // vectors to equivalent vectors of legal type (which are converted back |
181 | // before uses in subsequent blocks), to pack the bits into fewer physical |
182 | // registers (used in CopyToReg/CopyFromReg pairs). |
183 | LiveRegOptimizer LRO(*F.getParent(), ST); |
184 | |
185 | bool Changed = false; |
186 | |
187 | bool HasScalarSubwordLoads = ST.hasScalarSubwordLoads(); |
188 | |
189 | for (auto &BB : reverse(C&: F)) |
190 | for (Instruction &I : make_early_inc_range(Range: reverse(C&: BB))) { |
191 | Changed |= !HasScalarSubwordLoads && visit(I); |
192 | Changed |= LRO.optimizeLiveType(I: &I, DeadInsts); |
193 | } |
194 | |
195 | RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts); |
196 | return Changed; |
197 | } |
198 | |
199 | Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) { |
200 | assert(OriginalType->getScalarSizeInBits() <= |
201 | ConvertToScalar->getScalarSizeInBits()); |
202 | |
203 | FixedVectorType *VTy = cast<FixedVectorType>(Val: OriginalType); |
204 | |
205 | TypeSize OriginalSize = DL.getTypeSizeInBits(Ty: VTy); |
206 | TypeSize ConvertScalarSize = DL.getTypeSizeInBits(Ty: ConvertToScalar); |
207 | unsigned ConvertEltCount = |
208 | (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize; |
209 | |
210 | if (OriginalSize <= ConvertScalarSize) |
211 | return IntegerType::get(C&: Mod.getContext(), NumBits: ConvertScalarSize); |
212 | |
213 | return VectorType::get(ElementType: Type::getIntNTy(C&: Mod.getContext(), N: ConvertScalarSize), |
214 | NumElements: ConvertEltCount, Scalable: false); |
215 | } |
216 | |
217 | Value *LiveRegOptimizer::convertToOptType(Instruction *V, |
218 | BasicBlock::iterator &InsertPt) { |
219 | FixedVectorType *VTy = cast<FixedVectorType>(Val: V->getType()); |
220 | Type *NewTy = calculateConvertType(OriginalType: V->getType()); |
221 | |
222 | TypeSize OriginalSize = DL.getTypeSizeInBits(Ty: VTy); |
223 | TypeSize NewSize = DL.getTypeSizeInBits(Ty: NewTy); |
224 | |
225 | IRBuilder<> Builder(V->getParent(), InsertPt); |
226 | // If there is a bitsize match, we can fit the old vector into a new vector of |
227 | // desired type. |
228 | if (OriginalSize == NewSize) |
229 | return Builder.CreateBitCast(V, DestTy: NewTy, Name: V->getName() + ".bc" ); |
230 | |
231 | // If there is a bitsize mismatch, we must use a wider vector. |
232 | assert(NewSize > OriginalSize); |
233 | uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits(); |
234 | |
235 | SmallVector<int, 8> ShuffleMask; |
236 | uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue(); |
237 | for (unsigned I = 0; I < OriginalElementCount; I++) |
238 | ShuffleMask.push_back(Elt: I); |
239 | |
240 | for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++) |
241 | ShuffleMask.push_back(Elt: OriginalElementCount); |
242 | |
243 | Value *ExpandedVec = Builder.CreateShuffleVector(V, Mask: ShuffleMask); |
244 | return Builder.CreateBitCast(V: ExpandedVec, DestTy: NewTy, Name: V->getName() + ".bc" ); |
245 | } |
246 | |
247 | Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V, |
248 | BasicBlock::iterator &InsertPt, |
249 | BasicBlock *InsertBB) { |
250 | FixedVectorType *NewVTy = cast<FixedVectorType>(Val: ConvertType); |
251 | |
252 | TypeSize OriginalSize = DL.getTypeSizeInBits(Ty: V->getType()); |
253 | TypeSize NewSize = DL.getTypeSizeInBits(Ty: NewVTy); |
254 | |
255 | IRBuilder<> Builder(InsertBB, InsertPt); |
256 | // If there is a bitsize match, we simply convert back to the original type. |
257 | if (OriginalSize == NewSize) |
258 | return Builder.CreateBitCast(V, DestTy: NewVTy, Name: V->getName() + ".bc" ); |
259 | |
260 | // If there is a bitsize mismatch, then we must have used a wider value to |
261 | // hold the bits. |
262 | assert(OriginalSize > NewSize); |
263 | // For wide scalars, we can just truncate the value. |
264 | if (!V->getType()->isVectorTy()) { |
265 | Instruction *Trunc = cast<Instruction>( |
266 | Val: Builder.CreateTrunc(V, DestTy: IntegerType::get(C&: Mod.getContext(), NumBits: NewSize))); |
267 | return cast<Instruction>(Val: Builder.CreateBitCast(V: Trunc, DestTy: NewVTy)); |
268 | } |
269 | |
270 | // For wider vectors, we must strip the MSBs to convert back to the original |
271 | // type. |
272 | VectorType *ExpandedVT = VectorType::get( |
273 | ElementType: Type::getIntNTy(C&: Mod.getContext(), N: NewVTy->getScalarSizeInBits()), |
274 | NumElements: (OriginalSize / NewVTy->getScalarSizeInBits()), Scalable: false); |
275 | Instruction *Converted = |
276 | cast<Instruction>(Val: Builder.CreateBitCast(V, DestTy: ExpandedVT)); |
277 | |
278 | unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue(); |
279 | SmallVector<int, 8> ShuffleMask(NarrowElementCount); |
280 | std::iota(first: ShuffleMask.begin(), last: ShuffleMask.end(), value: 0); |
281 | |
282 | return Builder.CreateShuffleVector(V: Converted, Mask: ShuffleMask); |
283 | } |
284 | |
285 | bool LiveRegOptimizer::optimizeLiveType( |
286 | Instruction *I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) { |
287 | SmallVector<Instruction *, 4> Worklist; |
288 | SmallPtrSet<PHINode *, 4> PhiNodes; |
289 | SmallPtrSet<Instruction *, 4> Defs; |
290 | SmallPtrSet<Instruction *, 4> Uses; |
291 | |
292 | Worklist.push_back(Elt: cast<Instruction>(Val: I)); |
293 | while (!Worklist.empty()) { |
294 | Instruction *II = Worklist.pop_back_val(); |
295 | |
296 | if (!Visited.insert(Ptr: II).second) |
297 | continue; |
298 | |
299 | if (!shouldReplace(ITy: II->getType())) |
300 | continue; |
301 | |
302 | if (!isCoercionProfitable(II)) |
303 | continue; |
304 | |
305 | if (PHINode *Phi = dyn_cast<PHINode>(Val: II)) { |
306 | PhiNodes.insert(Ptr: Phi); |
307 | // Collect all the incoming values of problematic PHI nodes. |
308 | for (Value *V : Phi->incoming_values()) { |
309 | // Repeat the collection process for newly found PHI nodes. |
310 | if (PHINode *OpPhi = dyn_cast<PHINode>(Val: V)) { |
311 | if (!PhiNodes.count(Ptr: OpPhi) && !Visited.count(Ptr: OpPhi)) |
312 | Worklist.push_back(Elt: OpPhi); |
313 | continue; |
314 | } |
315 | |
316 | Instruction *IncInst = dyn_cast<Instruction>(Val: V); |
317 | // Other incoming value types (e.g. vector literals) are unhandled |
318 | if (!IncInst && !isa<ConstantAggregateZero>(Val: V)) |
319 | return false; |
320 | |
321 | // Collect all other incoming values for coercion. |
322 | if (IncInst) |
323 | Defs.insert(Ptr: IncInst); |
324 | } |
325 | } |
326 | |
327 | // Collect all relevant uses. |
328 | for (User *V : II->users()) { |
329 | // Repeat the collection process for problematic PHI nodes. |
330 | if (PHINode *OpPhi = dyn_cast<PHINode>(Val: V)) { |
331 | if (!PhiNodes.count(Ptr: OpPhi) && !Visited.count(Ptr: OpPhi)) |
332 | Worklist.push_back(Elt: OpPhi); |
333 | continue; |
334 | } |
335 | |
336 | Instruction *UseInst = cast<Instruction>(Val: V); |
337 | // Collect all uses of PHINodes and any use the crosses BB boundaries. |
338 | if (UseInst->getParent() != II->getParent() || isa<PHINode>(Val: II)) { |
339 | Uses.insert(Ptr: UseInst); |
340 | if (!isa<PHINode>(Val: II)) |
341 | Defs.insert(Ptr: II); |
342 | } |
343 | } |
344 | } |
345 | |
346 | // Coerce and track the defs. |
347 | for (Instruction *D : Defs) { |
348 | if (!ValMap.contains(Val: D)) { |
349 | BasicBlock::iterator InsertPt = std::next(x: D->getIterator()); |
350 | Value *ConvertVal = convertToOptType(V: D, InsertPt); |
351 | assert(ConvertVal); |
352 | ValMap[D] = ConvertVal; |
353 | } |
354 | } |
355 | |
356 | // Construct new-typed PHI nodes. |
357 | for (PHINode *Phi : PhiNodes) { |
358 | ValMap[Phi] = PHINode::Create(Ty: calculateConvertType(OriginalType: Phi->getType()), |
359 | NumReservedValues: Phi->getNumIncomingValues(), |
360 | NameStr: Phi->getName() + ".tc" , InsertBefore: Phi->getIterator()); |
361 | } |
362 | |
363 | // Connect all the PHI nodes with their new incoming values. |
364 | for (PHINode *Phi : PhiNodes) { |
365 | PHINode *NewPhi = cast<PHINode>(Val: ValMap[Phi]); |
366 | bool MissingIncVal = false; |
367 | for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) { |
368 | Value *IncVal = Phi->getIncomingValue(i: I); |
369 | if (isa<ConstantAggregateZero>(Val: IncVal)) { |
370 | Type *NewType = calculateConvertType(OriginalType: Phi->getType()); |
371 | NewPhi->addIncoming(V: ConstantInt::get(Ty: NewType, V: 0, IsSigned: false), |
372 | BB: Phi->getIncomingBlock(i: I)); |
373 | } else if (Value *Val = ValMap.lookup(Val: IncVal)) |
374 | NewPhi->addIncoming(V: Val, BB: Phi->getIncomingBlock(i: I)); |
375 | else |
376 | MissingIncVal = true; |
377 | } |
378 | if (MissingIncVal) { |
379 | Value *DeadVal = ValMap[Phi]; |
380 | // The coercion chain of the PHI is broken. Delete the Phi |
381 | // from the ValMap and any connected / user Phis. |
382 | SmallVector<Value *, 4> PHIWorklist; |
383 | SmallPtrSet<Value *, 4> VisitedPhis; |
384 | PHIWorklist.push_back(Elt: DeadVal); |
385 | while (!PHIWorklist.empty()) { |
386 | Value *NextDeadValue = PHIWorklist.pop_back_val(); |
387 | VisitedPhis.insert(Ptr: NextDeadValue); |
388 | auto OriginalPhi = |
389 | llvm::find_if(Range&: PhiNodes, P: [this, &NextDeadValue](PHINode *CandPhi) { |
390 | return ValMap[CandPhi] == NextDeadValue; |
391 | }); |
392 | // This PHI may have already been removed from maps when |
393 | // unwinding a previous Phi |
394 | if (OriginalPhi != PhiNodes.end()) |
395 | ValMap.erase(Val: *OriginalPhi); |
396 | |
397 | DeadInsts.emplace_back(Args: cast<Instruction>(Val: NextDeadValue)); |
398 | |
399 | for (User *U : NextDeadValue->users()) { |
400 | if (!VisitedPhis.contains(Ptr: cast<PHINode>(Val: U))) |
401 | PHIWorklist.push_back(Elt: U); |
402 | } |
403 | } |
404 | } else { |
405 | DeadInsts.emplace_back(Args: cast<Instruction>(Val: Phi)); |
406 | } |
407 | } |
408 | // Coerce back to the original type and replace the uses. |
409 | for (Instruction *U : Uses) { |
410 | // Replace all converted operands for a use. |
411 | for (auto [OpIdx, Op] : enumerate(First: U->operands())) { |
412 | if (Value *Val = ValMap.lookup(Val: Op)) { |
413 | Value *NewVal = nullptr; |
414 | if (BBUseValMap.contains(Val: U->getParent()) && |
415 | BBUseValMap[U->getParent()].contains(Val)) |
416 | NewVal = BBUseValMap[U->getParent()][Val]; |
417 | else { |
418 | BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt(); |
419 | // We may pick up ops that were previously converted for users in |
420 | // other blocks. If there is an originally typed definition of the Op |
421 | // already in this block, simply reuse it. |
422 | if (isa<Instruction>(Val: Op) && !isa<PHINode>(Val: Op) && |
423 | U->getParent() == cast<Instruction>(Val&: Op)->getParent()) { |
424 | NewVal = Op; |
425 | } else { |
426 | NewVal = |
427 | convertFromOptType(ConvertType: Op->getType(), V: cast<Instruction>(Val: ValMap[Op]), |
428 | InsertPt, InsertBB: U->getParent()); |
429 | BBUseValMap[U->getParent()][ValMap[Op]] = NewVal; |
430 | } |
431 | } |
432 | assert(NewVal); |
433 | U->setOperand(i: OpIdx, Val: NewVal); |
434 | } |
435 | } |
436 | } |
437 | |
438 | return true; |
439 | } |
440 | |
441 | bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { |
442 | unsigned AS = LI.getPointerAddressSpace(); |
443 | // Skip non-constant address space. |
444 | if (AS != AMDGPUAS::CONSTANT_ADDRESS && |
445 | AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) |
446 | return false; |
447 | // Skip non-simple loads. |
448 | if (!LI.isSimple()) |
449 | return false; |
450 | Type *Ty = LI.getType(); |
451 | // Skip aggregate types. |
452 | if (Ty->isAggregateType()) |
453 | return false; |
454 | unsigned TySize = DL.getTypeStoreSize(Ty); |
455 | // Only handle sub-DWORD loads. |
456 | if (TySize >= 4) |
457 | return false; |
458 | // That load must be at least naturally aligned. |
459 | if (LI.getAlign() < DL.getABITypeAlign(Ty)) |
460 | return false; |
461 | // It should be uniform, i.e. a scalar load. |
462 | return UA.isUniform(I: &LI); |
463 | } |
464 | |
465 | bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { |
466 | if (!WidenLoads) |
467 | return false; |
468 | |
469 | // Skip if that load is already aligned on DWORD at least as it's handled in |
470 | // SDAG. |
471 | if (LI.getAlign() >= 4) |
472 | return false; |
473 | |
474 | if (!canWidenScalarExtLoad(LI)) |
475 | return false; |
476 | |
477 | int64_t Offset = 0; |
478 | auto *Base = |
479 | GetPointerBaseWithConstantOffset(Ptr: LI.getPointerOperand(), Offset, DL); |
480 | // If that base is not DWORD aligned, it's not safe to perform the following |
481 | // transforms. |
482 | if (!isDWORDAligned(V: Base)) |
483 | return false; |
484 | |
485 | int64_t Adjust = Offset & 0x3; |
486 | if (Adjust == 0) { |
487 | // With a zero adjust, the original alignment could be promoted with a |
488 | // better one. |
489 | LI.setAlignment(Align(4)); |
490 | return true; |
491 | } |
492 | |
493 | IRBuilder<> IRB(&LI); |
494 | IRB.SetCurrentDebugLocation(LI.getDebugLoc()); |
495 | |
496 | unsigned LdBits = DL.getTypeStoreSizeInBits(Ty: LI.getType()); |
497 | auto *IntNTy = Type::getIntNTy(C&: LI.getContext(), N: LdBits); |
498 | |
499 | auto *NewPtr = IRB.CreateConstGEP1_64( |
500 | Ty: IRB.getInt8Ty(), |
501 | Ptr: IRB.CreateAddrSpaceCast(V: Base, DestTy: LI.getPointerOperand()->getType()), |
502 | Idx0: Offset - Adjust); |
503 | |
504 | LoadInst *NewLd = IRB.CreateAlignedLoad(Ty: IRB.getInt32Ty(), Ptr: NewPtr, Align: Align(4)); |
505 | NewLd->copyMetadata(SrcInst: LI); |
506 | NewLd->setMetadata(KindID: LLVMContext::MD_range, Node: nullptr); |
507 | |
508 | unsigned ShAmt = Adjust * 8; |
509 | Value *NewVal = IRB.CreateBitCast( |
510 | V: IRB.CreateTrunc(V: IRB.CreateLShr(LHS: NewLd, RHS: ShAmt), |
511 | DestTy: DL.typeSizeEqualsStoreSize(Ty: LI.getType()) ? IntNTy |
512 | : LI.getType()), |
513 | DestTy: LI.getType()); |
514 | LI.replaceAllUsesWith(V: NewVal); |
515 | DeadInsts.emplace_back(Args: &LI); |
516 | |
517 | return true; |
518 | } |
519 | |
520 | PreservedAnalyses |
521 | AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) { |
522 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); |
523 | AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(IR&: F); |
524 | UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(IR&: F); |
525 | |
526 | bool Changed = AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run(); |
527 | |
528 | if (!Changed) |
529 | return PreservedAnalyses::all(); |
530 | PreservedAnalyses PA = PreservedAnalyses::none(); |
531 | PA.preserveSet<CFGAnalyses>(); |
532 | return PA; |
533 | } |
534 | |
535 | class AMDGPULateCodeGenPrepareLegacy : public FunctionPass { |
536 | public: |
537 | static char ID; |
538 | |
539 | AMDGPULateCodeGenPrepareLegacy() : FunctionPass(ID) {} |
540 | |
541 | StringRef getPassName() const override { |
542 | return "AMDGPU IR late optimizations" ; |
543 | } |
544 | |
545 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
546 | AU.addRequired<TargetPassConfig>(); |
547 | AU.addRequired<AssumptionCacheTracker>(); |
548 | AU.addRequired<UniformityInfoWrapperPass>(); |
549 | AU.setPreservesAll(); |
550 | } |
551 | |
552 | bool runOnFunction(Function &F) override; |
553 | }; |
554 | |
555 | bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) { |
556 | if (skipFunction(F)) |
557 | return false; |
558 | |
559 | const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); |
560 | const TargetMachine &TM = TPC.getTM<TargetMachine>(); |
561 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); |
562 | |
563 | AssumptionCache &AC = |
564 | getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
565 | UniformityInfo &UI = |
566 | getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); |
567 | |
568 | return AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run(); |
569 | } |
570 | |
571 | INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE, |
572 | "AMDGPU IR late optimizations" , false, false) |
573 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
574 | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) |
575 | INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) |
576 | INITIALIZE_PASS_END(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE, |
577 | "AMDGPU IR late optimizations" , false, false) |
578 | |
579 | char AMDGPULateCodeGenPrepareLegacy::ID = 0; |
580 | |
581 | FunctionPass *llvm::createAMDGPULateCodeGenPrepareLegacyPass() { |
582 | return new AMDGPULateCodeGenPrepareLegacy(); |
583 | } |
584 | |