1//===-- AMDGPULowerBufferFatPointers.cpp ---------------------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers operations on buffer fat pointers (addrspace 7) to
10// operations on buffer resources (addrspace 8) and is needed for correct
11// codegen.
12//
13// # Background
14//
15// Address space 7 (the buffer fat pointer) is a 160-bit pointer that consists
16// of a 128-bit buffer descriptor and a 32-bit offset into that descriptor.
17// The buffer resource part needs to be it needs to be a "raw" buffer resource
18// (it must have a stride of 0 and bounds checks must be in raw buffer mode
19// or disabled).
20//
21// When these requirements are met, a buffer resource can be treated as a
22// typical (though quite wide) pointer that follows typical LLVM pointer
23// semantics. This allows the frontend to reason about such buffers (which are
24// often encountered in the context of SPIR-V kernels).
25//
26// However, because of their non-power-of-2 size, these fat pointers cannot be
27// present during translation to MIR (though this restriction may be lifted
28// during the transition to GlobalISel). Therefore, this pass is needed in order
29// to correctly implement these fat pointers.
30//
31// The resource intrinsics take the resource part (the address space 8 pointer)
32// and the offset part (the 32-bit integer) as separate arguments. In addition,
33// many users of these buffers manipulate the offset while leaving the resource
34// part alone. For these reasons, we want to typically separate the resource
35// and offset parts into separate variables, but combine them together when
36// encountering cases where this is required, such as by inserting these values
37// into aggretates or moving them to memory.
38//
39// Therefore, at a high level, `ptr addrspace(7) %x` becomes `ptr addrspace(8)
40// %x.rsrc` and `i32 %x.off`, which will be combined into `{ptr addrspace(8),
41// i32} %x = {%x.rsrc, %x.off}` if needed. Similarly, `vector<Nxp7>` becomes
42// `{vector<Nxp8>, vector<Nxi32 >}` and its component parts.
43//
44// # Implementation
45//
46// This pass proceeds in three main phases:
47//
48// ## Rewriting loads and stores of p7
49//
50// The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`,
51// including aggregates containing such pointers, to ones that use `i160`. This
52// is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and
53// allocas and, if the loaded or stored type contains `ptr addrspace(7)`,
54// rewrites that type to one where the p7s are replaced by i160s, copying other
55// parts of aggregates as needed. In the case of a store, each pointer is
56// `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back.
57// This same transformation is applied to vectors of pointers.
58//
59// Such a transformation allows the later phases of the pass to not need
60// to handle buffer fat pointers moving to and from memory, where we load
61// have to handle the incompatibility between a `{Nxp8, Nxi32}` representation
62// and `Nxi60` directly. Instead, that transposing action (where the vectors
63// of resources and vectors of offsets are concatentated before being stored to
64// memory) are handled through implementing `inttoptr` and `ptrtoint` only.
65//
66// Atomics operations on `ptr addrspace(7)` values are not suppported, as the
67// hardware does not include a 160-bit atomic.
68//
69// ## Type remapping
70//
71// We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers
72// to the corresponding struct type, which has a resource part and an offset
73// part.
74//
75// This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer`
76// to, usually by way of `setType`ing values. Constants are handled here
77// because there isn't a good way to fix them up later.
78//
79// This has the downside of leaving the IR in an invalid state (for example,
80// the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist),
81// but all such invalid states will be resolved by the third phase.
82//
83// Functions that don't take buffer fat pointers are modified in place. Those
84// that do take such pointers have their basic blocks moved to a new function
85// with arguments that are {ptr addrspace(8), i32} arguments and return values.
86// This phase also records intrinsics so that they can be remangled or deleted
87// later.
88//
89//
90// ## Splitting pointer structs
91//
92// The meat of this pass consists of defining semantics for operations that
93// produce or consume [vectors of] buffer fat pointers in terms of their
94// resource and offset parts. This is accomplished throgh the `SplitPtrStructs`
95// visitor.
96//
97// In the first pass through each function that is being lowered, the splitter
98// inserts new instructions to implement the split-structures behavior, which is
99// needed for correctness and performance. It records a list of "split users",
100// instructions that are being replaced by operations on the resource and offset
101// parts.
102//
103// Split users do not necessarily need to produce parts themselves (
104// a `load float, ptr addrspace(7)` does not, for example), but, if they do not
105// generate fat buffer pointers, they must RAUW in their replacement
106// instructions during the initial visit.
107//
108// When these new instructions are created, they use the split parts recorded
109// for their initial arguments in order to generate their replacements, creating
110// a parallel set of instructions that does not refer to the original fat
111// pointer values but instead to their resource and offset components.
112//
113// Instructions, such as `extractvalue`, that produce buffer fat pointers from
114// sources that do not have split parts, have such parts generated using
115// `extractvalue`. This is also the initial handling of PHI nodes, which
116// are then cleaned up.
117//
118// ### Conditionals
119//
120// PHI nodes are initially given resource parts via `extractvalue`. However,
121// this is not an efficient rewrite of such nodes, as, in most cases, the
122// resource part in a conditional or loop remains constant throughout the loop
123// and only the offset varies. Failing to optimize away these constant resources
124// would cause additional registers to be sent around loops and might lead to
125// waterfall loops being generated for buffer operations due to the
126// "non-uniform" resource argument.
127//
128// Therefore, after all instructions have been visited, the pointer splitter
129// post-processes all encountered conditionals. Given a PHI node or select,
130// getPossibleRsrcRoots() collects all values that the resource parts of that
131// conditional's input could come from as well as collecting all conditional
132// instructions encountered during the search. If, after filtering out the
133// initial node itself, the set of encountered conditionals is a subset of the
134// potential roots and there is a single potential resource that isn't in the
135// conditional set, that value is the only possible value the resource argument
136// could have throughout the control flow.
137//
138// If that condition is met, then a PHI node can have its resource part changed
139// to the singleton value and then be replaced by a PHI on the offsets.
140// Otherwise, each PHI node is split into two, one for the resource part and one
141// for the offset part, which replace the temporary `extractvalue` instructions
142// that were added during the first pass.
143//
144// Similar logic applies to `select`, where
145// `%z = select i1 %cond, %cond, ptr addrspace(7) %x, ptr addrspace(7) %y`
146// can be split into `%z.rsrc = %x.rsrc` and
147// `%z.off = select i1 %cond, ptr i32 %x.off, i32 %y.off`
148// if both `%x` and `%y` have the same resource part, but two `select`
149// operations will be needed if they do not.
150//
151// ### Final processing
152//
153// After conditionals have been cleaned up, the IR for each function is
154// rewritten to remove all the old instructions that have been split up.
155//
156// Any instruction that used to produce a buffer fat pointer (and therefore now
157// produces a resource-and-offset struct after type remapping) is
158// replaced as follows:
159// 1. All debug value annotations are cloned to reflect that the resource part
160// and offset parts are computed separately and constitute different
161// fragments of the underlying source language variable.
162// 2. All uses that were themselves split are replaced by a `poison` of the
163// struct type, as they will themselves be erased soon. This rule, combined
164// with debug handling, should leave the use lists of split instructions
165// empty in almost all cases.
166// 3. If a user of the original struct-valued result remains, the structure
167// needed for the new types to work is constructed out of the newly-defined
168// parts, and the original instruction is replaced by this structure
169// before being erased. Instructions requiring this construction include
170// `ret` and `insertvalue`.
171//
172// # Consequences
173//
174// This pass does not alter the CFG.
175//
176// Alias analysis information will become coarser, as the LLVM alias analyzer
177// cannot handle the buffer intrinsics. Specifically, while we can determine
178// that the following two loads do not alias:
179// ```
180// %y = getelementptr i32, ptr addrspace(7) %x, i32 1
181// %a = load i32, ptr addrspace(7) %x
182// %b = load i32, ptr addrspace(7) %y
183// ```
184// we cannot (except through some code that runs during scheduling) determine
185// that the rewritten loads below do not alias.
186// ```
187// %y.off = add i32 %x.off, 1
188// %a = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %x.rsrc, i32
189// %x.off, ...)
190// %b = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8)
191// %x.rsrc, i32 %y.off, ...)
192// ```
193// However, existing alias information is preserved.
194//===----------------------------------------------------------------------===//
195
196#include "AMDGPU.h"
197#include "AMDGPUTargetMachine.h"
198#include "GCNSubtarget.h"
199#include "SIDefines.h"
200#include "llvm/ADT/SetOperations.h"
201#include "llvm/ADT/SmallVector.h"
202#include "llvm/Analysis/ConstantFolding.h"
203#include "llvm/Analysis/Utils/Local.h"
204#include "llvm/CodeGen/TargetPassConfig.h"
205#include "llvm/IR/AttributeMask.h"
206#include "llvm/IR/Constants.h"
207#include "llvm/IR/DebugInfo.h"
208#include "llvm/IR/DerivedTypes.h"
209#include "llvm/IR/IRBuilder.h"
210#include "llvm/IR/InstIterator.h"
211#include "llvm/IR/InstVisitor.h"
212#include "llvm/IR/Instructions.h"
213#include "llvm/IR/Intrinsics.h"
214#include "llvm/IR/IntrinsicsAMDGPU.h"
215#include "llvm/IR/Metadata.h"
216#include "llvm/IR/Operator.h"
217#include "llvm/IR/PatternMatch.h"
218#include "llvm/IR/ReplaceConstant.h"
219#include "llvm/InitializePasses.h"
220#include "llvm/Pass.h"
221#include "llvm/Support/AtomicOrdering.h"
222#include "llvm/Support/Debug.h"
223#include "llvm/Support/ErrorHandling.h"
224#include "llvm/Transforms/Utils/Cloning.h"
225#include "llvm/Transforms/Utils/Local.h"
226#include "llvm/Transforms/Utils/ValueMapper.h"
227
228#define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers"
229
230using namespace llvm;
231
232static constexpr unsigned BufferOffsetWidth = 32;
233
234namespace {
235/// Recursively replace instances of ptr addrspace(7) and vector<Nxptr
236/// addrspace(7)> with some other type as defined by the relevant subclass.
237class BufferFatPtrTypeLoweringBase : public ValueMapTypeRemapper {
238 DenseMap<Type *, Type *> Map;
239
240 Type *remapTypeImpl(Type *Ty, SmallPtrSetImpl<StructType *> &Seen);
241
242protected:
243 virtual Type *remapScalar(PointerType *PT) = 0;
244 virtual Type *remapVector(VectorType *VT) = 0;
245
246 const DataLayout &DL;
247
248public:
249 BufferFatPtrTypeLoweringBase(const DataLayout &DL) : DL(DL) {}
250 Type *remapType(Type *SrcTy) override;
251 void clear() { Map.clear(); }
252};
253
254/// Remap ptr addrspace(7) to i160 and vector<Nxptr addrspace(7)> to
255/// vector<Nxi60> in order to correctly handling loading/storing these values
256/// from memory.
257class BufferFatPtrToIntTypeMap : public BufferFatPtrTypeLoweringBase {
258 using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase;
259
260protected:
261 Type *remapScalar(PointerType *PT) override { return DL.getIntPtrType(PT); }
262 Type *remapVector(VectorType *VT) override { return DL.getIntPtrType(VT); }
263};
264
265/// Remap ptr addrspace(7) to {ptr addrspace(8), i32} (the resource and offset
266/// parts of the pointer) so that we can easily rewrite operations on these
267/// values that aren't loading them from or storing them to memory.
268class BufferFatPtrToStructTypeMap : public BufferFatPtrTypeLoweringBase {
269 using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase;
270
271protected:
272 Type *remapScalar(PointerType *PT) override;
273 Type *remapVector(VectorType *VT) override;
274};
275} // namespace
276
277// This code is adapted from the type remapper in lib/Linker/IRMover.cpp
278Type *BufferFatPtrTypeLoweringBase::remapTypeImpl(
279 Type *Ty, SmallPtrSetImpl<StructType *> &Seen) {
280 Type **Entry = &Map[Ty];
281 if (*Entry)
282 return *Entry;
283 if (auto *PT = dyn_cast<PointerType>(Val: Ty)) {
284 if (PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) {
285 return *Entry = remapScalar(PT);
286 }
287 }
288 if (auto *VT = dyn_cast<VectorType>(Val: Ty)) {
289 auto *PT = dyn_cast<PointerType>(Val: VT->getElementType());
290 if (PT && PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) {
291 return *Entry = remapVector(VT);
292 }
293 return *Entry = Ty;
294 }
295 // Whether the type is one that is structurally uniqued - that is, if it is
296 // not a named struct (the only kind of type where multiple structurally
297 // identical types that have a distinct `Type*`)
298 StructType *TyAsStruct = dyn_cast<StructType>(Val: Ty);
299 bool IsUniqued = !TyAsStruct || TyAsStruct->isLiteral();
300 // Base case for ints, floats, opaque pointers, and so on, which don't
301 // require recursion.
302 if (Ty->getNumContainedTypes() == 0 && IsUniqued)
303 return *Entry = Ty;
304 if (!IsUniqued) {
305 // Create a dummy type for recursion purposes.
306 if (!Seen.insert(Ptr: TyAsStruct).second) {
307 StructType *Placeholder = StructType::create(Context&: Ty->getContext());
308 return *Entry = Placeholder;
309 }
310 }
311 bool Changed = false;
312 SmallVector<Type *> ElementTypes(Ty->getNumContainedTypes(), nullptr);
313 for (unsigned int I = 0, E = Ty->getNumContainedTypes(); I < E; ++I) {
314 Type *OldElem = Ty->getContainedType(i: I);
315 Type *NewElem = remapTypeImpl(Ty: OldElem, Seen);
316 ElementTypes[I] = NewElem;
317 Changed |= (OldElem != NewElem);
318 }
319 // Recursive calls to remapTypeImpl() may have invalidated pointer.
320 Entry = &Map[Ty];
321 if (!Changed) {
322 return *Entry = Ty;
323 }
324 if (auto *ArrTy = dyn_cast<ArrayType>(Val: Ty))
325 return *Entry = ArrayType::get(ElementType: ElementTypes[0], NumElements: ArrTy->getNumElements());
326 if (auto *FnTy = dyn_cast<FunctionType>(Val: Ty))
327 return *Entry = FunctionType::get(Result: ElementTypes[0],
328 Params: ArrayRef(ElementTypes).slice(N: 1),
329 isVarArg: FnTy->isVarArg());
330 if (auto *STy = dyn_cast<StructType>(Val: Ty)) {
331 // Genuine opaque types don't have a remapping.
332 if (STy->isOpaque())
333 return *Entry = Ty;
334 bool IsPacked = STy->isPacked();
335 if (IsUniqued)
336 return *Entry = StructType::get(Context&: Ty->getContext(), Elements: ElementTypes, isPacked: IsPacked);
337 SmallString<16> Name(STy->getName());
338 STy->setName("");
339 Type **RecursionEntry = &Map[Ty];
340 if (*RecursionEntry) {
341 auto *Placeholder = cast<StructType>(Val: *RecursionEntry);
342 Placeholder->setBody(Elements: ElementTypes, isPacked: IsPacked);
343 Placeholder->setName(Name);
344 return *Entry = Placeholder;
345 }
346 return *Entry = StructType::create(Context&: Ty->getContext(), Elements: ElementTypes, Name,
347 isPacked: IsPacked);
348 }
349 llvm_unreachable("Unknown type of type that contains elements");
350}
351
352Type *BufferFatPtrTypeLoweringBase::remapType(Type *SrcTy) {
353 SmallPtrSet<StructType *, 2> Visited;
354 return remapTypeImpl(Ty: SrcTy, Seen&: Visited);
355}
356
357Type *BufferFatPtrToStructTypeMap::remapScalar(PointerType *PT) {
358 LLVMContext &Ctx = PT->getContext();
359 return StructType::get(elt1: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::BUFFER_RESOURCE),
360 elts: IntegerType::get(C&: Ctx, NumBits: BufferOffsetWidth));
361}
362
363Type *BufferFatPtrToStructTypeMap::remapVector(VectorType *VT) {
364 ElementCount EC = VT->getElementCount();
365 LLVMContext &Ctx = VT->getContext();
366 Type *RsrcVec =
367 VectorType::get(ElementType: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::BUFFER_RESOURCE), EC);
368 Type *OffVec = VectorType::get(ElementType: IntegerType::get(C&: Ctx, NumBits: BufferOffsetWidth), EC);
369 return StructType::get(elt1: RsrcVec, elts: OffVec);
370}
371
372static bool isBufferFatPtrOrVector(Type *Ty) {
373 if (auto *PT = dyn_cast<PointerType>(Val: Ty->getScalarType()))
374 return PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER;
375 return false;
376}
377
378// True if the type is {ptr addrspace(8), i32} or a struct containing vectors of
379// those types. Used to quickly skip instructions we don't need to process.
380static bool isSplitFatPtr(Type *Ty) {
381 auto *ST = dyn_cast<StructType>(Val: Ty);
382 if (!ST)
383 return false;
384 if (!ST->isLiteral() || ST->getNumElements() != 2)
385 return false;
386 auto *MaybeRsrc =
387 dyn_cast<PointerType>(Val: ST->getElementType(N: 0)->getScalarType());
388 auto *MaybeOff =
389 dyn_cast<IntegerType>(Val: ST->getElementType(N: 1)->getScalarType());
390 return MaybeRsrc && MaybeOff &&
391 MaybeRsrc->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE &&
392 MaybeOff->getBitWidth() == BufferOffsetWidth;
393}
394
395// True if the result type or any argument types are buffer fat pointers.
396static bool isBufferFatPtrConst(Constant *C) {
397 Type *T = C->getType();
398 return isBufferFatPtrOrVector(Ty: T) || any_of(Range: C->operands(), P: [](const Use &U) {
399 return isBufferFatPtrOrVector(Ty: U.get()->getType());
400 });
401}
402
403namespace {
404/// Convert [vectors of] buffer fat pointers to integers when they are read from
405/// or stored to memory. This ensures that these pointers will have the same
406/// memory layout as before they are lowered, even though they will no longer
407/// have their previous layout in registers/in the program (they'll be broken
408/// down into resource and offset parts). This has the downside of imposing
409/// marshalling costs when reading or storing these values, but since placing
410/// such pointers into memory is an uncommon operation at best, we feel that
411/// this cost is acceptable for better performance in the common case.
412class StoreFatPtrsAsIntsVisitor
413 : public InstVisitor<StoreFatPtrsAsIntsVisitor, bool> {
414 BufferFatPtrToIntTypeMap *TypeMap;
415
416 ValueToValueMapTy ConvertedForStore;
417
418 IRBuilder<> IRB;
419
420 // Convert all the buffer fat pointers within the input value to inttegers
421 // so that it can be stored in memory.
422 Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name);
423 // Convert all the i160s that need to be buffer fat pointers (as specified)
424 // by the To type) into those pointers to preserve the semantics of the rest
425 // of the program.
426 Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name);
427
428public:
429 StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx)
430 : TypeMap(TypeMap), IRB(Ctx) {}
431 bool processFunction(Function &F);
432
433 bool visitInstruction(Instruction &I) { return false; }
434 bool visitAllocaInst(AllocaInst &I);
435 bool visitLoadInst(LoadInst &LI);
436 bool visitStoreInst(StoreInst &SI);
437 bool visitGetElementPtrInst(GetElementPtrInst &I);
438};
439} // namespace
440
441Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
442 const Twine &Name) {
443 if (From == To)
444 return V;
445 ValueToValueMapTy::iterator Find = ConvertedForStore.find(Val: V);
446 if (Find != ConvertedForStore.end())
447 return Find->second;
448 if (isBufferFatPtrOrVector(Ty: From)) {
449 Value *Cast = IRB.CreatePtrToInt(V, DestTy: To, Name: Name + ".int");
450 ConvertedForStore[V] = Cast;
451 return Cast;
452 }
453 if (From->getNumContainedTypes() == 0)
454 return V;
455 // Structs, arrays, and other compound types.
456 Value *Ret = PoisonValue::get(T: To);
457 if (auto *AT = dyn_cast<ArrayType>(Val: From)) {
458 Type *FromPart = AT->getArrayElementType();
459 Type *ToPart = cast<ArrayType>(Val: To)->getElementType();
460 for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) {
461 Value *Field = IRB.CreateExtractValue(Agg: V, Idxs: I);
462 Value *NewField =
463 fatPtrsToInts(V: Field, From: FromPart, To: ToPart, Name: Name + "." + Twine(I));
464 Ret = IRB.CreateInsertValue(Agg: Ret, Val: NewField, Idxs: I);
465 }
466 } else {
467 for (auto [Idx, FromPart, ToPart] :
468 enumerate(First: From->subtypes(), Rest: To->subtypes())) {
469 Value *Field = IRB.CreateExtractValue(Agg: V, Idxs: Idx);
470 Value *NewField =
471 fatPtrsToInts(V: Field, From: FromPart, To: ToPart, Name: Name + "." + Twine(Idx));
472 Ret = IRB.CreateInsertValue(Agg: Ret, Val: NewField, Idxs: Idx);
473 }
474 }
475 ConvertedForStore[V] = Ret;
476 return Ret;
477}
478
479Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
480 const Twine &Name) {
481 if (From == To)
482 return V;
483 if (isBufferFatPtrOrVector(Ty: To)) {
484 Value *Cast = IRB.CreateIntToPtr(V, DestTy: To, Name: Name + ".ptr");
485 return Cast;
486 }
487 if (From->getNumContainedTypes() == 0)
488 return V;
489 // Structs, arrays, and other compound types.
490 Value *Ret = PoisonValue::get(T: To);
491 if (auto *AT = dyn_cast<ArrayType>(Val: From)) {
492 Type *FromPart = AT->getArrayElementType();
493 Type *ToPart = cast<ArrayType>(Val: To)->getElementType();
494 for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) {
495 Value *Field = IRB.CreateExtractValue(Agg: V, Idxs: I);
496 Value *NewField =
497 intsToFatPtrs(V: Field, From: FromPart, To: ToPart, Name: Name + "." + Twine(I));
498 Ret = IRB.CreateInsertValue(Agg: Ret, Val: NewField, Idxs: I);
499 }
500 } else {
501 for (auto [Idx, FromPart, ToPart] :
502 enumerate(First: From->subtypes(), Rest: To->subtypes())) {
503 Value *Field = IRB.CreateExtractValue(Agg: V, Idxs: Idx);
504 Value *NewField =
505 intsToFatPtrs(V: Field, From: FromPart, To: ToPart, Name: Name + "." + Twine(Idx));
506 Ret = IRB.CreateInsertValue(Agg: Ret, Val: NewField, Idxs: Idx);
507 }
508 }
509 return Ret;
510}
511
512bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) {
513 bool Changed = false;
514 // The visitors will mutate GEPs and allocas, but will push loads and stores
515 // to the worklist to avoid invalidation.
516 for (Instruction &I : make_early_inc_range(Range: instructions(F))) {
517 Changed |= visit(I);
518 }
519 ConvertedForStore.clear();
520 return Changed;
521}
522
523bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
524 Type *Ty = I.getAllocatedType();
525 Type *NewTy = TypeMap->remapType(SrcTy: Ty);
526 if (Ty == NewTy)
527 return false;
528 I.setAllocatedType(NewTy);
529 return true;
530}
531
532bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
533 Type *Ty = I.getSourceElementType();
534 Type *NewTy = TypeMap->remapType(SrcTy: Ty);
535 if (Ty == NewTy)
536 return false;
537 // We'll be rewriting the type `ptr addrspace(7)` out of existence soon, so
538 // make sure GEPs don't have different semantics with the new type.
539 I.setSourceElementType(NewTy);
540 I.setResultElementType(TypeMap->remapType(SrcTy: I.getResultElementType()));
541 return true;
542}
543
544bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
545 Type *Ty = LI.getType();
546 Type *IntTy = TypeMap->remapType(SrcTy: Ty);
547 if (Ty == IntTy)
548 return false;
549
550 IRB.SetInsertPoint(&LI);
551 auto *NLI = cast<LoadInst>(Val: LI.clone());
552 NLI->mutateType(Ty: IntTy);
553 NLI = IRB.Insert(I: NLI);
554 copyMetadataForLoad(Dest&: *NLI, Source: LI);
555 NLI->takeName(V: &LI);
556
557 Value *CastBack = intsToFatPtrs(V: NLI, From: IntTy, To: Ty, Name: NLI->getName());
558 LI.replaceAllUsesWith(V: CastBack);
559 LI.eraseFromParent();
560 return true;
561}
562
563bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
564 Value *V = SI.getValueOperand();
565 Type *Ty = V->getType();
566 Type *IntTy = TypeMap->remapType(SrcTy: Ty);
567 if (Ty == IntTy)
568 return false;
569
570 IRB.SetInsertPoint(&SI);
571 Value *IntV = fatPtrsToInts(V, From: Ty, To: IntTy, Name: V->getName());
572 for (auto *Dbg : at::getAssignmentMarkers(Inst: &SI))
573 Dbg->setValue(IntV);
574
575 SI.setOperand(i_nocapture: 0, Val_nocapture: IntV);
576 return true;
577}
578
579/// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered
580/// buffer fat pointer constant.
581static std::pair<Constant *, Constant *>
582splitLoweredFatBufferConst(Constant *C) {
583 assert(isSplitFatPtr(C->getType()) && "Not a split fat buffer pointer");
584 return std::make_pair(x: C->getAggregateElement(Elt: 0u), y: C->getAggregateElement(Elt: 1u));
585}
586
587namespace {
588/// Handle the remapping of ptr addrspace(7) constants.
589class FatPtrConstMaterializer final : public ValueMaterializer {
590 BufferFatPtrToStructTypeMap *TypeMap;
591 // An internal mapper that is used to recurse into the arguments of constants.
592 // While the documentation for `ValueMapper` specifies not to use it
593 // recursively, examination of the logic in mapValue() shows that it can
594 // safely be used recursively when handling constants, like it does in its own
595 // logic.
596 ValueMapper InternalMapper;
597
598 Constant *materializeBufferFatPtrConst(Constant *C);
599
600public:
601 // UnderlyingMap is the value map this materializer will be filling.
602 FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap,
603 ValueToValueMapTy &UnderlyingMap)
604 : TypeMap(TypeMap),
605 InternalMapper(UnderlyingMap, RF_None, TypeMap, this) {}
606 virtual ~FatPtrConstMaterializer() = default;
607
608 Value *materialize(Value *V) override;
609};
610} // namespace
611
612Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) {
613 Type *SrcTy = C->getType();
614 auto *NewTy = dyn_cast<StructType>(Val: TypeMap->remapType(SrcTy));
615 if (C->isNullValue())
616 return ConstantAggregateZero::getNullValue(Ty: NewTy);
617 if (isa<PoisonValue>(Val: C)) {
618 return ConstantStruct::get(T: NewTy,
619 V: {PoisonValue::get(T: NewTy->getElementType(N: 0)),
620 PoisonValue::get(T: NewTy->getElementType(N: 1))});
621 }
622 if (isa<UndefValue>(Val: C)) {
623 return ConstantStruct::get(T: NewTy,
624 V: {UndefValue::get(T: NewTy->getElementType(N: 0)),
625 UndefValue::get(T: NewTy->getElementType(N: 1))});
626 }
627
628 if (auto *VC = dyn_cast<ConstantVector>(Val: C)) {
629 if (Constant *S = VC->getSplatValue()) {
630 Constant *NewS = InternalMapper.mapConstant(C: *S);
631 if (!NewS)
632 return nullptr;
633 auto [Rsrc, Off] = splitLoweredFatBufferConst(C: NewS);
634 auto EC = VC->getType()->getElementCount();
635 return ConstantStruct::get(T: NewTy, V: {ConstantVector::getSplat(EC, Elt: Rsrc),
636 ConstantVector::getSplat(EC, Elt: Off)});
637 }
638 SmallVector<Constant *> Rsrcs;
639 SmallVector<Constant *> Offs;
640 for (Value *Op : VC->operand_values()) {
641 auto *NewOp = dyn_cast_or_null<Constant>(Val: InternalMapper.mapValue(V: *Op));
642 if (!NewOp)
643 return nullptr;
644 auto [Rsrc, Off] = splitLoweredFatBufferConst(C: NewOp);
645 Rsrcs.push_back(Elt: Rsrc);
646 Offs.push_back(Elt: Off);
647 }
648 Constant *RsrcVec = ConstantVector::get(V: Rsrcs);
649 Constant *OffVec = ConstantVector::get(V: Offs);
650 return ConstantStruct::get(T: NewTy, V: {RsrcVec, OffVec});
651 }
652
653 if (isa<GlobalValue>(Val: C))
654 report_fatal_error(reason: "Global values containing ptr addrspace(7) (buffer "
655 "fat pointer) values are not supported");
656
657 if (isa<ConstantExpr>(Val: C))
658 report_fatal_error(reason: "Constant exprs containing ptr addrspace(7) (buffer "
659 "fat pointer) values should have been expanded earlier");
660
661 return nullptr;
662}
663
664Value *FatPtrConstMaterializer::materialize(Value *V) {
665 Constant *C = dyn_cast<Constant>(Val: V);
666 if (!C)
667 return nullptr;
668 // Structs and other types that happen to contain fat pointers get remapped
669 // by the mapValue() logic.
670 if (!isBufferFatPtrConst(C))
671 return nullptr;
672 return materializeBufferFatPtrConst(C);
673}
674
675using PtrParts = std::pair<Value *, Value *>;
676namespace {
677// The visitor returns the resource and offset parts for an instruction if they
678// can be computed, or (nullptr, nullptr) for cases that don't have a meaningful
679// value mapping.
680class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> {
681 ValueToValueMapTy RsrcParts;
682 ValueToValueMapTy OffParts;
683
684 // Track instructions that have been rewritten into a user of the component
685 // parts of their ptr addrspace(7) input. Instructions that produced
686 // ptr addrspace(7) parts should **not** be RAUW'd before being added to this
687 // set, as that replacement will be handled in a post-visit step. However,
688 // instructions that yield values that aren't fat pointers (ex. ptrtoint)
689 // should RAUW themselves with new instructions that use the split parts
690 // of their arguments during processing.
691 DenseSet<Instruction *> SplitUsers;
692
693 // Nodes that need a second look once we've computed the parts for all other
694 // instructions to see if, for example, we really need to phi on the resource
695 // part.
696 SmallVector<Instruction *> Conditionals;
697 // Temporary instructions produced while lowering conditionals that should be
698 // killed.
699 SmallVector<Instruction *> ConditionalTemps;
700
701 // Subtarget info, needed for determining what cache control bits to set.
702 const TargetMachine *TM;
703 const GCNSubtarget *ST = nullptr;
704
705 IRBuilder<> IRB;
706
707 // Copy metadata between instructions if applicable.
708 void copyMetadata(Value *Dest, Value *Src);
709
710 // Get the resource and offset parts of the value V, inserting appropriate
711 // extractvalue calls if needed.
712 PtrParts getPtrParts(Value *V);
713
714 // Given an instruction that could produce multiple resource parts (a PHI or
715 // select), collect the set of possible instructions that could have provided
716 // its resource parts that it could have (the `Roots`) and the set of
717 // conditional instructions visited during the search (`Seen`). If, after
718 // removing the root of the search from `Seen` and `Roots`, `Seen` is a subset
719 // of `Roots` and `Roots - Seen` contains one element, the resource part of
720 // that element can replace the resource part of all other elements in `Seen`.
721 void getPossibleRsrcRoots(Instruction *I, SmallPtrSetImpl<Value *> &Roots,
722 SmallPtrSetImpl<Value *> &Seen);
723 void processConditionals();
724
725 // If an instruction hav been split into resource and offset parts,
726 // delete that instruction. If any of its uses have not themselves been split
727 // into parts (for example, an insertvalue), construct the structure
728 // that the type rewrites declared should be produced by the dying instruction
729 // and use that.
730 // Also, kill the temporary extractvalue operations produced by the two-stage
731 // lowering of PHIs and conditionals.
732 void killAndReplaceSplitInstructions(SmallVectorImpl<Instruction *> &Origs);
733
734 void setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx);
735 void insertPreMemOpFence(AtomicOrdering Order, SyncScope::ID SSID);
736 void insertPostMemOpFence(AtomicOrdering Order, SyncScope::ID SSID);
737 Value *handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Type *Ty,
738 Align Alignment, AtomicOrdering Order,
739 bool IsVolatile, SyncScope::ID SSID);
740
741public:
742 SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM)
743 : TM(TM), IRB(Ctx) {}
744
745 void processFunction(Function &F);
746
747 PtrParts visitInstruction(Instruction &I);
748 PtrParts visitLoadInst(LoadInst &LI);
749 PtrParts visitStoreInst(StoreInst &SI);
750 PtrParts visitAtomicRMWInst(AtomicRMWInst &AI);
751 PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI);
752 PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP);
753
754 PtrParts visitPtrToIntInst(PtrToIntInst &PI);
755 PtrParts visitIntToPtrInst(IntToPtrInst &IP);
756 PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I);
757 PtrParts visitICmpInst(ICmpInst &Cmp);
758 PtrParts visitFreezeInst(FreezeInst &I);
759
760 PtrParts visitExtractElementInst(ExtractElementInst &I);
761 PtrParts visitInsertElementInst(InsertElementInst &I);
762 PtrParts visitShuffleVectorInst(ShuffleVectorInst &I);
763
764 PtrParts visitPHINode(PHINode &PHI);
765 PtrParts visitSelectInst(SelectInst &SI);
766
767 PtrParts visitIntrinsicInst(IntrinsicInst &II);
768};
769} // namespace
770
771void SplitPtrStructs::copyMetadata(Value *Dest, Value *Src) {
772 auto *DestI = dyn_cast<Instruction>(Val: Dest);
773 auto *SrcI = dyn_cast<Instruction>(Val: Src);
774
775 if (!DestI || !SrcI)
776 return;
777
778 DestI->copyMetadata(SrcInst: *SrcI);
779}
780
781PtrParts SplitPtrStructs::getPtrParts(Value *V) {
782 assert(isSplitFatPtr(V->getType()) && "it's not meaningful to get the parts "
783 "of something that wasn't rewritten");
784 auto *RsrcEntry = &RsrcParts[V];
785 auto *OffEntry = &OffParts[V];
786 if (*RsrcEntry && *OffEntry)
787 return {*RsrcEntry, *OffEntry};
788
789 if (auto *C = dyn_cast<Constant>(Val: V)) {
790 auto [Rsrc, Off] = splitLoweredFatBufferConst(C);
791 return {*RsrcEntry = Rsrc, *OffEntry = Off};
792 }
793
794 IRBuilder<>::InsertPointGuard Guard(IRB);
795 if (auto *I = dyn_cast<Instruction>(Val: V)) {
796 LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n");
797 auto [Rsrc, Off] = visit(I&: *I);
798 if (Rsrc && Off)
799 return {*RsrcEntry = Rsrc, *OffEntry = Off};
800 // We'll be creating the new values after the relevant instruction.
801 // This instruction generates a value and so isn't a terminator.
802 IRB.SetInsertPoint(*I->getInsertionPointAfterDef());
803 IRB.SetCurrentDebugLocation(I->getDebugLoc());
804 } else if (auto *A = dyn_cast<Argument>(Val: V)) {
805 IRB.SetInsertPointPastAllocas(A->getParent());
806 IRB.SetCurrentDebugLocation(DebugLoc());
807 }
808 Value *Rsrc = IRB.CreateExtractValue(Agg: V, Idxs: 0, Name: V->getName() + ".rsrc");
809 Value *Off = IRB.CreateExtractValue(Agg: V, Idxs: 1, Name: V->getName() + ".off");
810 return {*RsrcEntry = Rsrc, *OffEntry = Off};
811}
812
813/// Returns the instruction that defines the resource part of the value V.
814/// Note that this is not getUnderlyingObject(), since that looks through
815/// operations like ptrmask which might modify the resource part.
816///
817/// We can limit ourselves to just looking through GEPs followed by looking
818/// through addrspacecasts because only those two operations preserve the
819/// resource part, and because operations on an `addrspace(8)` (which is the
820/// legal input to this addrspacecast) would produce a different resource part.
821static Value *rsrcPartRoot(Value *V) {
822 while (auto *GEP = dyn_cast<GEPOperator>(Val: V))
823 V = GEP->getPointerOperand();
824 while (auto *ASC = dyn_cast<AddrSpaceCastOperator>(Val: V))
825 V = ASC->getPointerOperand();
826 return V;
827}
828
829void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I,
830 SmallPtrSetImpl<Value *> &Roots,
831 SmallPtrSetImpl<Value *> &Seen) {
832 if (auto *PHI = dyn_cast<PHINode>(Val: I)) {
833 if (!Seen.insert(Ptr: I).second)
834 return;
835 for (Value *In : PHI->incoming_values()) {
836 In = rsrcPartRoot(V: In);
837 Roots.insert(Ptr: In);
838 if (isa<PHINode, SelectInst>(Val: In))
839 getPossibleRsrcRoots(I: cast<Instruction>(Val: In), Roots, Seen);
840 }
841 } else if (auto *SI = dyn_cast<SelectInst>(Val: I)) {
842 if (!Seen.insert(Ptr: SI).second)
843 return;
844 Value *TrueVal = rsrcPartRoot(V: SI->getTrueValue());
845 Value *FalseVal = rsrcPartRoot(V: SI->getFalseValue());
846 Roots.insert(Ptr: TrueVal);
847 Roots.insert(Ptr: FalseVal);
848 if (isa<PHINode, SelectInst>(Val: TrueVal))
849 getPossibleRsrcRoots(I: cast<Instruction>(Val: TrueVal), Roots, Seen);
850 if (isa<PHINode, SelectInst>(Val: FalseVal))
851 getPossibleRsrcRoots(I: cast<Instruction>(Val: FalseVal), Roots, Seen);
852 } else {
853 llvm_unreachable("getPossibleRsrcParts() only works on phi and select");
854 }
855}
856
857void SplitPtrStructs::processConditionals() {
858 SmallDenseMap<Instruction *, Value *> FoundRsrcs;
859 SmallPtrSet<Value *, 4> Roots;
860 SmallPtrSet<Value *, 4> Seen;
861 for (Instruction *I : Conditionals) {
862 // These have to exist by now because we've visited these nodes.
863 Value *Rsrc = RsrcParts[I];
864 Value *Off = OffParts[I];
865 assert(Rsrc && Off && "must have visited conditionals by now");
866
867 std::optional<Value *> MaybeRsrc;
868 auto MaybeFoundRsrc = FoundRsrcs.find(Val: I);
869 if (MaybeFoundRsrc != FoundRsrcs.end()) {
870 MaybeRsrc = MaybeFoundRsrc->second;
871 } else {
872 IRBuilder<>::InsertPointGuard Guard(IRB);
873 Roots.clear();
874 Seen.clear();
875 getPossibleRsrcRoots(I, Roots, Seen);
876 LLVM_DEBUG(dbgs() << "Processing conditional: " << *I << "\n");
877#ifndef NDEBUG
878 for (Value *V : Roots)
879 LLVM_DEBUG(dbgs() << "Root: " << *V << "\n");
880 for (Value *V : Seen)
881 LLVM_DEBUG(dbgs() << "Seen: " << *V << "\n");
882#endif
883 // If we are our own possible root, then we shouldn't block our
884 // replacement with a valid incoming value.
885 Roots.erase(Ptr: I);
886 // We don't want to block the optimization for conditionals that don't
887 // refer to themselves but did see themselves during the traversal.
888 Seen.erase(Ptr: I);
889
890 if (set_is_subset(S1: Seen, S2: Roots)) {
891 auto Diff = set_difference(S1: Roots, S2: Seen);
892 if (Diff.size() == 1) {
893 Value *RootVal = *Diff.begin();
894 // Handle the case where previous loops already looked through
895 // an addrspacecast.
896 if (isSplitFatPtr(Ty: RootVal->getType()))
897 MaybeRsrc = std::get<0>(in: getPtrParts(V: RootVal));
898 else
899 MaybeRsrc = RootVal;
900 }
901 }
902 }
903
904 if (auto *PHI = dyn_cast<PHINode>(Val: I)) {
905 Value *NewRsrc;
906 StructType *PHITy = cast<StructType>(Val: PHI->getType());
907 IRB.SetInsertPoint(*PHI->getInsertionPointAfterDef());
908 IRB.SetCurrentDebugLocation(PHI->getDebugLoc());
909 if (MaybeRsrc) {
910 NewRsrc = *MaybeRsrc;
911 } else {
912 Type *RsrcTy = PHITy->getElementType(N: 0);
913 auto *RsrcPHI = IRB.CreatePHI(Ty: RsrcTy, NumReservedValues: PHI->getNumIncomingValues());
914 RsrcPHI->takeName(V: Rsrc);
915 for (auto [V, BB] : llvm::zip(t: PHI->incoming_values(), u: PHI->blocks())) {
916 Value *VRsrc = std::get<0>(in: getPtrParts(V));
917 RsrcPHI->addIncoming(V: VRsrc, BB);
918 }
919 copyMetadata(Dest: RsrcPHI, Src: PHI);
920 NewRsrc = RsrcPHI;
921 }
922
923 Type *OffTy = PHITy->getElementType(N: 1);
924 auto *NewOff = IRB.CreatePHI(Ty: OffTy, NumReservedValues: PHI->getNumIncomingValues());
925 NewOff->takeName(V: Off);
926 for (auto [V, BB] : llvm::zip(t: PHI->incoming_values(), u: PHI->blocks())) {
927 assert(OffParts.count(V) && "An offset part had to be created by now");
928 Value *VOff = std::get<1>(in: getPtrParts(V));
929 NewOff->addIncoming(V: VOff, BB);
930 }
931 copyMetadata(Dest: NewOff, Src: PHI);
932
933 // Note: We don't eraseFromParent() the temporaries because we don't want
934 // to put the corrections maps in an inconstent state. That'll be handed
935 // during the rest of the killing. Also, `ValueToValueMapTy` guarantees
936 // that references in that map will be updated as well.
937 ConditionalTemps.push_back(Elt: cast<Instruction>(Val: Rsrc));
938 ConditionalTemps.push_back(Elt: cast<Instruction>(Val: Off));
939 Rsrc->replaceAllUsesWith(V: NewRsrc);
940 Off->replaceAllUsesWith(V: NewOff);
941
942 // Save on recomputing the cycle traversals in known-root cases.
943 if (MaybeRsrc)
944 for (Value *V : Seen)
945 FoundRsrcs[cast<Instruction>(Val: V)] = NewRsrc;
946 } else if (isa<SelectInst>(Val: I)) {
947 if (MaybeRsrc) {
948 ConditionalTemps.push_back(Elt: cast<Instruction>(Val: Rsrc));
949 Rsrc->replaceAllUsesWith(V: *MaybeRsrc);
950 for (Value *V : Seen)
951 FoundRsrcs[cast<Instruction>(Val: V)] = *MaybeRsrc;
952 }
953 } else {
954 llvm_unreachable("Only PHIs and selects go in the conditionals list");
955 }
956 }
957}
958
959void SplitPtrStructs::killAndReplaceSplitInstructions(
960 SmallVectorImpl<Instruction *> &Origs) {
961 for (Instruction *I : ConditionalTemps)
962 I->eraseFromParent();
963
964 for (Instruction *I : Origs) {
965 if (!SplitUsers.contains(V: I))
966 continue;
967
968 SmallVector<DbgValueInst *> Dbgs;
969 findDbgValues(DbgValues&: Dbgs, V: I);
970 for (auto *Dbg : Dbgs) {
971 IRB.SetInsertPoint(Dbg);
972 auto &DL = I->getDataLayout();
973 assert(isSplitFatPtr(I->getType()) &&
974 "We should've RAUW'd away loads, stores, etc. at this point");
975 auto *OffDbg = cast<DbgValueInst>(Val: Dbg->clone());
976 copyMetadata(Dest: OffDbg, Src: Dbg);
977 auto [Rsrc, Off] = getPtrParts(V: I);
978
979 int64_t RsrcSz = DL.getTypeSizeInBits(Ty: Rsrc->getType());
980 int64_t OffSz = DL.getTypeSizeInBits(Ty: Off->getType());
981
982 std::optional<DIExpression *> RsrcExpr =
983 DIExpression::createFragmentExpression(Expr: Dbg->getExpression(), OffsetInBits: 0,
984 SizeInBits: RsrcSz);
985 std::optional<DIExpression *> OffExpr =
986 DIExpression::createFragmentExpression(Expr: Dbg->getExpression(), OffsetInBits: RsrcSz,
987 SizeInBits: OffSz);
988 if (OffExpr) {
989 OffDbg->setExpression(*OffExpr);
990 OffDbg->replaceVariableLocationOp(OldValue: I, NewValue: Off);
991 IRB.Insert(I: OffDbg);
992 } else {
993 OffDbg->deleteValue();
994 }
995 if (RsrcExpr) {
996 Dbg->setExpression(*RsrcExpr);
997 Dbg->replaceVariableLocationOp(OldValue: I, NewValue: Rsrc);
998 } else {
999 Dbg->replaceVariableLocationOp(OldValue: I, NewValue: UndefValue::get(T: I->getType()));
1000 }
1001 }
1002
1003 Value *Poison = PoisonValue::get(T: I->getType());
1004 I->replaceUsesWithIf(New: Poison, ShouldReplace: [&](const Use &U) -> bool {
1005 if (const auto *UI = dyn_cast<Instruction>(Val: U.getUser()))
1006 return SplitUsers.contains(V: UI);
1007 return false;
1008 });
1009
1010 if (I->use_empty()) {
1011 I->eraseFromParent();
1012 continue;
1013 }
1014 IRB.SetInsertPoint(*I->getInsertionPointAfterDef());
1015 IRB.SetCurrentDebugLocation(I->getDebugLoc());
1016 auto [Rsrc, Off] = getPtrParts(V: I);
1017 Value *Struct = PoisonValue::get(T: I->getType());
1018 Struct = IRB.CreateInsertValue(Agg: Struct, Val: Rsrc, Idxs: 0);
1019 Struct = IRB.CreateInsertValue(Agg: Struct, Val: Off, Idxs: 1);
1020 copyMetadata(Dest: Struct, Src: I);
1021 Struct->takeName(V: I);
1022 I->replaceAllUsesWith(V: Struct);
1023 I->eraseFromParent();
1024 }
1025}
1026
1027void SplitPtrStructs::setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx) {
1028 LLVMContext &Ctx = Intr->getContext();
1029 Intr->addParamAttr(ArgNo: RsrcArgIdx, Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: A));
1030}
1031
1032void SplitPtrStructs::insertPreMemOpFence(AtomicOrdering Order,
1033 SyncScope::ID SSID) {
1034 switch (Order) {
1035 case AtomicOrdering::Release:
1036 case AtomicOrdering::AcquireRelease:
1037 case AtomicOrdering::SequentiallyConsistent:
1038 IRB.CreateFence(Ordering: AtomicOrdering::Release, SSID);
1039 break;
1040 default:
1041 break;
1042 }
1043}
1044
1045void SplitPtrStructs::insertPostMemOpFence(AtomicOrdering Order,
1046 SyncScope::ID SSID) {
1047 switch (Order) {
1048 case AtomicOrdering::Acquire:
1049 case AtomicOrdering::AcquireRelease:
1050 case AtomicOrdering::SequentiallyConsistent:
1051 IRB.CreateFence(Ordering: AtomicOrdering::Acquire, SSID);
1052 break;
1053 default:
1054 break;
1055 }
1056}
1057
1058Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
1059 Type *Ty, Align Alignment,
1060 AtomicOrdering Order, bool IsVolatile,
1061 SyncScope::ID SSID) {
1062 IRB.SetInsertPoint(I);
1063
1064 auto [Rsrc, Off] = getPtrParts(V: Ptr);
1065 SmallVector<Value *, 5> Args;
1066 if (Arg)
1067 Args.push_back(Elt: Arg);
1068 Args.push_back(Elt: Rsrc);
1069 Args.push_back(Elt: Off);
1070 insertPreMemOpFence(Order, SSID);
1071 // soffset is always 0 for these cases, where we always want any offset to be
1072 // part of bounds checking and we don't know which parts of the GEPs is
1073 // uniform.
1074 Args.push_back(Elt: IRB.getInt32(C: 0));
1075
1076 uint32_t Aux = 0;
1077 bool IsInvariant =
1078 (isa<LoadInst>(Val: I) && I->getMetadata(KindID: LLVMContext::MD_invariant_load));
1079 bool IsNonTemporal = I->getMetadata(KindID: LLVMContext::MD_nontemporal);
1080 // Atomic loads and stores need glc, atomic read-modify-write doesn't.
1081 bool IsOneWayAtomic =
1082 !isa<AtomicRMWInst>(Val: I) && Order != AtomicOrdering::NotAtomic;
1083 if (IsOneWayAtomic)
1084 Aux |= AMDGPU::CPol::GLC;
1085 if (IsNonTemporal && !IsInvariant)
1086 Aux |= AMDGPU::CPol::SLC;
1087 if (isa<LoadInst>(Val: I) && ST->getGeneration() == AMDGPUSubtarget::GFX10)
1088 Aux |= (Aux & AMDGPU::CPol::GLC ? AMDGPU::CPol::DLC : 0);
1089 if (IsVolatile)
1090 Aux |= AMDGPU::CPol::VOLATILE;
1091 Args.push_back(Elt: IRB.getInt32(C: Aux));
1092
1093 Intrinsic::ID IID = Intrinsic::not_intrinsic;
1094 if (isa<LoadInst>(Val: I))
1095 IID = Order == AtomicOrdering::NotAtomic
1096 ? Intrinsic::amdgcn_raw_ptr_buffer_load
1097 : Intrinsic::amdgcn_raw_ptr_atomic_buffer_load;
1098 else if (isa<StoreInst>(Val: I))
1099 IID = Intrinsic::amdgcn_raw_ptr_buffer_store;
1100 else if (auto *RMW = dyn_cast<AtomicRMWInst>(Val: I)) {
1101 switch (RMW->getOperation()) {
1102 case AtomicRMWInst::Xchg:
1103 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap;
1104 break;
1105 case AtomicRMWInst::Add:
1106 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_add;
1107 break;
1108 case AtomicRMWInst::Sub:
1109 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub;
1110 break;
1111 case AtomicRMWInst::And:
1112 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_and;
1113 break;
1114 case AtomicRMWInst::Or:
1115 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_or;
1116 break;
1117 case AtomicRMWInst::Xor:
1118 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor;
1119 break;
1120 case AtomicRMWInst::Max:
1121 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax;
1122 break;
1123 case AtomicRMWInst::Min:
1124 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin;
1125 break;
1126 case AtomicRMWInst::UMax:
1127 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax;
1128 break;
1129 case AtomicRMWInst::UMin:
1130 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin;
1131 break;
1132 case AtomicRMWInst::FAdd:
1133 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd;
1134 break;
1135 case AtomicRMWInst::FMax:
1136 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax;
1137 break;
1138 case AtomicRMWInst::FMin:
1139 IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
1140 break;
1141 case AtomicRMWInst::FSub: {
1142 report_fatal_error(reason: "atomic floating point subtraction not supported for "
1143 "buffer resources and should've been expanded away");
1144 break;
1145 }
1146 case AtomicRMWInst::Nand:
1147 report_fatal_error(reason: "atomic nand not supported for buffer resources and "
1148 "should've been expanded away");
1149 break;
1150 case AtomicRMWInst::UIncWrap:
1151 case AtomicRMWInst::UDecWrap:
1152 report_fatal_error(reason: "wrapping increment/decrement not supported for "
1153 "buffer resources and should've ben expanded away");
1154 break;
1155 case AtomicRMWInst::BAD_BINOP:
1156 llvm_unreachable("Not sure how we got a bad binop");
1157 }
1158 }
1159
1160 auto *Call = IRB.CreateIntrinsic(ID: IID, Types: Ty, Args);
1161 copyMetadata(Dest: Call, Src: I);
1162 setAlign(Intr: Call, A: Alignment, RsrcArgIdx: Arg ? 1 : 0);
1163 Call->takeName(V: I);
1164
1165 insertPostMemOpFence(Order, SSID);
1166 // The "no moving p7 directly" rewrites ensure that this load or store won't
1167 // itself need to be split into parts.
1168 SplitUsers.insert(V: I);
1169 I->replaceAllUsesWith(V: Call);
1170 return Call;
1171}
1172
1173PtrParts SplitPtrStructs::visitInstruction(Instruction &I) {
1174 return {nullptr, nullptr};
1175}
1176
1177PtrParts SplitPtrStructs::visitLoadInst(LoadInst &LI) {
1178 if (!isSplitFatPtr(Ty: LI.getPointerOperandType()))
1179 return {nullptr, nullptr};
1180 handleMemoryInst(I: &LI, Arg: nullptr, Ptr: LI.getPointerOperand(), Ty: LI.getType(),
1181 Alignment: LI.getAlign(), Order: LI.getOrdering(), IsVolatile: LI.isVolatile(),
1182 SSID: LI.getSyncScopeID());
1183 return {nullptr, nullptr};
1184}
1185
1186PtrParts SplitPtrStructs::visitStoreInst(StoreInst &SI) {
1187 if (!isSplitFatPtr(Ty: SI.getPointerOperandType()))
1188 return {nullptr, nullptr};
1189 Value *Arg = SI.getValueOperand();
1190 handleMemoryInst(I: &SI, Arg, Ptr: SI.getPointerOperand(), Ty: Arg->getType(),
1191 Alignment: SI.getAlign(), Order: SI.getOrdering(), IsVolatile: SI.isVolatile(),
1192 SSID: SI.getSyncScopeID());
1193 return {nullptr, nullptr};
1194}
1195
1196PtrParts SplitPtrStructs::visitAtomicRMWInst(AtomicRMWInst &AI) {
1197 if (!isSplitFatPtr(Ty: AI.getPointerOperand()->getType()))
1198 return {nullptr, nullptr};
1199 Value *Arg = AI.getValOperand();
1200 handleMemoryInst(I: &AI, Arg, Ptr: AI.getPointerOperand(), Ty: Arg->getType(),
1201 Alignment: AI.getAlign(), Order: AI.getOrdering(), IsVolatile: AI.isVolatile(),
1202 SSID: AI.getSyncScopeID());
1203 return {nullptr, nullptr};
1204}
1205
1206// Unlike load, store, and RMW, cmpxchg needs special handling to account
1207// for the boolean argument.
1208PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) {
1209 Value *Ptr = AI.getPointerOperand();
1210 if (!isSplitFatPtr(Ty: Ptr->getType()))
1211 return {nullptr, nullptr};
1212 IRB.SetInsertPoint(&AI);
1213
1214 Type *Ty = AI.getNewValOperand()->getType();
1215 AtomicOrdering Order = AI.getMergedOrdering();
1216 SyncScope::ID SSID = AI.getSyncScopeID();
1217 bool IsNonTemporal = AI.getMetadata(KindID: LLVMContext::MD_nontemporal);
1218
1219 auto [Rsrc, Off] = getPtrParts(V: Ptr);
1220 insertPreMemOpFence(Order, SSID);
1221
1222 uint32_t Aux = 0;
1223 if (IsNonTemporal)
1224 Aux |= AMDGPU::CPol::SLC;
1225 if (AI.isVolatile())
1226 Aux |= AMDGPU::CPol::VOLATILE;
1227 auto *Call =
1228 IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap, Types: Ty,
1229 Args: {AI.getNewValOperand(), AI.getCompareOperand(), Rsrc,
1230 Off, IRB.getInt32(C: 0), IRB.getInt32(C: Aux)});
1231 copyMetadata(Dest: Call, Src: &AI);
1232 setAlign(Intr: Call, A: AI.getAlign(), RsrcArgIdx: 2);
1233 Call->takeName(V: &AI);
1234 insertPostMemOpFence(Order, SSID);
1235
1236 Value *Res = PoisonValue::get(T: AI.getType());
1237 Res = IRB.CreateInsertValue(Agg: Res, Val: Call, Idxs: 0);
1238 if (!AI.isWeak()) {
1239 Value *Succeeded = IRB.CreateICmpEQ(LHS: Call, RHS: AI.getCompareOperand());
1240 Res = IRB.CreateInsertValue(Agg: Res, Val: Succeeded, Idxs: 1);
1241 }
1242 SplitUsers.insert(V: &AI);
1243 AI.replaceAllUsesWith(V: Res);
1244 return {nullptr, nullptr};
1245}
1246
1247PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
1248 using namespace llvm::PatternMatch;
1249 Value *Ptr = GEP.getPointerOperand();
1250 if (!isSplitFatPtr(Ty: Ptr->getType()))
1251 return {nullptr, nullptr};
1252 IRB.SetInsertPoint(&GEP);
1253
1254 auto [Rsrc, Off] = getPtrParts(V: Ptr);
1255 const DataLayout &DL = GEP.getDataLayout();
1256 bool InBounds = GEP.isInBounds();
1257
1258 // In order to call emitGEPOffset() and thus not have to reimplement it,
1259 // we need the GEP result to have ptr addrspace(7) type.
1260 Type *FatPtrTy = IRB.getPtrTy(AddrSpace: AMDGPUAS::BUFFER_FAT_POINTER);
1261 if (auto *VT = dyn_cast<VectorType>(Val: Off->getType()))
1262 FatPtrTy = VectorType::get(ElementType: FatPtrTy, EC: VT->getElementCount());
1263 GEP.mutateType(Ty: FatPtrTy);
1264 Value *OffAccum = emitGEPOffset(Builder: &IRB, DL, GEP: &GEP);
1265 GEP.mutateType(Ty: Ptr->getType());
1266 if (match(V: OffAccum, P: m_Zero())) { // Constant-zero offset
1267 SplitUsers.insert(V: &GEP);
1268 return {Rsrc, Off};
1269 }
1270
1271 bool HasNonNegativeOff = false;
1272 if (auto *CI = dyn_cast<ConstantInt>(Val: OffAccum)) {
1273 HasNonNegativeOff = !CI->isNegative();
1274 }
1275 Value *NewOff;
1276 if (match(V: Off, P: m_Zero())) {
1277 NewOff = OffAccum;
1278 } else {
1279 NewOff = IRB.CreateAdd(LHS: Off, RHS: OffAccum, Name: "",
1280 /*hasNUW=*/HasNUW: InBounds && HasNonNegativeOff,
1281 /*hasNSW=*/HasNSW: false);
1282 }
1283 copyMetadata(Dest: NewOff, Src: &GEP);
1284 NewOff->takeName(V: &GEP);
1285 SplitUsers.insert(V: &GEP);
1286 return {Rsrc, NewOff};
1287}
1288
1289PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) {
1290 Value *Ptr = PI.getPointerOperand();
1291 if (!isSplitFatPtr(Ty: Ptr->getType()))
1292 return {nullptr, nullptr};
1293 IRB.SetInsertPoint(&PI);
1294
1295 Type *ResTy = PI.getType();
1296 unsigned Width = ResTy->getScalarSizeInBits();
1297
1298 auto [Rsrc, Off] = getPtrParts(V: Ptr);
1299 const DataLayout &DL = PI.getDataLayout();
1300 unsigned FatPtrWidth = DL.getPointerSizeInBits(AS: AMDGPUAS::BUFFER_FAT_POINTER);
1301
1302 Value *Res;
1303 if (Width <= BufferOffsetWidth) {
1304 Res = IRB.CreateIntCast(V: Off, DestTy: ResTy, /*isSigned=*/false,
1305 Name: PI.getName() + ".off");
1306 } else {
1307 Value *RsrcInt = IRB.CreatePtrToInt(V: Rsrc, DestTy: ResTy, Name: PI.getName() + ".rsrc");
1308 Value *Shl = IRB.CreateShl(
1309 LHS: RsrcInt,
1310 RHS: ConstantExpr::getIntegerValue(Ty: ResTy, V: APInt(Width, BufferOffsetWidth)),
1311 Name: "", HasNUW: Width >= FatPtrWidth, HasNSW: Width > FatPtrWidth);
1312 Value *OffCast = IRB.CreateIntCast(V: Off, DestTy: ResTy, /*isSigned=*/false,
1313 Name: PI.getName() + ".off");
1314 Res = IRB.CreateOr(LHS: Shl, RHS: OffCast);
1315 }
1316
1317 copyMetadata(Dest: Res, Src: &PI);
1318 Res->takeName(V: &PI);
1319 SplitUsers.insert(V: &PI);
1320 PI.replaceAllUsesWith(V: Res);
1321 return {nullptr, nullptr};
1322}
1323
1324PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) {
1325 if (!isSplitFatPtr(Ty: IP.getType()))
1326 return {nullptr, nullptr};
1327 IRB.SetInsertPoint(&IP);
1328 const DataLayout &DL = IP.getDataLayout();
1329 unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AS: AMDGPUAS::BUFFER_RESOURCE);
1330 Value *Int = IP.getOperand(i_nocapture: 0);
1331 Type *IntTy = Int->getType();
1332 Type *RsrcIntTy = IntTy->getWithNewBitWidth(NewBitWidth: RsrcPtrWidth);
1333 unsigned Width = IntTy->getScalarSizeInBits();
1334
1335 auto *RetTy = cast<StructType>(Val: IP.getType());
1336 Type *RsrcTy = RetTy->getElementType(N: 0);
1337 Type *OffTy = RetTy->getElementType(N: 1);
1338 Value *RsrcPart = IRB.CreateLShr(
1339 LHS: Int,
1340 RHS: ConstantExpr::getIntegerValue(Ty: IntTy, V: APInt(Width, BufferOffsetWidth)));
1341 Value *RsrcInt = IRB.CreateIntCast(V: RsrcPart, DestTy: RsrcIntTy, /*isSigned=*/false);
1342 Value *Rsrc = IRB.CreateIntToPtr(V: RsrcInt, DestTy: RsrcTy, Name: IP.getName() + ".rsrc");
1343 Value *Off =
1344 IRB.CreateIntCast(V: Int, DestTy: OffTy, /*IsSigned=*/isSigned: false, Name: IP.getName() + ".off");
1345
1346 copyMetadata(Dest: Rsrc, Src: &IP);
1347 SplitUsers.insert(V: &IP);
1348 return {Rsrc, Off};
1349}
1350
1351PtrParts SplitPtrStructs::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1352 if (!isSplitFatPtr(Ty: I.getType()))
1353 return {nullptr, nullptr};
1354 IRB.SetInsertPoint(&I);
1355 Value *In = I.getPointerOperand();
1356 // No-op casts preserve parts
1357 if (In->getType() == I.getType()) {
1358 auto [Rsrc, Off] = getPtrParts(V: In);
1359 SplitUsers.insert(V: &I);
1360 return {Rsrc, Off};
1361 }
1362 if (I.getSrcAddressSpace() != AMDGPUAS::BUFFER_RESOURCE)
1363 report_fatal_error(reason: "Only buffer resources (addrspace 8) can be cast to "
1364 "buffer fat pointers (addrspace 7)");
1365 Type *OffTy = cast<StructType>(Val: I.getType())->getElementType(N: 1);
1366 Value *ZeroOff = Constant::getNullValue(Ty: OffTy);
1367 SplitUsers.insert(V: &I);
1368 return {In, ZeroOff};
1369}
1370
1371PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) {
1372 Value *Lhs = Cmp.getOperand(i_nocapture: 0);
1373 if (!isSplitFatPtr(Ty: Lhs->getType()))
1374 return {nullptr, nullptr};
1375 Value *Rhs = Cmp.getOperand(i_nocapture: 1);
1376 IRB.SetInsertPoint(&Cmp);
1377 ICmpInst::Predicate Pred = Cmp.getPredicate();
1378
1379 assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
1380 "Pointer comparison is only equal or unequal");
1381 auto [LhsRsrc, LhsOff] = getPtrParts(V: Lhs);
1382 auto [RhsRsrc, RhsOff] = getPtrParts(V: Rhs);
1383 Value *RsrcCmp =
1384 IRB.CreateICmp(P: Pred, LHS: LhsRsrc, RHS: RhsRsrc, Name: Cmp.getName() + ".rsrc");
1385 copyMetadata(Dest: RsrcCmp, Src: &Cmp);
1386 Value *OffCmp = IRB.CreateICmp(P: Pred, LHS: LhsOff, RHS: RhsOff, Name: Cmp.getName() + ".off");
1387 copyMetadata(Dest: OffCmp, Src: &Cmp);
1388
1389 Value *Res = nullptr;
1390 if (Pred == ICmpInst::ICMP_EQ)
1391 Res = IRB.CreateAnd(LHS: RsrcCmp, RHS: OffCmp);
1392 else if (Pred == ICmpInst::ICMP_NE)
1393 Res = IRB.CreateOr(LHS: RsrcCmp, RHS: OffCmp);
1394 copyMetadata(Dest: Res, Src: &Cmp);
1395 Res->takeName(V: &Cmp);
1396 SplitUsers.insert(V: &Cmp);
1397 Cmp.replaceAllUsesWith(V: Res);
1398 return {nullptr, nullptr};
1399}
1400
1401PtrParts SplitPtrStructs::visitFreezeInst(FreezeInst &I) {
1402 if (!isSplitFatPtr(Ty: I.getType()))
1403 return {nullptr, nullptr};
1404 IRB.SetInsertPoint(&I);
1405 auto [Rsrc, Off] = getPtrParts(V: I.getOperand(i_nocapture: 0));
1406
1407 Value *RsrcRes = IRB.CreateFreeze(V: Rsrc, Name: I.getName() + ".rsrc");
1408 copyMetadata(Dest: RsrcRes, Src: &I);
1409 Value *OffRes = IRB.CreateFreeze(V: Off, Name: I.getName() + ".off");
1410 copyMetadata(Dest: OffRes, Src: &I);
1411 SplitUsers.insert(V: &I);
1412 return {RsrcRes, OffRes};
1413}
1414
1415PtrParts SplitPtrStructs::visitExtractElementInst(ExtractElementInst &I) {
1416 if (!isSplitFatPtr(Ty: I.getType()))
1417 return {nullptr, nullptr};
1418 IRB.SetInsertPoint(&I);
1419 Value *Vec = I.getVectorOperand();
1420 Value *Idx = I.getIndexOperand();
1421 auto [Rsrc, Off] = getPtrParts(V: Vec);
1422
1423 Value *RsrcRes = IRB.CreateExtractElement(Vec: Rsrc, Idx, Name: I.getName() + ".rsrc");
1424 copyMetadata(Dest: RsrcRes, Src: &I);
1425 Value *OffRes = IRB.CreateExtractElement(Vec: Off, Idx, Name: I.getName() + ".off");
1426 copyMetadata(Dest: OffRes, Src: &I);
1427 SplitUsers.insert(V: &I);
1428 return {RsrcRes, OffRes};
1429}
1430
1431PtrParts SplitPtrStructs::visitInsertElementInst(InsertElementInst &I) {
1432 // The mutated instructions temporarily don't return vectors, and so
1433 // we need the generic getType() here to avoid crashes.
1434 if (!isSplitFatPtr(Ty: cast<Instruction>(Val&: I).getType()))
1435 return {nullptr, nullptr};
1436 IRB.SetInsertPoint(&I);
1437 Value *Vec = I.getOperand(i_nocapture: 0);
1438 Value *Elem = I.getOperand(i_nocapture: 1);
1439 Value *Idx = I.getOperand(i_nocapture: 2);
1440 auto [VecRsrc, VecOff] = getPtrParts(V: Vec);
1441 auto [ElemRsrc, ElemOff] = getPtrParts(V: Elem);
1442
1443 Value *RsrcRes =
1444 IRB.CreateInsertElement(Vec: VecRsrc, NewElt: ElemRsrc, Idx, Name: I.getName() + ".rsrc");
1445 copyMetadata(Dest: RsrcRes, Src: &I);
1446 Value *OffRes =
1447 IRB.CreateInsertElement(Vec: VecOff, NewElt: ElemOff, Idx, Name: I.getName() + ".off");
1448 copyMetadata(Dest: OffRes, Src: &I);
1449 SplitUsers.insert(V: &I);
1450 return {RsrcRes, OffRes};
1451}
1452
1453PtrParts SplitPtrStructs::visitShuffleVectorInst(ShuffleVectorInst &I) {
1454 // Cast is needed for the same reason as insertelement's.
1455 if (!isSplitFatPtr(Ty: cast<Instruction>(Val&: I).getType()))
1456 return {nullptr, nullptr};
1457 IRB.SetInsertPoint(&I);
1458
1459 Value *V1 = I.getOperand(i_nocapture: 0);
1460 Value *V2 = I.getOperand(i_nocapture: 1);
1461 ArrayRef<int> Mask = I.getShuffleMask();
1462 auto [V1Rsrc, V1Off] = getPtrParts(V: V1);
1463 auto [V2Rsrc, V2Off] = getPtrParts(V: V2);
1464
1465 Value *RsrcRes =
1466 IRB.CreateShuffleVector(V1: V1Rsrc, V2: V2Rsrc, Mask, Name: I.getName() + ".rsrc");
1467 copyMetadata(Dest: RsrcRes, Src: &I);
1468 Value *OffRes =
1469 IRB.CreateShuffleVector(V1: V1Off, V2: V2Off, Mask, Name: I.getName() + ".off");
1470 copyMetadata(Dest: OffRes, Src: &I);
1471 SplitUsers.insert(V: &I);
1472 return {RsrcRes, OffRes};
1473}
1474
1475PtrParts SplitPtrStructs::visitPHINode(PHINode &PHI) {
1476 if (!isSplitFatPtr(Ty: PHI.getType()))
1477 return {nullptr, nullptr};
1478 IRB.SetInsertPoint(*PHI.getInsertionPointAfterDef());
1479 // Phi nodes will be handled in post-processing after we've visited every
1480 // instruction. However, instead of just returning {nullptr, nullptr},
1481 // we explicitly create the temporary extractvalue operations that are our
1482 // temporary results so that they end up at the beginning of the block with
1483 // the PHIs.
1484 Value *TmpRsrc = IRB.CreateExtractValue(Agg: &PHI, Idxs: 0, Name: PHI.getName() + ".rsrc");
1485 Value *TmpOff = IRB.CreateExtractValue(Agg: &PHI, Idxs: 1, Name: PHI.getName() + ".off");
1486 Conditionals.push_back(Elt: &PHI);
1487 SplitUsers.insert(V: &PHI);
1488 return {TmpRsrc, TmpOff};
1489}
1490
1491PtrParts SplitPtrStructs::visitSelectInst(SelectInst &SI) {
1492 if (!isSplitFatPtr(Ty: SI.getType()))
1493 return {nullptr, nullptr};
1494 IRB.SetInsertPoint(&SI);
1495
1496 Value *Cond = SI.getCondition();
1497 Value *True = SI.getTrueValue();
1498 Value *False = SI.getFalseValue();
1499 auto [TrueRsrc, TrueOff] = getPtrParts(V: True);
1500 auto [FalseRsrc, FalseOff] = getPtrParts(V: False);
1501
1502 Value *RsrcRes =
1503 IRB.CreateSelect(C: Cond, True: TrueRsrc, False: FalseRsrc, Name: SI.getName() + ".rsrc", MDFrom: &SI);
1504 copyMetadata(Dest: RsrcRes, Src: &SI);
1505 Conditionals.push_back(Elt: &SI);
1506 Value *OffRes =
1507 IRB.CreateSelect(C: Cond, True: TrueOff, False: FalseOff, Name: SI.getName() + ".off", MDFrom: &SI);
1508 copyMetadata(Dest: OffRes, Src: &SI);
1509 SplitUsers.insert(V: &SI);
1510 return {RsrcRes, OffRes};
1511}
1512
1513/// Returns true if this intrinsic needs to be removed when it is
1514/// applied to `ptr addrspace(7)` values. Calls to these intrinsics are
1515/// rewritten into calls to versions of that intrinsic on the resource
1516/// descriptor.
1517static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
1518 switch (IID) {
1519 default:
1520 return false;
1521 case Intrinsic::ptrmask:
1522 case Intrinsic::invariant_start:
1523 case Intrinsic::invariant_end:
1524 case Intrinsic::launder_invariant_group:
1525 case Intrinsic::strip_invariant_group:
1526 return true;
1527 }
1528}
1529
1530PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
1531 Intrinsic::ID IID = I.getIntrinsicID();
1532 switch (IID) {
1533 default:
1534 break;
1535 case Intrinsic::ptrmask: {
1536 Value *Ptr = I.getArgOperand(i: 0);
1537 if (!isSplitFatPtr(Ty: Ptr->getType()))
1538 return {nullptr, nullptr};
1539 Value *Mask = I.getArgOperand(i: 1);
1540 IRB.SetInsertPoint(&I);
1541 auto [Rsrc, Off] = getPtrParts(V: Ptr);
1542 if (Mask->getType() != Off->getType())
1543 report_fatal_error(reason: "offset width is not equal to index width of fat "
1544 "pointer (data layout not set up correctly?)");
1545 Value *OffRes = IRB.CreateAnd(LHS: Off, RHS: Mask, Name: I.getName() + ".off");
1546 copyMetadata(Dest: OffRes, Src: &I);
1547 SplitUsers.insert(V: &I);
1548 return {Rsrc, OffRes};
1549 }
1550 // Pointer annotation intrinsics that, given their object-wide nature
1551 // operate on the resource part.
1552 case Intrinsic::invariant_start: {
1553 Value *Ptr = I.getArgOperand(i: 1);
1554 if (!isSplitFatPtr(Ty: Ptr->getType()))
1555 return {nullptr, nullptr};
1556 IRB.SetInsertPoint(&I);
1557 auto [Rsrc, Off] = getPtrParts(V: Ptr);
1558 Type *NewTy = PointerType::get(C&: I.getContext(), AddressSpace: AMDGPUAS::BUFFER_RESOURCE);
1559 auto *NewRsrc = IRB.CreateIntrinsic(ID: IID, Types: {NewTy}, Args: {I.getOperand(i_nocapture: 0), Rsrc});
1560 copyMetadata(Dest: NewRsrc, Src: &I);
1561 NewRsrc->takeName(V: &I);
1562 SplitUsers.insert(V: &I);
1563 I.replaceAllUsesWith(V: NewRsrc);
1564 return {nullptr, nullptr};
1565 }
1566 case Intrinsic::invariant_end: {
1567 Value *RealPtr = I.getArgOperand(i: 2);
1568 if (!isSplitFatPtr(Ty: RealPtr->getType()))
1569 return {nullptr, nullptr};
1570 IRB.SetInsertPoint(&I);
1571 Value *RealRsrc = getPtrParts(V: RealPtr).first;
1572 Value *InvPtr = I.getArgOperand(i: 0);
1573 Value *Size = I.getArgOperand(i: 1);
1574 Value *NewRsrc = IRB.CreateIntrinsic(ID: IID, Types: {RealRsrc->getType()},
1575 Args: {InvPtr, Size, RealRsrc});
1576 copyMetadata(Dest: NewRsrc, Src: &I);
1577 NewRsrc->takeName(V: &I);
1578 SplitUsers.insert(V: &I);
1579 I.replaceAllUsesWith(V: NewRsrc);
1580 return {nullptr, nullptr};
1581 }
1582 case Intrinsic::launder_invariant_group:
1583 case Intrinsic::strip_invariant_group: {
1584 Value *Ptr = I.getArgOperand(i: 0);
1585 if (!isSplitFatPtr(Ty: Ptr->getType()))
1586 return {nullptr, nullptr};
1587 IRB.SetInsertPoint(&I);
1588 auto [Rsrc, Off] = getPtrParts(V: Ptr);
1589 Value *NewRsrc = IRB.CreateIntrinsic(ID: IID, Types: {Rsrc->getType()}, Args: {Rsrc});
1590 copyMetadata(Dest: NewRsrc, Src: &I);
1591 NewRsrc->takeName(V: &I);
1592 SplitUsers.insert(V: &I);
1593 return {NewRsrc, Off};
1594 }
1595 }
1596 return {nullptr, nullptr};
1597}
1598
1599void SplitPtrStructs::processFunction(Function &F) {
1600 ST = &TM->getSubtarget<GCNSubtarget>(F);
1601 SmallVector<Instruction *, 0> Originals;
1602 LLVM_DEBUG(dbgs() << "Splitting pointer structs in function: " << F.getName()
1603 << "\n");
1604 for (Instruction &I : instructions(F))
1605 Originals.push_back(Elt: &I);
1606 for (Instruction *I : Originals) {
1607 auto [Rsrc, Off] = visit(I);
1608 assert(((Rsrc && Off) || (!Rsrc && !Off)) &&
1609 "Can't have a resource but no offset");
1610 if (Rsrc)
1611 RsrcParts[I] = Rsrc;
1612 if (Off)
1613 OffParts[I] = Off;
1614 }
1615 processConditionals();
1616 killAndReplaceSplitInstructions(Origs&: Originals);
1617
1618 // Clean up after ourselves to save on memory.
1619 RsrcParts.clear();
1620 OffParts.clear();
1621 SplitUsers.clear();
1622 Conditionals.clear();
1623 ConditionalTemps.clear();
1624}
1625
1626namespace {
1627class AMDGPULowerBufferFatPointers : public ModulePass {
1628public:
1629 static char ID;
1630
1631 AMDGPULowerBufferFatPointers() : ModulePass(ID) {
1632 initializeAMDGPULowerBufferFatPointersPass(
1633 *PassRegistry::getPassRegistry());
1634 }
1635
1636 bool run(Module &M, const TargetMachine &TM);
1637 bool runOnModule(Module &M) override;
1638
1639 void getAnalysisUsage(AnalysisUsage &AU) const override;
1640};
1641} // namespace
1642
1643/// Returns true if there are values that have a buffer fat pointer in them,
1644/// which means we'll need to perform rewrites on this function. As a side
1645/// effect, this will populate the type remapping cache.
1646static bool containsBufferFatPointers(const Function &F,
1647 BufferFatPtrToStructTypeMap *TypeMap) {
1648 bool HasFatPointers = false;
1649 for (const BasicBlock &BB : F)
1650 for (const Instruction &I : BB)
1651 HasFatPointers |= (I.getType() != TypeMap->remapType(SrcTy: I.getType()));
1652 return HasFatPointers;
1653}
1654
1655static bool hasFatPointerInterface(const Function &F,
1656 BufferFatPtrToStructTypeMap *TypeMap) {
1657 Type *Ty = F.getFunctionType();
1658 return Ty != TypeMap->remapType(SrcTy: Ty);
1659}
1660
1661/// Move the body of `OldF` into a new function, returning it.
1662static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy,
1663 ValueToValueMapTy &CloneMap) {
1664 bool IsIntrinsic = OldF->isIntrinsic();
1665 Function *NewF =
1666 Function::Create(Ty: NewTy, Linkage: OldF->getLinkage(), AddrSpace: OldF->getAddressSpace());
1667 NewF->IsNewDbgInfoFormat = OldF->IsNewDbgInfoFormat;
1668 NewF->copyAttributesFrom(Src: OldF);
1669 NewF->copyMetadata(Src: OldF, Offset: 0);
1670 NewF->takeName(V: OldF);
1671 NewF->updateAfterNameChange();
1672 NewF->setDLLStorageClass(OldF->getDLLStorageClass());
1673 OldF->getParent()->getFunctionList().insertAfter(where: OldF->getIterator(), New: NewF);
1674
1675 while (!OldF->empty()) {
1676 BasicBlock *BB = &OldF->front();
1677 BB->removeFromParent();
1678 BB->insertInto(Parent: NewF);
1679 CloneMap[BB] = BB;
1680 for (Instruction &I : *BB) {
1681 CloneMap[&I] = &I;
1682 }
1683 }
1684
1685 AttributeMask PtrOnlyAttrs;
1686 for (auto K :
1687 {Attribute::Dereferenceable, Attribute::DereferenceableOrNull,
1688 Attribute::NoAlias, Attribute::NoCapture, Attribute::NoFree,
1689 Attribute::NonNull, Attribute::NullPointerIsValid, Attribute::ReadNone,
1690 Attribute::ReadOnly, Attribute::WriteOnly}) {
1691 PtrOnlyAttrs.addAttribute(Val: K);
1692 }
1693 SmallVector<AttributeSet> ArgAttrs;
1694 AttributeList OldAttrs = OldF->getAttributes();
1695
1696 for (auto [I, OldArg, NewArg] : enumerate(First: OldF->args(), Rest: NewF->args())) {
1697 CloneMap[&NewArg] = &OldArg;
1698 NewArg.takeName(V: &OldArg);
1699 Type *OldArgTy = OldArg.getType(), *NewArgTy = NewArg.getType();
1700 // Temporarily mutate type of `NewArg` to allow RAUW to work.
1701 NewArg.mutateType(Ty: OldArgTy);
1702 OldArg.replaceAllUsesWith(V: &NewArg);
1703 NewArg.mutateType(Ty: NewArgTy);
1704
1705 AttributeSet ArgAttr = OldAttrs.getParamAttrs(ArgNo: I);
1706 // Intrinsics get their attributes fixed later.
1707 if (OldArgTy != NewArgTy && !IsIntrinsic)
1708 ArgAttr = ArgAttr.removeAttributes(C&: NewF->getContext(), AttrsToRemove: PtrOnlyAttrs);
1709 ArgAttrs.push_back(Elt: ArgAttr);
1710 }
1711 AttributeSet RetAttrs = OldAttrs.getRetAttrs();
1712 if (OldF->getReturnType() != NewF->getReturnType() && !IsIntrinsic)
1713 RetAttrs = RetAttrs.removeAttributes(C&: NewF->getContext(), AttrsToRemove: PtrOnlyAttrs);
1714 NewF->setAttributes(AttributeList::get(
1715 C&: NewF->getContext(), FnAttrs: OldAttrs.getFnAttrs(), RetAttrs, ArgAttrs));
1716 return NewF;
1717}
1718
1719static void makeCloneInPraceMap(Function *F, ValueToValueMapTy &CloneMap) {
1720 for (Argument &A : F->args())
1721 CloneMap[&A] = &A;
1722 for (BasicBlock &BB : *F) {
1723 CloneMap[&BB] = &BB;
1724 for (Instruction &I : BB)
1725 CloneMap[&I] = &I;
1726 }
1727}
1728
1729bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
1730 bool Changed = false;
1731 const DataLayout &DL = M.getDataLayout();
1732 // Record the functions which need to be remapped.
1733 // The second element of the pair indicates whether the function has to have
1734 // its arguments or return types adjusted.
1735 SmallVector<std::pair<Function *, bool>> NeedsRemap;
1736
1737 BufferFatPtrToStructTypeMap StructTM(DL);
1738 BufferFatPtrToIntTypeMap IntTM(DL);
1739 for (const GlobalVariable &GV : M.globals()) {
1740 if (GV.getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER)
1741 report_fatal_error(reason: "Global variables with a buffer fat pointer address "
1742 "space (7) are not supported");
1743 Type *VT = GV.getValueType();
1744 if (VT != StructTM.remapType(SrcTy: VT))
1745 report_fatal_error(reason: "Global variables that contain buffer fat pointers "
1746 "(address space 7 pointers) are unsupported. Use "
1747 "buffer resource pointers (address space 8) instead.");
1748 }
1749
1750 {
1751 // Collect all constant exprs and aggregates referenced by any function.
1752 SmallVector<Constant *, 8> Worklist;
1753 for (Function &F : M.functions())
1754 for (Instruction &I : instructions(F))
1755 for (Value *Op : I.operands())
1756 if (isa<ConstantExpr>(Val: Op) || isa<ConstantAggregate>(Val: Op))
1757 Worklist.push_back(Elt: cast<Constant>(Val: Op));
1758
1759 // Recursively look for any referenced buffer pointer constants.
1760 SmallPtrSet<Constant *, 8> Visited;
1761 SetVector<Constant *> BufferFatPtrConsts;
1762 while (!Worklist.empty()) {
1763 Constant *C = Worklist.pop_back_val();
1764 if (!Visited.insert(Ptr: C).second)
1765 continue;
1766 if (isBufferFatPtrOrVector(Ty: C->getType()))
1767 BufferFatPtrConsts.insert(X: C);
1768 for (Value *Op : C->operands())
1769 if (isa<ConstantExpr>(Val: Op) || isa<ConstantAggregate>(Val: Op))
1770 Worklist.push_back(Elt: cast<Constant>(Val: Op));
1771 }
1772
1773 // Expand all constant expressions using fat buffer pointers to
1774 // instructions.
1775 Changed |= convertUsersOfConstantsToInstructions(
1776 Consts: BufferFatPtrConsts.getArrayRef(), /*RestrictToFunc=*/nullptr,
1777 /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
1778 }
1779
1780 StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
1781 for (Function &F : M.functions()) {
1782 bool InterfaceChange = hasFatPointerInterface(F, TypeMap: &StructTM);
1783 bool BodyChanges = containsBufferFatPointers(F, TypeMap: &StructTM);
1784 Changed |= MemOpsRewrite.processFunction(F);
1785 if (InterfaceChange || BodyChanges)
1786 NeedsRemap.push_back(Elt: std::make_pair(x: &F, y&: InterfaceChange));
1787 }
1788 if (NeedsRemap.empty())
1789 return Changed;
1790
1791 SmallVector<Function *> NeedsPostProcess;
1792 SmallVector<Function *> Intrinsics;
1793 // Keep one big map so as to memoize constants across functions.
1794 ValueToValueMapTy CloneMap;
1795 FatPtrConstMaterializer Materializer(&StructTM, CloneMap);
1796
1797 ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer);
1798 for (auto [F, InterfaceChange] : NeedsRemap) {
1799 Function *NewF = F;
1800 if (InterfaceChange)
1801 NewF = moveFunctionAdaptingType(
1802 OldF: F, NewTy: cast<FunctionType>(Val: StructTM.remapType(SrcTy: F->getFunctionType())),
1803 CloneMap);
1804 else
1805 makeCloneInPraceMap(F, CloneMap);
1806 LowerInFuncs.remapFunction(F&: *NewF);
1807 if (NewF->isIntrinsic())
1808 Intrinsics.push_back(Elt: NewF);
1809 else
1810 NeedsPostProcess.push_back(Elt: NewF);
1811 if (InterfaceChange) {
1812 F->replaceAllUsesWith(V: NewF);
1813 F->eraseFromParent();
1814 }
1815 Changed = true;
1816 }
1817 StructTM.clear();
1818 IntTM.clear();
1819 CloneMap.clear();
1820
1821 SplitPtrStructs Splitter(M.getContext(), &TM);
1822 for (Function *F : NeedsPostProcess)
1823 Splitter.processFunction(F&: *F);
1824 for (Function *F : Intrinsics) {
1825 if (isRemovablePointerIntrinsic(IID: F->getIntrinsicID())) {
1826 F->eraseFromParent();
1827 } else {
1828 std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F);
1829 if (NewF)
1830 F->replaceAllUsesWith(V: *NewF);
1831 }
1832 }
1833 return Changed;
1834}
1835
1836bool AMDGPULowerBufferFatPointers::runOnModule(Module &M) {
1837 TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
1838 const TargetMachine &TM = TPC.getTM<TargetMachine>();
1839 return run(M, TM);
1840}
1841
1842char AMDGPULowerBufferFatPointers::ID = 0;
1843
1844char &llvm::AMDGPULowerBufferFatPointersID = AMDGPULowerBufferFatPointers::ID;
1845
1846void AMDGPULowerBufferFatPointers::getAnalysisUsage(AnalysisUsage &AU) const {
1847 AU.addRequired<TargetPassConfig>();
1848}
1849
1850#define PASS_DESC "Lower buffer fat pointer operations to buffer resources"
1851INITIALIZE_PASS_BEGIN(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC,
1852 false, false)
1853INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
1854INITIALIZE_PASS_END(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false,
1855 false)
1856#undef PASS_DESC
1857
1858ModulePass *llvm::createAMDGPULowerBufferFatPointersPass() {
1859 return new AMDGPULowerBufferFatPointers();
1860}
1861
1862PreservedAnalyses
1863AMDGPULowerBufferFatPointersPass::run(Module &M, ModuleAnalysisManager &MA) {
1864 return AMDGPULowerBufferFatPointers().run(M, TM) ? PreservedAnalyses::none()
1865 : PreservedAnalyses::all();
1866}
1867