1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
24#include "SIRegisterInfo.h"
25#include "Utils/AMDGPUBaseInfo.h"
26#include "llvm/ADT/ScopeExit.h"
27#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31#include "llvm/CodeGen/GlobalISel/Utils.h"
32#include "llvm/CodeGen/TargetOpcodes.h"
33#include "llvm/IR/DiagnosticInfo.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
46static cl::opt<bool> EnableNewLegality(
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(Val: false),
51 cl::ReallyHidden);
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
56static LLT getPow2VectorType(LLT Ty) {
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(Value: NElts);
59 return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
63static LLT getPow2ScalarType(LLT Ty) {
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Value: Bits);
66 return LLT::scalar(SizeInBits: Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(NumElements: Ty.getNumElements() + 1, ScalarTy: EltTy));
107 };
108}
109
110static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
141static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
149 assert(Ty.getSizeInBits() < MaxRegisterSize);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
155 break;
156 }
157 return std::pair(TypeIdx,
158 LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: Ty.getElementType()));
159 };
160}
161
162static LLT getBufferRsrcScalarType(const LLT Ty) {
163 if (!Ty.isVector())
164 return LLT::scalar(SizeInBits: 128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: 128));
167}
168
169static LLT getBufferRsrcRegisterType(const LLT Ty) {
170 if (!Ty.isVector())
171 return LLT::fixed_vector(NumElements: 4, ScalarTy: LLT::scalar(SizeInBits: 32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElements: NumElems * 4, ScalarTy: LLT::scalar(SizeInBits: 32));
174}
175
176static LLT getBitcastRegisterType(const LLT Ty) {
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(SizeInBits: Size);
183 }
184
185 return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32);
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
195static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
201 TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32));
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
227 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
228 Size <= MaxRegisterSize;
229}
230
231static bool isRegisterVectorElementType(LLT EltTy) {
232 const int EltSize = EltTy.getSizeInBits();
233 return EltSize == 16 || EltSize % 32 == 0;
234}
235
236static bool isRegisterVectorType(LLT Ty) {
237 const int EltSize = Ty.getElementType().getSizeInBits();
238 return EltSize == 32 || EltSize == 64 ||
239 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
240 EltSize == 128 || EltSize == 256;
241}
242
243// TODO: replace all uses of isRegisterType with isRegisterClassType
244static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
245 if (!isRegisterSize(ST, Size: Ty.getSizeInBits()))
246 return false;
247
248 if (Ty.isVector())
249 return isRegisterVectorType(Ty);
250
251 return true;
252}
253
254// Any combination of 32 or 64-bit elements up the maximum register size, and
255// multiples of v2s16.
256static LegalityPredicate isRegisterType(const GCNSubtarget &ST,
257 unsigned TypeIdx) {
258 return [=, &ST](const LegalityQuery &Query) {
259 return isRegisterType(ST, Ty: Query.Types[TypeIdx]);
260 };
261}
262
263// RegisterType that doesn't have a corresponding RegClass.
264// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
265// should be removed.
266static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST,
267 unsigned TypeIdx) {
268 return [=, &ST](const LegalityQuery &Query) {
269 LLT Ty = Query.Types[TypeIdx];
270 return isRegisterType(ST, Ty) &&
271 !SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
272 };
273}
274
275static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
276 return [=](const LegalityQuery &Query) {
277 const LLT QueryTy = Query.Types[TypeIdx];
278 if (!QueryTy.isVector())
279 return false;
280 const LLT EltTy = QueryTy.getElementType();
281 return EltTy == LLT::scalar(SizeInBits: 16) || EltTy.getSizeInBits() >= 32;
282 };
283}
284
285constexpr LLT S1 = LLT::scalar(SizeInBits: 1);
286constexpr LLT S8 = LLT::scalar(SizeInBits: 8);
287constexpr LLT S16 = LLT::scalar(SizeInBits: 16);
288constexpr LLT S32 = LLT::scalar(SizeInBits: 32);
289constexpr LLT F32 = LLT::float32();
290constexpr LLT S64 = LLT::scalar(SizeInBits: 64);
291constexpr LLT F64 = LLT::float64();
292constexpr LLT S96 = LLT::scalar(SizeInBits: 96);
293constexpr LLT S128 = LLT::scalar(SizeInBits: 128);
294constexpr LLT S160 = LLT::scalar(SizeInBits: 160);
295constexpr LLT S192 = LLT::scalar(SizeInBits: 192);
296constexpr LLT S224 = LLT::scalar(SizeInBits: 224);
297constexpr LLT S256 = LLT::scalar(SizeInBits: 256);
298constexpr LLT S512 = LLT::scalar(SizeInBits: 512);
299constexpr LLT S1024 = LLT::scalar(SizeInBits: 1024);
300constexpr LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
301
302constexpr LLT V2S8 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 8);
303constexpr LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
304constexpr LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
305constexpr LLT V6S16 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16);
306constexpr LLT V8S16 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
307constexpr LLT V10S16 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 16);
308constexpr LLT V12S16 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 16);
309constexpr LLT V16S16 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16);
310
311constexpr LLT V2F16 = LLT::fixed_vector(NumElements: 2, ScalarTy: LLT::float16());
312constexpr LLT V2BF16 = V2F16; // FIXME
313
314constexpr LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
315constexpr LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
316constexpr LLT V4S32 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
317constexpr LLT V5S32 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 32);
318constexpr LLT V6S32 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 32);
319constexpr LLT V7S32 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 32);
320constexpr LLT V8S32 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32);
321constexpr LLT V9S32 = LLT::fixed_vector(NumElements: 9, ScalarSizeInBits: 32);
322constexpr LLT V10S32 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 32);
323constexpr LLT V11S32 = LLT::fixed_vector(NumElements: 11, ScalarSizeInBits: 32);
324constexpr LLT V12S32 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 32);
325constexpr LLT V16S32 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32);
326constexpr LLT V32S32 = LLT::fixed_vector(NumElements: 32, ScalarSizeInBits: 32);
327
328constexpr LLT V2S64 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
329constexpr LLT V3S64 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 64);
330constexpr LLT V4S64 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64);
331constexpr LLT V5S64 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 64);
332constexpr LLT V6S64 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 64);
333constexpr LLT V7S64 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 64);
334constexpr LLT V8S64 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64);
335constexpr LLT V16S64 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 64);
336
337constexpr LLT V2S128 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 128);
338constexpr LLT V4S128 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 128);
339
340constexpr std::initializer_list<LLT> AllScalarTypes = {
341 S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
342
343constexpr std::initializer_list<LLT> AllS16Vectors{
344 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
345
346constexpr std::initializer_list<LLT> AllS32Vectors = {
347 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
348 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
349
350constexpr std::initializer_list<LLT> AllS64Vectors = {
351 V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
352
353constexpr std::initializer_list<LLT> AllVectors{
354 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128,
355 V4S128, V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
356 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32, V2S64, V3S64,
357 V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
358
359// Checks whether a type is in the list of legal register types.
360static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
361 if (Ty.isPointerOrPointerVector())
362 Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
363
364 return is_contained(Set: AllS32Vectors, Element: Ty) || is_contained(Set: AllS64Vectors, Element: Ty) ||
365 is_contained(Set: AllScalarTypes, Element: Ty) ||
366 (ST.useRealTrue16Insts() && Ty == S16) ||
367 is_contained(Set: AllS16Vectors, Element: Ty);
368}
369
370static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST,
371 unsigned TypeIdx) {
372 return [&ST, TypeIdx](const LegalityQuery &Query) {
373 return isRegisterClassType(ST, Ty: Query.Types[TypeIdx]);
374 };
375}
376
377// If we have a truncating store or an extending load with a data size larger
378// than 32-bits, we need to reduce to a 32-bit type.
379static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
380 return [=](const LegalityQuery &Query) {
381 const LLT Ty = Query.Types[TypeIdx];
382 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
383 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
384 };
385}
386
387// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
388// handle some operations by just promoting the register during
389// selection. There are also d16 loads on GFX9+ which preserve the high bits.
390static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
391 bool IsLoad, bool IsAtomic) {
392 switch (AS) {
393 case AMDGPUAS::PRIVATE_ADDRESS:
394 // FIXME: Private element size.
395 return ST.enableFlatScratch() ? 128 : 32;
396 case AMDGPUAS::LOCAL_ADDRESS:
397 return ST.useDS128() ? 128 : 64;
398 case AMDGPUAS::GLOBAL_ADDRESS:
399 case AMDGPUAS::CONSTANT_ADDRESS:
400 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
401 case AMDGPUAS::BUFFER_RESOURCE:
402 // Treat constant and global as identical. SMRD loads are sometimes usable for
403 // global loads (ideally constant address space should be eliminated)
404 // depending on the context. Legality cannot be context dependent, but
405 // RegBankSelect can split the load as necessary depending on the pointer
406 // register bank/uniformity and if the memory is invariant or not written in a
407 // kernel.
408 return IsLoad ? 512 : 128;
409 default:
410 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
411 // if they may alias scratch depending on the subtarget. This needs to be
412 // moved to custom handling to use addressMayBeAccessedAsPrivate
413 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
414 }
415}
416
417static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
418 const LegalityQuery &Query) {
419 const LLT Ty = Query.Types[0];
420
421 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
422 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
423
424 unsigned RegSize = Ty.getSizeInBits();
425 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
426 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
427 unsigned AS = Query.Types[1].getAddressSpace();
428
429 // All of these need to be custom lowered to cast the pointer operand.
430 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
431 return false;
432
433 // Do not handle extending vector loads.
434 if (Ty.isVector() && MemSize != RegSize)
435 return false;
436
437 // TODO: We should be able to widen loads if the alignment is high enough, but
438 // we also need to modify the memory access size.
439#if 0
440 // Accept widening loads based on alignment.
441 if (IsLoad && MemSize < Size)
442 MemSize = std::max(MemSize, Align);
443#endif
444
445 // Only 1-byte and 2-byte to 32-bit extloads are valid.
446 if (MemSize != RegSize && RegSize != 32)
447 return false;
448
449 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
450 IsAtomic: Query.MMODescrs[0].Ordering !=
451 AtomicOrdering::NotAtomic))
452 return false;
453
454 switch (MemSize) {
455 case 8:
456 case 16:
457 case 32:
458 case 64:
459 case 128:
460 break;
461 case 96:
462 if (!ST.hasDwordx3LoadStores())
463 return false;
464 break;
465 case 256:
466 case 512:
467 // These may contextually need to be broken down.
468 break;
469 default:
470 return false;
471 }
472
473 assert(RegSize >= MemSize);
474
475 if (AlignBits < MemSize) {
476 const SITargetLowering *TLI = ST.getTargetLowering();
477 if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
478 Alignment: Align(AlignBits / 8)))
479 return false;
480 }
481
482 return true;
483}
484
485// The newer buffer intrinsic forms take their resource arguments as
486// pointers in address space 8, aka s128 values. However, in order to not break
487// SelectionDAG, the underlying operations have to continue to take v4i32
488// arguments. Therefore, we convert resource pointers - or vectors of them
489// to integer values here.
490static bool hasBufferRsrcWorkaround(const LLT Ty) {
491 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
492 return true;
493 if (Ty.isVector()) {
494 const LLT ElemTy = Ty.getElementType();
495 return hasBufferRsrcWorkaround(Ty: ElemTy);
496 }
497 return false;
498}
499
500// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
501// workaround this. Eventually it should ignore the type for loads and only care
502// about the size. Return true in cases where we will workaround this for now by
503// bitcasting.
504static bool loadStoreBitcastWorkaround(const LLT Ty) {
505 if (EnableNewLegality)
506 return false;
507
508 const unsigned Size = Ty.getSizeInBits();
509 if (Ty.isPointerVector())
510 return true;
511 if (Size <= 64)
512 return false;
513 // Address space 8 pointers get their own workaround.
514 if (hasBufferRsrcWorkaround(Ty))
515 return false;
516 if (!Ty.isVector())
517 return true;
518
519 unsigned EltSize = Ty.getScalarSizeInBits();
520 return EltSize != 32 && EltSize != 64;
521}
522
523static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
524 const LLT Ty = Query.Types[0];
525 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
526 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
527}
528
529/// Return true if a load or store of the type should be lowered with a bitcast
530/// to a different type.
531static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
532 const LLT MemTy) {
533 const unsigned MemSizeInBits = MemTy.getSizeInBits();
534 const unsigned Size = Ty.getSizeInBits();
535 if (Size != MemSizeInBits)
536 return Size <= 32 && Ty.isVector();
537
538 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
539 return true;
540
541 // Don't try to handle bitcasting vector ext loads for now.
542 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
543 (Size <= 32 || isRegisterSize(ST, Size)) &&
544 !isRegisterVectorElementType(EltTy: Ty.getElementType());
545}
546
547/// Return true if we should legalize a load by widening an odd sized memory
548/// access up to the alignment. Note this case when the memory access itself
549/// changes, not the size of the result register.
550static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
551 uint64_t AlignInBits, unsigned AddrSpace,
552 unsigned Opcode) {
553 unsigned SizeInBits = MemoryTy.getSizeInBits();
554 // We don't want to widen cases that are naturally legal.
555 if (isPowerOf2_32(Value: SizeInBits))
556 return false;
557
558 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
559 // end up widening these for a scalar load during RegBankSelect, if we don't
560 // have 96-bit scalar loads.
561 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
562 return false;
563
564 if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
565 return false;
566
567 // A load is known dereferenceable up to the alignment, so it's legal to widen
568 // to it.
569 //
570 // TODO: Could check dereferenceable for less aligned cases.
571 unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
572 if (AlignInBits < RoundedSize)
573 return false;
574
575 // Do not widen if it would introduce a slow unaligned load.
576 const SITargetLowering *TLI = ST.getTargetLowering();
577 unsigned Fast = 0;
578 return TLI->allowsMisalignedMemoryAccessesImpl(
579 Size: RoundedSize, AddrSpace, Alignment: Align(AlignInBits / 8),
580 Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
581 Fast;
582}
583
584static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
585 unsigned Opcode) {
586 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
587 return false;
588
589 return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs[0].MemoryTy,
590 AlignInBits: Query.MMODescrs[0].AlignInBits,
591 AddrSpace: Query.Types[1].getAddressSpace(), Opcode);
592}
593
594/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
595/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
596/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
597static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
598 MachineRegisterInfo &MRI, unsigned Idx) {
599 MachineOperand &MO = MI.getOperand(i: Idx);
600
601 const LLT PointerTy = MRI.getType(Reg: MO.getReg());
602
603 // Paranoidly prevent us from doing this multiple times.
604 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
605 return PointerTy;
606
607 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
608 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
609 if (!PointerTy.isVector()) {
610 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
611 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
612 const LLT S32 = LLT::scalar(SizeInBits: 32);
613
614 Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
615 std::array<Register, 4> VectorElems;
616 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
617 for (unsigned I = 0; I < NumParts; ++I)
618 VectorElems[I] =
619 B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: 0);
620 B.buildMergeValues(Res: MO, Ops: VectorElems);
621 MO.setReg(VectorReg);
622 return VectorTy;
623 }
624 Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
625 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
626 auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
627 B.buildIntToPtr(Dst: MO, Src: Scalar);
628 MO.setReg(BitcastReg);
629
630 return VectorTy;
631}
632
633/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
634/// the form in which the value must be in order to be passed to the low-level
635/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
636/// needed in order to account for the fact that we can't define a register
637/// class for s128 without breaking SelectionDAG.
638static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
639 MachineRegisterInfo &MRI = *B.getMRI();
640 const LLT PointerTy = MRI.getType(Reg: Pointer);
641 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
642 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
643
644 if (!PointerTy.isVector()) {
645 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
646 SmallVector<Register, 4> PointerParts;
647 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
648 auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: Pointer);
649 for (unsigned I = 0; I < NumParts; ++I)
650 PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
651 return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: 0);
652 }
653 Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: 0);
654 return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: 0);
655}
656
657static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
658 unsigned Idx) {
659 MachineOperand &MO = MI.getOperand(i: Idx);
660
661 const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
662 // Paranoidly prevent us from doing this multiple times.
663 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
664 return;
665 MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
666}
667
668AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
669 const GCNTargetMachine &TM)
670 : ST(ST_) {
671 using namespace TargetOpcode;
672
673 auto GetAddrSpacePtr = [&TM](unsigned AS) {
674 return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
675 };
676
677 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
678 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
679 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
680 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
681 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
682 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
683 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
684 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
685 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
686 const LLT BufferStridedPtr =
687 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
688
689 const LLT CodePtr = FlatPtr;
690
691 const std::initializer_list<LLT> AddrSpaces64 = {
692 GlobalPtr, ConstantPtr, FlatPtr
693 };
694
695 const std::initializer_list<LLT> AddrSpaces32 = {
696 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
697 };
698
699 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
700
701 const std::initializer_list<LLT> FPTypesBase = {
702 S32, S64
703 };
704
705 const std::initializer_list<LLT> FPTypes16 = {
706 S32, S64, S16
707 };
708
709 const std::initializer_list<LLT> FPTypesPK16 = {
710 S32, S64, S16, V2S16
711 };
712
713 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
714
715 // s1 for VCC branches, s32 for SCC branches.
716 getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
717
718 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
719 // elements for v3s16
720 getActionDefinitionsBuilder(Opcode: G_PHI)
721 .legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
722 .legalFor(Types: AllS32Vectors)
723 .legalFor(Types: AllS64Vectors)
724 .legalFor(Types: AddrSpaces64)
725 .legalFor(Types: AddrSpaces32)
726 .legalFor(Types: AddrSpaces128)
727 .legalIf(Predicate: isPointer(TypeIdx: 0))
728 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S256)
729 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
730 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16)
731 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
732 .scalarize(TypeIdx: 0);
733
734 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
735 // Full set of gfx9 features.
736 if (ST.hasScalarAddSub64()) {
737 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
738 .legalFor(Types: {S64, S32, S16, V2S16})
739 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
740 .scalarize(TypeIdx: 0)
741 .minScalar(TypeIdx: 0, Ty: S16)
742 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
743 .maxScalar(TypeIdx: 0, Ty: S32);
744 } else {
745 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
746 .legalFor(Types: {S32, S16, V2S16})
747 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
748 .scalarize(TypeIdx: 0)
749 .minScalar(TypeIdx: 0, Ty: S16)
750 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
751 .maxScalar(TypeIdx: 0, Ty: S32);
752 }
753
754 if (ST.hasScalarSMulU64()) {
755 getActionDefinitionsBuilder(Opcode: G_MUL)
756 .legalFor(Types: {S64, S32, S16, V2S16})
757 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
758 .scalarize(TypeIdx: 0)
759 .minScalar(TypeIdx: 0, Ty: S16)
760 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
761 .custom();
762 } else {
763 getActionDefinitionsBuilder(Opcode: G_MUL)
764 .legalFor(Types: {S32, S16, V2S16})
765 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
766 .scalarize(TypeIdx: 0)
767 .minScalar(TypeIdx: 0, Ty: S16)
768 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
769 .custom();
770 }
771 assert(ST.hasMad64_32());
772
773 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
774 .legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
775 .minScalarOrElt(TypeIdx: 0, Ty: S16)
776 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
777 .scalarize(TypeIdx: 0)
778 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
779 .lower();
780 } else if (ST.has16BitInsts()) {
781 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
782 .legalFor(Types: {S32, S16})
783 .minScalar(TypeIdx: 0, Ty: S16)
784 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
785 .maxScalar(TypeIdx: 0, Ty: S32)
786 .scalarize(TypeIdx: 0);
787
788 getActionDefinitionsBuilder(Opcode: G_MUL)
789 .legalFor(Types: {S32, S16})
790 .scalarize(TypeIdx: 0)
791 .minScalar(TypeIdx: 0, Ty: S16)
792 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
793 .custom();
794 assert(ST.hasMad64_32());
795
796 // Technically the saturating operations require clamp bit support, but this
797 // was introduced at the same time as 16-bit operations.
798 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
799 .legalFor(Types: {S32, S16}) // Clamp modifier
800 .minScalar(TypeIdx: 0, Ty: S16)
801 .scalarize(TypeIdx: 0)
802 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 16)
803 .lower();
804
805 // We're just lowering this, but it helps get a better result to try to
806 // coerce to the desired type first.
807 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
808 .minScalar(TypeIdx: 0, Ty: S16)
809 .scalarize(TypeIdx: 0)
810 .lower();
811 } else {
812 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
813 .legalFor(Types: {S32})
814 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
815 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
816 .scalarize(TypeIdx: 0);
817
818 auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
819 .legalFor(Types: {S32})
820 .scalarize(TypeIdx: 0)
821 .minScalar(TypeIdx: 0, Ty: S32)
822 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32);
823
824 if (ST.hasMad64_32())
825 Mul.custom();
826 else
827 Mul.maxScalar(TypeIdx: 0, Ty: S32);
828
829 if (ST.hasIntClamp()) {
830 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
831 .legalFor(Types: {S32}) // Clamp modifier.
832 .scalarize(TypeIdx: 0)
833 .minScalarOrElt(TypeIdx: 0, Ty: S32)
834 .lower();
835 } else {
836 // Clamp bit support was added in VI, along with 16-bit operations.
837 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
838 .minScalar(TypeIdx: 0, Ty: S32)
839 .scalarize(TypeIdx: 0)
840 .lower();
841 }
842
843 // FIXME: DAG expansion gets better results. The widening uses the smaller
844 // range values and goes for the min/max lowering directly.
845 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
846 .minScalar(TypeIdx: 0, Ty: S32)
847 .scalarize(TypeIdx: 0)
848 .lower();
849 }
850
851 getActionDefinitionsBuilder(
852 Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
853 .customFor(Types: {S32, S64})
854 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
855 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
856 .scalarize(TypeIdx: 0);
857
858 auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
859 .legalFor(Types: {S32})
860 .maxScalar(TypeIdx: 0, Ty: S32);
861
862 if (ST.hasVOP3PInsts()) {
863 Mulh
864 .clampMaxNumElements(TypeIdx: 0, EltTy: S8, MaxElements: 2)
865 .lowerFor(Types: {V2S8});
866 }
867
868 Mulh
869 .scalarize(TypeIdx: 0)
870 .lower();
871
872 // Report legal for any types we can handle anywhere. For the cases only legal
873 // on the SALU, RegBankSelect will be able to re-legalize.
874 getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
875 .legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
876 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
877 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
878 .fewerElementsIf(
879 Predicate: all(P0: vectorWiderThan(TypeIdx: 0, Size: 64), P1: scalarOrEltNarrowerThan(TypeIdx: 0, Size: 64)),
880 Mutation: fewerEltsToSize64Vector(TypeIdx: 0))
881 .widenScalarToNextPow2(TypeIdx: 0)
882 .scalarize(TypeIdx: 0);
883
884 getActionDefinitionsBuilder(
885 Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
886 .legalFor(Types: {{S32, S1}, {S32, S32}})
887 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
888 .scalarize(TypeIdx: 0);
889
890 getActionDefinitionsBuilder(Opcode: G_BITCAST)
891 // Don't worry about the size constraint.
892 .legalIf(Predicate: all(P0: isRegisterClassType(ST, TypeIdx: 0), P1: isRegisterClassType(ST, TypeIdx: 1)))
893 .lower();
894
895 getActionDefinitionsBuilder(Opcode: G_CONSTANT)
896 .legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
897 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
898 .legalIf(Predicate: isPointer(TypeIdx: 0))
899 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
900 .widenScalarToNextPow2(TypeIdx: 0);
901
902 getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
903 .legalFor(Types: {S32, S64, S16})
904 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
905
906 getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
907 .legalIf(Predicate: isRegisterClassType(ST, TypeIdx: 0))
908 // s1 and s16 are special cases because they have legal operations on
909 // them, but don't really occupy registers in the normal way.
910 .legalFor(Types: {S1, S16})
911 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
912 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
913 .clampScalarOrElt(TypeIdx: 0, MinTy: S32, MaxTy: MaxScalar)
914 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
915 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16);
916
917 getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
918
919 // If the amount is divergent, we have to do a wave reduction to get the
920 // maximum value, so this is expanded during RegBankSelect.
921 getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
922 .legalFor(Types: {{PrivatePtr, S32}});
923
924 getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
925 .customFor(Types: {PrivatePtr});
926 getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
927 .legalFor(Types: {PrivatePtr});
928
929 getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
930
931 getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
932 .customIf(Predicate: typeIsNot(TypeIdx: 0, Type: PrivatePtr));
933
934 getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
935
936 auto &FPOpActions = getActionDefinitionsBuilder(
937 Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
938 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
939 .legalFor(Types: {S32, S64});
940 auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
941 .customFor(Types: {S32, S64});
942 auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
943 .customFor(Types: {S32, S64});
944
945 if (ST.has16BitInsts()) {
946 if (ST.hasVOP3PInsts())
947 FPOpActions.legalFor(Types: {S16, V2S16});
948 else
949 FPOpActions.legalFor(Types: {S16});
950
951 TrigActions.customFor(Types: {S16});
952 FDIVActions.customFor(Types: {S16});
953 }
954
955 if (ST.hasPackedFP32Ops()) {
956 FPOpActions.legalFor(Types: {V2S32});
957 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S32, NumElts: 2);
958 }
959
960 auto &MinNumMaxNum = getActionDefinitionsBuilder(
961 Opcodes: {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
962 G_FMAXNUM_IEEE});
963
964 if (ST.hasVOP3PInsts()) {
965 MinNumMaxNum.customFor(Types: FPTypesPK16)
966 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
967 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
968 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
969 .scalarize(TypeIdx: 0);
970 } else if (ST.has16BitInsts()) {
971 MinNumMaxNum.customFor(Types: FPTypes16)
972 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
973 .scalarize(TypeIdx: 0);
974 } else {
975 MinNumMaxNum.customFor(Types: FPTypesBase)
976 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
977 .scalarize(TypeIdx: 0);
978 }
979
980 if (ST.hasVOP3PInsts())
981 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
982
983 FPOpActions
984 .scalarize(TypeIdx: 0)
985 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
986
987 TrigActions
988 .scalarize(TypeIdx: 0)
989 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
990
991 FDIVActions
992 .scalarize(TypeIdx: 0)
993 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
994
995 getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS})
996 .legalFor(Types: FPTypesPK16)
997 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
998 .scalarize(TypeIdx: 0)
999 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1000
1001 if (ST.has16BitInsts()) {
1002 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1003 .legalFor(Types: {S16})
1004 .customFor(Types: {S32, S64})
1005 .scalarize(TypeIdx: 0)
1006 .unsupported();
1007 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1008 .legalFor(Types: {S32, S64, S16})
1009 .scalarize(TypeIdx: 0)
1010 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1011
1012 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1013 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
1014 .scalarize(TypeIdx: 0)
1015 .maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16)
1016 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1017 .lower();
1018
1019 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1020 .customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1021 .scalarize(TypeIdx: 0)
1022 .lower();
1023 } else {
1024 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1025 .customFor(Types: {S32, S64, S16})
1026 .scalarize(TypeIdx: 0)
1027 .unsupported();
1028
1029
1030 if (ST.hasFractBug()) {
1031 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1032 .customFor(Types: {S64})
1033 .legalFor(Types: {S32, S64})
1034 .scalarize(TypeIdx: 0)
1035 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1036 } else {
1037 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1038 .legalFor(Types: {S32, S64})
1039 .scalarize(TypeIdx: 0)
1040 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1041 }
1042
1043 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1044 .legalFor(Types: {{S32, S32}, {S64, S32}})
1045 .scalarize(TypeIdx: 0)
1046 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1047 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1048 .lower();
1049
1050 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1051 .customFor(Types: {{S32, S32}, {S64, S32}})
1052 .scalarize(TypeIdx: 0)
1053 .minScalar(TypeIdx: 0, Ty: S32)
1054 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1055 .lower();
1056 }
1057
1058 auto &FPTruncActions = getActionDefinitionsBuilder(Opcode: G_FPTRUNC);
1059 if (ST.hasCvtPkF16F32Inst()) {
1060 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1061 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1062 } else {
1063 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}});
1064 }
1065 FPTruncActions.scalarize(TypeIdx: 0).lower();
1066
1067 getActionDefinitionsBuilder(Opcode: G_FPEXT)
1068 .legalFor(Types: {{S64, S32}, {S32, S16}})
1069 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32))
1070 .scalarize(TypeIdx: 0);
1071
1072 auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1073 if (ST.has16BitInsts()) {
1074 FSubActions
1075 // Use actual fsub instruction
1076 .legalFor(Types: {S32, S16})
1077 // Must use fadd + fneg
1078 .lowerFor(Types: {S64, V2S16});
1079 } else {
1080 FSubActions
1081 // Use actual fsub instruction
1082 .legalFor(Types: {S32})
1083 // Must use fadd + fneg
1084 .lowerFor(Types: {S64, S16, V2S16});
1085 }
1086
1087 FSubActions
1088 .scalarize(TypeIdx: 0)
1089 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1090
1091 // Whether this is legal depends on the floating point mode for the function.
1092 auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1093 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1094 FMad.customFor(Types: {S32, S16});
1095 else if (ST.hasMadMacF32Insts())
1096 FMad.customFor(Types: {S32});
1097 else if (ST.hasMadF16())
1098 FMad.customFor(Types: {S16});
1099 FMad.scalarize(TypeIdx: 0)
1100 .lower();
1101
1102 auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1103 if (ST.has16BitInsts()) {
1104 FRem.customFor(Types: {S16, S32, S64});
1105 } else {
1106 FRem.minScalar(TypeIdx: 0, Ty: S32)
1107 .customFor(Types: {S32, S64});
1108 }
1109 FRem.scalarize(TypeIdx: 0);
1110
1111 // TODO: Do we need to clamp maximum bitwidth?
1112 getActionDefinitionsBuilder(Opcode: G_TRUNC)
1113 .legalIf(Predicate: isScalar(TypeIdx: 0))
1114 .legalFor(Types: {{V2S16, V2S32}})
1115 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1116 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1117 // situations (like an invalid implicit use), we don't want to infinite loop
1118 // in the legalizer.
1119 .fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: 0), Mutation: LegalizeMutations::scalarize(TypeIdx: 0))
1120 .alwaysLegal();
1121
1122 getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1123 .legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1124 {S32, S1}, {S64, S1}, {S16, S1}})
1125 .scalarize(TypeIdx: 0)
1126 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1127 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1128
1129 // TODO: Split s1->s64 during regbankselect for VALU.
1130 auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1131 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S32}})
1132 .lowerIf(Predicate: typeIs(TypeIdx: 1, TypesInit: S1))
1133 .customFor(Types: {{S32, S64}, {S64, S64}});
1134 if (ST.has16BitInsts())
1135 IToFP.legalFor(Types: {{S16, S16}});
1136 IToFP.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1137 .minScalar(TypeIdx: 0, Ty: S32)
1138 .scalarize(TypeIdx: 0)
1139 .widenScalarToNextPow2(TypeIdx: 1);
1140
1141 auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1142 .legalFor(Types: {{S32, S32}, {S32, S64}, {S32, S16}})
1143 .customFor(Types: {{S64, S32}, {S64, S64}})
1144 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1145 if (ST.has16BitInsts())
1146 FPToI.legalFor(Types: {{S16, S16}});
1147 else
1148 FPToI.minScalar(TypeIdx: 1, Ty: S32);
1149
1150 FPToI.minScalar(TypeIdx: 0, Ty: S32)
1151 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1152 .scalarize(TypeIdx: 0)
1153 .lower();
1154
1155 getActionDefinitionsBuilder(Opcodes: {G_LROUND, G_LLROUND})
1156 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1157 .scalarize(TypeIdx: 0)
1158 .lower();
1159
1160 getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1161 .legalFor(Types: {S16, S32})
1162 .scalarize(TypeIdx: 0)
1163 .lower();
1164
1165 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1166 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1167 .scalarize(TypeIdx: 0)
1168 .lower();
1169
1170 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1171 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1172 .scalarize(TypeIdx: 0)
1173 .lower();
1174
1175 if (ST.has16BitInsts()) {
1176 getActionDefinitionsBuilder(
1177 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1178 .legalFor(Types: {S16, S32, S64})
1179 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1180 .scalarize(TypeIdx: 0);
1181 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1182 getActionDefinitionsBuilder(
1183 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1184 .legalFor(Types: {S32, S64})
1185 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1186 .scalarize(TypeIdx: 0);
1187 } else {
1188 getActionDefinitionsBuilder(
1189 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1190 .legalFor(Types: {S32})
1191 .customFor(Types: {S64})
1192 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1193 .scalarize(TypeIdx: 0);
1194 }
1195
1196 getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1197 .unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1198 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: sameSize(TypeIdx0: 0, TypeIdx1: 1)))
1199 .scalarize(TypeIdx: 0)
1200 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0);
1201
1202 getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1203 .legalIf(Predicate: all(P0: sameSize(TypeIdx0: 0, TypeIdx1: 1), P1: typeInSet(TypeIdx: 1, TypesInit: {S64, S32})))
1204 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0)
1205 .scalarize(TypeIdx: 0);
1206
1207 auto &CmpBuilder =
1208 getActionDefinitionsBuilder(Opcode: G_ICMP)
1209 // The compare output type differs based on the register bank of the output,
1210 // so make both s1 and s32 legal.
1211 //
1212 // Scalar compares producing output in scc will be promoted to s32, as that
1213 // is the allocatable register type that will be needed for the copy from
1214 // scc. This will be promoted during RegBankSelect, and we assume something
1215 // before that won't try to use s32 result types.
1216 //
1217 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1218 // bank.
1219 .legalForCartesianProduct(
1220 Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1221 .legalForCartesianProduct(
1222 Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1223 if (ST.has16BitInsts()) {
1224 CmpBuilder.legalFor(Types: {{S1, S16}});
1225 }
1226
1227 CmpBuilder
1228 .widenScalarToNextPow2(TypeIdx: 1)
1229 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1230 .scalarize(TypeIdx: 0)
1231 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: 1)));
1232
1233 auto &FCmpBuilder =
1234 getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1235 Types0: {S1}, Types1: ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1236
1237 if (ST.hasSALUFloatInsts())
1238 FCmpBuilder.legalForCartesianProduct(Types0: {S32}, Types1: {S16, S32});
1239
1240 FCmpBuilder
1241 .widenScalarToNextPow2(TypeIdx: 1)
1242 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1243 .scalarize(TypeIdx: 0);
1244
1245 // FIXME: fpow has a selection pattern that should move to custom lowering.
1246 auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1247 if (ST.has16BitInsts())
1248 ExpOps.customFor(Types: {{S32}, {S16}});
1249 else
1250 ExpOps.customFor(Types: {S32});
1251 ExpOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1252 .scalarize(TypeIdx: 0);
1253
1254 getActionDefinitionsBuilder(Opcode: G_FPOWI)
1255 .clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1256 .lower();
1257
1258 auto &Log2Ops = getActionDefinitionsBuilder(Opcodes: {G_FLOG2, G_FEXP2});
1259 Log2Ops.customFor(Types: {S32});
1260 if (ST.has16BitInsts())
1261 Log2Ops.legalFor(Types: {S16});
1262 else
1263 Log2Ops.customFor(Types: {S16});
1264 Log2Ops.scalarize(TypeIdx: 0)
1265 .lower();
1266
1267 auto &LogOps =
1268 getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1269 LogOps.customFor(Types: {S32, S16});
1270 LogOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1271 .scalarize(TypeIdx: 0);
1272
1273 // The 64-bit versions produce 32-bit results, but only on the SALU.
1274 getActionDefinitionsBuilder(Opcode: G_CTPOP)
1275 .legalFor(Types: {{S32, S32}, {S32, S64}})
1276 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1277 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1278 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1279 .scalarize(TypeIdx: 0)
1280 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1281
1282 // If no 16 bit instr is available, lower into different instructions.
1283 if (ST.has16BitInsts())
1284 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1285 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1286 .widenScalarToNextPow2(TypeIdx: 1)
1287 .scalarize(TypeIdx: 0)
1288 .lower();
1289 else
1290 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1291 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1292 .lowerFor(Types: {S1, S16})
1293 .widenScalarToNextPow2(TypeIdx: 1)
1294 .scalarize(TypeIdx: 0)
1295 .lower();
1296
1297 // The hardware instructions return a different result on 0 than the generic
1298 // instructions expect. The hardware produces -1, but these produce the
1299 // bitwidth.
1300 getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1301 .scalarize(TypeIdx: 0)
1302 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1303 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1304 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1305 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1306 .custom();
1307
1308 // The 64-bit versions produce 32-bit results, but only on the SALU.
1309 getActionDefinitionsBuilder(Opcode: G_CTLZ_ZERO_UNDEF)
1310 .legalFor(Types: {{S32, S32}, {S32, S64}})
1311 .customIf(Predicate: scalarNarrowerThan(TypeIdx: 1, Size: 32))
1312 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1313 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1314 .scalarize(TypeIdx: 0)
1315 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1316 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1317
1318 getActionDefinitionsBuilder(Opcode: G_CTTZ_ZERO_UNDEF)
1319 .legalFor(Types: {{S32, S32}, {S32, S64}})
1320 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1321 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1322 .scalarize(TypeIdx: 0)
1323 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1324 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1325
1326 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1327 // RegBankSelect.
1328 getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1329 .legalFor(Types: {S32, S64})
1330 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1331 .scalarize(TypeIdx: 0)
1332 .widenScalarToNextPow2(TypeIdx: 0);
1333
1334 if (ST.has16BitInsts()) {
1335 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1336 .legalFor(Types: {S16, S32, V2S16})
1337 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1338 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1339 // narrowScalar limitation.
1340 .widenScalarToNextPow2(TypeIdx: 0)
1341 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S32)
1342 .scalarize(TypeIdx: 0);
1343
1344 if (ST.hasVOP3PInsts()) {
1345 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1346 .legalFor(Types: {S32, S16, V2S16})
1347 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1348 .minScalar(TypeIdx: 0, Ty: S16)
1349 .widenScalarToNextPow2(TypeIdx: 0)
1350 .scalarize(TypeIdx: 0)
1351 .lower();
1352 } else {
1353 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1354 .legalFor(Types: {S32, S16})
1355 .widenScalarToNextPow2(TypeIdx: 0)
1356 .minScalar(TypeIdx: 0, Ty: S16)
1357 .scalarize(TypeIdx: 0)
1358 .lower();
1359 }
1360 } else {
1361 // TODO: Should have same legality without v_perm_b32
1362 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1363 .legalFor(Types: {S32})
1364 .lowerIf(Predicate: scalarNarrowerThan(TypeIdx: 0, Size: 32))
1365 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1366 // narrowScalar limitation.
1367 .widenScalarToNextPow2(TypeIdx: 0)
1368 .maxScalar(TypeIdx: 0, Ty: S32)
1369 .scalarize(TypeIdx: 0)
1370 .lower();
1371
1372 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1373 .legalFor(Types: {S32})
1374 .minScalar(TypeIdx: 0, Ty: S32)
1375 .widenScalarToNextPow2(TypeIdx: 0)
1376 .scalarize(TypeIdx: 0)
1377 .lower();
1378 }
1379
1380 getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1381 // List the common cases
1382 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1383 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1384 .scalarize(TypeIdx: 0)
1385 // Accept any address space as long as the size matches
1386 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1387 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 1, TypeIdx1: 0),
1388 Mutation: [](const LegalityQuery &Query) {
1389 return std::pair(
1390 1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1391 })
1392 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 1, TypeIdx1: 0), Mutation: [](const LegalityQuery &Query) {
1393 return std::pair(1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1394 });
1395
1396 getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1397 // List the common cases
1398 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1399 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1400 .scalarize(TypeIdx: 0)
1401 // Accept any address space as long as the size matches
1402 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1403 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 0, TypeIdx1: 1),
1404 Mutation: [](const LegalityQuery &Query) {
1405 return std::pair(
1406 0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1407 })
1408 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 0, TypeIdx1: 1), Mutation: [](const LegalityQuery &Query) {
1409 return std::pair(0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1410 });
1411
1412 getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1413 .scalarize(TypeIdx: 0)
1414 .custom();
1415
1416 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1417 bool IsLoad) -> bool {
1418 const LLT DstTy = Query.Types[0];
1419
1420 // Split vector extloads.
1421 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1422
1423 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1424 return true;
1425
1426 const LLT PtrTy = Query.Types[1];
1427 unsigned AS = PtrTy.getAddressSpace();
1428 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1429 IsAtomic: Query.MMODescrs[0].Ordering !=
1430 AtomicOrdering::NotAtomic))
1431 return true;
1432
1433 // Catch weird sized loads that don't evenly divide into the access sizes
1434 // TODO: May be able to widen depending on alignment etc.
1435 unsigned NumRegs = (MemSize + 31) / 32;
1436 if (NumRegs == 3) {
1437 if (!ST.hasDwordx3LoadStores())
1438 return true;
1439 } else {
1440 // If the alignment allows, these should have been widened.
1441 if (!isPowerOf2_32(Value: NumRegs))
1442 return true;
1443 }
1444
1445 return false;
1446 };
1447
1448 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1449 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1450 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1451
1452 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1453 // LDS
1454 // TODO: Unsupported flat for SI.
1455
1456 for (unsigned Op : {G_LOAD, G_STORE}) {
1457 const bool IsStore = Op == G_STORE;
1458
1459 auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1460 // Explicitly list some common cases.
1461 // TODO: Does this help compile time at all?
1462 Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1463 {.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1464 {.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1465 {.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1466 {.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1467 {.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1468 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1469 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1470
1471 {.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1472 {.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: 32},
1473 {.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: 32},
1474 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1475 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1476 {.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1477
1478 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1479 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1480 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1481 {.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1482
1483 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1484 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1485 {.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1486 {.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1487 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1488 Actions.legalIf(
1489 Predicate: [=](const LegalityQuery &Query) -> bool {
1490 return isLoadStoreLegal(ST, Query);
1491 });
1492
1493 // The custom pointers (fat pointers, buffer resources) don't work with load
1494 // and store at this level. Fat pointers should have been lowered to
1495 // intrinsics before the translation to MIR.
1496 Actions.unsupportedIf(
1497 Predicate: typeInSet(TypeIdx: 1, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1498
1499 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1500 // ptrtoint. This is needed to account for the fact that we can't have i128
1501 // as a register class for SelectionDAG reasons.
1502 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1503 return hasBufferRsrcWorkaround(Ty: Query.Types[0]);
1504 });
1505
1506 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1507 // 64-bits.
1508 //
1509 // TODO: Should generalize bitcast action into coerce, which will also cover
1510 // inserting addrspacecasts.
1511 Actions.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1512
1513 // Turn any illegal element vectors into something easier to deal
1514 // with. These will ultimately produce 32-bit scalar shifts to extract the
1515 // parts anyway.
1516 //
1517 // For odd 16-bit element vectors, prefer to split those into pieces with
1518 // 16-bit vector parts.
1519 Actions.bitcastIf(
1520 Predicate: [=](const LegalityQuery &Query) -> bool {
1521 return shouldBitcastLoadStoreType(ST, Ty: Query.Types[0],
1522 MemTy: Query.MMODescrs[0].MemoryTy);
1523 }, Mutation: bitcastToRegisterType(TypeIdx: 0));
1524
1525 if (!IsStore) {
1526 // Widen suitably aligned loads by loading extra bytes. The standard
1527 // legalization actions can't properly express widening memory operands.
1528 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1529 return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1530 });
1531 }
1532
1533 // FIXME: load/store narrowing should be moved to lower action
1534 Actions
1535 .narrowScalarIf(
1536 Predicate: [=](const LegalityQuery &Query) -> bool {
1537 return !Query.Types[0].isVector() &&
1538 needToSplitMemOp(Query, Op == G_LOAD);
1539 },
1540 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1541 const LLT DstTy = Query.Types[0];
1542 const LLT PtrTy = Query.Types[1];
1543
1544 const unsigned DstSize = DstTy.getSizeInBits();
1545 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1546
1547 // Split extloads.
1548 if (DstSize > MemSize)
1549 return std::pair(0, LLT::scalar(SizeInBits: MemSize));
1550
1551 unsigned MaxSize = maxSizeForAddrSpace(
1552 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1553 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1554 if (MemSize > MaxSize)
1555 return std::pair(0, LLT::scalar(SizeInBits: MaxSize));
1556
1557 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1558 return std::pair(0, LLT::scalar(SizeInBits: Align));
1559 })
1560 .fewerElementsIf(
1561 Predicate: [=](const LegalityQuery &Query) -> bool {
1562 return Query.Types[0].isVector() &&
1563 needToSplitMemOp(Query, Op == G_LOAD);
1564 },
1565 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1566 const LLT DstTy = Query.Types[0];
1567 const LLT PtrTy = Query.Types[1];
1568
1569 LLT EltTy = DstTy.getElementType();
1570 unsigned MaxSize = maxSizeForAddrSpace(
1571 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1572 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1573
1574 // FIXME: Handle widened to power of 2 results better. This ends
1575 // up scalarizing.
1576 // FIXME: 3 element stores scalarized on SI
1577
1578 // Split if it's too large for the address space.
1579 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1580 if (MemSize > MaxSize) {
1581 unsigned NumElts = DstTy.getNumElements();
1582 unsigned EltSize = EltTy.getSizeInBits();
1583
1584 if (MaxSize % EltSize == 0) {
1585 return std::pair(
1586 0, LLT::scalarOrVector(
1587 EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1588 }
1589
1590 unsigned NumPieces = MemSize / MaxSize;
1591
1592 // FIXME: Refine when odd breakdowns handled
1593 // The scalars will need to be re-legalized.
1594 if (NumPieces == 1 || NumPieces >= NumElts ||
1595 NumElts % NumPieces != 0)
1596 return std::pair(0, EltTy);
1597
1598 return std::pair(0,
1599 LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1600 }
1601
1602 // FIXME: We could probably handle weird extending loads better.
1603 if (DstTy.getSizeInBits() > MemSize)
1604 return std::pair(0, EltTy);
1605
1606 unsigned EltSize = EltTy.getSizeInBits();
1607 unsigned DstSize = DstTy.getSizeInBits();
1608 if (!isPowerOf2_32(Value: DstSize)) {
1609 // We're probably decomposing an odd sized store. Try to split
1610 // to the widest type. TODO: Account for alignment. As-is it
1611 // should be OK, since the new parts will be further legalized.
1612 unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1613 return std::pair(
1614 0, LLT::scalarOrVector(
1615 EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1616 }
1617
1618 // May need relegalization for the scalars.
1619 return std::pair(0, EltTy);
1620 })
1621 .minScalar(TypeIdx: 0, Ty: S32)
1622 .narrowScalarIf(Predicate: isWideScalarExtLoadTruncStore(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: S32))
1623 .widenScalarToNextPow2(TypeIdx: 0)
1624 .moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: 0, Size: 32), Mutation: moreEltsToNext32Bit(TypeIdx: 0))
1625 .lower();
1626 }
1627
1628 // FIXME: Unaligned accesses not lowered.
1629 auto &ExtLoads = getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1630 .legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: 8},
1631 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: 2 * 8},
1632 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1633 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1634 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1635 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1636 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: 8},
1637 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: 2 * 8}})
1638 .legalIf(
1639 Predicate: [=](const LegalityQuery &Query) -> bool {
1640 return isLoadStoreLegal(ST, Query);
1641 });
1642
1643 if (ST.hasFlatAddressSpace()) {
1644 ExtLoads.legalForTypesWithMemDesc(
1645 TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: 8}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: 16}});
1646 }
1647
1648 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1649 // 64-bits.
1650 //
1651 // TODO: Should generalize bitcast action into coerce, which will also cover
1652 // inserting addrspacecasts.
1653 ExtLoads.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1654
1655 ExtLoads.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1656 .widenScalarToNextPow2(TypeIdx: 0)
1657 .lower();
1658
1659 auto &Atomics = getActionDefinitionsBuilder(
1660 Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1661 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1662 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1663 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1664 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1665 {S64, GlobalPtr}, {S64, LocalPtr},
1666 {S32, RegionPtr}, {S64, RegionPtr}});
1667 if (ST.hasFlatAddressSpace()) {
1668 Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1669 }
1670
1671 // TODO: v2bf16 operations, and fat buffer pointer support.
1672 auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1673 if (ST.hasLDSFPAtomicAddF32()) {
1674 Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1675 if (ST.hasLdsAtomicAddF64())
1676 Atomic.legalFor(Types: {{S64, LocalPtr}});
1677 if (ST.hasAtomicDsPkAdd16Insts())
1678 Atomic.legalFor(Types: {{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1679 }
1680 if (ST.hasAtomicFaddInsts())
1681 Atomic.legalFor(Types: {{S32, GlobalPtr}});
1682 if (ST.hasFlatAtomicFaddF32Inst())
1683 Atomic.legalFor(Types: {{S32, FlatPtr}});
1684
1685 if (ST.hasGFX90AInsts()) {
1686 // These are legal with some caveats, and should have undergone expansion in
1687 // the IR in most situations
1688 // TODO: Move atomic expansion into legalizer
1689 Atomic.legalFor(Types: {
1690 {S32, GlobalPtr},
1691 {S64, GlobalPtr},
1692 {S64, FlatPtr}
1693 });
1694 }
1695
1696 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1697 ST.hasAtomicBufferGlobalPkAddF16Insts())
1698 Atomic.legalFor(Types: {{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1699 if (ST.hasAtomicGlobalPkAddBF16Inst())
1700 Atomic.legalFor(Types: {{V2BF16, GlobalPtr}});
1701 if (ST.hasAtomicFlatPkAdd16Insts())
1702 Atomic.legalFor(Types: {{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1703
1704
1705 // Most of the legalization work here is done by AtomicExpand. We could
1706 // probably use a simpler legality rule that just assumes anything is OK.
1707 auto &AtomicFMinFMax =
1708 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1709 .legalFor(Types: {{F32, LocalPtr}, {F64, LocalPtr}});
1710
1711 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1712 AtomicFMinFMax.legalFor(Types: {{F32, GlobalPtr},{F32, BufferFatPtr}});
1713 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1714 AtomicFMinFMax.legalFor(Types: {{F64, GlobalPtr}, {F64, BufferFatPtr}});
1715 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1716 AtomicFMinFMax.legalFor(Types: {F32, FlatPtr});
1717 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1718 AtomicFMinFMax.legalFor(Types: {F64, FlatPtr});
1719
1720 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1721 // demarshalling
1722 getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1723 .customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1724 {S32, FlatPtr}, {S64, FlatPtr}})
1725 .legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1726 {S32, RegionPtr}, {S64, RegionPtr}});
1727 // TODO: Pointer types, any 32-bit or 64-bit vector
1728
1729 // Condition should be s32 for scalar, s1 for vector.
1730 getActionDefinitionsBuilder(Opcode: G_SELECT)
1731 .legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1732 LocalPtr, FlatPtr, PrivatePtr,
1733 LLT::fixed_vector(NumElements: 2, ScalarTy: LocalPtr),
1734 LLT::fixed_vector(NumElements: 2, ScalarTy: PrivatePtr)},
1735 Types1: {S1, S32})
1736 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1737 .scalarize(TypeIdx: 1)
1738 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1739 .fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: 0), Mutation: scalarize(TypeIdx: 0))
1740 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 2)
1741 .clampMaxNumElements(TypeIdx: 0, EltTy: LocalPtr, MaxElements: 2)
1742 .clampMaxNumElements(TypeIdx: 0, EltTy: PrivatePtr, MaxElements: 2)
1743 .scalarize(TypeIdx: 0)
1744 .widenScalarToNextPow2(TypeIdx: 0)
1745 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: typeInSet(TypeIdx: 1, TypesInit: {S1, S32})));
1746
1747 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1748 // be more flexible with the shift amount type.
1749 auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1750 .legalFor(Types: {{S32, S32}, {S64, S32}});
1751 if (ST.has16BitInsts()) {
1752 if (ST.hasVOP3PInsts()) {
1753 Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1754 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1755 } else
1756 Shifts.legalFor(Types: {{S16, S16}});
1757
1758 // TODO: Support 16-bit shift amounts for all types
1759 Shifts.widenScalarIf(
1760 Predicate: [=](const LegalityQuery &Query) {
1761 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1762 // 32-bit amount.
1763 const LLT ValTy = Query.Types[0];
1764 const LLT AmountTy = Query.Types[1];
1765 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1766 AmountTy.getSizeInBits() < 16;
1767 }, Mutation: changeTo(TypeIdx: 1, Ty: S16));
1768 Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16);
1769 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1770 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 16);
1771 Shifts.clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1772
1773 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1774 .minScalar(TypeIdx: 0, Ty: S16)
1775 .scalarize(TypeIdx: 0)
1776 .lower();
1777 } else {
1778 // Make sure we legalize the shift amount type first, as the general
1779 // expansion for the shifted type will produce much worse code if it hasn't
1780 // been truncated already.
1781 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1782 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1783 Shifts.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1784
1785 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1786 .minScalar(TypeIdx: 0, Ty: S32)
1787 .scalarize(TypeIdx: 0)
1788 .lower();
1789 }
1790 Shifts.scalarize(TypeIdx: 0);
1791
1792 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1793 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1794 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1795 unsigned IdxTypeIdx = 2;
1796
1797 getActionDefinitionsBuilder(Opcode: Op)
1798 .customIf(Predicate: [=](const LegalityQuery &Query) {
1799 const LLT EltTy = Query.Types[EltTypeIdx];
1800 const LLT VecTy = Query.Types[VecTypeIdx];
1801 const LLT IdxTy = Query.Types[IdxTypeIdx];
1802 const unsigned EltSize = EltTy.getSizeInBits();
1803 const bool isLegalVecType =
1804 !!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1805 // Address space 8 pointers are 128-bit wide values, but the logic
1806 // below will try to bitcast them to 2N x s64, which will fail.
1807 // Therefore, as an intermediate step, wrap extracts/insertions from a
1808 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1809 // extraction result) in order to produce a vector operation that can
1810 // be handled by the logic below.
1811 if (EltTy.isPointer() && EltSize > 64)
1812 return true;
1813 return (EltSize == 32 || EltSize == 64) &&
1814 VecTy.getSizeInBits() % 32 == 0 &&
1815 VecTy.getSizeInBits() <= MaxRegisterSize &&
1816 IdxTy.getSizeInBits() == 32 &&
1817 isLegalVecType;
1818 })
1819 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1820 P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: 32)),
1821 Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1822 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1823 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1824 P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: 64)),
1825 Mutation: [=](const LegalityQuery &Query) {
1826 // For > 64-bit element types, try to turn this into a
1827 // 64-bit element vector since we may be able to do better
1828 // indexing if this is scalar. If not, fall back to 32.
1829 const LLT EltTy = Query.Types[EltTypeIdx];
1830 const LLT VecTy = Query.Types[VecTypeIdx];
1831 const unsigned DstEltSize = EltTy.getSizeInBits();
1832 const unsigned VecSize = VecTy.getSizeInBits();
1833
1834 const unsigned TargetEltSize =
1835 DstEltSize % 64 == 0 ? 64 : 32;
1836 return std::pair(VecTypeIdx,
1837 LLT::fixed_vector(NumElements: VecSize / TargetEltSize,
1838 ScalarSizeInBits: TargetEltSize));
1839 })
1840 .clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
1841 .clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
1842 .clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
1843 .clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: 32)
1844 // TODO: Clamp elements for 64-bit vectors?
1845 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: VecTypeIdx),
1846 Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
1847 // It should only be necessary with variable indexes.
1848 // As a last resort, lower to the stack
1849 .lower();
1850 }
1851
1852 getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1853 .unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1854 const LLT &EltTy = Query.Types[1].getElementType();
1855 return Query.Types[0] != EltTy;
1856 });
1857
1858 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1859 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1860 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1861
1862 // FIXME: Doesn't handle extract of illegal sizes.
1863 getActionDefinitionsBuilder(Opcode: Op)
1864 .lowerIf(Predicate: all(P0: typeIs(TypeIdx: LitTyIdx, TypesInit: S16), P1: sizeIs(TypeIdx: BigTyIdx, Size: 32)))
1865 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
1866 // Sub-vector(or single element) insert and extract.
1867 // TODO: verify immediate offset here since lower only works with
1868 // whole elements.
1869 const LLT BigTy = Query.Types[BigTyIdx];
1870 return BigTy.isVector();
1871 })
1872 // FIXME: Multiples of 16 should not be legal.
1873 .legalIf(Predicate: [=](const LegalityQuery &Query) {
1874 const LLT BigTy = Query.Types[BigTyIdx];
1875 const LLT LitTy = Query.Types[LitTyIdx];
1876 return (BigTy.getSizeInBits() % 32 == 0) &&
1877 (LitTy.getSizeInBits() % 16 == 0);
1878 })
1879 .widenScalarIf(
1880 Predicate: [=](const LegalityQuery &Query) {
1881 const LLT BigTy = Query.Types[BigTyIdx];
1882 return (BigTy.getScalarSizeInBits() < 16);
1883 },
1884 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: 16))
1885 .widenScalarIf(
1886 Predicate: [=](const LegalityQuery &Query) {
1887 const LLT LitTy = Query.Types[LitTyIdx];
1888 return (LitTy.getScalarSizeInBits() < 16);
1889 },
1890 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: 16))
1891 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1892 .widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: 32);
1893
1894 }
1895
1896 auto &BuildVector =
1897 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1898 .legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
1899 .legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
1900 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
1901 .clampNumElements(TypeIdx: 0, MinTy: V2S64, MaxTy: V16S64)
1902 .fewerElementsIf(Predicate: isWideVec16(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: V2S16))
1903 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: 0),
1904 Mutation: moreElementsToNextExistingRegClass(TypeIdx: 0));
1905
1906 if (ST.hasScalarPackInsts()) {
1907 BuildVector
1908 // FIXME: Should probably widen s1 vectors straight to s32
1909 .minScalarOrElt(TypeIdx: 0, Ty: S16)
1910 .minScalar(TypeIdx: 1, Ty: S16);
1911
1912 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1913 .legalFor(Types: {V2S16, S32})
1914 .lower();
1915 } else {
1916 BuildVector.customFor(Types: {V2S16, S16});
1917 BuildVector.minScalarOrElt(TypeIdx: 0, Ty: S32);
1918
1919 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1920 .customFor(Types: {V2S16, S32})
1921 .lower();
1922 }
1923
1924 BuildVector.legalIf(Predicate: isRegisterType(ST, TypeIdx: 0));
1925
1926 // FIXME: Clamp maximum size
1927 getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
1928 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
1929 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 32)
1930 .clampMaxNumElements(TypeIdx: 1, EltTy: S16, MaxElements: 2) // TODO: Make 4?
1931 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 64);
1932
1933 getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
1934
1935 // Merge/Unmerge
1936 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1937 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1938 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1939
1940 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1941 const LLT Ty = Query.Types[TypeIdx];
1942 if (Ty.isVector()) {
1943 const LLT &EltTy = Ty.getElementType();
1944 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1945 return true;
1946 if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
1947 return true;
1948 }
1949 return false;
1950 };
1951
1952 auto &Builder =
1953 getActionDefinitionsBuilder(Opcode: Op)
1954 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
1955 .lowerFor(Types: {{S16, V2S16}})
1956 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
1957 const LLT BigTy = Query.Types[BigTyIdx];
1958 return BigTy.getSizeInBits() == 32;
1959 })
1960 // Try to widen to s16 first for small types.
1961 // TODO: Only do this on targets with legal s16 shifts
1962 .minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: 16), TypeIdx: LitTyIdx, Ty: S16)
1963 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 16)
1964 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx),
1965 Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1966 .fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: 0, TypesInit: S16), P1: vectorWiderThan(TypeIdx: 1, Size: 32),
1967 args: elementTypeIs(TypeIdx: 1, EltTy: S16)),
1968 Mutation: changeTo(TypeIdx: 1, Ty: V2S16))
1969 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
1970 // not worth considering the multiples of 64 since 2*192 and 2*384
1971 // are not valid.
1972 .clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
1973 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 32)
1974 // Break up vectors with weird elements into scalars
1975 .fewerElementsIf(
1976 Predicate: [=](const LegalityQuery &Query) {
1977 return notValidElt(Query, LitTyIdx);
1978 },
1979 Mutation: scalarize(TypeIdx: 0))
1980 .fewerElementsIf(
1981 Predicate: [=](const LegalityQuery &Query) {
1982 return notValidElt(Query, BigTyIdx);
1983 },
1984 Mutation: scalarize(TypeIdx: 1))
1985 .clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
1986
1987 if (Op == G_MERGE_VALUES) {
1988 Builder.widenScalarIf(
1989 // TODO: Use 16-bit shifts if legal for 8-bit values?
1990 Predicate: [=](const LegalityQuery &Query) {
1991 const LLT Ty = Query.Types[LitTyIdx];
1992 return Ty.getSizeInBits() < 32;
1993 },
1994 Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
1995 }
1996
1997 Builder.widenScalarIf(
1998 Predicate: [=](const LegalityQuery &Query) {
1999 const LLT Ty = Query.Types[BigTyIdx];
2000 return Ty.getSizeInBits() % 16 != 0;
2001 },
2002 Mutation: [=](const LegalityQuery &Query) {
2003 // Pick the next power of 2, or a multiple of 64 over 128.
2004 // Whichever is smaller.
2005 const LLT &Ty = Query.Types[BigTyIdx];
2006 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Value: Ty.getSizeInBits() + 1);
2007 if (NewSizeInBits >= 256) {
2008 unsigned RoundedTo = alignTo<64>(Value: Ty.getSizeInBits() + 1);
2009 if (RoundedTo < NewSizeInBits)
2010 NewSizeInBits = RoundedTo;
2011 }
2012 return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
2013 })
2014 // Any vectors left are the wrong size. Scalarize them.
2015 .scalarize(TypeIdx: 0)
2016 .scalarize(TypeIdx: 1);
2017 }
2018
2019 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2020 // RegBankSelect.
2021 auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
2022 .legalFor(Types: {{S32}, {S64}})
2023 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
2024
2025 if (ST.hasVOP3PInsts()) {
2026 SextInReg.lowerFor(Types: {{V2S16}})
2027 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2028 // get more vector shift opportunities, since we'll get those when
2029 // expanded.
2030 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2031 } else if (ST.has16BitInsts()) {
2032 SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
2033 } else {
2034 // Prefer to promote to s32 before lowering if we don't have 16-bit
2035 // shifts. This avoid a lot of intermediate truncate and extend operations.
2036 SextInReg.lowerFor(Types: {{S32}, {S64}});
2037 }
2038
2039 SextInReg
2040 .scalarize(TypeIdx: 0)
2041 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2042 .lower();
2043
2044 getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
2045 .scalarize(TypeIdx: 0)
2046 .lower();
2047
2048 // TODO: Only Try to form v2s16 with legal packed instructions.
2049 getActionDefinitionsBuilder(Opcode: G_FSHR)
2050 .legalFor(Types: {{S32, S32}})
2051 .lowerFor(Types: {{V2S16, V2S16}})
2052 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2053 .scalarize(TypeIdx: 0)
2054 .lower();
2055
2056 if (ST.hasVOP3PInsts()) {
2057 getActionDefinitionsBuilder(Opcode: G_FSHL)
2058 .lowerFor(Types: {{V2S16, V2S16}})
2059 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2060 .scalarize(TypeIdx: 0)
2061 .lower();
2062 } else {
2063 getActionDefinitionsBuilder(Opcode: G_FSHL)
2064 .scalarize(TypeIdx: 0)
2065 .lower();
2066 }
2067
2068 getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
2069 .legalFor(Types: {S64});
2070
2071 getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
2072
2073 getActionDefinitionsBuilder(Opcode: G_FENCE)
2074 .alwaysLegal();
2075
2076 getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
2077 .scalarize(TypeIdx: 0)
2078 .minScalar(TypeIdx: 0, Ty: S32)
2079 .lower();
2080
2081 getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2082 .legalFor(Types: {{S32, S32}, {S64, S32}})
2083 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
2084 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2085 .widenScalarToNextPow2(TypeIdx: 0)
2086 .scalarize(TypeIdx: 0);
2087
2088 getActionDefinitionsBuilder(
2089 Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2090 G_FCOPYSIGN,
2091
2092 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2093 G_READ_REGISTER, G_WRITE_REGISTER,
2094
2095 G_SADDO, G_SSUBO})
2096 .lower();
2097
2098 if (ST.hasIEEEMinMax()) {
2099 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2100 .legalFor(Types: FPTypesPK16)
2101 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
2102 .scalarize(TypeIdx: 0);
2103 } else {
2104 // TODO: Implement
2105 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM}).lower();
2106 }
2107
2108 getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2109 .lower();
2110
2111 getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2112
2113 getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2114 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2115 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2116 .unsupported();
2117
2118 getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2119
2120 getActionDefinitionsBuilder(
2121 Opcodes: {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2122 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2123 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2124 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2125 .legalFor(Types: AllVectors)
2126 .scalarize(TypeIdx: 1)
2127 .lower();
2128
2129 getLegacyLegalizerInfo().computeTables();
2130 verify(MII: *ST.getInstrInfo());
2131}
2132
2133bool AMDGPULegalizerInfo::legalizeCustom(
2134 LegalizerHelper &Helper, MachineInstr &MI,
2135 LostDebugLocObserver &LocObserver) const {
2136 MachineIRBuilder &B = Helper.MIRBuilder;
2137 MachineRegisterInfo &MRI = *B.getMRI();
2138
2139 switch (MI.getOpcode()) {
2140 case TargetOpcode::G_ADDRSPACE_CAST:
2141 return legalizeAddrSpaceCast(MI, MRI, B);
2142 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2143 return legalizeFroundeven(MI, MRI, B);
2144 case TargetOpcode::G_FCEIL:
2145 return legalizeFceil(MI, MRI, B);
2146 case TargetOpcode::G_FREM:
2147 return legalizeFrem(MI, MRI, B);
2148 case TargetOpcode::G_INTRINSIC_TRUNC:
2149 return legalizeIntrinsicTrunc(MI, MRI, B);
2150 case TargetOpcode::G_SITOFP:
2151 return legalizeITOFP(MI, MRI, B, Signed: true);
2152 case TargetOpcode::G_UITOFP:
2153 return legalizeITOFP(MI, MRI, B, Signed: false);
2154 case TargetOpcode::G_FPTOSI:
2155 return legalizeFPTOI(MI, MRI, B, Signed: true);
2156 case TargetOpcode::G_FPTOUI:
2157 return legalizeFPTOI(MI, MRI, B, Signed: false);
2158 case TargetOpcode::G_FMINNUM:
2159 case TargetOpcode::G_FMAXNUM:
2160 case TargetOpcode::G_FMINIMUMNUM:
2161 case TargetOpcode::G_FMAXIMUMNUM:
2162 case TargetOpcode::G_FMINNUM_IEEE:
2163 case TargetOpcode::G_FMAXNUM_IEEE:
2164 return legalizeMinNumMaxNum(Helper, MI);
2165 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2166 return legalizeExtractVectorElt(MI, MRI, B);
2167 case TargetOpcode::G_INSERT_VECTOR_ELT:
2168 return legalizeInsertVectorElt(MI, MRI, B);
2169 case TargetOpcode::G_FSIN:
2170 case TargetOpcode::G_FCOS:
2171 return legalizeSinCos(MI, MRI, B);
2172 case TargetOpcode::G_GLOBAL_VALUE:
2173 return legalizeGlobalValue(MI, MRI, B);
2174 case TargetOpcode::G_LOAD:
2175 case TargetOpcode::G_SEXTLOAD:
2176 case TargetOpcode::G_ZEXTLOAD:
2177 return legalizeLoad(Helper, MI);
2178 case TargetOpcode::G_STORE:
2179 return legalizeStore(Helper, MI);
2180 case TargetOpcode::G_FMAD:
2181 return legalizeFMad(MI, MRI, B);
2182 case TargetOpcode::G_FDIV:
2183 return legalizeFDIV(MI, MRI, B);
2184 case TargetOpcode::G_FFREXP:
2185 return legalizeFFREXP(MI, MRI, B);
2186 case TargetOpcode::G_FSQRT:
2187 return legalizeFSQRT(MI, MRI, B);
2188 case TargetOpcode::G_UDIV:
2189 case TargetOpcode::G_UREM:
2190 case TargetOpcode::G_UDIVREM:
2191 return legalizeUnsignedDIV_REM(MI, MRI, B);
2192 case TargetOpcode::G_SDIV:
2193 case TargetOpcode::G_SREM:
2194 case TargetOpcode::G_SDIVREM:
2195 return legalizeSignedDIV_REM(MI, MRI, B);
2196 case TargetOpcode::G_ATOMIC_CMPXCHG:
2197 return legalizeAtomicCmpXChg(MI, MRI, B);
2198 case TargetOpcode::G_FLOG2:
2199 return legalizeFlog2(MI, B);
2200 case TargetOpcode::G_FLOG:
2201 case TargetOpcode::G_FLOG10:
2202 return legalizeFlogCommon(MI, B);
2203 case TargetOpcode::G_FEXP2:
2204 return legalizeFExp2(MI, B);
2205 case TargetOpcode::G_FEXP:
2206 case TargetOpcode::G_FEXP10:
2207 return legalizeFExp(MI, B);
2208 case TargetOpcode::G_FPOW:
2209 return legalizeFPow(MI, B);
2210 case TargetOpcode::G_FFLOOR:
2211 return legalizeFFloor(MI, MRI, B);
2212 case TargetOpcode::G_BUILD_VECTOR:
2213 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2214 return legalizeBuildVector(MI, MRI, B);
2215 case TargetOpcode::G_MUL:
2216 return legalizeMul(Helper, MI);
2217 case TargetOpcode::G_CTLZ:
2218 case TargetOpcode::G_CTTZ:
2219 return legalizeCTLZ_CTTZ(MI, MRI, B);
2220 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2221 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2222 case TargetOpcode::G_STACKSAVE:
2223 return legalizeStackSave(MI, B);
2224 case TargetOpcode::G_GET_FPENV:
2225 return legalizeGetFPEnv(MI, MRI, B);
2226 case TargetOpcode::G_SET_FPENV:
2227 return legalizeSetFPEnv(MI, MRI, B);
2228 case TargetOpcode::G_TRAP:
2229 return legalizeTrap(MI, MRI, B);
2230 case TargetOpcode::G_DEBUGTRAP:
2231 return legalizeDebugTrap(MI, MRI, B);
2232 default:
2233 return false;
2234 }
2235
2236 llvm_unreachable("expected switch to return");
2237}
2238
2239Register AMDGPULegalizerInfo::getSegmentAperture(
2240 unsigned AS,
2241 MachineRegisterInfo &MRI,
2242 MachineIRBuilder &B) const {
2243 MachineFunction &MF = B.getMF();
2244 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2245 const LLT S32 = LLT::scalar(SizeInBits: 32);
2246 const LLT S64 = LLT::scalar(SizeInBits: 64);
2247
2248 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2249
2250 if (ST.hasApertureRegs()) {
2251 // Note: this register is somewhat broken. When used as a 32-bit operand,
2252 // it only returns zeroes. The real value is in the upper 32 bits.
2253 // Thus, we must emit extract the high 32 bits.
2254 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2255 ? AMDGPU::SRC_SHARED_BASE
2256 : AMDGPU::SRC_PRIVATE_BASE;
2257 // FIXME: It would be more natural to emit a COPY here, but then copy
2258 // coalescing would kick in and it would think it's okay to use the "HI"
2259 // subregister (instead of extracting the HI 32 bits) which is an artificial
2260 // (unusable) register.
2261 // Register TableGen definitions would need an overhaul to get rid of the
2262 // artificial "HI" aperture registers and prevent this kind of issue from
2263 // happening.
2264 Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2265 MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2266 B.buildInstr(Opc: AMDGPU::S_MOV_B64, DstOps: {Dst}, SrcOps: {Register(ApertureRegNo)});
2267 return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: 1);
2268 }
2269
2270 // TODO: can we be smarter about machine pointer info?
2271 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2272 Register LoadAddr = MRI.createGenericVirtualRegister(
2273 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2274 // For code object version 5, private_base and shared_base are passed through
2275 // implicit kernargs.
2276 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2277 AMDGPU::AMDHSA_COV5) {
2278 AMDGPUTargetLowering::ImplicitParameter Param =
2279 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2280 : AMDGPUTargetLowering::PRIVATE_BASE;
2281 uint64_t Offset =
2282 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2283
2284 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2285 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2286
2287 if (!loadInputValue(DstReg: KernargPtrReg, B,
2288 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2289 return Register();
2290
2291 MachineMemOperand *MMO = MF.getMachineMemOperand(
2292 PtrInfo,
2293 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2294 MachineMemOperand::MOInvariant,
2295 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset));
2296
2297 // Pointer address
2298 B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
2299 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
2300 // Load address
2301 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2302 }
2303
2304 Register QueuePtr = MRI.createGenericVirtualRegister(
2305 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2306
2307 if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2308 return Register();
2309
2310 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2311 // private_segment_aperture_base_hi.
2312 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2313
2314 MachineMemOperand *MMO = MF.getMachineMemOperand(
2315 PtrInfo,
2316 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2317 MachineMemOperand::MOInvariant,
2318 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset: StructOffset));
2319
2320 B.buildPtrAdd(Res: LoadAddr, Op0: QueuePtr,
2321 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: StructOffset).getReg(Idx: 0));
2322 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2323}
2324
2325/// Return true if the value is a known valid address, such that a null check is
2326/// not necessary.
2327static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2328 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2329 MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2330 switch (Def->getOpcode()) {
2331 case AMDGPU::G_FRAME_INDEX:
2332 case AMDGPU::G_GLOBAL_VALUE:
2333 case AMDGPU::G_BLOCK_ADDR:
2334 return true;
2335 case AMDGPU::G_CONSTANT: {
2336 const ConstantInt *CI = Def->getOperand(i: 1).getCImm();
2337 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2338 }
2339 default:
2340 return false;
2341 }
2342
2343 return false;
2344}
2345
2346bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2347 MachineInstr &MI, MachineRegisterInfo &MRI,
2348 MachineIRBuilder &B) const {
2349 MachineFunction &MF = B.getMF();
2350
2351 // MI can either be a G_ADDRSPACE_CAST or a
2352 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2353 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2354 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2355 Intrinsic::amdgcn_addrspacecast_nonnull));
2356
2357 const LLT S32 = LLT::scalar(SizeInBits: 32);
2358 Register Dst = MI.getOperand(i: 0).getReg();
2359 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
2360 : MI.getOperand(i: 1).getReg();
2361 LLT DstTy = MRI.getType(Reg: Dst);
2362 LLT SrcTy = MRI.getType(Reg: Src);
2363 unsigned DestAS = DstTy.getAddressSpace();
2364 unsigned SrcAS = SrcTy.getAddressSpace();
2365
2366 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2367 // vector element.
2368 assert(!DstTy.isVector());
2369
2370 const AMDGPUTargetMachine &TM
2371 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2372
2373 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2374 MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2375 return true;
2376 }
2377
2378 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2379 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2380 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2381 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2382 // G_ADDRSPACE_CAST we need to guess.
2383 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2384 // Extract low 32-bits of the pointer.
2385 B.buildExtract(Res: Dst, Src, Index: 0);
2386 MI.eraseFromParent();
2387 return true;
2388 }
2389
2390 unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
2391
2392 auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2393 auto FlatNull = B.buildConstant(Res: SrcTy, Val: 0);
2394
2395 // Extract low 32-bits of the pointer.
2396 auto PtrLo32 = B.buildExtract(Res: DstTy, Src, Index: 0);
2397
2398 auto CmpRes =
2399 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: FlatNull.getReg(Idx: 0));
2400 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: 0));
2401
2402 MI.eraseFromParent();
2403 return true;
2404 }
2405
2406 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2407 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2408 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2409 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2410 Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2411 if (!ApertureReg.isValid())
2412 return false;
2413
2414 // Coerce the type of the low half of the result so we can use
2415 // merge_values.
2416 Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: 0);
2417
2418 // TODO: Should we allow mismatched types but matching sizes in merges to
2419 // avoid the ptrtoint?
2420 return B.buildMergeLikeInstr(Res: Dst, Ops: {SrcAsInt, ApertureReg}).getReg(Idx: 0);
2421 };
2422
2423 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2424 // G_ADDRSPACE_CAST we need to guess.
2425 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2426 castLocalOrPrivateToFlat(Dst);
2427 MI.eraseFromParent();
2428 return true;
2429 }
2430
2431 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2432
2433 auto SegmentNull = B.buildConstant(Res: SrcTy, Val: TM.getNullPointerValue(AddrSpace: SrcAS));
2434 auto FlatNull = B.buildConstant(Res: DstTy, Val: TM.getNullPointerValue(AddrSpace: DestAS));
2435
2436 auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
2437 Op1: SegmentNull.getReg(Idx: 0));
2438
2439 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2440
2441 MI.eraseFromParent();
2442 return true;
2443 }
2444
2445 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2446 SrcTy.getSizeInBits() == 64) {
2447 // Truncate.
2448 B.buildExtract(Res: Dst, Src, Index: 0);
2449 MI.eraseFromParent();
2450 return true;
2451 }
2452
2453 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2454 DstTy.getSizeInBits() == 64) {
2455 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2456 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2457 auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2458 auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2459 B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2460 MI.eraseFromParent();
2461 return true;
2462 }
2463
2464 // Invalid casts are poison.
2465 // TODO: Should return poison
2466 B.buildUndef(Res: Dst);
2467 MI.eraseFromParent();
2468 return true;
2469}
2470
2471bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2472 MachineRegisterInfo &MRI,
2473 MachineIRBuilder &B) const {
2474 Register Src = MI.getOperand(i: 1).getReg();
2475 LLT Ty = MRI.getType(Reg: Src);
2476 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2477
2478 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2479 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2480
2481 auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2482 auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2483
2484 // TODO: Should this propagate fast-math-flags?
2485 auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2486 auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2487
2488 auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2489 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2490
2491 auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: C2);
2492 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2493 MI.eraseFromParent();
2494 return true;
2495}
2496
2497bool AMDGPULegalizerInfo::legalizeFceil(
2498 MachineInstr &MI, MachineRegisterInfo &MRI,
2499 MachineIRBuilder &B) const {
2500
2501 const LLT S1 = LLT::scalar(SizeInBits: 1);
2502 const LLT S64 = LLT::scalar(SizeInBits: 64);
2503
2504 Register Src = MI.getOperand(i: 1).getReg();
2505 assert(MRI.getType(Src) == S64);
2506
2507 // result = trunc(src)
2508 // if (src > 0.0 && src != result)
2509 // result += 1.0
2510
2511 auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2512
2513 const auto Zero = B.buildFConstant(Res: S64, Val: 0.0);
2514 const auto One = B.buildFConstant(Res: S64, Val: 1.0);
2515 auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2516 auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2517 auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2518 auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2519
2520 // TODO: Should this propagate fast-math-flags?
2521 B.buildFAdd(Dst: MI.getOperand(i: 0).getReg(), Src0: Trunc, Src1: Add);
2522 MI.eraseFromParent();
2523 return true;
2524}
2525
2526bool AMDGPULegalizerInfo::legalizeFrem(
2527 MachineInstr &MI, MachineRegisterInfo &MRI,
2528 MachineIRBuilder &B) const {
2529 Register DstReg = MI.getOperand(i: 0).getReg();
2530 Register Src0Reg = MI.getOperand(i: 1).getReg();
2531 Register Src1Reg = MI.getOperand(i: 2).getReg();
2532 auto Flags = MI.getFlags();
2533 LLT Ty = MRI.getType(Reg: DstReg);
2534
2535 auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2536 auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2537 auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2538 B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2539 MI.eraseFromParent();
2540 return true;
2541}
2542
2543static MachineInstrBuilder extractF64Exponent(Register Hi,
2544 MachineIRBuilder &B) {
2545 const unsigned FractBits = 52;
2546 const unsigned ExpBits = 11;
2547 LLT S32 = LLT::scalar(SizeInBits: 32);
2548
2549 auto Const0 = B.buildConstant(Res: S32, Val: FractBits - 32);
2550 auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2551
2552 auto ExpPart = B.buildIntrinsic(ID: Intrinsic::amdgcn_ubfe, Res: {S32})
2553 .addUse(RegNo: Hi)
2554 .addUse(RegNo: Const0.getReg(Idx: 0))
2555 .addUse(RegNo: Const1.getReg(Idx: 0));
2556
2557 return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: 1023));
2558}
2559
2560bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2561 MachineInstr &MI, MachineRegisterInfo &MRI,
2562 MachineIRBuilder &B) const {
2563 const LLT S1 = LLT::scalar(SizeInBits: 1);
2564 const LLT S32 = LLT::scalar(SizeInBits: 32);
2565 const LLT S64 = LLT::scalar(SizeInBits: 64);
2566
2567 Register Src = MI.getOperand(i: 1).getReg();
2568 assert(MRI.getType(Src) == S64);
2569
2570 // TODO: Should this use extract since the low half is unused?
2571 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2572 Register Hi = Unmerge.getReg(Idx: 1);
2573
2574 // Extract the upper half, since this is where we will find the sign and
2575 // exponent.
2576 auto Exp = extractF64Exponent(Hi, B);
2577
2578 const unsigned FractBits = 52;
2579
2580 // Extract the sign bit.
2581 const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(1) << 31);
2582 auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2583
2584 const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(1) << FractBits) - 1);
2585
2586 const auto Zero32 = B.buildConstant(Res: S32, Val: 0);
2587
2588 // Extend back to 64-bits.
2589 auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2590
2591 auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2592 auto Not = B.buildNot(Dst: S64, Src0: Shr);
2593 auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2594 auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - 1);
2595
2596 auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2597 auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2598
2599 auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2600 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2601 MI.eraseFromParent();
2602 return true;
2603}
2604
2605bool AMDGPULegalizerInfo::legalizeITOFP(
2606 MachineInstr &MI, MachineRegisterInfo &MRI,
2607 MachineIRBuilder &B, bool Signed) const {
2608
2609 Register Dst = MI.getOperand(i: 0).getReg();
2610 Register Src = MI.getOperand(i: 1).getReg();
2611
2612 const LLT S64 = LLT::scalar(SizeInBits: 64);
2613 const LLT S32 = LLT::scalar(SizeInBits: 32);
2614
2615 assert(MRI.getType(Src) == S64);
2616
2617 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2618 auto ThirtyTwo = B.buildConstant(Res: S32, Val: 32);
2619
2620 if (MRI.getType(Reg: Dst) == S64) {
2621 auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1))
2622 : B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1));
2623
2624 auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 0));
2625 auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2626
2627 // TODO: Should this propagate fast-math-flags?
2628 B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2629 MI.eraseFromParent();
2630 return true;
2631 }
2632
2633 assert(MRI.getType(Dst) == S32);
2634
2635 auto One = B.buildConstant(Res: S32, Val: 1);
2636
2637 MachineInstrBuilder ShAmt;
2638 if (Signed) {
2639 auto ThirtyOne = B.buildConstant(Res: S32, Val: 31);
2640 auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: 0), Src1: Unmerge.getReg(Idx: 1));
2641 auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2642 auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2643 auto LS = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32})
2644 .addUse(RegNo: Unmerge.getReg(Idx: 1));
2645 auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2646 ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2647 } else
2648 ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
2649 auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2650 auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2651 auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: 0));
2652 auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: 1), Src1: Adjust);
2653 auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2654 auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2655 B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2656 MI.eraseFromParent();
2657 return true;
2658}
2659
2660// TODO: Copied from DAG implementation. Verify logic and document how this
2661// actually works.
2662bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2663 MachineRegisterInfo &MRI,
2664 MachineIRBuilder &B,
2665 bool Signed) const {
2666
2667 Register Dst = MI.getOperand(i: 0).getReg();
2668 Register Src = MI.getOperand(i: 1).getReg();
2669
2670 const LLT S64 = LLT::scalar(SizeInBits: 64);
2671 const LLT S32 = LLT::scalar(SizeInBits: 32);
2672
2673 const LLT SrcLT = MRI.getType(Reg: Src);
2674 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2675
2676 unsigned Flags = MI.getFlags();
2677
2678 // The basic idea of converting a floating point number into a pair of 32-bit
2679 // integers is illustrated as follows:
2680 //
2681 // tf := trunc(val);
2682 // hif := floor(tf * 2^-32);
2683 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2684 // hi := fptoi(hif);
2685 // lo := fptoi(lof);
2686 //
2687 auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2688 MachineInstrBuilder Sign;
2689 if (Signed && SrcLT == S32) {
2690 // However, a 32-bit floating point number has only 23 bits mantissa and
2691 // it's not enough to hold all the significant bits of `lof` if val is
2692 // negative. To avoid the loss of precision, We need to take the absolute
2693 // value after truncating and flip the result back based on the original
2694 // signedness.
2695 Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: 31));
2696 Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2697 }
2698 MachineInstrBuilder K0, K1;
2699 if (SrcLT == S64) {
2700 K0 = B.buildFConstant(
2701 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2702 K1 = B.buildFConstant(
2703 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2704 } else {
2705 K0 = B.buildFConstant(
2706 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2707 K1 = B.buildFConstant(
2708 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2709 }
2710
2711 auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2712 auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2713 auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2714
2715 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2716 : B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2717 auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2718
2719 if (Signed && SrcLT == S32) {
2720 // Flip the result based on the signedness, which is either all 0s or 1s.
2721 Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2722 // r := xor({lo, hi}, sign) - sign;
2723 B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2724 Src1: Sign);
2725 } else
2726 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2727 MI.eraseFromParent();
2728
2729 return true;
2730}
2731
2732bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2733 MachineInstr &MI) const {
2734 MachineFunction &MF = Helper.MIRBuilder.getMF();
2735 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2736
2737 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2738 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2739
2740 // With ieee_mode disabled, the instructions have the correct behavior
2741 // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
2742 //
2743 // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
2744 // enabled.
2745 if (!MFI->getMode().IEEE) {
2746 if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
2747 MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
2748 return true;
2749
2750 return !IsIEEEOp;
2751 }
2752
2753 if (IsIEEEOp)
2754 return true;
2755
2756 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2757}
2758
2759bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2760 MachineInstr &MI, MachineRegisterInfo &MRI,
2761 MachineIRBuilder &B) const {
2762 // TODO: Should move some of this into LegalizerHelper.
2763
2764 // TODO: Promote dynamic indexing of s16 to s32
2765
2766 Register Dst = MI.getOperand(i: 0).getReg();
2767 Register Vec = MI.getOperand(i: 1).getReg();
2768
2769 LLT VecTy = MRI.getType(Reg: Vec);
2770 LLT EltTy = VecTy.getElementType();
2771 assert(EltTy == MRI.getType(Dst));
2772
2773 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2774 // but we can't go directly to that logic becasue you can't bitcast a vector
2775 // of pointers to a vector of integers. Therefore, introduce an intermediate
2776 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2777 // drive the legalization forward.
2778 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2779 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2780 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2781
2782 auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2783 auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: 2));
2784 B.buildIntToPtr(Dst, Src: IntElt);
2785
2786 MI.eraseFromParent();
2787 return true;
2788 }
2789
2790 // FIXME: Artifact combiner probably should have replaced the truncated
2791 // constant before this, so we shouldn't need
2792 // getIConstantVRegValWithLookThrough.
2793 std::optional<ValueAndVReg> MaybeIdxVal =
2794 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 2).getReg(), MRI);
2795 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2796 return true;
2797 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2798
2799 if (IdxVal < VecTy.getNumElements()) {
2800 auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
2801 B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
2802 } else {
2803 B.buildUndef(Res: Dst);
2804 }
2805
2806 MI.eraseFromParent();
2807 return true;
2808}
2809
2810bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2811 MachineInstr &MI, MachineRegisterInfo &MRI,
2812 MachineIRBuilder &B) const {
2813 // TODO: Should move some of this into LegalizerHelper.
2814
2815 // TODO: Promote dynamic indexing of s16 to s32
2816
2817 Register Dst = MI.getOperand(i: 0).getReg();
2818 Register Vec = MI.getOperand(i: 1).getReg();
2819 Register Ins = MI.getOperand(i: 2).getReg();
2820
2821 LLT VecTy = MRI.getType(Reg: Vec);
2822 LLT EltTy = VecTy.getElementType();
2823 assert(EltTy == MRI.getType(Ins));
2824
2825 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2826 // but we can't go directly to that logic becasue you can't bitcast a vector
2827 // of pointers to a vector of integers. Therefore, make the pointer vector
2828 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2829 // new value, and then inttoptr the result vector back. This will then allow
2830 // the rest of legalization to take over.
2831 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2832 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2833 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2834
2835 auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2836 auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
2837 auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
2838 Idx: MI.getOperand(i: 3));
2839 B.buildIntToPtr(Dst, Src: IntVecDest);
2840 MI.eraseFromParent();
2841 return true;
2842 }
2843
2844 // FIXME: Artifact combiner probably should have replaced the truncated
2845 // constant before this, so we shouldn't need
2846 // getIConstantVRegValWithLookThrough.
2847 std::optional<ValueAndVReg> MaybeIdxVal =
2848 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 3).getReg(), MRI);
2849 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2850 return true;
2851
2852 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2853
2854 unsigned NumElts = VecTy.getNumElements();
2855 if (IdxVal < NumElts) {
2856 SmallVector<Register, 8> SrcRegs;
2857 for (unsigned i = 0; i < NumElts; ++i)
2858 SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
2859 B.buildUnmerge(Res: SrcRegs, Op: Vec);
2860
2861 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
2862 B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
2863 } else {
2864 B.buildUndef(Res: Dst);
2865 }
2866
2867 MI.eraseFromParent();
2868 return true;
2869}
2870
2871bool AMDGPULegalizerInfo::legalizeSinCos(
2872 MachineInstr &MI, MachineRegisterInfo &MRI,
2873 MachineIRBuilder &B) const {
2874
2875 Register DstReg = MI.getOperand(i: 0).getReg();
2876 Register SrcReg = MI.getOperand(i: 1).getReg();
2877 LLT Ty = MRI.getType(Reg: DstReg);
2878 unsigned Flags = MI.getFlags();
2879
2880 Register TrigVal;
2881 auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: 0.5 * numbers::inv_pi);
2882 if (ST.hasTrigReducedRange()) {
2883 auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
2884 TrigVal = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {Ty})
2885 .addUse(RegNo: MulVal.getReg(Idx: 0))
2886 .setMIFlags(Flags)
2887 .getReg(Idx: 0);
2888 } else
2889 TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: 0);
2890
2891 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2892 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2893 B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
2894 .addUse(RegNo: TrigVal)
2895 .setMIFlags(Flags);
2896 MI.eraseFromParent();
2897 return true;
2898}
2899
2900bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2901 MachineIRBuilder &B,
2902 const GlobalValue *GV,
2903 int64_t Offset,
2904 unsigned GAFlags) const {
2905 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2906 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2907 // to the following code sequence:
2908 //
2909 // For constant address space:
2910 // s_getpc_b64 s[0:1]
2911 // s_add_u32 s0, s0, $symbol
2912 // s_addc_u32 s1, s1, 0
2913 //
2914 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2915 // a fixup or relocation is emitted to replace $symbol with a literal
2916 // constant, which is a pc-relative offset from the encoding of the $symbol
2917 // operand to the global variable.
2918 //
2919 // For global address space:
2920 // s_getpc_b64 s[0:1]
2921 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2922 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2923 //
2924 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2925 // fixups or relocations are emitted to replace $symbol@*@lo and
2926 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2927 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2928 // operand to the global variable.
2929
2930 LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
2931
2932 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2933 B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
2934
2935 MachineInstrBuilder MIB = B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET)
2936 .addDef(RegNo: PCReg);
2937
2938 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
2939 if (GAFlags == SIInstrInfo::MO_NONE)
2940 MIB.addImm(Val: 0);
2941 else
2942 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 1);
2943
2944 if (!B.getMRI()->getRegClassOrNull(Reg: PCReg))
2945 B.getMRI()->setRegClass(Reg: PCReg, RC: &AMDGPU::SReg_64RegClass);
2946
2947 if (PtrTy.getSizeInBits() == 32)
2948 B.buildExtract(Res: DstReg, Src: PCReg, Index: 0);
2949 return true;
2950}
2951
2952// Emit a ABS32_LO / ABS32_HI relocation stub.
2953void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2954 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2955 MachineRegisterInfo &MRI) const {
2956 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2957
2958 LLT S32 = LLT::scalar(SizeInBits: 32);
2959
2960 // Use the destination directly, if and only if we store the lower address
2961 // part only and we don't have a register class being set.
2962 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
2963 ? DstReg
2964 : MRI.createGenericVirtualRegister(Ty: S32);
2965
2966 if (!MRI.getRegClassOrNull(Reg: AddrLo))
2967 MRI.setRegClass(Reg: AddrLo, RC: &AMDGPU::SReg_32RegClass);
2968
2969 // Write the lower half.
2970 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
2971 .addDef(RegNo: AddrLo)
2972 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
2973
2974 // If required, write the upper half as well.
2975 if (RequiresHighHalf) {
2976 assert(PtrTy.getSizeInBits() == 64 &&
2977 "Must provide a 64-bit pointer type!");
2978
2979 Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
2980 MRI.setRegClass(Reg: AddrHi, RC: &AMDGPU::SReg_32RegClass);
2981
2982 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
2983 .addDef(RegNo: AddrHi)
2984 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_HI);
2985
2986 // Use the destination directly, if and only if we don't have a register
2987 // class being set.
2988 Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
2989 ? DstReg
2990 : MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2991
2992 if (!MRI.getRegClassOrNull(Reg: AddrDst))
2993 MRI.setRegClass(Reg: AddrDst, RC: &AMDGPU::SReg_64RegClass);
2994
2995 B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
2996
2997 // If we created a new register for the destination, cast the result into
2998 // the final output.
2999 if (AddrDst != DstReg)
3000 B.buildCast(Dst: DstReg, Src: AddrDst);
3001 } else if (AddrLo != DstReg) {
3002 // If we created a new register for the destination, cast the result into
3003 // the final output.
3004 B.buildCast(Dst: DstReg, Src: AddrLo);
3005 }
3006}
3007
3008bool AMDGPULegalizerInfo::legalizeGlobalValue(
3009 MachineInstr &MI, MachineRegisterInfo &MRI,
3010 MachineIRBuilder &B) const {
3011 Register DstReg = MI.getOperand(i: 0).getReg();
3012 LLT Ty = MRI.getType(Reg: DstReg);
3013 unsigned AS = Ty.getAddressSpace();
3014
3015 const GlobalValue *GV = MI.getOperand(i: 1).getGlobal();
3016 MachineFunction &MF = B.getMF();
3017 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3018
3019 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
3020 if (!MFI->isModuleEntryFunction() &&
3021 GV->getName() != "llvm.amdgcn.module.lds" &&
3022 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
3023 const Function &Fn = MF.getFunction();
3024 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
3025 Fn, "local memory global used by non-kernel function",
3026 MI.getDebugLoc(), DS_Warning));
3027
3028 // We currently don't have a way to correctly allocate LDS objects that
3029 // aren't directly associated with a kernel. We do force inlining of
3030 // functions that use local objects. However, if these dead functions are
3031 // not eliminated, we don't want a compile time error. Just emit a warning
3032 // and a trap, since there should be no callable path here.
3033 B.buildTrap();
3034 B.buildUndef(Res: DstReg);
3035 MI.eraseFromParent();
3036 return true;
3037 }
3038
3039 // TODO: We could emit code to handle the initialization somewhere.
3040 // We ignore the initializer for now and legalize it to allow selection.
3041 // The initializer will anyway get errored out during assembly emission.
3042 const SITargetLowering *TLI = ST.getTargetLowering();
3043 if (!TLI->shouldUseLDSConstAddress(GV)) {
3044 MI.getOperand(i: 1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3045 return true; // Leave in place;
3046 }
3047
3048 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3049 Type *Ty = GV->getValueType();
3050 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3051 // zero-sized type in other languages to declare the dynamic shared
3052 // memory which size is not known at the compile time. They will be
3053 // allocated by the runtime and placed directly after the static
3054 // allocated ones. They all share the same offset.
3055 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3056 // Adjust alignment for that dynamic shared memory array.
3057 MFI->setDynLDSAlign(F: MF.getFunction(), GV: *cast<GlobalVariable>(Val: GV));
3058 LLT S32 = LLT::scalar(SizeInBits: 32);
3059 auto Sz = B.buildIntrinsic(ID: Intrinsic::amdgcn_groupstaticsize, Res: {S32});
3060 B.buildIntToPtr(Dst: DstReg, Src: Sz);
3061 MI.eraseFromParent();
3062 return true;
3063 }
3064 }
3065
3066 B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(),
3067 GV: *cast<GlobalVariable>(Val: GV)));
3068 MI.eraseFromParent();
3069 return true;
3070 }
3071
3072 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3073 buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
3074 MI.eraseFromParent();
3075 return true;
3076 }
3077
3078 const SITargetLowering *TLI = ST.getTargetLowering();
3079
3080 if (TLI->shouldEmitFixup(GV)) {
3081 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0);
3082 MI.eraseFromParent();
3083 return true;
3084 }
3085
3086 if (TLI->shouldEmitPCReloc(GV)) {
3087 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_REL32);
3088 MI.eraseFromParent();
3089 return true;
3090 }
3091
3092 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3093 Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
3094
3095 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3096 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3097 PtrInfo: MachinePointerInfo::getGOT(MF),
3098 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3099 MachineMemOperand::MOInvariant,
3100 MemTy: LoadTy, base_alignment: Align(8));
3101
3102 buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3103
3104 if (Ty.getSizeInBits() == 32) {
3105 // Truncate if this is a 32-bit constant address.
3106 auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3107 B.buildExtract(Res: DstReg, Src: Load, Index: 0);
3108 } else
3109 B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3110
3111 MI.eraseFromParent();
3112 return true;
3113}
3114
3115static LLT widenToNextPowerOf2(LLT Ty) {
3116 if (Ty.isVector())
3117 return Ty.changeElementCount(
3118 EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3119 return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3120}
3121
3122bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3123 MachineInstr &MI) const {
3124 MachineIRBuilder &B = Helper.MIRBuilder;
3125 MachineRegisterInfo &MRI = *B.getMRI();
3126 GISelChangeObserver &Observer = Helper.Observer;
3127
3128 Register PtrReg = MI.getOperand(i: 1).getReg();
3129 LLT PtrTy = MRI.getType(Reg: PtrReg);
3130 unsigned AddrSpace = PtrTy.getAddressSpace();
3131
3132 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3133 LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3134 auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3135 Observer.changingInstr(MI);
3136 MI.getOperand(i: 1).setReg(Cast.getReg(Idx: 0));
3137 Observer.changedInstr(MI);
3138 return true;
3139 }
3140
3141 if (MI.getOpcode() != AMDGPU::G_LOAD)
3142 return false;
3143
3144 Register ValReg = MI.getOperand(i: 0).getReg();
3145 LLT ValTy = MRI.getType(Reg: ValReg);
3146
3147 if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3148 Observer.changingInstr(MI);
3149 castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
3150 Observer.changedInstr(MI);
3151 return true;
3152 }
3153
3154 MachineMemOperand *MMO = *MI.memoperands_begin();
3155 const unsigned ValSize = ValTy.getSizeInBits();
3156 const LLT MemTy = MMO->getMemoryType();
3157 const Align MemAlign = MMO->getAlign();
3158 const unsigned MemSize = MemTy.getSizeInBits();
3159 const uint64_t AlignInBits = 8 * MemAlign.value();
3160
3161 // Widen non-power-of-2 loads to the alignment if needed
3162 if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3163 const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3164
3165 // This was already the correct extending load result type, so just adjust
3166 // the memory type.
3167 if (WideMemSize == ValSize) {
3168 MachineFunction &MF = B.getMF();
3169
3170 MachineMemOperand *WideMMO =
3171 MF.getMachineMemOperand(MMO, Offset: 0, Size: WideMemSize / 8);
3172 Observer.changingInstr(MI);
3173 MI.setMemRefs(MF, MemRefs: {WideMMO});
3174 Observer.changedInstr(MI);
3175 return true;
3176 }
3177
3178 // Don't bother handling edge case that should probably never be produced.
3179 if (ValSize > WideMemSize)
3180 return false;
3181
3182 LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3183
3184 Register WideLoad;
3185 if (!WideTy.isVector()) {
3186 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3187 B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: 0);
3188 } else {
3189 // Extract the subvector.
3190
3191 if (isRegisterType(ST, Ty: ValTy)) {
3192 // If this a case where G_EXTRACT is legal, use it.
3193 // (e.g. <3 x s32> -> <4 x s32>)
3194 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3195 B.buildExtract(Res: ValReg, Src: WideLoad, Index: 0);
3196 } else {
3197 // For cases where the widened type isn't a nice register value, unmerge
3198 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3199 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3200 B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3201 }
3202 }
3203
3204 MI.eraseFromParent();
3205 return true;
3206 }
3207
3208 return false;
3209}
3210
3211bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3212 MachineInstr &MI) const {
3213 MachineIRBuilder &B = Helper.MIRBuilder;
3214 MachineRegisterInfo &MRI = *B.getMRI();
3215 GISelChangeObserver &Observer = Helper.Observer;
3216
3217 Register DataReg = MI.getOperand(i: 0).getReg();
3218 LLT DataTy = MRI.getType(Reg: DataReg);
3219
3220 if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3221 Observer.changingInstr(MI);
3222 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
3223 Observer.changedInstr(MI);
3224 return true;
3225 }
3226 return false;
3227}
3228
3229bool AMDGPULegalizerInfo::legalizeFMad(
3230 MachineInstr &MI, MachineRegisterInfo &MRI,
3231 MachineIRBuilder &B) const {
3232 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3233 assert(Ty.isScalar());
3234
3235 MachineFunction &MF = B.getMF();
3236 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3237
3238 // TODO: Always legal with future ftz flag.
3239 // FIXME: Do we need just output?
3240 if (Ty == LLT::float32() &&
3241 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3242 return true;
3243 if (Ty == LLT::float16() &&
3244 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3245 return true;
3246
3247 MachineIRBuilder HelperBuilder(MI);
3248 GISelObserverWrapper DummyObserver;
3249 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3250 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3251}
3252
3253bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3254 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3255 Register DstReg = MI.getOperand(i: 0).getReg();
3256 Register PtrReg = MI.getOperand(i: 1).getReg();
3257 Register CmpVal = MI.getOperand(i: 2).getReg();
3258 Register NewVal = MI.getOperand(i: 3).getReg();
3259
3260 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3261 "this should not have been custom lowered");
3262
3263 LLT ValTy = MRI.getType(Reg: CmpVal);
3264 LLT VecTy = LLT::fixed_vector(NumElements: 2, ScalarTy: ValTy);
3265
3266 Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: 0);
3267
3268 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3269 .addDef(RegNo: DstReg)
3270 .addUse(RegNo: PtrReg)
3271 .addUse(RegNo: PackedVal)
3272 .setMemRefs(MI.memoperands());
3273
3274 MI.eraseFromParent();
3275 return true;
3276}
3277
3278/// Return true if it's known that \p Src can never be an f32 denormal value.
3279static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3280 Register Src) {
3281 const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3282 switch (DefMI->getOpcode()) {
3283 case TargetOpcode::G_INTRINSIC: {
3284 switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3285 case Intrinsic::amdgcn_frexp_mant:
3286 return true;
3287 default:
3288 break;
3289 }
3290
3291 break;
3292 }
3293 case TargetOpcode::G_FFREXP: {
3294 if (DefMI->getOperand(i: 0).getReg() == Src)
3295 return true;
3296 break;
3297 }
3298 case TargetOpcode::G_FPEXT: {
3299 return MRI.getType(Reg: DefMI->getOperand(i: 1).getReg()) == LLT::scalar(SizeInBits: 16);
3300 }
3301 default:
3302 return false;
3303 }
3304
3305 return false;
3306}
3307
3308static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3309 if (Flags & MachineInstr::FmAfn)
3310 return true;
3311 const auto &Options = MF.getTarget().Options;
3312 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3313}
3314
3315static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3316 unsigned Flags) {
3317 return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3318 MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3319 DenormalMode::PreserveSign;
3320}
3321
3322std::pair<Register, Register>
3323AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3324 unsigned Flags) const {
3325 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3326 return {};
3327
3328 const LLT F32 = LLT::scalar(SizeInBits: 32);
3329 auto SmallestNormal = B.buildFConstant(
3330 Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3331 auto IsLtSmallestNormal =
3332 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: SmallestNormal);
3333
3334 auto Scale32 = B.buildFConstant(Res: F32, Val: 0x1.0p+32);
3335 auto One = B.buildFConstant(Res: F32, Val: 1.0);
3336 auto ScaleFactor =
3337 B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3338 auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3339
3340 return {ScaledInput.getReg(Idx: 0), IsLtSmallestNormal.getReg(Idx: 0)};
3341}
3342
3343bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3344 MachineIRBuilder &B) const {
3345 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3346 // If we have to handle denormals, scale up the input and adjust the result.
3347
3348 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3349 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3350
3351 Register Dst = MI.getOperand(i: 0).getReg();
3352 Register Src = MI.getOperand(i: 1).getReg();
3353 LLT Ty = B.getMRI()->getType(Reg: Dst);
3354 unsigned Flags = MI.getFlags();
3355
3356 if (Ty == LLT::scalar(SizeInBits: 16)) {
3357 const LLT F32 = LLT::scalar(SizeInBits: 32);
3358 // Nothing in half is a denormal when promoted to f32.
3359 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3360 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {F32})
3361 .addUse(RegNo: Ext.getReg(Idx: 0))
3362 .setMIFlags(Flags);
3363 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3364 MI.eraseFromParent();
3365 return true;
3366 }
3367
3368 assert(Ty == LLT::scalar(32));
3369
3370 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3371 if (!ScaledInput) {
3372 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {MI.getOperand(i: 0)})
3373 .addUse(RegNo: Src)
3374 .setMIFlags(Flags);
3375 MI.eraseFromParent();
3376 return true;
3377 }
3378
3379 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3380 .addUse(RegNo: ScaledInput)
3381 .setMIFlags(Flags);
3382
3383 auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: 32.0);
3384 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3385 auto ResultOffset =
3386 B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3387 B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3388
3389 MI.eraseFromParent();
3390 return true;
3391}
3392
3393static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3394 Register Z, unsigned Flags) {
3395 auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3396 return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: 0);
3397}
3398
3399bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3400 MachineIRBuilder &B) const {
3401 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3402 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3403
3404 MachineRegisterInfo &MRI = *B.getMRI();
3405 Register Dst = MI.getOperand(i: 0).getReg();
3406 Register X = MI.getOperand(i: 1).getReg();
3407 unsigned Flags = MI.getFlags();
3408 const LLT Ty = MRI.getType(Reg: X);
3409 MachineFunction &MF = B.getMF();
3410
3411 const LLT F32 = LLT::scalar(SizeInBits: 32);
3412 const LLT F16 = LLT::scalar(SizeInBits: 16);
3413
3414 const AMDGPUTargetMachine &TM =
3415 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3416
3417 if (Ty == F16 || MI.getFlag(Flag: MachineInstr::FmAfn) ||
3418 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3419 if (Ty == F16 && !ST.has16BitInsts()) {
3420 Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3421 auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3422 legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: 0), IsLog10, Flags);
3423 B.buildFPTrunc(Res: Dst, Op: LogVal);
3424 } else {
3425 legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3426 }
3427
3428 MI.eraseFromParent();
3429 return true;
3430 }
3431
3432 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3433 if (ScaledInput)
3434 X = ScaledInput;
3435
3436 auto Y =
3437 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty}).addUse(RegNo: X).setMIFlags(Flags);
3438
3439 Register R;
3440 if (ST.hasFastFMAF32()) {
3441 // c+cc are ln(2)/ln(10) to more than 49 bits
3442 const float c_log10 = 0x1.344134p-2f;
3443 const float cc_log10 = 0x1.09f79ep-26f;
3444
3445 // c + cc is ln(2) to more than 49 bits
3446 const float c_log = 0x1.62e42ep-1f;
3447 const float cc_log = 0x1.efa39ep-25f;
3448
3449 auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3450 auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3451
3452 R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags).getReg(Idx: 0);
3453 auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags);
3454 auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags);
3455 auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags);
3456 R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags).getReg(Idx: 0);
3457 } else {
3458 // ch+ct is ln(2)/ln(10) to more than 36 bits
3459 const float ch_log10 = 0x1.344000p-2f;
3460 const float ct_log10 = 0x1.3509f6p-18f;
3461
3462 // ch + ct is ln(2) to more than 36 bits
3463 const float ch_log = 0x1.62e000p-1f;
3464 const float ct_log = 0x1.0bfbe8p-15f;
3465
3466 auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3467 auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3468
3469 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3470 auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3471 auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3472 auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags);
3473
3474 Register Mad0 =
3475 getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CT.getReg(Idx: 0), Z: YTCT.getReg(Idx: 0), Flags);
3476 Register Mad1 = getMad(B, Ty, X: YT.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad0, Flags);
3477 R = getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad1, Flags);
3478 }
3479
3480 const bool IsFiniteOnly =
3481 (MI.getFlag(Flag: MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3482 (MI.getFlag(Flag: MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3483
3484 if (!IsFiniteOnly) {
3485 // Expand isfinite(x) => fabs(x) < inf
3486 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3487 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3488 auto IsFinite =
3489 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
3490 R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(Idx: 0);
3491 }
3492
3493 if (ScaledInput) {
3494 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3495 auto ShiftK =
3496 B.buildFConstant(Res: Ty, Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3497 auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3498 B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3499 } else {
3500 B.buildCopy(Res: Dst, Op: R);
3501 }
3502
3503 MI.eraseFromParent();
3504 return true;
3505}
3506
3507bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3508 Register Src, bool IsLog10,
3509 unsigned Flags) const {
3510 const double Log2BaseInverted =
3511 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3512
3513 LLT Ty = B.getMRI()->getType(Reg: Dst);
3514
3515 if (Ty == LLT::scalar(SizeInBits: 32)) {
3516 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3517 if (ScaledInput) {
3518 auto LogSrc = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3519 .addUse(RegNo: Src)
3520 .setMIFlags(Flags);
3521 auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -32.0 * Log2BaseInverted);
3522 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3523 auto ResultOffset =
3524 B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3525 auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3526
3527 if (ST.hasFastFMAF32())
3528 B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3529 else {
3530 auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3531 B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3532 }
3533
3534 return true;
3535 }
3536 }
3537
3538 auto Log2Operand = Ty == LLT::scalar(SizeInBits: 16)
3539 ? B.buildFLog2(Dst: Ty, Src, Flags)
3540 : B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3541 .addUse(RegNo: Src)
3542 .setMIFlags(Flags);
3543 auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3544 B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3545 return true;
3546}
3547
3548bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3549 MachineIRBuilder &B) const {
3550 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3551 // If we have to handle denormals, scale up the input and adjust the result.
3552
3553 Register Dst = MI.getOperand(i: 0).getReg();
3554 Register Src = MI.getOperand(i: 1).getReg();
3555 unsigned Flags = MI.getFlags();
3556 LLT Ty = B.getMRI()->getType(Reg: Dst);
3557 const LLT F16 = LLT::scalar(SizeInBits: 16);
3558 const LLT F32 = LLT::scalar(SizeInBits: 32);
3559
3560 if (Ty == F16) {
3561 // Nothing in half is a denormal when promoted to f32.
3562 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3563 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {F32})
3564 .addUse(RegNo: Ext.getReg(Idx: 0))
3565 .setMIFlags(Flags);
3566 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3567 MI.eraseFromParent();
3568 return true;
3569 }
3570
3571 assert(Ty == F32);
3572
3573 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3574 B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3575 .addUse(RegNo: Src)
3576 .setMIFlags(Flags);
3577 MI.eraseFromParent();
3578 return true;
3579 }
3580
3581 // bool needs_scaling = x < -0x1.f80000p+6f;
3582 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3583
3584 // -nextafter(128.0, -1)
3585 auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -0x1.f80000p+6f);
3586 auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
3587 Op1: RangeCheckConst, Flags);
3588
3589 auto SixtyFour = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3590 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3591 auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3592 auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3593
3594 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3595 .addUse(RegNo: AddInput.getReg(Idx: 0))
3596 .setMIFlags(Flags);
3597
3598 auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: 0x1.0p-64f);
3599 auto One = B.buildFConstant(Res: Ty, Val: 1.0);
3600 auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3601 B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3602 MI.eraseFromParent();
3603 return true;
3604}
3605
3606bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3607 Register X, unsigned Flags) const {
3608 LLT Ty = B.getMRI()->getType(Reg: Dst);
3609 LLT F32 = LLT::scalar(SizeInBits: 32);
3610
3611 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3612 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3613 auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Log2E, Flags);
3614
3615 if (Ty == F32) {
3616 B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3617 .addUse(RegNo: Mul.getReg(Idx: 0))
3618 .setMIFlags(Flags);
3619 } else {
3620 B.buildFExp2(Dst, Src: Mul.getReg(Idx: 0), Flags);
3621 }
3622
3623 return true;
3624 }
3625
3626 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.5d58a0p+6f);
3627 auto NeedsScaling =
3628 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold, Flags);
3629 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3630 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3631 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3632
3633 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3634 auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3635
3636 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3637 .addUse(RegNo: ExpInput.getReg(Idx: 0))
3638 .setMIFlags(Flags);
3639
3640 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.969d48p-93f);
3641 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3642 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3643 return true;
3644}
3645
3646bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3647 MachineIRBuilder &B) const {
3648 Register Dst = MI.getOperand(i: 0).getReg();
3649 Register X = MI.getOperand(i: 1).getReg();
3650 const unsigned Flags = MI.getFlags();
3651 MachineFunction &MF = B.getMF();
3652 MachineRegisterInfo &MRI = *B.getMRI();
3653 LLT Ty = MRI.getType(Reg: Dst);
3654 const LLT F16 = LLT::scalar(SizeInBits: 16);
3655 const LLT F32 = LLT::scalar(SizeInBits: 32);
3656 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3657
3658 if (Ty == F16) {
3659 // v_exp_f16 (fmul x, log2e)
3660 if (allowApproxFunc(MF, Flags)) {
3661 // TODO: Does this really require fast?
3662 legalizeFExpUnsafe(B, Dst, X, Flags);
3663 MI.eraseFromParent();
3664 return true;
3665 }
3666
3667 // exp(f16 x) ->
3668 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3669
3670 // Nothing in half is a denormal when promoted to f32.
3671 auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
3672 Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
3673 legalizeFExpUnsafe(B, Dst: Lowered, X: Ext.getReg(Idx: 0), Flags);
3674 B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
3675 MI.eraseFromParent();
3676 return true;
3677 }
3678
3679 assert(Ty == F32);
3680
3681 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3682 // library behavior. Also, is known-not-daz source sufficient?
3683 if (allowApproxFunc(MF, Flags)) {
3684 legalizeFExpUnsafe(B, Dst, X, Flags);
3685 MI.eraseFromParent();
3686 return true;
3687 }
3688
3689 // Algorithm:
3690 //
3691 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3692 //
3693 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3694 // n = 64*m + j, 0 <= j < 64
3695 //
3696 // e^x = 2^((64*m + j + f)/64)
3697 // = (2^m) * (2^(j/64)) * 2^(f/64)
3698 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3699 //
3700 // f = x*(64/ln(2)) - n
3701 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3702 //
3703 // e^x = (2^m) * (2^(j/64)) * e^r
3704 //
3705 // (2^(j/64)) is precomputed
3706 //
3707 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3708 // e^r = 1 + q
3709 //
3710 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3711 //
3712 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3713 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3714 Register PH, PL;
3715
3716 if (ST.hasFastFMAF32()) {
3717 const float c_exp = numbers::log2ef;
3718 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3719 const float c_exp10 = 0x1.a934f0p+1f;
3720 const float cc_exp10 = 0x1.2f346ep-24f;
3721
3722 auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
3723 PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: 0);
3724 auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
3725 auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
3726
3727 auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
3728 PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: 0);
3729 } else {
3730 const float ch_exp = 0x1.714000p+0f;
3731 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3732
3733 const float ch_exp10 = 0x1.a92000p+1f;
3734 const float cl_exp10 = 0x1.4f0978p-11f;
3735
3736 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3737 auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
3738 auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
3739
3740 auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
3741 PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: 0);
3742
3743 auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
3744 auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
3745
3746 Register Mad0 =
3747 getMad(B, Ty, X: XL.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: XLCL.getReg(Idx: 0), Flags);
3748 PL = getMad(B, Ty, X: XH.getReg(Idx: 0), Y: CL.getReg(Idx: 0), Z: Mad0, Flags);
3749 }
3750
3751 auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
3752
3753 // It is unsafe to contract this fsub into the PH multiply.
3754 auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
3755 auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
3756 auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: 32), Src0: E);
3757
3758 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3759 .addUse(RegNo: A.getReg(Idx: 0))
3760 .setMIFlags(Flags);
3761 auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
3762
3763 auto UnderflowCheckConst =
3764 B.buildFConstant(Res: Ty, Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3765 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3766 auto Underflow =
3767 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: UnderflowCheckConst);
3768
3769 R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
3770
3771 const auto &Options = MF.getTarget().Options;
3772
3773 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3774 auto OverflowCheckConst =
3775 B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3776
3777 auto Overflow =
3778 B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: OverflowCheckConst);
3779 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3780 R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
3781 }
3782
3783 B.buildCopy(Res: Dst, Op: R);
3784 MI.eraseFromParent();
3785 return true;
3786}
3787
3788bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3789 MachineIRBuilder &B) const {
3790 Register Dst = MI.getOperand(i: 0).getReg();
3791 Register Src0 = MI.getOperand(i: 1).getReg();
3792 Register Src1 = MI.getOperand(i: 2).getReg();
3793 unsigned Flags = MI.getFlags();
3794 LLT Ty = B.getMRI()->getType(Reg: Dst);
3795 const LLT F16 = LLT::float16();
3796 const LLT F32 = LLT::float32();
3797
3798 if (Ty == F32) {
3799 auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
3800 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
3801 .addUse(RegNo: Log.getReg(Idx: 0))
3802 .addUse(RegNo: Src1)
3803 .setMIFlags(Flags);
3804 B.buildFExp2(Dst, Src: Mul, Flags);
3805 } else if (Ty == F16) {
3806 // There's no f16 fmul_legacy, so we need to convert for it.
3807 auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
3808 auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
3809 auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
3810 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
3811 .addUse(RegNo: Ext0.getReg(Idx: 0))
3812 .addUse(RegNo: Ext1.getReg(Idx: 0))
3813 .setMIFlags(Flags);
3814 B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
3815 } else
3816 return false;
3817
3818 MI.eraseFromParent();
3819 return true;
3820}
3821
3822// Find a source register, ignoring any possible source modifiers.
3823static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3824 Register ModSrc = OrigSrc;
3825 if (MachineInstr *SrcFNeg = getOpcodeDef(Opcode: AMDGPU::G_FNEG, Reg: ModSrc, MRI)) {
3826 ModSrc = SrcFNeg->getOperand(i: 1).getReg();
3827 if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
3828 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
3829 } else if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
3830 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
3831 return ModSrc;
3832}
3833
3834bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3835 MachineRegisterInfo &MRI,
3836 MachineIRBuilder &B) const {
3837
3838 const LLT S1 = LLT::scalar(SizeInBits: 1);
3839 const LLT F64 = LLT::float64();
3840 Register Dst = MI.getOperand(i: 0).getReg();
3841 Register OrigSrc = MI.getOperand(i: 1).getReg();
3842 unsigned Flags = MI.getFlags();
3843 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3844 "this should not have been custom lowered");
3845
3846 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3847 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3848 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3849 // V_FRACT bug is:
3850 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3851 //
3852 // Convert floor(x) to (x - fract(x))
3853
3854 auto Fract = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {F64})
3855 .addUse(RegNo: OrigSrc)
3856 .setMIFlags(Flags);
3857
3858 // Give source modifier matching some assistance before obscuring a foldable
3859 // pattern.
3860
3861 // TODO: We can avoid the neg on the fract? The input sign to fract
3862 // shouldn't matter?
3863 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3864
3865 auto Const =
3866 B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: 0x3fefffffffffffff));
3867
3868 Register Min = MRI.createGenericVirtualRegister(Ty: F64);
3869
3870 // We don't need to concern ourselves with the snan handling difference, so
3871 // use the one which will directly select.
3872 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3873 if (MFI->getMode().IEEE)
3874 B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
3875 else
3876 B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
3877
3878 Register CorrectedFract = Min;
3879 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
3880 auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
3881 CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: 0);
3882 }
3883
3884 auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
3885 B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
3886
3887 MI.eraseFromParent();
3888 return true;
3889}
3890
3891// Turn an illegal packed v2s16 build vector into bit operations.
3892// TODO: This should probably be a bitcast action in LegalizerHelper.
3893bool AMDGPULegalizerInfo::legalizeBuildVector(
3894 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3895 Register Dst = MI.getOperand(i: 0).getReg();
3896 const LLT S32 = LLT::scalar(SizeInBits: 32);
3897 const LLT S16 = LLT::scalar(SizeInBits: 16);
3898 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3899
3900 Register Src0 = MI.getOperand(i: 1).getReg();
3901 Register Src1 = MI.getOperand(i: 2).getReg();
3902
3903 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3904 assert(MRI.getType(Src0) == S32);
3905 Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 1).getReg()).getReg(Idx: 0);
3906 Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 2).getReg()).getReg(Idx: 0);
3907 }
3908
3909 auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
3910 B.buildBitcast(Dst, Src: Merge);
3911
3912 MI.eraseFromParent();
3913 return true;
3914}
3915
3916// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3917//
3918// Source and accumulation registers must all be 32-bits.
3919//
3920// TODO: When the multiply is uniform, we should produce a code sequence
3921// that is better suited to instruction selection on the SALU. Instead of
3922// the outer loop going over parts of the result, the outer loop should go
3923// over parts of one of the factors. This should result in instruction
3924// selection that makes full use of S_ADDC_U32 instructions.
3925void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3926 MutableArrayRef<Register> Accum,
3927 ArrayRef<Register> Src0,
3928 ArrayRef<Register> Src1,
3929 bool UsePartialMad64_32,
3930 bool SeparateOddAlignedProducts) const {
3931 // Use (possibly empty) vectors of S1 registers to represent the set of
3932 // carries from one pair of positions to the next.
3933 using Carry = SmallVector<Register, 2>;
3934
3935 MachineIRBuilder &B = Helper.MIRBuilder;
3936 GISelValueTracking &VT = *Helper.getValueTracking();
3937
3938 const LLT S1 = LLT::scalar(SizeInBits: 1);
3939 const LLT S32 = LLT::scalar(SizeInBits: 32);
3940 const LLT S64 = LLT::scalar(SizeInBits: 64);
3941
3942 Register Zero32;
3943 Register Zero64;
3944
3945 auto getZero32 = [&]() -> Register {
3946 if (!Zero32)
3947 Zero32 = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
3948 return Zero32;
3949 };
3950 auto getZero64 = [&]() -> Register {
3951 if (!Zero64)
3952 Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
3953 return Zero64;
3954 };
3955
3956 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3957 for (unsigned i = 0; i < Src0.size(); ++i) {
3958 Src0KnownZeros.push_back(Elt: VT.getKnownBits(R: Src0[i]).isZero());
3959 Src1KnownZeros.push_back(Elt: VT.getKnownBits(R: Src1[i]).isZero());
3960 }
3961
3962 // Merge the given carries into the 32-bit LocalAccum, which is modified
3963 // in-place.
3964 //
3965 // Returns the carry-out, which is a single S1 register or null.
3966 auto mergeCarry =
3967 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3968 if (CarryIn.empty())
3969 return Register();
3970
3971 bool HaveCarryOut = true;
3972 Register CarryAccum;
3973 if (CarryIn.size() == 1) {
3974 if (!LocalAccum) {
3975 LocalAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
3976 return Register();
3977 }
3978
3979 CarryAccum = getZero32();
3980 } else {
3981 CarryAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
3982 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3983 CarryAccum =
3984 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32(), CarryIn: CarryIn[i])
3985 .getReg(Idx: 0);
3986 }
3987
3988 if (!LocalAccum) {
3989 LocalAccum = getZero32();
3990 HaveCarryOut = false;
3991 }
3992 }
3993
3994 auto Add =
3995 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
3996 LocalAccum = Add.getReg(Idx: 0);
3997 return HaveCarryOut ? Add.getReg(Idx: 1) : Register();
3998 };
3999
4000 // Build a multiply-add chain to compute
4001 //
4002 // LocalAccum + (partial products at DstIndex)
4003 // + (opportunistic subset of CarryIn)
4004 //
4005 // LocalAccum is an array of one or two 32-bit registers that are updated
4006 // in-place. The incoming registers may be null.
4007 //
4008 // In some edge cases, carry-ins can be consumed "for free". In that case,
4009 // the consumed carry bits are removed from CarryIn in-place.
4010 auto buildMadChain =
4011 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4012 -> Carry {
4013 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4014 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4015
4016 Carry CarryOut;
4017 unsigned j0 = 0;
4018
4019 // Use plain 32-bit multiplication for the most significant part of the
4020 // result by default.
4021 if (LocalAccum.size() == 1 &&
4022 (!UsePartialMad64_32 || !CarryIn.empty())) {
4023 do {
4024 // Skip multiplication if one of the operands is 0
4025 unsigned j1 = DstIndex - j0;
4026 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4027 ++j0;
4028 continue;
4029 }
4030 auto Mul = B.buildMul(Dst: S32, Src0: Src0[j0], Src1: Src1[j1]);
4031 if (!LocalAccum[0] || VT.getKnownBits(R: LocalAccum[0]).isZero()) {
4032 LocalAccum[0] = Mul.getReg(Idx: 0);
4033 } else {
4034 if (CarryIn.empty()) {
4035 LocalAccum[0] = B.buildAdd(Dst: S32, Src0: LocalAccum[0], Src1: Mul).getReg(Idx: 0);
4036 } else {
4037 LocalAccum[0] =
4038 B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum[0], Op1: Mul, CarryIn: CarryIn.back())
4039 .getReg(Idx: 0);
4040 CarryIn.pop_back();
4041 }
4042 }
4043 ++j0;
4044 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4045 }
4046
4047 // Build full 64-bit multiplies.
4048 if (j0 <= DstIndex) {
4049 bool HaveSmallAccum = false;
4050 Register Tmp;
4051
4052 if (LocalAccum[0]) {
4053 if (LocalAccum.size() == 1) {
4054 Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4055 HaveSmallAccum = true;
4056 } else if (LocalAccum[1]) {
4057 Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: 0);
4058 HaveSmallAccum = false;
4059 } else {
4060 Tmp = B.buildZExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4061 HaveSmallAccum = true;
4062 }
4063 } else {
4064 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4065 Tmp = getZero64();
4066 HaveSmallAccum = true;
4067 }
4068
4069 do {
4070 unsigned j1 = DstIndex - j0;
4071 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4072 ++j0;
4073 continue;
4074 }
4075 auto Mad = B.buildInstr(Opc: AMDGPU::G_AMDGPU_MAD_U64_U32, DstOps: {S64, S1},
4076 SrcOps: {Src0[j0], Src1[j1], Tmp});
4077 Tmp = Mad.getReg(Idx: 0);
4078 if (!HaveSmallAccum)
4079 CarryOut.push_back(Elt: Mad.getReg(Idx: 1));
4080 HaveSmallAccum = false;
4081
4082 ++j0;
4083 } while (j0 <= DstIndex);
4084
4085 auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
4086 LocalAccum[0] = Unmerge.getReg(Idx: 0);
4087 if (LocalAccum.size() > 1)
4088 LocalAccum[1] = Unmerge.getReg(Idx: 1);
4089 }
4090
4091 return CarryOut;
4092 };
4093
4094 // Outer multiply loop, iterating over destination parts from least
4095 // significant to most significant parts.
4096 //
4097 // The columns of the following diagram correspond to the destination parts
4098 // affected by one iteration of the outer loop (ignoring boundary
4099 // conditions).
4100 //
4101 // Dest index relative to 2 * i: 1 0 -1
4102 // ------
4103 // Carries from previous iteration: e o
4104 // Even-aligned partial product sum: E E .
4105 // Odd-aligned partial product sum: O O
4106 //
4107 // 'o' is OddCarry, 'e' is EvenCarry.
4108 // EE and OO are computed from partial products via buildMadChain and use
4109 // accumulation where possible and appropriate.
4110 //
4111 Register SeparateOddCarry;
4112 Carry EvenCarry;
4113 Carry OddCarry;
4114
4115 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4116 Carry OddCarryIn = std::move(OddCarry);
4117 Carry EvenCarryIn = std::move(EvenCarry);
4118 OddCarry.clear();
4119 EvenCarry.clear();
4120
4121 // Partial products at offset 2 * i.
4122 if (2 * i < Accum.size()) {
4123 auto LocalAccum = Accum.drop_front(N: 2 * i).take_front(N: 2);
4124 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4125 }
4126
4127 // Partial products at offset 2 * i - 1.
4128 if (i > 0) {
4129 if (!SeparateOddAlignedProducts) {
4130 auto LocalAccum = Accum.drop_front(N: 2 * i - 1).take_front(N: 2);
4131 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4132 } else {
4133 bool IsHighest = 2 * i >= Accum.size();
4134 Register SeparateOddOut[2];
4135 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4136 .take_front(N: IsHighest ? 1 : 2);
4137 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4138
4139 MachineInstr *Lo;
4140
4141 if (i == 1) {
4142 if (!IsHighest)
4143 Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0]);
4144 else
4145 Lo = B.buildAdd(Dst: S32, Src0: Accum[2 * i - 1], Src1: SeparateOddOut[0]);
4146 } else {
4147 Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0],
4148 CarryIn: SeparateOddCarry);
4149 }
4150 Accum[2 * i - 1] = Lo->getOperand(i: 0).getReg();
4151
4152 if (!IsHighest) {
4153 auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i], Op1: SeparateOddOut[1],
4154 CarryIn: Lo->getOperand(i: 1).getReg());
4155 Accum[2 * i] = Hi.getReg(Idx: 0);
4156 SeparateOddCarry = Hi.getReg(Idx: 1);
4157 }
4158 }
4159 }
4160
4161 // Add in the carries from the previous iteration
4162 if (i > 0) {
4163 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4164 EvenCarryIn.push_back(Elt: CarryOut);
4165
4166 if (2 * i < Accum.size()) {
4167 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4168 OddCarry.push_back(Elt: CarryOut);
4169 }
4170 }
4171 }
4172}
4173
4174// Custom narrowing of wide multiplies using wide multiply-add instructions.
4175//
4176// TODO: If the multiply is followed by an addition, we should attempt to
4177// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4178bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4179 MachineInstr &MI) const {
4180 assert(ST.hasMad64_32());
4181 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4182
4183 MachineIRBuilder &B = Helper.MIRBuilder;
4184 MachineRegisterInfo &MRI = *B.getMRI();
4185
4186 Register DstReg = MI.getOperand(i: 0).getReg();
4187 Register Src0 = MI.getOperand(i: 1).getReg();
4188 Register Src1 = MI.getOperand(i: 2).getReg();
4189
4190 LLT Ty = MRI.getType(Reg: DstReg);
4191 assert(Ty.isScalar());
4192
4193 unsigned Size = Ty.getSizeInBits();
4194 unsigned NumParts = Size / 32;
4195 assert((Size % 32) == 0);
4196 assert(NumParts >= 2);
4197
4198 // Whether to use MAD_64_32 for partial products whose high half is
4199 // discarded. This avoids some ADD instructions but risks false dependency
4200 // stalls on some subtargets in some cases.
4201 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4202
4203 // Whether to compute odd-aligned partial products separately. This is
4204 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4205 // in an even-aligned VGPR.
4206 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4207
4208 LLT S32 = LLT::scalar(SizeInBits: 32);
4209 SmallVector<Register, 2> Src0Parts, Src1Parts;
4210 for (unsigned i = 0; i < NumParts; ++i) {
4211 Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4212 Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4213 }
4214 B.buildUnmerge(Res: Src0Parts, Op: Src0);
4215 B.buildUnmerge(Res: Src1Parts, Op: Src1);
4216
4217 SmallVector<Register, 2> AccumRegs(NumParts);
4218 buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4219 SeparateOddAlignedProducts);
4220
4221 B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4222 MI.eraseFromParent();
4223 return true;
4224}
4225
4226// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4227// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4228// case with a single min instruction instead of a compare+select.
4229bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4230 MachineRegisterInfo &MRI,
4231 MachineIRBuilder &B) const {
4232 Register Dst = MI.getOperand(i: 0).getReg();
4233 Register Src = MI.getOperand(i: 1).getReg();
4234 LLT DstTy = MRI.getType(Reg: Dst);
4235 LLT SrcTy = MRI.getType(Reg: Src);
4236
4237 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4238 ? AMDGPU::G_AMDGPU_FFBH_U32
4239 : AMDGPU::G_AMDGPU_FFBL_B32;
4240 auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4241 B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4242
4243 MI.eraseFromParent();
4244 return true;
4245}
4246
4247bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4248 MachineRegisterInfo &MRI,
4249 MachineIRBuilder &B) const {
4250 Register Dst = MI.getOperand(i: 0).getReg();
4251 Register Src = MI.getOperand(i: 1).getReg();
4252 LLT SrcTy = MRI.getType(Reg: Src);
4253 TypeSize NumBits = SrcTy.getSizeInBits();
4254
4255 assert(NumBits < 32u);
4256
4257 auto ShiftAmt = B.buildConstant(Res: S32, Val: 32u - NumBits);
4258 auto Extend = B.buildAnyExt(Res: S32, Op: {Src}).getReg(Idx: 0u);
4259 auto Shift = B.buildShl(Dst: S32, Src0: Extend, Src1: ShiftAmt);
4260 auto Ctlz = B.buildInstr(Opc: AMDGPU::G_AMDGPU_FFBH_U32, DstOps: {S32}, SrcOps: {Shift});
4261 B.buildTrunc(Res: Dst, Op: Ctlz);
4262 MI.eraseFromParent();
4263 return true;
4264}
4265
4266// Check that this is a G_XOR x, -1
4267static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4268 if (MI.getOpcode() != TargetOpcode::G_XOR)
4269 return false;
4270 auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: 2).getReg(), MRI);
4271 return ConstVal == -1;
4272}
4273
4274// Return the use branch instruction, otherwise null if the usage is invalid.
4275static MachineInstr *
4276verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4277 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4278 Register CondDef = MI.getOperand(i: 0).getReg();
4279 if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4280 return nullptr;
4281
4282 MachineBasicBlock *Parent = MI.getParent();
4283 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(RegNo: CondDef);
4284
4285 if (isNot(MRI, MI: *UseMI)) {
4286 Register NegatedCond = UseMI->getOperand(i: 0).getReg();
4287 if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4288 return nullptr;
4289
4290 // We're deleting the def of this value, so we need to remove it.
4291 eraseInstr(MI&: *UseMI, MRI);
4292
4293 UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4294 Negated = true;
4295 }
4296
4297 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4298 return nullptr;
4299
4300 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4301 MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4302 if (Next == Parent->end()) {
4303 MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4304 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4305 return nullptr;
4306 UncondBrTarget = &*NextMBB;
4307 } else {
4308 if (Next->getOpcode() != AMDGPU::G_BR)
4309 return nullptr;
4310 Br = &*Next;
4311 UncondBrTarget = Br->getOperand(i: 0).getMBB();
4312 }
4313
4314 return UseMI;
4315}
4316
4317void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
4318 MachineIRBuilder &B,
4319 const ArgDescriptor *Arg,
4320 const TargetRegisterClass *ArgRC,
4321 LLT ArgTy) const {
4322 MCRegister SrcReg = Arg->getRegister();
4323 assert(SrcReg.isPhysical() && "Physical register expected");
4324 assert(DstReg.isVirtual() && "Virtual register expected");
4325
4326 Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4327 RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4328 if (Arg->isMasked()) {
4329 // TODO: Should we try to emit this once in the entry block?
4330 const LLT S32 = LLT::scalar(SizeInBits: 32);
4331 const unsigned Mask = Arg->getMask();
4332 const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4333
4334 Register AndMaskSrc = LiveIn;
4335
4336 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4337 // 0.
4338 if (Shift != 0) {
4339 auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4340 AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: 0);
4341 }
4342
4343 B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4344 } else {
4345 B.buildCopy(Res: DstReg, Op: LiveIn);
4346 }
4347}
4348
4349bool AMDGPULegalizerInfo::loadInputValue(
4350 Register DstReg, MachineIRBuilder &B,
4351 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4352 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4353 const ArgDescriptor *Arg = nullptr;
4354 const TargetRegisterClass *ArgRC;
4355 LLT ArgTy;
4356
4357 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4358 const ArgDescriptor WorkGroupIDX =
4359 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
4360 // If GridZ is not programmed in an entry function then the hardware will set
4361 // it to all zeros, so there is no need to mask the GridY value in the low
4362 // order bits.
4363 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4364 Reg: AMDGPU::TTMP7,
4365 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4366 const ArgDescriptor WorkGroupIDZ =
4367 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
4368 if (ST.hasArchitectedSGPRs() &&
4369 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4370 switch (ArgType) {
4371 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4372 Arg = &WorkGroupIDX;
4373 ArgRC = &AMDGPU::SReg_32RegClass;
4374 ArgTy = LLT::scalar(SizeInBits: 32);
4375 break;
4376 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4377 Arg = &WorkGroupIDY;
4378 ArgRC = &AMDGPU::SReg_32RegClass;
4379 ArgTy = LLT::scalar(SizeInBits: 32);
4380 break;
4381 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4382 Arg = &WorkGroupIDZ;
4383 ArgRC = &AMDGPU::SReg_32RegClass;
4384 ArgTy = LLT::scalar(SizeInBits: 32);
4385 break;
4386 default:
4387 break;
4388 }
4389 }
4390
4391 if (!Arg)
4392 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4393
4394 if (!Arg) {
4395 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4396 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4397 // case the pointer argument may be missing and we use null.
4398 B.buildConstant(Res: DstReg, Val: 0);
4399 return true;
4400 }
4401
4402 // It's undefined behavior if a function marked with the amdgpu-no-*
4403 // attributes uses the corresponding intrinsic.
4404 B.buildUndef(Res: DstReg);
4405 return true;
4406 }
4407
4408 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4409 return false; // TODO: Handle these
4410 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4411 return true;
4412}
4413
4414bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4415 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4416 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4417 if (!loadInputValue(DstReg: MI.getOperand(i: 0).getReg(), B, ArgType))
4418 return false;
4419
4420 MI.eraseFromParent();
4421 return true;
4422}
4423
4424static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4425 int64_t C) {
4426 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: C);
4427 MI.eraseFromParent();
4428 return true;
4429}
4430
4431bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4432 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4433 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4434 unsigned MaxID = ST.getMaxWorkitemID(Kernel: B.getMF().getFunction(), Dimension: Dim);
4435 if (MaxID == 0)
4436 return replaceWithConstant(B, MI, C: 0);
4437
4438 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4439 const ArgDescriptor *Arg;
4440 const TargetRegisterClass *ArgRC;
4441 LLT ArgTy;
4442 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4443
4444 Register DstReg = MI.getOperand(i: 0).getReg();
4445 if (!Arg) {
4446 // It's undefined behavior if a function marked with the amdgpu-no-*
4447 // attributes uses the corresponding intrinsic.
4448 B.buildUndef(Res: DstReg);
4449 MI.eraseFromParent();
4450 return true;
4451 }
4452
4453 if (Arg->isMasked()) {
4454 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4455 // masking operations anyway.
4456 //
4457 // TODO: We could assert the top bit is 0 for the source copy.
4458 if (!loadInputValue(DstReg, B, ArgType))
4459 return false;
4460 } else {
4461 Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
4462 if (!loadInputValue(DstReg: TmpReg, B, ArgType))
4463 return false;
4464 B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
4465 }
4466
4467 MI.eraseFromParent();
4468 return true;
4469}
4470
4471Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4472 int64_t Offset) const {
4473 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
4474 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
4475
4476 // TODO: If we passed in the base kernel offset we could have a better
4477 // alignment than 4, but we don't really need it.
4478 if (!loadInputValue(DstReg: KernArgReg, B,
4479 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4480 llvm_unreachable("failed to find kernarg segment ptr");
4481
4482 auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset);
4483 // TODO: Should get nuw
4484 return B.buildPtrAdd(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: 0);
4485}
4486
4487/// Legalize a value that's loaded from kernel arguments. This is only used by
4488/// legacy intrinsics.
4489bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4490 MachineIRBuilder &B,
4491 uint64_t Offset,
4492 Align Alignment) const {
4493 Register DstReg = MI.getOperand(i: 0).getReg();
4494
4495 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4496 "unexpected kernarg parameter type");
4497
4498 Register Ptr = getKernargParameterPtr(B, Offset);
4499 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4500 B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo, Alignment: Align(4),
4501 MMOFlags: MachineMemOperand::MODereferenceable |
4502 MachineMemOperand::MOInvariant);
4503 MI.eraseFromParent();
4504 return true;
4505}
4506
4507bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4508 MachineRegisterInfo &MRI,
4509 MachineIRBuilder &B) const {
4510 Register Dst = MI.getOperand(i: 0).getReg();
4511 LLT DstTy = MRI.getType(Reg: Dst);
4512 LLT S16 = LLT::scalar(SizeInBits: 16);
4513 LLT S32 = LLT::scalar(SizeInBits: 32);
4514 LLT S64 = LLT::scalar(SizeInBits: 64);
4515
4516 if (DstTy == S16)
4517 return legalizeFDIV16(MI, MRI, B);
4518 if (DstTy == S32)
4519 return legalizeFDIV32(MI, MRI, B);
4520 if (DstTy == S64)
4521 return legalizeFDIV64(MI, MRI, B);
4522
4523 return false;
4524}
4525
4526void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4527 Register DstDivReg,
4528 Register DstRemReg,
4529 Register X,
4530 Register Y) const {
4531 const LLT S1 = LLT::scalar(SizeInBits: 1);
4532 const LLT S32 = LLT::scalar(SizeInBits: 32);
4533
4534 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4535 // algorithm used here.
4536
4537 // Initial estimate of inv(y).
4538 auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
4539 auto RcpIFlag = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {FloatY});
4540 auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f7ffffe));
4541 auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
4542 auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
4543
4544 // One round of UNR.
4545 auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 0), Src1: Y);
4546 auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
4547 Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
4548
4549 // Quotient/remainder estimate.
4550 auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
4551 auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
4552
4553 // First quotient/remainder refinement.
4554 auto One = B.buildConstant(Res: S32, Val: 1);
4555 auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4556 if (DstDivReg)
4557 Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4558 R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4559
4560 // Second quotient/remainder refinement.
4561 Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4562 if (DstDivReg)
4563 B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4564
4565 if (DstRemReg)
4566 B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4567}
4568
4569// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4570//
4571// Return lo, hi of result
4572//
4573// %cvt.lo = G_UITOFP Val.lo
4574// %cvt.hi = G_UITOFP Val.hi
4575// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4576// %rcp = G_AMDGPU_RCP_IFLAG %mad
4577// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4578// %mul2 = G_FMUL %mul1, 2**(-32)
4579// %trunc = G_INTRINSIC_TRUNC %mul2
4580// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4581// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4582static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4583 Register Val) {
4584 const LLT S32 = LLT::scalar(SizeInBits: 32);
4585 auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
4586
4587 auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 0));
4588 auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
4589
4590 auto Mad = B.buildFMAD(
4591 Dst: S32, Src0: CvtHi, // 2**32
4592 Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f800000)), Src2: CvtLo);
4593
4594 auto Rcp = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {Mad});
4595 auto Mul1 = B.buildFMul(
4596 Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x5f7ffffc)));
4597
4598 // 2**(-32)
4599 auto Mul2 = B.buildFMul(
4600 Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x2f800000)));
4601 auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
4602
4603 // -(2**32)
4604 auto Mad2 = B.buildFMAD(
4605 Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0xcf800000)),
4606 Src2: Mul1);
4607
4608 auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
4609 auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
4610
4611 return {ResultLo.getReg(Idx: 0), ResultHi.getReg(Idx: 0)};
4612}
4613
4614void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4615 Register DstDivReg,
4616 Register DstRemReg,
4617 Register Numer,
4618 Register Denom) const {
4619 const LLT S32 = LLT::scalar(SizeInBits: 32);
4620 const LLT S64 = LLT::scalar(SizeInBits: 64);
4621 const LLT S1 = LLT::scalar(SizeInBits: 1);
4622 Register RcpLo, RcpHi;
4623
4624 std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
4625
4626 auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
4627
4628 auto Zero64 = B.buildConstant(Res: S64, Val: 0);
4629 auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
4630
4631 auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
4632 auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
4633
4634 auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
4635 Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: 0);
4636 Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: 1);
4637
4638 auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
4639 auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: 1));
4640 auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
4641
4642 auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
4643 auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
4644 auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
4645 Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: 0);
4646 Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: 1);
4647
4648 auto Zero32 = B.buildConstant(Res: S32, Val: 0);
4649 auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
4650 auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: 1));
4651 auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
4652
4653 auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
4654 Register NumerLo = UnmergeNumer.getReg(Idx: 0);
4655 Register NumerHi = UnmergeNumer.getReg(Idx: 1);
4656
4657 auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
4658 auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
4659 auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
4660 Register Mul3_Lo = UnmergeMul3.getReg(Idx: 0);
4661 Register Mul3_Hi = UnmergeMul3.getReg(Idx: 1);
4662 auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
4663 auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: 1));
4664 auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
4665 auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
4666
4667 auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
4668 Register DenomLo = UnmergeDenom.getReg(Idx: 0);
4669 Register DenomHi = UnmergeDenom.getReg(Idx: 1);
4670
4671 auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4672 auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
4673
4674 auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
4675 auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
4676
4677 auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4678 auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
4679
4680 // TODO: Here and below portions of the code can be enclosed into if/endif.
4681 // Currently control flow is unconditional and we have 4 selects after
4682 // potential endif to substitute PHIs.
4683
4684 // if C3 != 0 ...
4685 auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
4686 auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: 1));
4687 auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: 1));
4688 auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
4689
4690 auto One64 = B.buildConstant(Res: S64, Val: 1);
4691 auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
4692
4693 auto C4 =
4694 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
4695 auto C5 =
4696 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
4697 auto C6 = B.buildSelect(
4698 Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
4699
4700 // if (C6 != 0)
4701 auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
4702 auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
4703
4704 auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: 1));
4705 auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: 1));
4706 auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
4707
4708 // endif C6
4709 // endif C3
4710
4711 if (DstDivReg) {
4712 auto Sel1 = B.buildSelect(
4713 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
4714 B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4715 Op0: Sel1, Op1: MulHi3);
4716 }
4717
4718 if (DstRemReg) {
4719 auto Sel2 = B.buildSelect(
4720 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
4721 B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4722 Op0: Sel2, Op1: Sub1);
4723 }
4724}
4725
4726bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4727 MachineRegisterInfo &MRI,
4728 MachineIRBuilder &B) const {
4729 Register DstDivReg, DstRemReg;
4730 switch (MI.getOpcode()) {
4731 default:
4732 llvm_unreachable("Unexpected opcode!");
4733 case AMDGPU::G_UDIV: {
4734 DstDivReg = MI.getOperand(i: 0).getReg();
4735 break;
4736 }
4737 case AMDGPU::G_UREM: {
4738 DstRemReg = MI.getOperand(i: 0).getReg();
4739 break;
4740 }
4741 case AMDGPU::G_UDIVREM: {
4742 DstDivReg = MI.getOperand(i: 0).getReg();
4743 DstRemReg = MI.getOperand(i: 1).getReg();
4744 break;
4745 }
4746 }
4747
4748 const LLT S64 = LLT::scalar(SizeInBits: 64);
4749 const LLT S32 = LLT::scalar(SizeInBits: 32);
4750 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4751 Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
4752 Register Den = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
4753 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4754
4755 if (Ty == S32)
4756 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
4757 else if (Ty == S64)
4758 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
4759 else
4760 return false;
4761
4762 MI.eraseFromParent();
4763 return true;
4764}
4765
4766bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4767 MachineRegisterInfo &MRI,
4768 MachineIRBuilder &B) const {
4769 const LLT S64 = LLT::scalar(SizeInBits: 64);
4770 const LLT S32 = LLT::scalar(SizeInBits: 32);
4771
4772 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4773 if (Ty != S32 && Ty != S64)
4774 return false;
4775
4776 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4777 Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
4778 Register RHS = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
4779
4780 auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - 1);
4781 auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
4782 auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
4783
4784 LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
4785 RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
4786
4787 LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
4788 RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
4789
4790 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4791 switch (MI.getOpcode()) {
4792 default:
4793 llvm_unreachable("Unexpected opcode!");
4794 case AMDGPU::G_SDIV: {
4795 DstDivReg = MI.getOperand(i: 0).getReg();
4796 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4797 break;
4798 }
4799 case AMDGPU::G_SREM: {
4800 DstRemReg = MI.getOperand(i: 0).getReg();
4801 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4802 break;
4803 }
4804 case AMDGPU::G_SDIVREM: {
4805 DstDivReg = MI.getOperand(i: 0).getReg();
4806 DstRemReg = MI.getOperand(i: 1).getReg();
4807 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4808 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4809 break;
4810 }
4811 }
4812
4813 if (Ty == S32)
4814 legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
4815 else
4816 legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
4817
4818 if (DstDivReg) {
4819 auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: 0);
4820 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: 0);
4821 B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
4822 }
4823
4824 if (DstRemReg) {
4825 auto Sign = LHSign.getReg(Idx: 0); // Remainder sign is the same as LHS
4826 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: 0);
4827 B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
4828 }
4829
4830 MI.eraseFromParent();
4831 return true;
4832}
4833
4834bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4835 MachineRegisterInfo &MRI,
4836 MachineIRBuilder &B) const {
4837 Register Res = MI.getOperand(i: 0).getReg();
4838 Register LHS = MI.getOperand(i: 1).getReg();
4839 Register RHS = MI.getOperand(i: 2).getReg();
4840 uint16_t Flags = MI.getFlags();
4841 LLT ResTy = MRI.getType(Reg: Res);
4842
4843 const MachineFunction &MF = B.getMF();
4844 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn) ||
4845 MF.getTarget().Options.UnsafeFPMath;
4846
4847 if (const auto *CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
4848 if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: 16))
4849 return false;
4850
4851 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4852 // the CI documentation has a worst case error of 1 ulp.
4853 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4854 // use it as long as we aren't trying to use denormals.
4855 //
4856 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4857
4858 // 1 / x -> RCP(x)
4859 if (CLHS->isExactlyValue(V: 1.0)) {
4860 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
4861 .addUse(RegNo: RHS)
4862 .setMIFlags(Flags);
4863
4864 MI.eraseFromParent();
4865 return true;
4866 }
4867
4868 // -1 / x -> RCP( FNEG(x) )
4869 if (CLHS->isExactlyValue(V: -1.0)) {
4870 auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
4871 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
4872 .addUse(RegNo: FNeg.getReg(Idx: 0))
4873 .setMIFlags(Flags);
4874
4875 MI.eraseFromParent();
4876 return true;
4877 }
4878 }
4879
4880 // For f16 require afn or arcp.
4881 // For f32 require afn.
4882 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: 16) ||
4883 !MI.getFlag(Flag: MachineInstr::FmArcp)))
4884 return false;
4885
4886 // x / y -> x * (1.0 / y)
4887 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
4888 .addUse(RegNo: RHS)
4889 .setMIFlags(Flags);
4890 B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
4891
4892 MI.eraseFromParent();
4893 return true;
4894}
4895
4896bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4897 MachineRegisterInfo &MRI,
4898 MachineIRBuilder &B) const {
4899 Register Res = MI.getOperand(i: 0).getReg();
4900 Register X = MI.getOperand(i: 1).getReg();
4901 Register Y = MI.getOperand(i: 2).getReg();
4902 uint16_t Flags = MI.getFlags();
4903 LLT ResTy = MRI.getType(Reg: Res);
4904
4905 const MachineFunction &MF = B.getMF();
4906 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4907 MI.getFlag(Flag: MachineInstr::FmAfn);
4908
4909 if (!AllowInaccurateRcp)
4910 return false;
4911
4912 auto NegY = B.buildFNeg(Dst: ResTy, Src0: Y);
4913 auto One = B.buildFConstant(Res: ResTy, Val: 1.0);
4914
4915 auto R = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
4916 .addUse(RegNo: Y)
4917 .setMIFlags(Flags);
4918
4919 auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4920 R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
4921
4922 auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4923 R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
4924
4925 auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
4926 auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
4927
4928 B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
4929 MI.eraseFromParent();
4930 return true;
4931}
4932
4933bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4934 MachineRegisterInfo &MRI,
4935 MachineIRBuilder &B) const {
4936 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4937 return true;
4938
4939 Register Res = MI.getOperand(i: 0).getReg();
4940 Register LHS = MI.getOperand(i: 1).getReg();
4941 Register RHS = MI.getOperand(i: 2).getReg();
4942
4943 uint16_t Flags = MI.getFlags();
4944
4945 LLT S16 = LLT::scalar(SizeInBits: 16);
4946 LLT S32 = LLT::scalar(SizeInBits: 32);
4947
4948 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
4949 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
4950 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
4951 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
4952 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4953 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
4954 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4955 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
4956 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
4957 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
4958 // q16.u = opx(V_CVT_F16_F32, q32.u);
4959 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
4960
4961 auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
4962 auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
4963 auto NegRHSExt = B.buildFNeg(Dst: S32, Src0: RHSExt);
4964 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
4965 .addUse(RegNo: RHSExt.getReg(Idx: 0))
4966 .setMIFlags(Flags);
4967 auto Quot = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: Rcp, Flags);
4968 MachineInstrBuilder Err;
4969 if (ST.hasMadMacF32Insts()) {
4970 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
4971 Quot = B.buildFMAD(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
4972 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
4973 } else {
4974 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
4975 Quot = B.buildFMA(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
4976 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
4977 }
4978 auto Tmp = B.buildFMul(Dst: S32, Src0: Err, Src1: Rcp, Flags);
4979 Tmp = B.buildAnd(Dst: S32, Src0: Tmp, Src1: B.buildConstant(Res: S32, Val: 0xff800000));
4980 Quot = B.buildFAdd(Dst: S32, Src0: Tmp, Src1: Quot, Flags);
4981 auto RDst = B.buildFPTrunc(Res: S16, Op: Quot, Flags);
4982 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
4983 .addUse(RegNo: RDst.getReg(Idx: 0))
4984 .addUse(RegNo: RHS)
4985 .addUse(RegNo: LHS)
4986 .setMIFlags(Flags);
4987
4988 MI.eraseFromParent();
4989 return true;
4990}
4991
4992static constexpr unsigned SPDenormModeBitField =
4993 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 4, Values: 2);
4994
4995// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4996// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4997static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4998 const GCNSubtarget &ST,
4999 SIModeRegisterDefaults Mode) {
5000 // Set SP denorm mode to this value.
5001 unsigned SPDenormMode =
5002 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5003
5004 if (ST.hasDenormModeInst()) {
5005 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5006 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5007
5008 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5009 B.buildInstr(Opcode: AMDGPU::S_DENORM_MODE)
5010 .addImm(Val: NewDenormModeValue);
5011
5012 } else {
5013 B.buildInstr(Opcode: AMDGPU::S_SETREG_IMM32_B32)
5014 .addImm(Val: SPDenormMode)
5015 .addImm(Val: SPDenormModeBitField);
5016 }
5017}
5018
5019bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
5020 MachineRegisterInfo &MRI,
5021 MachineIRBuilder &B) const {
5022 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5023 return true;
5024
5025 Register Res = MI.getOperand(i: 0).getReg();
5026 Register LHS = MI.getOperand(i: 1).getReg();
5027 Register RHS = MI.getOperand(i: 2).getReg();
5028 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5029 SIModeRegisterDefaults Mode = MFI->getMode();
5030
5031 uint16_t Flags = MI.getFlags();
5032
5033 LLT S32 = LLT::scalar(SizeInBits: 32);
5034 LLT S1 = LLT::scalar(SizeInBits: 1);
5035
5036 auto One = B.buildFConstant(Res: S32, Val: 1.0f);
5037
5038 auto DenominatorScaled =
5039 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5040 .addUse(RegNo: LHS)
5041 .addUse(RegNo: RHS)
5042 .addImm(Val: 0)
5043 .setMIFlags(Flags);
5044 auto NumeratorScaled =
5045 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5046 .addUse(RegNo: LHS)
5047 .addUse(RegNo: RHS)
5048 .addImm(Val: 1)
5049 .setMIFlags(Flags);
5050
5051 auto ApproxRcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5052 .addUse(RegNo: DenominatorScaled.getReg(Idx: 0))
5053 .setMIFlags(Flags);
5054 auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
5055
5056 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5057 const bool HasDynamicDenormals =
5058 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5059 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5060
5061 Register SavedSPDenormMode;
5062 if (!PreservesDenormals) {
5063 if (HasDynamicDenormals) {
5064 SavedSPDenormMode = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5065 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32)
5066 .addDef(RegNo: SavedSPDenormMode)
5067 .addImm(Val: SPDenormModeBitField);
5068 }
5069 toggleSPDenormMode(Enable: true, B, ST, Mode);
5070 }
5071
5072 auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
5073 auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
5074 auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
5075 auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
5076 auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
5077 auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
5078
5079 if (!PreservesDenormals) {
5080 if (HasDynamicDenormals) {
5081 assert(SavedSPDenormMode);
5082 B.buildInstr(Opcode: AMDGPU::S_SETREG_B32)
5083 .addReg(RegNo: SavedSPDenormMode)
5084 .addImm(Val: SPDenormModeBitField);
5085 } else
5086 toggleSPDenormMode(Enable: false, B, ST, Mode);
5087 }
5088
5089 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S32})
5090 .addUse(RegNo: Fma4.getReg(Idx: 0))
5091 .addUse(RegNo: Fma1.getReg(Idx: 0))
5092 .addUse(RegNo: Fma3.getReg(Idx: 0))
5093 .addUse(RegNo: NumeratorScaled.getReg(Idx: 1))
5094 .setMIFlags(Flags);
5095
5096 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5097 .addUse(RegNo: Fmas.getReg(Idx: 0))
5098 .addUse(RegNo: RHS)
5099 .addUse(RegNo: LHS)
5100 .setMIFlags(Flags);
5101
5102 MI.eraseFromParent();
5103 return true;
5104}
5105
5106bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5107 MachineRegisterInfo &MRI,
5108 MachineIRBuilder &B) const {
5109 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5110 return true;
5111
5112 Register Res = MI.getOperand(i: 0).getReg();
5113 Register LHS = MI.getOperand(i: 1).getReg();
5114 Register RHS = MI.getOperand(i: 2).getReg();
5115
5116 uint16_t Flags = MI.getFlags();
5117
5118 LLT S64 = LLT::scalar(SizeInBits: 64);
5119 LLT S1 = LLT::scalar(SizeInBits: 1);
5120
5121 auto One = B.buildFConstant(Res: S64, Val: 1.0);
5122
5123 auto DivScale0 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5124 .addUse(RegNo: LHS)
5125 .addUse(RegNo: RHS)
5126 .addImm(Val: 0)
5127 .setMIFlags(Flags);
5128
5129 auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(Idx: 0), Flags);
5130
5131 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S64})
5132 .addUse(RegNo: DivScale0.getReg(Idx: 0))
5133 .setMIFlags(Flags);
5134
5135 auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
5136 auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
5137 auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
5138
5139 auto DivScale1 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5140 .addUse(RegNo: LHS)
5141 .addUse(RegNo: RHS)
5142 .addImm(Val: 1)
5143 .setMIFlags(Flags);
5144
5145 auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5146 auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(Idx: 0), Src1: Fma3, Flags);
5147 auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(Idx: 0), Flags);
5148
5149 Register Scale;
5150 if (!ST.hasUsableDivScaleConditionOutput()) {
5151 // Workaround a hardware bug on SI where the condition output from div_scale
5152 // is not usable.
5153
5154 LLT S32 = LLT::scalar(SizeInBits: 32);
5155
5156 auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5157 auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5158 auto Scale0Unmerge = B.buildUnmerge(Res: S32, Op: DivScale0);
5159 auto Scale1Unmerge = B.buildUnmerge(Res: S32, Op: DivScale1);
5160
5161 auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: 1),
5162 Op1: Scale1Unmerge.getReg(Idx: 1));
5163 auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: 1),
5164 Op1: Scale0Unmerge.getReg(Idx: 1));
5165 Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(Idx: 0);
5166 } else {
5167 Scale = DivScale1.getReg(Idx: 1);
5168 }
5169
5170 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S64})
5171 .addUse(RegNo: Fma4.getReg(Idx: 0))
5172 .addUse(RegNo: Fma3.getReg(Idx: 0))
5173 .addUse(RegNo: Mul.getReg(Idx: 0))
5174 .addUse(RegNo: Scale)
5175 .setMIFlags(Flags);
5176
5177 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res: ArrayRef(Res))
5178 .addUse(RegNo: Fmas.getReg(Idx: 0))
5179 .addUse(RegNo: RHS)
5180 .addUse(RegNo: LHS)
5181 .setMIFlags(Flags);
5182
5183 MI.eraseFromParent();
5184 return true;
5185}
5186
5187bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5188 MachineRegisterInfo &MRI,
5189 MachineIRBuilder &B) const {
5190 Register Res0 = MI.getOperand(i: 0).getReg();
5191 Register Res1 = MI.getOperand(i: 1).getReg();
5192 Register Val = MI.getOperand(i: 2).getReg();
5193 uint16_t Flags = MI.getFlags();
5194
5195 LLT Ty = MRI.getType(Reg: Res0);
5196 LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: 16) ? LLT::scalar(SizeInBits: 16) : LLT::scalar(SizeInBits: 32);
5197
5198 auto Mant = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_mant, Res: {Ty})
5199 .addUse(RegNo: Val)
5200 .setMIFlags(Flags);
5201 auto Exp = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_exp, Res: {InstrExpTy})
5202 .addUse(RegNo: Val)
5203 .setMIFlags(Flags);
5204
5205 if (ST.hasFractBug()) {
5206 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5207 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5208 auto IsFinite =
5209 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
5210 auto Zero = B.buildConstant(Res: InstrExpTy, Val: 0);
5211 Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5212 Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5213 }
5214
5215 B.buildCopy(Res: Res0, Op: Mant);
5216 B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5217
5218 MI.eraseFromParent();
5219 return true;
5220}
5221
5222bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5223 MachineRegisterInfo &MRI,
5224 MachineIRBuilder &B) const {
5225 Register Res = MI.getOperand(i: 0).getReg();
5226 Register LHS = MI.getOperand(i: 2).getReg();
5227 Register RHS = MI.getOperand(i: 3).getReg();
5228 uint16_t Flags = MI.getFlags();
5229
5230 LLT S32 = LLT::scalar(SizeInBits: 32);
5231 LLT S1 = LLT::scalar(SizeInBits: 1);
5232
5233 auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5234 const APFloat C0Val(1.0f);
5235
5236 auto C0 = B.buildFConstant(Res: S32, Val: 0x1p+96f);
5237 auto C1 = B.buildFConstant(Res: S32, Val: 0x1p-32f);
5238 auto C2 = B.buildFConstant(Res: S32, Val: 1.0f);
5239
5240 auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5241 auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5242
5243 auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5244
5245 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5246 .addUse(RegNo: Mul0.getReg(Idx: 0))
5247 .setMIFlags(Flags);
5248
5249 auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5250
5251 B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5252
5253 MI.eraseFromParent();
5254 return true;
5255}
5256
5257bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5258 MachineRegisterInfo &MRI,
5259 MachineIRBuilder &B) const {
5260 // Bypass the correct expansion a standard promotion through G_FSQRT would
5261 // get. The f32 op is accurate enough for the f16 cas.
5262 unsigned Flags = MI.getFlags();
5263 assert(!ST.has16BitInsts());
5264 const LLT F32 = LLT::scalar(SizeInBits: 32);
5265 auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: 1), Flags);
5266 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: {F32})
5267 .addUse(RegNo: Ext.getReg(Idx: 0))
5268 .setMIFlags(Flags);
5269 B.buildFPTrunc(Res: MI.getOperand(i: 0), Op: Log2, Flags);
5270 MI.eraseFromParent();
5271 return true;
5272}
5273
5274bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5275 MachineRegisterInfo &MRI,
5276 MachineIRBuilder &B) const {
5277 MachineFunction &MF = B.getMF();
5278 Register Dst = MI.getOperand(i: 0).getReg();
5279 Register X = MI.getOperand(i: 1).getReg();
5280 const unsigned Flags = MI.getFlags();
5281 const LLT S1 = LLT::scalar(SizeInBits: 1);
5282 const LLT F32 = LLT::scalar(SizeInBits: 32);
5283 const LLT I32 = LLT::scalar(SizeInBits: 32);
5284
5285 if (allowApproxFunc(MF, Flags)) {
5286 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({Dst}))
5287 .addUse(RegNo: X)
5288 .setMIFlags(Flags);
5289 MI.eraseFromParent();
5290 return true;
5291 }
5292
5293 auto ScaleThreshold = B.buildFConstant(Res: F32, Val: 0x1.0p-96f);
5294 auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5295 auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: 0x1.0p+32f);
5296 auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5297 auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5298
5299 Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5300 if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5301 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({SqrtS}))
5302 .addUse(RegNo: SqrtX.getReg(Idx: 0))
5303 .setMIFlags(Flags);
5304
5305 auto NegOne = B.buildConstant(Res: I32, Val: -1);
5306 auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5307
5308 auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5309 auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5310
5311 auto PosOne = B.buildConstant(Res: I32, Val: 1);
5312 auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5313
5314 auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5315 auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5316
5317 auto Zero = B.buildFConstant(Res: F32, Val: 0.0f);
5318 auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5319
5320 SqrtS =
5321 B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5322
5323 auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
5324 SqrtS =
5325 B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: 0);
5326 } else {
5327 auto SqrtR =
5328 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F32}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5329 B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
5330
5331 auto Half = B.buildFConstant(Res: F32, Val: 0.5f);
5332 auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
5333 auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
5334 auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
5335 SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
5336 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(Idx: 0);
5337 auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
5338 auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
5339 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(Idx: 0);
5340 }
5341
5342 auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: 0x1.0p-16f);
5343
5344 auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
5345
5346 SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5347
5348 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5349 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
5350
5351 MI.eraseFromParent();
5352 return true;
5353}
5354
5355bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5356 MachineRegisterInfo &MRI,
5357 MachineIRBuilder &B) const {
5358 // For double type, the SQRT and RSQ instructions don't have required
5359 // precision, we apply Goldschmidt's algorithm to improve the result:
5360 //
5361 // y0 = rsq(x)
5362 // g0 = x * y0
5363 // h0 = 0.5 * y0
5364 //
5365 // r0 = 0.5 - h0 * g0
5366 // g1 = g0 * r0 + g0
5367 // h1 = h0 * r0 + h0
5368 //
5369 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5370 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5371 // h2 = h1 * r1 + h1
5372 //
5373 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5374 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5375 //
5376 // sqrt(x) = g3
5377
5378 const LLT S1 = LLT::scalar(SizeInBits: 1);
5379 const LLT S32 = LLT::scalar(SizeInBits: 32);
5380 const LLT F64 = LLT::scalar(SizeInBits: 64);
5381
5382 Register Dst = MI.getOperand(i: 0).getReg();
5383 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5384
5385 Register X = MI.getOperand(i: 1).getReg();
5386 unsigned Flags = MI.getFlags();
5387
5388 auto ScaleConstant = B.buildFConstant(Res: F64, Val: 0x1.0p-767);
5389
5390 auto ZeroInt = B.buildConstant(Res: S32, Val: 0);
5391 auto Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant);
5392
5393 // Scale up input if it is too small.
5394 auto ScaleUpFactor = B.buildConstant(Res: S32, Val: 256);
5395 auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
5396 auto SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags);
5397
5398 auto SqrtY =
5399 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F64}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5400
5401 auto Half = B.buildFConstant(Res: F64, Val: 0.5);
5402 auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
5403 auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
5404
5405 auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
5406 auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
5407
5408 auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
5409 auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
5410
5411 auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
5412 auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
5413
5414 auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
5415
5416 auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
5417 auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
5418
5419 auto SqrtRet = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
5420
5421 // Scale down the result.
5422 auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -128);
5423 auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
5424 SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtRet, Src1: ScaleDown, Flags);
5425
5426 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5427 // with finite only or nsz because rsq(+/-0) = +/-inf
5428
5429 // TODO: Check for DAZ and expand to subnormals
5430 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5431
5432 // If x is +INF, +0, or -0, use its original value
5433 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
5434
5435 MI.eraseFromParent();
5436 return true;
5437}
5438
5439bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5440 MachineRegisterInfo &MRI,
5441 MachineIRBuilder &B) const {
5442 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5443 if (Ty == LLT::scalar(SizeInBits: 32))
5444 return legalizeFSQRTF32(MI, MRI, B);
5445 if (Ty == LLT::scalar(SizeInBits: 64))
5446 return legalizeFSQRTF64(MI, MRI, B);
5447 if (Ty == LLT::scalar(SizeInBits: 16))
5448 return legalizeFSQRTF16(MI, MRI, B);
5449 return false;
5450}
5451
5452// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5453// FIXME: Why do we handle this one but not other removed instructions?
5454//
5455// Reciprocal square root. The clamp prevents infinite results, clamping
5456// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5457// +-max_float.
5458bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5459 MachineRegisterInfo &MRI,
5460 MachineIRBuilder &B) const {
5461 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5462 return true;
5463
5464 Register Dst = MI.getOperand(i: 0).getReg();
5465 Register Src = MI.getOperand(i: 2).getReg();
5466 auto Flags = MI.getFlags();
5467
5468 LLT Ty = MRI.getType(Reg: Dst);
5469
5470 const fltSemantics *FltSemantics;
5471 if (Ty == LLT::scalar(SizeInBits: 32))
5472 FltSemantics = &APFloat::IEEEsingle();
5473 else if (Ty == LLT::scalar(SizeInBits: 64))
5474 FltSemantics = &APFloat::IEEEdouble();
5475 else
5476 return false;
5477
5478 auto Rsq = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {Ty})
5479 .addUse(RegNo: Src)
5480 .setMIFlags(Flags);
5481
5482 // We don't need to concern ourselves with the snan handling difference, since
5483 // the rsq quieted (or not) so use the one which will directly select.
5484 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5485 const bool UseIEEE = MFI->getMode().IEEE;
5486
5487 auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
5488 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
5489 B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
5490
5491 auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics, Negative: true));
5492
5493 if (UseIEEE)
5494 B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5495 else
5496 B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5497 MI.eraseFromParent();
5498 return true;
5499}
5500
5501// TODO: Fix pointer type handling
5502bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5503 MachineInstr &MI,
5504 Intrinsic::ID IID) const {
5505
5506 MachineIRBuilder &B = Helper.MIRBuilder;
5507 MachineRegisterInfo &MRI = *B.getMRI();
5508
5509 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5510 IID == Intrinsic::amdgcn_permlanex16;
5511 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5512 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5513
5514 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5515 Register Src2, LLT VT) -> Register {
5516 auto LaneOp = B.buildIntrinsic(ID: IID, Res: {VT}).addUse(RegNo: Src0);
5517 switch (IID) {
5518 case Intrinsic::amdgcn_readfirstlane:
5519 case Intrinsic::amdgcn_permlane64:
5520 return LaneOp.getReg(Idx: 0);
5521 case Intrinsic::amdgcn_readlane:
5522 case Intrinsic::amdgcn_set_inactive:
5523 case Intrinsic::amdgcn_set_inactive_chain_arg:
5524 return LaneOp.addUse(RegNo: Src1).getReg(Idx: 0);
5525 case Intrinsic::amdgcn_writelane:
5526 return LaneOp.addUse(RegNo: Src1).addUse(RegNo: Src2).getReg(Idx: 0);
5527 case Intrinsic::amdgcn_permlane16:
5528 case Intrinsic::amdgcn_permlanex16: {
5529 Register Src3 = MI.getOperand(i: 5).getReg();
5530 int64_t Src4 = MI.getOperand(i: 6).getImm();
5531 int64_t Src5 = MI.getOperand(i: 7).getImm();
5532 return LaneOp.addUse(RegNo: Src1)
5533 .addUse(RegNo: Src2)
5534 .addUse(RegNo: Src3)
5535 .addImm(Val: Src4)
5536 .addImm(Val: Src5)
5537 .getReg(Idx: 0);
5538 }
5539 case Intrinsic::amdgcn_mov_dpp8:
5540 return LaneOp.addImm(Val: MI.getOperand(i: 3).getImm()).getReg(Idx: 0);
5541 case Intrinsic::amdgcn_update_dpp:
5542 return LaneOp.addUse(RegNo: Src1)
5543 .addImm(Val: MI.getOperand(i: 4).getImm())
5544 .addImm(Val: MI.getOperand(i: 5).getImm())
5545 .addImm(Val: MI.getOperand(i: 6).getImm())
5546 .addImm(Val: MI.getOperand(i: 7).getImm())
5547 .getReg(Idx: 0);
5548 default:
5549 llvm_unreachable("unhandled lane op");
5550 }
5551 };
5552
5553 Register DstReg = MI.getOperand(i: 0).getReg();
5554 Register Src0 = MI.getOperand(i: 2).getReg();
5555 Register Src1, Src2;
5556 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5557 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5558 Src1 = MI.getOperand(i: 3).getReg();
5559 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5560 Src2 = MI.getOperand(i: 4).getReg();
5561 }
5562 }
5563
5564 LLT Ty = MRI.getType(Reg: DstReg);
5565 unsigned Size = Ty.getSizeInBits();
5566
5567 unsigned SplitSize = 32;
5568 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5569 ST.hasDPALU_DPP() &&
5570 AMDGPU::isLegalDPALU_DPPControl(DC: MI.getOperand(i: 4).getImm()))
5571 SplitSize = 64;
5572
5573 if (Size == SplitSize) {
5574 // Already legal
5575 return true;
5576 }
5577
5578 if (Size < 32) {
5579 Src0 = B.buildAnyExt(Res: S32, Op: Src0).getReg(Idx: 0);
5580
5581 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5582 Src1 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src1).getReg(Idx: 0);
5583
5584 if (IID == Intrinsic::amdgcn_writelane)
5585 Src2 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src2).getReg(Idx: 0);
5586
5587 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5588 B.buildTrunc(Res: DstReg, Op: LaneOpDst);
5589 MI.eraseFromParent();
5590 return true;
5591 }
5592
5593 if (Size % SplitSize != 0)
5594 return false;
5595
5596 LLT PartialResTy = LLT::scalar(SizeInBits: SplitSize);
5597 bool NeedsBitcast = false;
5598 if (Ty.isVector()) {
5599 LLT EltTy = Ty.getElementType();
5600 unsigned EltSize = EltTy.getSizeInBits();
5601 if (EltSize == SplitSize) {
5602 PartialResTy = EltTy;
5603 } else if (EltSize == 16 || EltSize == 32) {
5604 unsigned NElem = SplitSize / EltSize;
5605 PartialResTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: NElem));
5606 } else {
5607 // Handle all other cases via S32/S64 pieces
5608 NeedsBitcast = true;
5609 }
5610 }
5611
5612 SmallVector<Register, 4> PartialRes;
5613 unsigned NumParts = Size / SplitSize;
5614 MachineInstrBuilder Src0Parts = B.buildUnmerge(Res: PartialResTy, Op: Src0);
5615 MachineInstrBuilder Src1Parts, Src2Parts;
5616
5617 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5618 Src1Parts = B.buildUnmerge(Res: PartialResTy, Op: Src1);
5619
5620 if (IID == Intrinsic::amdgcn_writelane)
5621 Src2Parts = B.buildUnmerge(Res: PartialResTy, Op: Src2);
5622
5623 for (unsigned i = 0; i < NumParts; ++i) {
5624 Src0 = Src0Parts.getReg(Idx: i);
5625
5626 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5627 Src1 = Src1Parts.getReg(Idx: i);
5628
5629 if (IID == Intrinsic::amdgcn_writelane)
5630 Src2 = Src2Parts.getReg(Idx: i);
5631
5632 PartialRes.push_back(Elt: createLaneOp(Src0, Src1, Src2, PartialResTy));
5633 }
5634
5635 if (NeedsBitcast)
5636 B.buildBitcast(Dst: DstReg, Src: B.buildMergeLikeInstr(
5637 Res: LLT::scalar(SizeInBits: Ty.getSizeInBits()), Ops: PartialRes));
5638 else
5639 B.buildMergeLikeInstr(Res: DstReg, Ops: PartialRes);
5640
5641 MI.eraseFromParent();
5642 return true;
5643}
5644
5645bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5646 MachineRegisterInfo &MRI,
5647 MachineIRBuilder &B) const {
5648 uint64_t Offset =
5649 ST.getTargetLowering()->getImplicitParameterOffset(
5650 MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
5651 LLT DstTy = MRI.getType(Reg: DstReg);
5652 LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
5653
5654 Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
5655 if (!loadInputValue(DstReg: KernargPtrReg, B,
5656 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5657 return false;
5658
5659 // FIXME: This should be nuw
5660 B.buildPtrAdd(Res: DstReg, Op0: KernargPtrReg, Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: 0));
5661 return true;
5662}
5663
5664/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5665/// bits of the pointer and replace them with the stride argument, then
5666/// merge_values everything together. In the common case of a raw buffer (the
5667/// stride component is 0), we can just AND off the upper half.
5668bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5669 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5670 Register Result = MI.getOperand(i: 0).getReg();
5671 Register Pointer = MI.getOperand(i: 2).getReg();
5672 Register Stride = MI.getOperand(i: 3).getReg();
5673 Register NumRecords = MI.getOperand(i: 4).getReg();
5674 Register Flags = MI.getOperand(i: 5).getReg();
5675
5676 LLT S32 = LLT::scalar(SizeInBits: 32);
5677
5678 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
5679 auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
5680 Register LowHalf = Unmerge.getReg(Idx: 0);
5681 Register HighHalf = Unmerge.getReg(Idx: 1);
5682
5683 auto AndMask = B.buildConstant(Res: S32, Val: 0x0000ffff);
5684 auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
5685
5686 MachineInstrBuilder NewHighHalf = Masked;
5687 std::optional<ValueAndVReg> StrideConst =
5688 getIConstantVRegValWithLookThrough(VReg: Stride, MRI);
5689 if (!StrideConst || !StrideConst->Value.isZero()) {
5690 MachineInstrBuilder ShiftedStride;
5691 if (StrideConst) {
5692 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5693 uint32_t ShiftedStrideVal = StrideVal << 16;
5694 ShiftedStride = B.buildConstant(Res: S32, Val: ShiftedStrideVal);
5695 } else {
5696 auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
5697 auto ShiftConst = B.buildConstant(Res: S32, Val: 16);
5698 ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
5699 }
5700 NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
5701 }
5702 Register NewHighHalfReg = NewHighHalf.getReg(Idx: 0);
5703 B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
5704 MI.eraseFromParent();
5705 return true;
5706}
5707
5708bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5709 MachineRegisterInfo &MRI,
5710 MachineIRBuilder &B) const {
5711 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5712 if (!MFI->isEntryFunction()) {
5713 return legalizePreloadedArgIntrin(MI, MRI, B,
5714 ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5715 }
5716
5717 Register DstReg = MI.getOperand(i: 0).getReg();
5718 if (!getImplicitArgPtr(DstReg, MRI, B))
5719 return false;
5720
5721 MI.eraseFromParent();
5722 return true;
5723}
5724
5725bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5726 MachineRegisterInfo &MRI,
5727 MachineIRBuilder &B) const {
5728 Function &F = B.getMF().getFunction();
5729 std::optional<uint32_t> KnownSize =
5730 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5731 if (KnownSize.has_value())
5732 B.buildConstant(Res: DstReg, Val: *KnownSize);
5733 return false;
5734}
5735
5736bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5737 MachineRegisterInfo &MRI,
5738 MachineIRBuilder &B) const {
5739
5740 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5741 if (!MFI->isEntryFunction()) {
5742 return legalizePreloadedArgIntrin(MI, MRI, B,
5743 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5744 }
5745
5746 Register DstReg = MI.getOperand(i: 0).getReg();
5747 if (!getLDSKernelId(DstReg, MRI, B))
5748 return false;
5749
5750 MI.eraseFromParent();
5751 return true;
5752}
5753
5754bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5755 MachineRegisterInfo &MRI,
5756 MachineIRBuilder &B,
5757 unsigned AddrSpace) const {
5758 Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
5759 auto Unmerge = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: MI.getOperand(i: 2).getReg());
5760 Register Hi32 = Unmerge.getReg(Idx: 1);
5761
5762 B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: 0), Op0: Hi32, Op1: ApertureReg);
5763 MI.eraseFromParent();
5764 return true;
5765}
5766
5767// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5768// offset (the offset that is included in bounds checking and swizzling, to be
5769// split between the instruction's voffset and immoffset fields) and soffset
5770// (the offset that is excluded from bounds checking and swizzling, to go in
5771// the instruction's soffset field). This function takes the first kind of
5772// offset and figures out how to split it between voffset and immoffset.
5773std::pair<Register, unsigned>
5774AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5775 Register OrigOffset) const {
5776 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5777 Register BaseReg;
5778 unsigned ImmOffset;
5779 const LLT S32 = LLT::scalar(SizeInBits: 32);
5780 MachineRegisterInfo &MRI = *B.getMRI();
5781
5782 std::tie(args&: BaseReg, args&: ImmOffset) =
5783 AMDGPU::getBaseWithConstantOffset(MRI, Reg: OrigOffset);
5784
5785 // If BaseReg is a pointer, convert it to int.
5786 if (MRI.getType(Reg: BaseReg).isPointer())
5787 BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: 0);
5788
5789 // If the immediate value is too big for the immoffset field, put only bits
5790 // that would normally fit in the immoffset field. The remaining value that
5791 // is copied/added for the voffset field is a large power of 2, and it
5792 // stands more chance of being CSEd with the copy/add for another similar
5793 // load/store.
5794 // However, do not do that rounding down if that is a negative
5795 // number, as it appears to be illegal to have a negative offset in the
5796 // vgpr, even if adding the immediate offset makes it positive.
5797 unsigned Overflow = ImmOffset & ~MaxImm;
5798 ImmOffset -= Overflow;
5799 if ((int32_t)Overflow < 0) {
5800 Overflow += ImmOffset;
5801 ImmOffset = 0;
5802 }
5803
5804 if (Overflow != 0) {
5805 if (!BaseReg) {
5806 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
5807 } else {
5808 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
5809 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
5810 }
5811 }
5812
5813 if (!BaseReg)
5814 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
5815
5816 return std::pair(BaseReg, ImmOffset);
5817}
5818
5819/// Handle register layout difference for f16 images for some subtargets.
5820Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5821 MachineRegisterInfo &MRI,
5822 Register Reg,
5823 bool ImageStore) const {
5824 const LLT S16 = LLT::scalar(SizeInBits: 16);
5825 const LLT S32 = LLT::scalar(SizeInBits: 32);
5826 LLT StoreVT = MRI.getType(Reg);
5827 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5828
5829 if (ST.hasUnpackedD16VMem()) {
5830 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5831
5832 SmallVector<Register, 4> WideRegs;
5833 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5834 WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
5835
5836 int NumElts = StoreVT.getNumElements();
5837
5838 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
5839 .getReg(Idx: 0);
5840 }
5841
5842 if (ImageStore && ST.hasImageStoreD16Bug()) {
5843 if (StoreVT.getNumElements() == 2) {
5844 SmallVector<Register, 4> PackedRegs;
5845 Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: 0);
5846 PackedRegs.push_back(Elt: Reg);
5847 PackedRegs.resize(N: 2, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
5848 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Ops: PackedRegs)
5849 .getReg(Idx: 0);
5850 }
5851
5852 if (StoreVT.getNumElements() == 3) {
5853 SmallVector<Register, 4> PackedRegs;
5854 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5855 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5856 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5857 PackedRegs.resize(N: 6, NV: B.buildUndef(Res: S16).getReg(Idx: 0));
5858 Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 6, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: 0);
5859 return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 3, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
5860 }
5861
5862 if (StoreVT.getNumElements() == 4) {
5863 SmallVector<Register, 4> PackedRegs;
5864 Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
5865 auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
5866 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5867 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5868 PackedRegs.resize(N: 4, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
5869 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S32), Ops: PackedRegs)
5870 .getReg(Idx: 0);
5871 }
5872
5873 llvm_unreachable("invalid data type");
5874 }
5875
5876 if (StoreVT == LLT::fixed_vector(NumElements: 3, ScalarTy: S16)) {
5877 Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S16), Op0: Reg)
5878 .getReg(Idx: 0);
5879 }
5880 return Reg;
5881}
5882
5883Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
5884 Register VData, LLT MemTy,
5885 bool IsFormat) const {
5886 MachineRegisterInfo *MRI = B.getMRI();
5887 LLT Ty = MRI->getType(Reg: VData);
5888
5889 const LLT S16 = LLT::scalar(SizeInBits: 16);
5890
5891 // Fixup buffer resources themselves needing to be v4i128.
5892 if (hasBufferRsrcWorkaround(Ty))
5893 return castBufferRsrcToV4I32(Pointer: VData, B);
5894
5895 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
5896 Ty = getBitcastRegisterType(Ty);
5897 VData = B.buildBitcast(Dst: Ty, Src: VData).getReg(Idx: 0);
5898 }
5899 // Fixup illegal register types for i8 stores.
5900 if (Ty == LLT::scalar(SizeInBits: 8) || Ty == S16) {
5901 Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: VData).getReg(Idx: 0);
5902 return AnyExt;
5903 }
5904
5905 if (Ty.isVector()) {
5906 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5907 if (IsFormat)
5908 return handleD16VData(B, MRI&: *MRI, Reg: VData);
5909 }
5910 }
5911
5912 return VData;
5913}
5914
5915bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5916 LegalizerHelper &Helper,
5917 bool IsTyped,
5918 bool IsFormat) const {
5919 MachineIRBuilder &B = Helper.MIRBuilder;
5920 MachineRegisterInfo &MRI = *B.getMRI();
5921
5922 Register VData = MI.getOperand(i: 1).getReg();
5923 LLT Ty = MRI.getType(Reg: VData);
5924 LLT EltTy = Ty.getScalarType();
5925 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5926 const LLT S32 = LLT::scalar(SizeInBits: 32);
5927
5928 MachineMemOperand *MMO = *MI.memoperands_begin();
5929 const int MemSize = MMO->getSize().getValue();
5930 LLT MemTy = MMO->getMemoryType();
5931
5932 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
5933
5934 castBufferRsrcArgToV4I32(MI, B, Idx: 2);
5935 Register RSrc = MI.getOperand(i: 2).getReg();
5936
5937 unsigned ImmOffset;
5938
5939 // The typed intrinsics add an immediate after the registers.
5940 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5941
5942 // The struct intrinsic variants add one additional operand over raw.
5943 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5944 Register VIndex;
5945 int OpOffset = 0;
5946 if (HasVIndex) {
5947 VIndex = MI.getOperand(i: 3).getReg();
5948 OpOffset = 1;
5949 } else {
5950 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
5951 }
5952
5953 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
5954 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
5955
5956 unsigned Format = 0;
5957 if (IsTyped) {
5958 Format = MI.getOperand(i: 5 + OpOffset).getImm();
5959 ++OpOffset;
5960 }
5961
5962 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
5963
5964 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
5965
5966 unsigned Opc;
5967 if (IsTyped) {
5968 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5969 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5970 } else if (IsFormat) {
5971 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5972 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5973 } else {
5974 switch (MemSize) {
5975 case 1:
5976 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5977 break;
5978 case 2:
5979 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5980 break;
5981 default:
5982 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5983 break;
5984 }
5985 }
5986
5987 auto MIB = B.buildInstr(Opcode: Opc)
5988 .addUse(RegNo: VData) // vdata
5989 .addUse(RegNo: RSrc) // rsrc
5990 .addUse(RegNo: VIndex) // vindex
5991 .addUse(RegNo: VOffset) // voffset
5992 .addUse(RegNo: SOffset) // soffset
5993 .addImm(Val: ImmOffset); // offset(imm)
5994
5995 if (IsTyped)
5996 MIB.addImm(Val: Format);
5997
5998 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5999 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6000 .addMemOperand(MMO);
6001
6002 MI.eraseFromParent();
6003 return true;
6004}
6005
6006static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6007 Register VIndex, Register VOffset, Register SOffset,
6008 unsigned ImmOffset, unsigned Format,
6009 unsigned AuxiliaryData, MachineMemOperand *MMO,
6010 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6011 auto MIB = B.buildInstr(Opcode: Opc)
6012 .addDef(RegNo: LoadDstReg) // vdata
6013 .addUse(RegNo: RSrc) // rsrc
6014 .addUse(RegNo: VIndex) // vindex
6015 .addUse(RegNo: VOffset) // voffset
6016 .addUse(RegNo: SOffset) // soffset
6017 .addImm(Val: ImmOffset); // offset(imm)
6018
6019 if (IsTyped)
6020 MIB.addImm(Val: Format);
6021
6022 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6023 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6024 .addMemOperand(MMO);
6025}
6026
6027bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
6028 LegalizerHelper &Helper,
6029 bool IsFormat,
6030 bool IsTyped) const {
6031 MachineIRBuilder &B = Helper.MIRBuilder;
6032 MachineRegisterInfo &MRI = *B.getMRI();
6033 GISelChangeObserver &Observer = Helper.Observer;
6034
6035 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6036 MachineMemOperand *MMO = *MI.memoperands_begin();
6037 const LLT MemTy = MMO->getMemoryType();
6038 const LLT S32 = LLT::scalar(SizeInBits: 32);
6039
6040 Register Dst = MI.getOperand(i: 0).getReg();
6041
6042 Register StatusDst;
6043 int OpOffset = 0;
6044 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6045 bool IsTFE = MI.getNumExplicitDefs() == 2;
6046 if (IsTFE) {
6047 StatusDst = MI.getOperand(i: 1).getReg();
6048 ++OpOffset;
6049 }
6050
6051 castBufferRsrcArgToV4I32(MI, B, Idx: 2 + OpOffset);
6052 Register RSrc = MI.getOperand(i: 2 + OpOffset).getReg();
6053
6054 // The typed intrinsics add an immediate after the registers.
6055 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6056
6057 // The struct intrinsic variants add one additional operand over raw.
6058 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6059 Register VIndex;
6060 if (HasVIndex) {
6061 VIndex = MI.getOperand(i: 3 + OpOffset).getReg();
6062 ++OpOffset;
6063 } else {
6064 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6065 }
6066
6067 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6068 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6069
6070 unsigned Format = 0;
6071 if (IsTyped) {
6072 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6073 ++OpOffset;
6074 }
6075
6076 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6077 unsigned ImmOffset;
6078
6079 LLT Ty = MRI.getType(Reg: Dst);
6080 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6081 // logic doesn't have to handle that case.
6082 if (hasBufferRsrcWorkaround(Ty)) {
6083 Observer.changingInstr(MI);
6084 Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
6085 Observer.changedInstr(MI);
6086 Dst = MI.getOperand(i: 0).getReg();
6087 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6088 }
6089 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6090 Ty = getBitcastRegisterType(Ty);
6091 Observer.changingInstr(MI);
6092 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
6093 Observer.changedInstr(MI);
6094 Dst = MI.getOperand(i: 0).getReg();
6095 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6096 }
6097
6098 LLT EltTy = Ty.getScalarType();
6099 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6100 const bool Unpacked = ST.hasUnpackedD16VMem();
6101
6102 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6103
6104 unsigned Opc;
6105
6106 // TODO: Support TFE for typed and narrow loads.
6107 if (IsTyped) {
6108 if (IsTFE)
6109 return false;
6110 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6111 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6112 } else if (IsFormat) {
6113 if (IsD16) {
6114 if (IsTFE)
6115 return false;
6116 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6117 } else {
6118 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6119 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6120 }
6121 } else {
6122 switch (MemTy.getSizeInBits()) {
6123 case 8:
6124 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6125 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6126 break;
6127 case 16:
6128 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6129 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6130 break;
6131 default:
6132 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6133 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6134 break;
6135 }
6136 }
6137
6138 if (IsTFE) {
6139 unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: 32);
6140 unsigned NumLoadDWords = NumValueDWords + 1;
6141 LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
6142 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
6143 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6144 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6145 if (MemTy.getSizeInBits() < 32) {
6146 Register ExtDst = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6147 B.buildUnmerge(Res: {ExtDst, StatusDst}, Op: LoadDstReg);
6148 B.buildTrunc(Res: Dst, Op: ExtDst);
6149 } else if (NumValueDWords == 1) {
6150 B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
6151 } else {
6152 SmallVector<Register, 5> LoadElts;
6153 for (unsigned I = 0; I != NumValueDWords; ++I)
6154 LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
6155 LoadElts.push_back(Elt: StatusDst);
6156 B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
6157 LoadElts.truncate(N: NumValueDWords);
6158 B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
6159 }
6160 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6161 (IsD16 && !Ty.isVector())) {
6162 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6163 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6164 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6165 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6166 B.buildTrunc(Res: Dst, Op: LoadDstReg);
6167 } else if (Unpacked && IsD16 && Ty.isVector()) {
6168 LLT UnpackedTy = Ty.changeElementSize(NewEltSize: 32);
6169 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
6170 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6171 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6172 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6173 // FIXME: G_TRUNC should work, but legalization currently fails
6174 auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
6175 SmallVector<Register, 4> Repack;
6176 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6177 Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6178 B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
6179 } else {
6180 buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6181 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6182 }
6183
6184 MI.eraseFromParent();
6185 return true;
6186}
6187
6188static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6189 switch (IntrID) {
6190 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6192 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6193 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6194 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6195 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6196 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6197 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6198 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6199 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6200 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6201 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6202 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6203 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6204 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6205 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6206 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6207 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6208 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6209 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6210 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6211 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6212 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6213 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6214 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6215 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6216 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6217 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6218 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6219 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6220 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6221 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6222 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6223 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6224 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6225 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6226 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6227 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6228 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6229 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6230 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6231 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6232 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6233 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6234 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6235 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6236 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6237 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6239 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6240 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6241 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6242 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6243 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6244 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6245 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6246 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6247 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6248 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6249 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6250 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6251 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6252 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6254 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6255 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6256 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6257 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6258 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6259 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6260 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6261 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6262 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6263 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6264 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6265 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6266 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6267 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6268 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6269 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6270 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6271 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6272 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6273 default:
6274 llvm_unreachable("unhandled atomic opcode");
6275 }
6276}
6277
6278bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6279 MachineIRBuilder &B,
6280 Intrinsic::ID IID) const {
6281 const bool IsCmpSwap =
6282 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6283 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6284 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6285 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6286
6287 Register Dst = MI.getOperand(i: 0).getReg();
6288 // Since we don't have 128-bit atomics, we don't need to handle the case of
6289 // p8 argmunents to the atomic itself
6290 Register VData = MI.getOperand(i: 2).getReg();
6291
6292 Register CmpVal;
6293 int OpOffset = 0;
6294
6295 if (IsCmpSwap) {
6296 CmpVal = MI.getOperand(i: 3).getReg();
6297 ++OpOffset;
6298 }
6299
6300 castBufferRsrcArgToV4I32(MI, B, Idx: 3 + OpOffset);
6301 Register RSrc = MI.getOperand(i: 3 + OpOffset).getReg();
6302 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6303
6304 // The struct intrinsic variants add one additional operand over raw.
6305 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6306 Register VIndex;
6307 if (HasVIndex) {
6308 VIndex = MI.getOperand(i: 4 + OpOffset).getReg();
6309 ++OpOffset;
6310 } else {
6311 VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 0).getReg(Idx: 0);
6312 }
6313
6314 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6315 Register SOffset = MI.getOperand(i: 5 + OpOffset).getReg();
6316 unsigned AuxiliaryData = MI.getOperand(i: 6 + OpOffset).getImm();
6317
6318 MachineMemOperand *MMO = *MI.memoperands_begin();
6319
6320 unsigned ImmOffset;
6321 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6322
6323 auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
6324 .addDef(RegNo: Dst)
6325 .addUse(RegNo: VData); // vdata
6326
6327 if (IsCmpSwap)
6328 MIB.addReg(RegNo: CmpVal);
6329
6330 MIB.addUse(RegNo: RSrc) // rsrc
6331 .addUse(RegNo: VIndex) // vindex
6332 .addUse(RegNo: VOffset) // voffset
6333 .addUse(RegNo: SOffset) // soffset
6334 .addImm(Val: ImmOffset) // offset(imm)
6335 .addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6336 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6337 .addMemOperand(MMO);
6338
6339 MI.eraseFromParent();
6340 return true;
6341}
6342
6343/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6344/// vector with s16 typed elements.
6345static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6346 SmallVectorImpl<Register> &PackedAddrs,
6347 unsigned ArgOffset,
6348 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6349 bool IsA16, bool IsG16) {
6350 const LLT S16 = LLT::scalar(SizeInBits: 16);
6351 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6352 auto EndIdx = Intr->VAddrEnd;
6353
6354 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6355 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6356 if (!SrcOp.isReg())
6357 continue; // _L to _LZ may have eliminated this.
6358
6359 Register AddrReg = SrcOp.getReg();
6360
6361 if ((I < Intr->GradientStart) ||
6362 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6363 (I >= Intr->CoordStart && !IsA16)) {
6364 if ((I < Intr->GradientStart) && IsA16 &&
6365 (B.getMRI()->getType(Reg: AddrReg) == S16)) {
6366 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6367 // Special handling of bias when A16 is on. Bias is of type half but
6368 // occupies full 32-bit.
6369 PackedAddrs.push_back(
6370 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6371 .getReg(Idx: 0));
6372 } else {
6373 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6374 "Bias needs to be converted to 16 bit in A16 mode");
6375 // Handle any gradient or coordinate operands that should not be packed
6376 AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: 0);
6377 PackedAddrs.push_back(Elt: AddrReg);
6378 }
6379 } else {
6380 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6381 // derivatives dx/dh and dx/dv are packed with undef.
6382 if (((I + 1) >= EndIdx) ||
6383 ((Intr->NumGradients / 2) % 2 == 1 &&
6384 (I == static_cast<unsigned>(Intr->GradientStart +
6385 (Intr->NumGradients / 2) - 1) ||
6386 I == static_cast<unsigned>(Intr->GradientStart +
6387 Intr->NumGradients - 1))) ||
6388 // Check for _L to _LZ optimization
6389 !MI.getOperand(i: ArgOffset + I + 1).isReg()) {
6390 PackedAddrs.push_back(
6391 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6392 .getReg(Idx: 0));
6393 } else {
6394 PackedAddrs.push_back(
6395 Elt: B.buildBuildVector(
6396 Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + 1).getReg()})
6397 .getReg(Idx: 0));
6398 ++I;
6399 }
6400 }
6401 }
6402}
6403
6404/// Convert from separate vaddr components to a single vector address register,
6405/// and replace the remaining operands with $noreg.
6406static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6407 int DimIdx, int NumVAddrs) {
6408 const LLT S32 = LLT::scalar(SizeInBits: 32);
6409 (void)S32;
6410 SmallVector<Register, 8> AddrRegs;
6411 for (int I = 0; I != NumVAddrs; ++I) {
6412 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6413 if (SrcOp.isReg()) {
6414 AddrRegs.push_back(Elt: SrcOp.getReg());
6415 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6416 }
6417 }
6418
6419 int NumAddrRegs = AddrRegs.size();
6420 if (NumAddrRegs != 1) {
6421 auto VAddr =
6422 B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: 32), Ops: AddrRegs);
6423 MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: 0));
6424 }
6425
6426 for (int I = 1; I != NumVAddrs; ++I) {
6427 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6428 if (SrcOp.isReg())
6429 MI.getOperand(i: DimIdx + I).setReg(AMDGPU::NoRegister);
6430 }
6431}
6432
6433/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6434///
6435/// Depending on the subtarget, load/store with 16-bit element data need to be
6436/// rewritten to use the low half of 32-bit registers, or directly use a packed
6437/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6438/// registers.
6439///
6440/// We don't want to directly select image instructions just yet, but also want
6441/// to exposes all register repacking to the legalizer/combiners. We also don't
6442/// want a selected instruction entering RegBankSelect. In order to avoid
6443/// defining a multitude of intermediate image instructions, directly hack on
6444/// the intrinsic's arguments. In cases like a16 addresses, this requires
6445/// padding now unnecessary arguments with $noreg.
6446bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6447 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6448 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6449
6450 const MachineFunction &MF = *MI.getMF();
6451 const unsigned NumDefs = MI.getNumExplicitDefs();
6452 const unsigned ArgOffset = NumDefs + 1;
6453 bool IsTFE = NumDefs == 2;
6454 // We are only processing the operands of d16 image operations on subtargets
6455 // that use the unpacked register layout, or need to repack the TFE result.
6456
6457 // TODO: Do we need to guard against already legalized intrinsics?
6458 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6459 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
6460
6461 MachineRegisterInfo *MRI = B.getMRI();
6462 const LLT S32 = LLT::scalar(SizeInBits: 32);
6463 const LLT S16 = LLT::scalar(SizeInBits: 16);
6464 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6465
6466 unsigned DMask = 0;
6467 Register VData;
6468 LLT Ty;
6469
6470 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6471 VData = MI.getOperand(i: NumDefs == 0 ? 1 : 0).getReg();
6472 Ty = MRI->getType(Reg: VData);
6473 }
6474
6475 const bool IsAtomicPacked16Bit =
6476 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6477 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6478
6479 // Check for 16 bit addresses and pack if true.
6480 LLT GradTy =
6481 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
6482 LLT AddrTy =
6483 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
6484 const bool IsG16 =
6485 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6486 const bool IsA16 = AddrTy == S16;
6487 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6488
6489 int DMaskLanes = 0;
6490 if (!BaseOpcode->Atomic) {
6491 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
6492 if (BaseOpcode->Gather4) {
6493 DMaskLanes = 4;
6494 } else if (DMask != 0) {
6495 DMaskLanes = llvm::popcount(Value: DMask);
6496 } else if (!IsTFE && !BaseOpcode->Store) {
6497 // If dmask is 0, this is a no-op load. This can be eliminated.
6498 B.buildUndef(Res: MI.getOperand(i: 0));
6499 MI.eraseFromParent();
6500 return true;
6501 }
6502 }
6503
6504 Observer.changingInstr(MI);
6505 auto ChangedInstr = make_scope_exit(F: [&] { Observer.changedInstr(MI); });
6506
6507 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6508 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6509 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6510 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6511 unsigned NewOpcode = LoadOpcode;
6512 if (BaseOpcode->Store)
6513 NewOpcode = StoreOpcode;
6514 else if (BaseOpcode->NoReturn)
6515 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6516
6517 // Track that we legalized this
6518 MI.setDesc(B.getTII().get(Opcode: NewOpcode));
6519
6520 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6521 // dmask to be at least 1 otherwise the instruction will fail
6522 if (IsTFE && DMask == 0) {
6523 DMask = 0x1;
6524 DMaskLanes = 1;
6525 MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
6526 }
6527
6528 if (BaseOpcode->Atomic) {
6529 Register VData0 = MI.getOperand(i: 2).getReg();
6530 LLT Ty = MRI->getType(Reg: VData0);
6531
6532 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6533 if (Ty.isVector() && !IsAtomicPacked16Bit)
6534 return false;
6535
6536 if (BaseOpcode->AtomicX2) {
6537 Register VData1 = MI.getOperand(i: 3).getReg();
6538 // The two values are packed in one register.
6539 LLT PackedTy = LLT::fixed_vector(NumElements: 2, ScalarTy: Ty);
6540 auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
6541 MI.getOperand(i: 2).setReg(Concat.getReg(Idx: 0));
6542 MI.getOperand(i: 3).setReg(AMDGPU::NoRegister);
6543 }
6544 }
6545
6546 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6547
6548 // Rewrite the addressing register layout before doing anything else.
6549 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6550 // 16 bit gradients are supported, but are tied to the A16 control
6551 // so both gradients and addresses must be 16 bit
6552 return false;
6553 }
6554
6555 if (IsA16 && !ST.hasA16()) {
6556 // A16 not supported
6557 return false;
6558 }
6559
6560 const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
6561 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6562
6563 if (IsA16 || IsG16) {
6564 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6565 // instructions expect VGPR_32
6566 SmallVector<Register, 4> PackedRegs;
6567
6568 packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6569
6570 // See also below in the non-a16 branch
6571 const bool UseNSA = ST.hasNSAEncoding() &&
6572 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6573 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6574 const bool UsePartialNSA =
6575 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6576
6577 if (UsePartialNSA) {
6578 // Pack registers that would go over NSAMaxSize into last VAddr register
6579 LLT PackedAddrTy =
6580 LLT::fixed_vector(NumElements: 2 * (PackedRegs.size() - NSAMaxSize + 1), ScalarSizeInBits: 16);
6581 auto Concat = B.buildConcatVectors(
6582 Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - 1));
6583 PackedRegs[NSAMaxSize - 1] = Concat.getReg(Idx: 0);
6584 PackedRegs.resize(N: NSAMaxSize);
6585 } else if (!UseNSA && PackedRegs.size() > 1) {
6586 LLT PackedAddrTy = LLT::fixed_vector(NumElements: 2 * PackedRegs.size(), ScalarSizeInBits: 16);
6587 auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
6588 PackedRegs[0] = Concat.getReg(Idx: 0);
6589 PackedRegs.resize(N: 1);
6590 }
6591
6592 const unsigned NumPacked = PackedRegs.size();
6593 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6594 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6595 if (!SrcOp.isReg()) {
6596 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6597 continue;
6598 }
6599
6600 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6601
6602 if (I - Intr->VAddrStart < NumPacked)
6603 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6604 else
6605 SrcOp.setReg(AMDGPU::NoRegister);
6606 }
6607 } else {
6608 // If the register allocator cannot place the address registers contiguously
6609 // without introducing moves, then using the non-sequential address encoding
6610 // is always preferable, since it saves VALU instructions and is usually a
6611 // wash in terms of code size or even better.
6612 //
6613 // However, we currently have no way of hinting to the register allocator
6614 // that MIMG addresses should be placed contiguously when it is possible to
6615 // do so, so force non-NSA for the common 2-address case as a heuristic.
6616 //
6617 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6618 // allocation when possible.
6619 //
6620 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6621 // set of the remaining addresses.
6622 const bool UseNSA = ST.hasNSAEncoding() &&
6623 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6624 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6625 const bool UsePartialNSA =
6626 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6627
6628 if (UsePartialNSA) {
6629 convertImageAddrToPacked(B, MI,
6630 DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6631 NumVAddrs: Intr->NumVAddrs - NSAMaxSize + 1);
6632 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6633 convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
6634 NumVAddrs: Intr->NumVAddrs);
6635 }
6636 }
6637
6638 int Flags = 0;
6639 if (IsA16)
6640 Flags |= 1;
6641 if (IsG16)
6642 Flags |= 2;
6643 MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
6644
6645 if (BaseOpcode->NoReturn) { // No TFE for stores?
6646 // TODO: Handle dmask trim
6647 if (!Ty.isVector() || !IsD16)
6648 return true;
6649
6650 Register RepackedReg = handleD16VData(B, MRI&: *MRI, Reg: VData, ImageStore: true);
6651 if (RepackedReg != VData) {
6652 MI.getOperand(i: 1).setReg(RepackedReg);
6653 }
6654
6655 return true;
6656 }
6657
6658 Register DstReg = MI.getOperand(i: 0).getReg();
6659 const LLT EltTy = Ty.getScalarType();
6660 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6661
6662 // Confirm that the return type is large enough for the dmask specified
6663 if (NumElts < DMaskLanes)
6664 return false;
6665
6666 if (NumElts > 4 || DMaskLanes > 4)
6667 return false;
6668
6669 // Image atomic instructions are using DMask to specify how many bits
6670 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6671 // DMaskLanes for image atomic has default value '0'.
6672 // We must be sure that atomic variants (especially packed) will not be
6673 // truncated from v2s16 or v4s16 to s16 type.
6674 //
6675 // ChangeElementCount will be needed for image load where Ty is always scalar.
6676 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6677 const LLT AdjustedTy =
6678 DMaskLanes == 0
6679 ? Ty
6680 : Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
6681
6682 // The raw dword aligned data component of the load. The only legal cases
6683 // where this matters should be when using the packed D16 format, for
6684 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6685 LLT RoundedTy;
6686
6687 // S32 vector to cover all data, plus TFE result element.
6688 LLT TFETy;
6689
6690 // Register type to use for each loaded component. Will be S32 or V2S16.
6691 LLT RegTy;
6692
6693 if (IsD16 && ST.hasUnpackedD16VMem()) {
6694 RoundedTy =
6695 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: 32);
6696 TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + 1, ScalarSizeInBits: 32);
6697 RegTy = S32;
6698 } else {
6699 unsigned EltSize = EltTy.getSizeInBits();
6700 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6701 unsigned RoundedSize = 32 * RoundedElts;
6702 RoundedTy = LLT::scalarOrVector(
6703 EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
6704 TFETy = LLT::fixed_vector(NumElements: RoundedSize / 32 + 1, ScalarTy: S32);
6705 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6706 }
6707
6708 // The return type does not need adjustment.
6709 // TODO: Should we change s16 case to s32 or <2 x s16>?
6710 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6711 return true;
6712
6713 Register Dst1Reg;
6714
6715 // Insert after the instruction.
6716 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
6717
6718 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6719 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6720 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6721 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6722
6723 Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
6724
6725 MI.getOperand(i: 0).setReg(NewResultReg);
6726
6727 // In the IR, TFE is supposed to be used with a 2 element struct return
6728 // type. The instruction really returns these two values in one contiguous
6729 // register, with one additional dword beyond the loaded data. Rewrite the
6730 // return type to use a single register result.
6731
6732 if (IsTFE) {
6733 Dst1Reg = MI.getOperand(i: 1).getReg();
6734 if (MRI->getType(Reg: Dst1Reg) != S32)
6735 return false;
6736
6737 // TODO: Make sure the TFE operand bit is set.
6738 MI.removeOperand(OpNo: 1);
6739
6740 // Handle the easy case that requires no repack instructions.
6741 if (Ty == S32) {
6742 B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
6743 return true;
6744 }
6745 }
6746
6747 // Now figure out how to copy the new result register back into the old
6748 // result.
6749 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6750
6751 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6752
6753 if (ResultNumRegs == 1) {
6754 assert(!IsTFE);
6755 ResultRegs[0] = NewResultReg;
6756 } else {
6757 // We have to repack into a new vector of some kind.
6758 for (int I = 0; I != NumDataRegs; ++I)
6759 ResultRegs[I] = MRI->createGenericVirtualRegister(Ty: RegTy);
6760 B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
6761
6762 // Drop the final TFE element to get the data part. The TFE result is
6763 // directly written to the right place already.
6764 if (IsTFE)
6765 ResultRegs.resize(N: NumDataRegs);
6766 }
6767
6768 // For an s16 scalar result, we form an s32 result with a truncate regardless
6769 // of packed vs. unpacked.
6770 if (IsD16 && !Ty.isVector()) {
6771 B.buildTrunc(Res: DstReg, Op: ResultRegs[0]);
6772 return true;
6773 }
6774
6775 // Avoid a build/concat_vector of 1 entry.
6776 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6777 B.buildBitcast(Dst: DstReg, Src: ResultRegs[0]);
6778 return true;
6779 }
6780
6781 assert(Ty.isVector());
6782
6783 if (IsD16) {
6784 // For packed D16 results with TFE enabled, all the data components are
6785 // S32. Cast back to the expected type.
6786 //
6787 // TODO: We don't really need to use load s32 elements. We would only need one
6788 // cast for the TFE result if a multiple of v2s16 was used.
6789 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6790 for (Register &Reg : ResultRegs)
6791 Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: 0);
6792 } else if (ST.hasUnpackedD16VMem()) {
6793 for (Register &Reg : ResultRegs)
6794 Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: 0);
6795 }
6796 }
6797
6798 auto padWithUndef = [&](LLT Ty, int NumElts) {
6799 if (NumElts == 0)
6800 return;
6801 Register Undef = B.buildUndef(Res: Ty).getReg(Idx: 0);
6802 for (int I = 0; I != NumElts; ++I)
6803 ResultRegs.push_back(Elt: Undef);
6804 };
6805
6806 // Pad out any elements eliminated due to the dmask.
6807 LLT ResTy = MRI->getType(Reg: ResultRegs[0]);
6808 if (!ResTy.isVector()) {
6809 padWithUndef(ResTy, NumElts - ResultRegs.size());
6810 B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
6811 return true;
6812 }
6813
6814 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6815 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6816
6817 // Deal with the one annoying legal case.
6818 const LLT V3S16 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 16);
6819 if (Ty == V3S16) {
6820 if (IsTFE) {
6821 if (ResultRegs.size() == 1) {
6822 NewResultReg = ResultRegs[0];
6823 } else if (ResultRegs.size() == 2) {
6824 LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
6825 NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: 0);
6826 } else {
6827 return false;
6828 }
6829 }
6830
6831 if (MRI->getType(Reg: DstReg).getNumElements() <
6832 MRI->getType(Reg: NewResultReg).getNumElements()) {
6833 B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
6834 } else {
6835 B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
6836 }
6837 return true;
6838 }
6839
6840 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6841 B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
6842 return true;
6843}
6844
6845bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6846 MachineInstr &MI) const {
6847 MachineIRBuilder &B = Helper.MIRBuilder;
6848 GISelChangeObserver &Observer = Helper.Observer;
6849
6850 Register OrigDst = MI.getOperand(i: 0).getReg();
6851 Register Dst;
6852 LLT Ty = B.getMRI()->getType(Reg: OrigDst);
6853 unsigned Size = Ty.getSizeInBits();
6854 MachineFunction &MF = B.getMF();
6855 unsigned Opc = 0;
6856 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6857 assert(Size == 8 || Size == 16);
6858 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6859 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6860 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6861 // destination register.
6862 Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
6863 } else {
6864 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6865 Dst = OrigDst;
6866 }
6867
6868 Observer.changingInstr(MI);
6869
6870 // Handle needing to s.buffer.load() a p8 value.
6871 if (hasBufferRsrcWorkaround(Ty)) {
6872 Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: 0);
6873 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6874 }
6875 if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
6876 Ty = getBitcastRegisterType(Ty);
6877 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
6878 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6879 }
6880
6881 // FIXME: We don't really need this intermediate instruction. The intrinsic
6882 // should be fixed to have a memory operand. Since it's readnone, we're not
6883 // allowed to add one.
6884 MI.setDesc(B.getTII().get(Opcode: Opc));
6885 MI.removeOperand(OpNo: 1); // Remove intrinsic ID
6886
6887 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6888 const unsigned MemSize = (Size + 7) / 8;
6889 const Align MemAlign = B.getDataLayout().getABITypeAlign(
6890 Ty: getTypeForLLT(Ty, C&: MF.getFunction().getContext()));
6891 MachineMemOperand *MMO = MF.getMachineMemOperand(
6892 PtrInfo: MachinePointerInfo(),
6893 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6894 MachineMemOperand::MOInvariant,
6895 Size: MemSize, BaseAlignment: MemAlign);
6896 MI.addMemOperand(MF, MO: MMO);
6897 if (Dst != OrigDst) {
6898 MI.getOperand(i: 0).setReg(Dst);
6899 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6900 B.buildTrunc(Res: OrigDst, Op: Dst);
6901 }
6902
6903 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6904 // always be legal. We may need to restore this to a 96-bit result if it turns
6905 // out this needs to be converted to a vector load during RegBankSelect.
6906 if (!isPowerOf2_32(Value: Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6907 if (Ty.isVector())
6908 Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: 0);
6909 else
6910 Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: 0);
6911 }
6912
6913 Observer.changedInstr(MI);
6914 return true;
6915}
6916
6917bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
6918 MachineInstr &MI) const {
6919 MachineIRBuilder &B = Helper.MIRBuilder;
6920 GISelChangeObserver &Observer = Helper.Observer;
6921 Observer.changingInstr(MI);
6922 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
6923 MI.removeOperand(OpNo: 0); // Remove intrinsic ID
6924 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
6925 Observer.changedInstr(MI);
6926 return true;
6927}
6928
6929// TODO: Move to selection
6930bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6931 MachineRegisterInfo &MRI,
6932 MachineIRBuilder &B) const {
6933 if (!ST.isTrapHandlerEnabled() ||
6934 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6935 return legalizeTrapEndpgm(MI, MRI, B);
6936
6937 return ST.supportsGetDoorbellID() ?
6938 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6939}
6940
6941bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6942 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6943 const DebugLoc &DL = MI.getDebugLoc();
6944 MachineBasicBlock &BB = B.getMBB();
6945 MachineFunction *MF = BB.getParent();
6946
6947 if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
6948 BuildMI(BB, I: BB.end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
6949 .addImm(Val: 0);
6950 MI.eraseFromParent();
6951 return true;
6952 }
6953
6954 // We need a block split to make the real endpgm a terminator. We also don't
6955 // want to break phis in successor blocks, so we can't just delete to the
6956 // end of the block.
6957 BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
6958 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6959 MF->push_back(MBB: TrapBB);
6960 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
6961 .addImm(Val: 0);
6962 BuildMI(BB, I: &MI, MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
6963 .addMBB(MBB: TrapBB);
6964
6965 BB.addSuccessor(Succ: TrapBB);
6966 MI.eraseFromParent();
6967 return true;
6968}
6969
6970bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6971 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6972 MachineFunction &MF = B.getMF();
6973 const LLT S64 = LLT::scalar(SizeInBits: 64);
6974
6975 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6976 // For code object version 5, queue_ptr is passed through implicit kernarg.
6977 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
6978 AMDGPU::AMDHSA_COV5) {
6979 AMDGPUTargetLowering::ImplicitParameter Param =
6980 AMDGPUTargetLowering::QUEUE_PTR;
6981 uint64_t Offset =
6982 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
6983
6984 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6985 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
6986
6987 if (!loadInputValue(DstReg: KernargPtrReg, B,
6988 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6989 return false;
6990
6991 // TODO: can we be smarter about machine pointer info?
6992 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6993 MachineMemOperand *MMO = MF.getMachineMemOperand(
6994 PtrInfo,
6995 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6996 MachineMemOperand::MOInvariant,
6997 MemTy: LLT::scalar(SizeInBits: 64), base_alignment: commonAlignment(A: Align(64), Offset));
6998
6999 // Pointer address
7000 Register LoadAddr = MRI.createGenericVirtualRegister(
7001 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7002 B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
7003 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
7004 // Load address
7005 Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
7006 B.buildCopy(Res: SGPR01, Op: Temp);
7007 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7008 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7009 .addReg(RegNo: SGPR01, flags: RegState::Implicit);
7010 MI.eraseFromParent();
7011 return true;
7012 }
7013
7014 // Pass queue pointer to trap handler as input, and insert trap instruction
7015 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7016 Register LiveIn =
7017 MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7018 if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
7019 return false;
7020
7021 B.buildCopy(Res: SGPR01, Op: LiveIn);
7022 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7023 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7024 .addReg(RegNo: SGPR01, flags: RegState::Implicit);
7025
7026 MI.eraseFromParent();
7027 return true;
7028}
7029
7030bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
7031 MachineRegisterInfo &MRI,
7032 MachineIRBuilder &B) const {
7033 // We need to simulate the 's_trap 2' instruction on targets that run in
7034 // PRIV=1 (where it is treated as a nop).
7035 if (ST.hasPrivEnabledTrap2NopBug()) {
7036 ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
7037 DL: MI.getDebugLoc());
7038 MI.eraseFromParent();
7039 return true;
7040 }
7041
7042 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7043 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7044 MI.eraseFromParent();
7045 return true;
7046}
7047
7048bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
7049 MachineRegisterInfo &MRI,
7050 MachineIRBuilder &B) const {
7051 // Is non-HSA path or trap-handler disabled? Then, report a warning
7052 // accordingly
7053 if (!ST.isTrapHandlerEnabled() ||
7054 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7055 Function &Fn = B.getMF().getFunction();
7056 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7057 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7058 } else {
7059 // Insert debug-trap instruction
7060 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7061 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7062 }
7063
7064 MI.eraseFromParent();
7065 return true;
7066}
7067
7068bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic(
7069 MachineInstr &MI, MachineIRBuilder &B) const {
7070 MachineRegisterInfo &MRI = *B.getMRI();
7071 const LLT S16 = LLT::scalar(SizeInBits: 16);
7072 const LLT S32 = LLT::scalar(SizeInBits: 32);
7073 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
7074 const LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
7075
7076 Register DstReg = MI.getOperand(i: 0).getReg();
7077 Register NodePtr = MI.getOperand(i: 2).getReg();
7078 Register RayExtent = MI.getOperand(i: 3).getReg();
7079 Register RayOrigin = MI.getOperand(i: 4).getReg();
7080 Register RayDir = MI.getOperand(i: 5).getReg();
7081 Register RayInvDir = MI.getOperand(i: 6).getReg();
7082 Register TDescr = MI.getOperand(i: 7).getReg();
7083
7084 if (!ST.hasGFX10_AEncoding()) {
7085 Function &Fn = B.getMF().getFunction();
7086 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7087 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7088 return false;
7089 }
7090
7091 const bool IsGFX11 = AMDGPU::isGFX11(STI: ST);
7092 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: ST);
7093 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: ST);
7094 const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == 16;
7095 const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == 64;
7096 const unsigned NumVDataDwords = 4;
7097 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7098 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7099 const bool UseNSA =
7100 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7101
7102 const unsigned BaseOpcodes[2][2] = {
7103 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7104 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7105 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7106 int Opcode;
7107 if (UseNSA) {
7108 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7109 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7110 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7111 : AMDGPU::MIMGEncGfx10NSA,
7112 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7113 } else {
7114 assert(!IsGFX12Plus);
7115 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7116 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7117 : AMDGPU::MIMGEncGfx10Default,
7118 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7119 }
7120 assert(Opcode != -1);
7121
7122 SmallVector<Register, 12> Ops;
7123 if (UseNSA && IsGFX11Plus) {
7124 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7125 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7126 auto Merged = B.buildMergeLikeInstr(
7127 Res: V3S32, Ops: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1), Unmerge.getReg(Idx: 2)});
7128 Ops.push_back(Elt: Merged.getReg(Idx: 0));
7129 };
7130
7131 Ops.push_back(Elt: NodePtr);
7132 Ops.push_back(Elt: RayExtent);
7133 packLanes(RayOrigin);
7134
7135 if (IsA16) {
7136 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7137 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7138 auto MergedDir = B.buildMergeLikeInstr(
7139 Res: V3S32,
7140 Ops: {B.buildBitcast(
7141 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 0),
7142 UnmergeRayDir.getReg(Idx: 0)}))
7143 .getReg(Idx: 0),
7144 B.buildBitcast(
7145 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 1),
7146 UnmergeRayDir.getReg(Idx: 1)}))
7147 .getReg(Idx: 0),
7148 B.buildBitcast(
7149 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 2),
7150 UnmergeRayDir.getReg(Idx: 2)}))
7151 .getReg(Idx: 0)});
7152 Ops.push_back(Elt: MergedDir.getReg(Idx: 0));
7153 } else {
7154 packLanes(RayDir);
7155 packLanes(RayInvDir);
7156 }
7157 } else {
7158 if (Is64) {
7159 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
7160 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7161 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7162 } else {
7163 Ops.push_back(Elt: NodePtr);
7164 }
7165 Ops.push_back(Elt: RayExtent);
7166
7167 auto packLanes = [&Ops, &S32, &B](Register Src) {
7168 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7169 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7170 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7171 Ops.push_back(Elt: Unmerge.getReg(Idx: 2));
7172 };
7173
7174 packLanes(RayOrigin);
7175 if (IsA16) {
7176 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7177 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7178 Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
7179 Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
7180 Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
7181 B.buildMergeLikeInstr(Res: R1,
7182 Ops: {UnmergeRayDir.getReg(Idx: 0), UnmergeRayDir.getReg(Idx: 1)});
7183 B.buildMergeLikeInstr(
7184 Res: R2, Ops: {UnmergeRayDir.getReg(Idx: 2), UnmergeRayInvDir.getReg(Idx: 0)});
7185 B.buildMergeLikeInstr(
7186 Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: 1), UnmergeRayInvDir.getReg(Idx: 2)});
7187 Ops.push_back(Elt: R1);
7188 Ops.push_back(Elt: R2);
7189 Ops.push_back(Elt: R3);
7190 } else {
7191 packLanes(RayDir);
7192 packLanes(RayInvDir);
7193 }
7194 }
7195
7196 if (!UseNSA) {
7197 // Build a single vector containing all the operands so far prepared.
7198 LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: 32);
7199 Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: 0);
7200 Ops.clear();
7201 Ops.push_back(Elt: MergedOps);
7202 }
7203
7204 auto MIB = B.buildInstr(Opcode: AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7205 .addDef(RegNo: DstReg)
7206 .addImm(Val: Opcode);
7207
7208 for (Register R : Ops) {
7209 MIB.addUse(RegNo: R);
7210 }
7211
7212 MIB.addUse(RegNo: TDescr)
7213 .addImm(Val: IsA16 ? 1 : 0)
7214 .cloneMemRefs(OtherMI: MI);
7215
7216 MI.eraseFromParent();
7217 return true;
7218}
7219
7220bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic(
7221 MachineInstr &MI, MachineIRBuilder &B) const {
7222 const LLT S32 = LLT::scalar(SizeInBits: 32);
7223 const LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
7224
7225 Register DstReg = MI.getOperand(i: 0).getReg();
7226 Register DstOrigin = MI.getOperand(i: 1).getReg();
7227 Register DstDir = MI.getOperand(i: 2).getReg();
7228 Register NodePtr = MI.getOperand(i: 4).getReg();
7229 Register RayExtent = MI.getOperand(i: 5).getReg();
7230 Register InstanceMask = MI.getOperand(i: 6).getReg();
7231 Register RayOrigin = MI.getOperand(i: 7).getReg();
7232 Register RayDir = MI.getOperand(i: 8).getReg();
7233 Register Offsets = MI.getOperand(i: 9).getReg();
7234 Register TDescr = MI.getOperand(i: 10).getReg();
7235
7236 if (!ST.hasBVHDualAndBVH8Insts()) {
7237 Function &Fn = B.getMF().getFunction();
7238 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7239 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7240 return false;
7241 }
7242
7243 bool IsBVH8 = cast<GIntrinsic>(Val&: MI).getIntrinsicID() ==
7244 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7245 const unsigned NumVDataDwords = 10;
7246 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7247 int Opcode = AMDGPU::getMIMGOpcode(
7248 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7249 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7250 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7251 assert(Opcode != -1);
7252
7253 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7254 Res: V2S32, Ops: {RayExtent, B.buildAnyExt(Res: S32, Op: InstanceMask)});
7255
7256 B.buildInstr(Opcode: IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7257 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7258 .addDef(RegNo: DstReg)
7259 .addDef(RegNo: DstOrigin)
7260 .addDef(RegNo: DstDir)
7261 .addImm(Val: Opcode)
7262 .addUse(RegNo: NodePtr)
7263 .addUse(RegNo: RayExtentInstanceMaskVec.getReg(Idx: 0))
7264 .addUse(RegNo: RayOrigin)
7265 .addUse(RegNo: RayDir)
7266 .addUse(RegNo: Offsets)
7267 .addUse(RegNo: TDescr)
7268 .cloneMemRefs(OtherMI: MI);
7269
7270 MI.eraseFromParent();
7271 return true;
7272}
7273
7274bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7275 MachineIRBuilder &B) const {
7276 const SITargetLowering *TLI = ST.getTargetLowering();
7277 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7278 Register DstReg = MI.getOperand(i: 0).getReg();
7279 B.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {DstReg}, SrcOps: {StackPtr});
7280 MI.eraseFromParent();
7281 return true;
7282}
7283
7284bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7285 MachineIRBuilder &B) const {
7286 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7287 if (!ST.hasArchitectedSGPRs())
7288 return false;
7289 LLT S32 = LLT::scalar(SizeInBits: 32);
7290 Register DstReg = MI.getOperand(i: 0).getReg();
7291 auto TTMP8 = B.buildCopy(Res: S32, Op: Register(AMDGPU::TTMP8));
7292 auto LSB = B.buildConstant(Res: S32, Val: 25);
7293 auto Width = B.buildConstant(Res: S32, Val: 5);
7294 B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
7295 MI.eraseFromParent();
7296 return true;
7297}
7298
7299static constexpr unsigned FPEnvModeBitField =
7300 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
7301
7302static constexpr unsigned FPEnvTrapBitField =
7303 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
7304
7305bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7306 MachineRegisterInfo &MRI,
7307 MachineIRBuilder &B) const {
7308 Register Src = MI.getOperand(i: 0).getReg();
7309 if (MRI.getType(Reg: Src) != S64)
7310 return false;
7311
7312 auto ModeReg =
7313 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7314 /*HasSideEffects=*/true, /*isConvergent=*/false)
7315 .addImm(Val: FPEnvModeBitField);
7316 auto TrapReg =
7317 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7318 /*HasSideEffects=*/true, /*isConvergent=*/false)
7319 .addImm(Val: FPEnvTrapBitField);
7320 B.buildMergeLikeInstr(Res: Src, Ops: {ModeReg, TrapReg});
7321 MI.eraseFromParent();
7322 return true;
7323}
7324
7325bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7326 MachineRegisterInfo &MRI,
7327 MachineIRBuilder &B) const {
7328 Register Src = MI.getOperand(i: 0).getReg();
7329 if (MRI.getType(Reg: Src) != S64)
7330 return false;
7331
7332 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: 0));
7333 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7334 /*HasSideEffects=*/true, /*isConvergent=*/false)
7335 .addImm(Val: static_cast<int16_t>(FPEnvModeBitField))
7336 .addReg(RegNo: Unmerge.getReg(Idx: 0));
7337 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7338 /*HasSideEffects=*/true, /*isConvergent=*/false)
7339 .addImm(Val: static_cast<int16_t>(FPEnvTrapBitField))
7340 .addReg(RegNo: Unmerge.getReg(Idx: 1));
7341 MI.eraseFromParent();
7342 return true;
7343}
7344
7345bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7346 MachineInstr &MI) const {
7347 MachineIRBuilder &B = Helper.MIRBuilder;
7348 MachineRegisterInfo &MRI = *B.getMRI();
7349
7350 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7351 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
7352 switch (IntrID) {
7353 case Intrinsic::amdgcn_if:
7354 case Intrinsic::amdgcn_else: {
7355 MachineInstr *Br = nullptr;
7356 MachineBasicBlock *UncondBrTarget = nullptr;
7357 bool Negated = false;
7358 if (MachineInstr *BrCond =
7359 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7360 const SIRegisterInfo *TRI
7361 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7362
7363 Register Def = MI.getOperand(i: 1).getReg();
7364 Register Use = MI.getOperand(i: 3).getReg();
7365
7366 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7367
7368 if (Negated)
7369 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7370
7371 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7372 if (IntrID == Intrinsic::amdgcn_if) {
7373 B.buildInstr(Opcode: AMDGPU::SI_IF)
7374 .addDef(RegNo: Def)
7375 .addUse(RegNo: Use)
7376 .addMBB(MBB: UncondBrTarget);
7377 } else {
7378 B.buildInstr(Opcode: AMDGPU::SI_ELSE)
7379 .addDef(RegNo: Def)
7380 .addUse(RegNo: Use)
7381 .addMBB(MBB: UncondBrTarget);
7382 }
7383
7384 if (Br) {
7385 Br->getOperand(i: 0).setMBB(CondBrTarget);
7386 } else {
7387 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7388 // since we're swapping branch targets it needs to be reinserted.
7389 // FIXME: IRTranslator should probably not do this
7390 B.buildBr(Dest&: *CondBrTarget);
7391 }
7392
7393 MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
7394 MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
7395 MI.eraseFromParent();
7396 BrCond->eraseFromParent();
7397 return true;
7398 }
7399
7400 return false;
7401 }
7402 case Intrinsic::amdgcn_loop: {
7403 MachineInstr *Br = nullptr;
7404 MachineBasicBlock *UncondBrTarget = nullptr;
7405 bool Negated = false;
7406 if (MachineInstr *BrCond =
7407 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7408 const SIRegisterInfo *TRI
7409 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7410
7411 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7412 Register Reg = MI.getOperand(i: 2).getReg();
7413
7414 if (Negated)
7415 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7416
7417 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7418 B.buildInstr(Opcode: AMDGPU::SI_LOOP)
7419 .addUse(RegNo: Reg)
7420 .addMBB(MBB: UncondBrTarget);
7421
7422 if (Br)
7423 Br->getOperand(i: 0).setMBB(CondBrTarget);
7424 else
7425 B.buildBr(Dest&: *CondBrTarget);
7426
7427 MI.eraseFromParent();
7428 BrCond->eraseFromParent();
7429 MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
7430 return true;
7431 }
7432
7433 return false;
7434 }
7435 case Intrinsic::amdgcn_addrspacecast_nonnull:
7436 return legalizeAddrSpaceCast(MI, MRI, B);
7437 case Intrinsic::amdgcn_make_buffer_rsrc:
7438 return legalizePointerAsRsrcIntrin(MI, MRI, B);
7439 case Intrinsic::amdgcn_kernarg_segment_ptr:
7440 if (!AMDGPU::isKernel(CC: B.getMF().getFunction().getCallingConv())) {
7441 // This only makes sense to call in a kernel, so just lower to null.
7442 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: 0);
7443 MI.eraseFromParent();
7444 return true;
7445 }
7446
7447 return legalizePreloadedArgIntrin(
7448 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7449 case Intrinsic::amdgcn_implicitarg_ptr:
7450 return legalizeImplicitArgPtr(MI, MRI, B);
7451 case Intrinsic::amdgcn_workitem_id_x:
7452 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 0,
7453 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7454 case Intrinsic::amdgcn_workitem_id_y:
7455 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 1,
7456 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7457 case Intrinsic::amdgcn_workitem_id_z:
7458 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 2,
7459 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7460 case Intrinsic::amdgcn_workgroup_id_x:
7461 return legalizePreloadedArgIntrin(MI, MRI, B,
7462 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7463 case Intrinsic::amdgcn_workgroup_id_y:
7464 return legalizePreloadedArgIntrin(MI, MRI, B,
7465 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7466 case Intrinsic::amdgcn_workgroup_id_z:
7467 return legalizePreloadedArgIntrin(MI, MRI, B,
7468 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7469 case Intrinsic::amdgcn_wave_id:
7470 return legalizeWaveID(MI, B);
7471 case Intrinsic::amdgcn_lds_kernel_id:
7472 return legalizePreloadedArgIntrin(MI, MRI, B,
7473 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7474 case Intrinsic::amdgcn_dispatch_ptr:
7475 return legalizePreloadedArgIntrin(MI, MRI, B,
7476 ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
7477 case Intrinsic::amdgcn_queue_ptr:
7478 return legalizePreloadedArgIntrin(MI, MRI, B,
7479 ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
7480 case Intrinsic::amdgcn_implicit_buffer_ptr:
7481 return legalizePreloadedArgIntrin(
7482 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7483 case Intrinsic::amdgcn_dispatch_id:
7484 return legalizePreloadedArgIntrin(MI, MRI, B,
7485 ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
7486 case Intrinsic::r600_read_ngroups_x:
7487 // TODO: Emit error for hsa
7488 return legalizeKernargMemParameter(MI, B,
7489 Offset: SI::KernelInputOffsets::NGROUPS_X);
7490 case Intrinsic::r600_read_ngroups_y:
7491 return legalizeKernargMemParameter(MI, B,
7492 Offset: SI::KernelInputOffsets::NGROUPS_Y);
7493 case Intrinsic::r600_read_ngroups_z:
7494 return legalizeKernargMemParameter(MI, B,
7495 Offset: SI::KernelInputOffsets::NGROUPS_Z);
7496 case Intrinsic::r600_read_local_size_x:
7497 // TODO: Could insert G_ASSERT_ZEXT from s16
7498 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
7499 case Intrinsic::r600_read_local_size_y:
7500 // TODO: Could insert G_ASSERT_ZEXT from s16
7501 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
7502 // TODO: Could insert G_ASSERT_ZEXT from s16
7503 case Intrinsic::r600_read_local_size_z:
7504 return legalizeKernargMemParameter(MI, B,
7505 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
7506 case Intrinsic::amdgcn_fdiv_fast:
7507 return legalizeFDIVFastIntrin(MI, MRI, B);
7508 case Intrinsic::amdgcn_is_shared:
7509 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
7510 case Intrinsic::amdgcn_is_private:
7511 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
7512 case Intrinsic::amdgcn_wavefrontsize: {
7513 B.buildConstant(Res: MI.getOperand(i: 0), Val: ST.getWavefrontSize());
7514 MI.eraseFromParent();
7515 return true;
7516 }
7517 case Intrinsic::amdgcn_s_buffer_load:
7518 return legalizeSBufferLoad(Helper, MI);
7519 case Intrinsic::amdgcn_raw_buffer_store:
7520 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7521 case Intrinsic::amdgcn_struct_buffer_store:
7522 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7523 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: false);
7524 case Intrinsic::amdgcn_raw_buffer_store_format:
7525 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7526 case Intrinsic::amdgcn_struct_buffer_store_format:
7527 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7528 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: true);
7529 case Intrinsic::amdgcn_raw_tbuffer_store:
7530 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7531 case Intrinsic::amdgcn_struct_tbuffer_store:
7532 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7533 return legalizeBufferStore(MI, Helper, IsTyped: true, IsFormat: true);
7534 case Intrinsic::amdgcn_raw_buffer_load:
7535 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7536 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7537 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7538 case Intrinsic::amdgcn_struct_buffer_load:
7539 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7540 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7541 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7542 return legalizeBufferLoad(MI, Helper, IsFormat: false, IsTyped: false);
7543 case Intrinsic::amdgcn_raw_buffer_load_format:
7544 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7545 case Intrinsic::amdgcn_struct_buffer_load_format:
7546 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7547 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: false);
7548 case Intrinsic::amdgcn_raw_tbuffer_load:
7549 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7550 case Intrinsic::amdgcn_struct_tbuffer_load:
7551 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7552 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: true);
7553 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7554 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7555 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7556 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7557 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7558 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7559 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7560 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7561 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7562 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7563 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7564 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7565 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7566 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7567 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7568 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7569 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7570 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7571 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7572 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7573 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7574 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7575 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7576 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7577 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7578 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7579 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7580 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7581 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7582 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7583 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7584 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7585 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7586 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7587 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7588 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7589 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7590 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7591 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7592 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7593 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7594 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7595 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7596 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7597 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7598 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7599 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7600 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7601 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7602 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7603 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7604 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7605 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7606 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7607 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7608 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7609 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7610 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7611 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7612 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7613 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7614 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7615 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7616 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7617 return legalizeBufferAtomic(MI, B, IID: IntrID);
7618 case Intrinsic::amdgcn_rsq_clamp:
7619 return legalizeRsqClampIntrinsic(MI, MRI, B);
7620 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7621 return legalizeBVHIntersectRayIntrinsic(MI, B);
7622 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7623 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7624 return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
7625 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7626 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7627 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7628 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7629 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7630 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7631 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7632 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7633 Register Index = MI.getOperand(i: 5).getReg();
7634 LLT S32 = LLT::scalar(SizeInBits: 32);
7635 if (MRI.getType(Reg: Index) != S32)
7636 MI.getOperand(i: 5).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
7637 return true;
7638 }
7639 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7640 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7641 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7642 Register Index = MI.getOperand(i: 7).getReg();
7643 LLT S32 = LLT::scalar(SizeInBits: 32);
7644 if (MRI.getType(Reg: Index) != S32)
7645 MI.getOperand(i: 7).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
7646 return true;
7647 }
7648 case Intrinsic::amdgcn_fmed3: {
7649 GISelChangeObserver &Observer = Helper.Observer;
7650
7651 // FIXME: This is to workaround the inability of tablegen match combiners to
7652 // match intrinsics in patterns.
7653 Observer.changingInstr(MI);
7654 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_FMED3));
7655 MI.removeOperand(OpNo: 1);
7656 Observer.changedInstr(MI);
7657 return true;
7658 }
7659 case Intrinsic::amdgcn_readlane:
7660 case Intrinsic::amdgcn_writelane:
7661 case Intrinsic::amdgcn_readfirstlane:
7662 case Intrinsic::amdgcn_permlane16:
7663 case Intrinsic::amdgcn_permlanex16:
7664 case Intrinsic::amdgcn_permlane64:
7665 case Intrinsic::amdgcn_set_inactive:
7666 case Intrinsic::amdgcn_set_inactive_chain_arg:
7667 case Intrinsic::amdgcn_mov_dpp8:
7668 case Intrinsic::amdgcn_update_dpp:
7669 return legalizeLaneOp(Helper, MI, IID: IntrID);
7670 case Intrinsic::amdgcn_s_buffer_prefetch_data:
7671 return legalizeSBufferPrefetch(Helper, MI);
7672 case Intrinsic::amdgcn_dead: {
7673 // TODO: Use poison instead of undef
7674 for (const MachineOperand &Def : MI.defs())
7675 B.buildUndef(Res: Def);
7676 MI.eraseFromParent();
7677 return true;
7678 }
7679 default: {
7680 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7681 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
7682 return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
7683 return true;
7684 }
7685 }
7686
7687 return true;
7688}
7689