1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "SIInstrInfo.h"
22#include "SIMachineFunctionInfo.h"
23#include "SIRegisterInfo.h"
24#include "Utils/AMDGPUBaseInfo.h"
25#include "llvm/ADT/ScopeExit.h"
26#include "llvm/BinaryFormat/ELF.h"
27#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31#include "llvm/CodeGen/GlobalISel/Utils.h"
32#include "llvm/CodeGen/TargetOpcodes.h"
33#include "llvm/IR/DiagnosticInfo.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
46static cl::opt<bool> EnableNewLegality(
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(Val: false),
51 cl::ReallyHidden);
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
56static LLT getPow2VectorType(LLT Ty) {
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(Value: NElts);
59 return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
63static LLT getPow2ScalarType(LLT Ty) {
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Value: Bits);
66 return LLT::scalar(SizeInBits: Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(NumElements: Ty.getNumElements() + 1, ScalarTy: EltTy));
107 };
108}
109
110static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
141static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
149 assert(Ty.getSizeInBits() < MaxRegisterSize);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarSizeInBits: EltSize));
159 };
160}
161
162static LLT getBufferRsrcScalarType(const LLT Ty) {
163 if (!Ty.isVector())
164 return LLT::scalar(SizeInBits: 128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: 128));
167}
168
169static LLT getBufferRsrcRegisterType(const LLT Ty) {
170 if (!Ty.isVector())
171 return LLT::fixed_vector(NumElements: 4, ScalarTy: LLT::scalar(SizeInBits: 32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElements: NumElems * 4, ScalarTy: LLT::scalar(SizeInBits: 32));
174}
175
176static LLT getBitcastRegisterType(const LLT Ty) {
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(SizeInBits: Size);
183 }
184
185 return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32);
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
195static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
201 TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32));
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
230static bool isRegisterVectorElementType(LLT EltTy) {
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Size: Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Ty: Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
268 !SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(SizeInBits: 16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(SizeInBits: 1);
283static const LLT S8 = LLT::scalar(SizeInBits: 8);
284static const LLT S16 = LLT::scalar(SizeInBits: 16);
285static const LLT S32 = LLT::scalar(SizeInBits: 32);
286static const LLT F32 = LLT::float32();
287static const LLT S64 = LLT::scalar(SizeInBits: 64);
288static const LLT F64 = LLT::float64();
289static const LLT S96 = LLT::scalar(SizeInBits: 96);
290static const LLT S128 = LLT::scalar(SizeInBits: 128);
291static const LLT S160 = LLT::scalar(SizeInBits: 160);
292static const LLT S224 = LLT::scalar(SizeInBits: 224);
293static const LLT S256 = LLT::scalar(SizeInBits: 256);
294static const LLT S512 = LLT::scalar(SizeInBits: 512);
295static const LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
296
297static const LLT V2S8 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 8);
298static const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
299static const LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
300static const LLT V6S16 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16);
301static const LLT V8S16 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
302static const LLT V10S16 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 16);
303static const LLT V12S16 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 16);
304static const LLT V16S16 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16);
305
306static const LLT V2F16 = LLT::fixed_vector(NumElements: 2, ScalarTy: LLT::float16());
307static const LLT V2BF16 = V2F16; // FIXME
308
309static const LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
310static const LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
311static const LLT V4S32 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
312static const LLT V5S32 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 32);
313static const LLT V6S32 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 32);
314static const LLT V7S32 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 32);
315static const LLT V8S32 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32);
316static const LLT V9S32 = LLT::fixed_vector(NumElements: 9, ScalarSizeInBits: 32);
317static const LLT V10S32 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 32);
318static const LLT V11S32 = LLT::fixed_vector(NumElements: 11, ScalarSizeInBits: 32);
319static const LLT V12S32 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 32);
320static const LLT V16S32 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32);
321static const LLT V32S32 = LLT::fixed_vector(NumElements: 32, ScalarSizeInBits: 32);
322
323static const LLT V2S64 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
324static const LLT V3S64 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 64);
325static const LLT V4S64 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64);
326static const LLT V5S64 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 64);
327static const LLT V6S64 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 64);
328static const LLT V7S64 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 64);
329static const LLT V8S64 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64);
330static const LLT V16S64 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 64);
331
332static const LLT V2S128 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 128);
333static const LLT V4S128 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 128);
334
335static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
336 S160, S224, S256, S512};
337
338static std::initializer_list<LLT> AllS16Vectors{
339 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
340
341static std::initializer_list<LLT> AllS32Vectors = {
342 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
343 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
344
345static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
346 V6S64, V7S64, V8S64, V16S64};
347
348// Checks whether a type is in the list of legal register types.
349static bool isRegisterClassType(LLT Ty) {
350 if (Ty.isPointerOrPointerVector())
351 Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
352
353 return is_contained(Set: AllS32Vectors, Element: Ty) || is_contained(Set: AllS64Vectors, Element: Ty) ||
354 is_contained(Set: AllScalarTypes, Element: Ty) || is_contained(Set: AllS16Vectors, Element: Ty);
355}
356
357static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358 return [TypeIdx](const LegalityQuery &Query) {
359 return isRegisterClassType(Ty: Query.Types[TypeIdx]);
360 };
361}
362
363// If we have a truncating store or an extending load with a data size larger
364// than 32-bits, we need to reduce to a 32-bit type.
365static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
366 return [=](const LegalityQuery &Query) {
367 const LLT Ty = Query.Types[TypeIdx];
368 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
370 };
371}
372
373// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374// handle some operations by just promoting the register during
375// selection. There are also d16 loads on GFX9+ which preserve the high bits.
376static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
377 bool IsLoad, bool IsAtomic) {
378 switch (AS) {
379 case AMDGPUAS::PRIVATE_ADDRESS:
380 // FIXME: Private element size.
381 return ST.enableFlatScratch() ? 128 : 32;
382 case AMDGPUAS::LOCAL_ADDRESS:
383 return ST.useDS128() ? 128 : 64;
384 case AMDGPUAS::GLOBAL_ADDRESS:
385 case AMDGPUAS::CONSTANT_ADDRESS:
386 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
387 case AMDGPUAS::BUFFER_RESOURCE:
388 // Treat constant and global as identical. SMRD loads are sometimes usable for
389 // global loads (ideally constant address space should be eliminated)
390 // depending on the context. Legality cannot be context dependent, but
391 // RegBankSelect can split the load as necessary depending on the pointer
392 // register bank/uniformity and if the memory is invariant or not written in a
393 // kernel.
394 return IsLoad ? 512 : 128;
395 default:
396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
397 // if they may alias scratch depending on the subtarget. This needs to be
398 // moved to custom handling to use addressMayBeAccessedAsPrivate
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
400 }
401}
402
403static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404 const LegalityQuery &Query) {
405 const LLT Ty = Query.Types[0];
406
407 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
409
410 unsigned RegSize = Ty.getSizeInBits();
411 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
413 unsigned AS = Query.Types[1].getAddressSpace();
414
415 // All of these need to be custom lowered to cast the pointer operand.
416 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
417 return false;
418
419 // Do not handle extending vector loads.
420 if (Ty.isVector() && MemSize != RegSize)
421 return false;
422
423 // TODO: We should be able to widen loads if the alignment is high enough, but
424 // we also need to modify the memory access size.
425#if 0
426 // Accept widening loads based on alignment.
427 if (IsLoad && MemSize < Size)
428 MemSize = std::max(MemSize, Align);
429#endif
430
431 // Only 1-byte and 2-byte to 32-bit extloads are valid.
432 if (MemSize != RegSize && RegSize != 32)
433 return false;
434
435 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
436 IsAtomic: Query.MMODescrs[0].Ordering !=
437 AtomicOrdering::NotAtomic))
438 return false;
439
440 switch (MemSize) {
441 case 8:
442 case 16:
443 case 32:
444 case 64:
445 case 128:
446 break;
447 case 96:
448 if (!ST.hasDwordx3LoadStores())
449 return false;
450 break;
451 case 256:
452 case 512:
453 // These may contextually need to be broken down.
454 break;
455 default:
456 return false;
457 }
458
459 assert(RegSize >= MemSize);
460
461 if (AlignBits < MemSize) {
462 const SITargetLowering *TLI = ST.getTargetLowering();
463 if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
464 Alignment: Align(AlignBits / 8)))
465 return false;
466 }
467
468 return true;
469}
470
471// The newer buffer intrinsic forms take their resource arguments as
472// pointers in address space 8, aka s128 values. However, in order to not break
473// SelectionDAG, the underlying operations have to continue to take v4i32
474// arguments. Therefore, we convert resource pointers - or vectors of them
475// to integer values here.
476static bool hasBufferRsrcWorkaround(const LLT Ty) {
477 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
478 return true;
479 if (Ty.isVector()) {
480 const LLT ElemTy = Ty.getElementType();
481 return hasBufferRsrcWorkaround(Ty: ElemTy);
482 }
483 return false;
484}
485
486// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487// workaround this. Eventually it should ignore the type for loads and only care
488// about the size. Return true in cases where we will workaround this for now by
489// bitcasting.
490static bool loadStoreBitcastWorkaround(const LLT Ty) {
491 if (EnableNewLegality)
492 return false;
493
494 const unsigned Size = Ty.getSizeInBits();
495 if (Size <= 64)
496 return false;
497 // Address space 8 pointers get their own workaround.
498 if (hasBufferRsrcWorkaround(Ty))
499 return false;
500 if (!Ty.isVector())
501 return true;
502
503 if (Ty.isPointerVector())
504 return true;
505
506 unsigned EltSize = Ty.getScalarSizeInBits();
507 return EltSize != 32 && EltSize != 64;
508}
509
510static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
511 const LLT Ty = Query.Types[0];
512 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
513 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
514}
515
516/// Return true if a load or store of the type should be lowered with a bitcast
517/// to a different type.
518static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519 const LLT MemTy) {
520 const unsigned MemSizeInBits = MemTy.getSizeInBits();
521 const unsigned Size = Ty.getSizeInBits();
522 if (Size != MemSizeInBits)
523 return Size <= 32 && Ty.isVector();
524
525 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
526 return true;
527
528 // Don't try to handle bitcasting vector ext loads for now.
529 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
530 (Size <= 32 || isRegisterSize(Size)) &&
531 !isRegisterVectorElementType(EltTy: Ty.getElementType());
532}
533
534/// Return true if we should legalize a load by widening an odd sized memory
535/// access up to the alignment. Note this case when the memory access itself
536/// changes, not the size of the result register.
537static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
538 uint64_t AlignInBits, unsigned AddrSpace,
539 unsigned Opcode) {
540 unsigned SizeInBits = MemoryTy.getSizeInBits();
541 // We don't want to widen cases that are naturally legal.
542 if (isPowerOf2_32(Value: SizeInBits))
543 return false;
544
545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
546 // end up widening these for a scalar load during RegBankSelect, if we don't
547 // have 96-bit scalar loads.
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
549 return false;
550
551 if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
552 return false;
553
554 // A load is known dereferenceable up to the alignment, so it's legal to widen
555 // to it.
556 //
557 // TODO: Could check dereferenceable for less aligned cases.
558 unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
559 if (AlignInBits < RoundedSize)
560 return false;
561
562 // Do not widen if it would introduce a slow unaligned load.
563 const SITargetLowering *TLI = ST.getTargetLowering();
564 unsigned Fast = 0;
565 return TLI->allowsMisalignedMemoryAccessesImpl(
566 Size: RoundedSize, AddrSpace, Alignment: Align(AlignInBits / 8),
567 Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
568 Fast;
569}
570
571static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572 unsigned Opcode) {
573 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
574 return false;
575
576 return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs[0].MemoryTy,
577 AlignInBits: Query.MMODescrs[0].AlignInBits,
578 AddrSpace: Query.Types[1].getAddressSpace(), Opcode);
579}
580
581/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
584static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
585 MachineRegisterInfo &MRI, unsigned Idx) {
586 MachineOperand &MO = MI.getOperand(i: Idx);
587
588 const LLT PointerTy = MRI.getType(Reg: MO.getReg());
589
590 // Paranoidly prevent us from doing this multiple times.
591 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
592 return PointerTy;
593
594 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
595 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
596 if (!PointerTy.isVector()) {
597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
599 const LLT S32 = LLT::scalar(SizeInBits: 32);
600
601 Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
604 for (unsigned I = 0; I < NumParts; ++I)
605 VectorElems[I] =
606 B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: 0);
607 B.buildMergeValues(Res: MO, Ops: VectorElems);
608 MO.setReg(VectorReg);
609 return VectorTy;
610 }
611 Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
612 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
613 auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
614 B.buildIntToPtr(Dst: MO, Src: Scalar);
615 MO.setReg(BitcastReg);
616
617 return VectorTy;
618}
619
620/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621/// the form in which the value must be in order to be passed to the low-level
622/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623/// needed in order to account for the fact that we can't define a register
624/// class for s128 without breaking SelectionDAG.
625static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
626 MachineRegisterInfo &MRI = *B.getMRI();
627 const LLT PointerTy = MRI.getType(Reg: Pointer);
628 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
630
631 if (!PointerTy.isVector()) {
632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633 SmallVector<Register, 4> PointerParts;
634 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
635 auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: Pointer);
636 for (unsigned I = 0; I < NumParts; ++I)
637 PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
638 return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: 0);
639 }
640 Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: 0);
641 return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: 0);
642}
643
644static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
645 unsigned Idx) {
646 MachineOperand &MO = MI.getOperand(i: Idx);
647
648 const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
649 // Paranoidly prevent us from doing this multiple times.
650 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
651 return;
652 MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
653}
654
655AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
656 const GCNTargetMachine &TM)
657 : ST(ST_) {
658 using namespace TargetOpcode;
659
660 auto GetAddrSpacePtr = [&TM](unsigned AS) {
661 return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
662 };
663
664 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
665 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
666 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
667 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
668 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
669 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
670 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
671 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
672 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
673 const LLT BufferStridedPtr =
674 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
675
676 const LLT CodePtr = FlatPtr;
677
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
680 };
681
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
684 };
685
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
687
688 const std::initializer_list<LLT> FPTypesBase = {
689 S32, S64
690 };
691
692 const std::initializer_list<LLT> FPTypes16 = {
693 S32, S64, S16
694 };
695
696 const std::initializer_list<LLT> FPTypesPK16 = {
697 S32, S64, S16, V2S16
698 };
699
700 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
701
702 // s1 for VCC branches, s32 for SCC branches.
703 getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
704
705 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706 // elements for v3s16
707 getActionDefinitionsBuilder(Opcode: G_PHI)
708 .legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
709 .legalFor(Types: AllS32Vectors)
710 .legalFor(Types: AllS64Vectors)
711 .legalFor(Types: AddrSpaces64)
712 .legalFor(Types: AddrSpaces32)
713 .legalFor(Types: AddrSpaces128)
714 .legalIf(Predicate: isPointer(TypeIdx: 0))
715 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S256)
716 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
717 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16)
718 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
719 .scalarize(TypeIdx: 0);
720
721 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722 // Full set of gfx9 features.
723 if (ST.hasScalarAddSub64()) {
724 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
725 .legalFor(Types: {S64, S32, S16, V2S16})
726 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
727 .scalarize(TypeIdx: 0)
728 .minScalar(TypeIdx: 0, Ty: S16)
729 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
730 .maxScalar(TypeIdx: 0, Ty: S32);
731 } else {
732 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
733 .legalFor(Types: {S32, S16, V2S16})
734 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
735 .scalarize(TypeIdx: 0)
736 .minScalar(TypeIdx: 0, Ty: S16)
737 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
738 .maxScalar(TypeIdx: 0, Ty: S32);
739 }
740
741 if (ST.hasScalarSMulU64()) {
742 getActionDefinitionsBuilder(Opcode: G_MUL)
743 .legalFor(Types: {S64, S32, S16, V2S16})
744 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
745 .scalarize(TypeIdx: 0)
746 .minScalar(TypeIdx: 0, Ty: S16)
747 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
748 .custom();
749 } else {
750 getActionDefinitionsBuilder(Opcode: G_MUL)
751 .legalFor(Types: {S32, S16, V2S16})
752 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
753 .scalarize(TypeIdx: 0)
754 .minScalar(TypeIdx: 0, Ty: S16)
755 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
756 .custom();
757 }
758 assert(ST.hasMad64_32());
759
760 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761 .legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
762 .minScalarOrElt(TypeIdx: 0, Ty: S16)
763 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
764 .scalarize(TypeIdx: 0)
765 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
766 .lower();
767 } else if (ST.has16BitInsts()) {
768 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
769 .legalFor(Types: {S32, S16})
770 .minScalar(TypeIdx: 0, Ty: S16)
771 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
772 .maxScalar(TypeIdx: 0, Ty: S32)
773 .scalarize(TypeIdx: 0);
774
775 getActionDefinitionsBuilder(Opcode: G_MUL)
776 .legalFor(Types: {S32, S16})
777 .scalarize(TypeIdx: 0)
778 .minScalar(TypeIdx: 0, Ty: S16)
779 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
780 .custom();
781 assert(ST.hasMad64_32());
782
783 // Technically the saturating operations require clamp bit support, but this
784 // was introduced at the same time as 16-bit operations.
785 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
786 .legalFor(Types: {S32, S16}) // Clamp modifier
787 .minScalar(TypeIdx: 0, Ty: S16)
788 .scalarize(TypeIdx: 0)
789 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 16)
790 .lower();
791
792 // We're just lowering this, but it helps get a better result to try to
793 // coerce to the desired type first.
794 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
795 .minScalar(TypeIdx: 0, Ty: S16)
796 .scalarize(TypeIdx: 0)
797 .lower();
798 } else {
799 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
800 .legalFor(Types: {S32})
801 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
802 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
803 .scalarize(TypeIdx: 0);
804
805 auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
806 .legalFor(Types: {S32})
807 .scalarize(TypeIdx: 0)
808 .minScalar(TypeIdx: 0, Ty: S32)
809 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32);
810
811 if (ST.hasMad64_32())
812 Mul.custom();
813 else
814 Mul.maxScalar(TypeIdx: 0, Ty: S32);
815
816 if (ST.hasIntClamp()) {
817 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
818 .legalFor(Types: {S32}) // Clamp modifier.
819 .scalarize(TypeIdx: 0)
820 .minScalarOrElt(TypeIdx: 0, Ty: S32)
821 .lower();
822 } else {
823 // Clamp bit support was added in VI, along with 16-bit operations.
824 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
825 .minScalar(TypeIdx: 0, Ty: S32)
826 .scalarize(TypeIdx: 0)
827 .lower();
828 }
829
830 // FIXME: DAG expansion gets better results. The widening uses the smaller
831 // range values and goes for the min/max lowering directly.
832 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
833 .minScalar(TypeIdx: 0, Ty: S32)
834 .scalarize(TypeIdx: 0)
835 .lower();
836 }
837
838 getActionDefinitionsBuilder(
839 Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
840 .customFor(Types: {S32, S64})
841 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
842 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
843 .scalarize(TypeIdx: 0);
844
845 auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
846 .legalFor(Types: {S32})
847 .maxScalar(TypeIdx: 0, Ty: S32);
848
849 if (ST.hasVOP3PInsts()) {
850 Mulh
851 .clampMaxNumElements(TypeIdx: 0, EltTy: S8, MaxElements: 2)
852 .lowerFor(Types: {V2S8});
853 }
854
855 Mulh
856 .scalarize(TypeIdx: 0)
857 .lower();
858
859 // Report legal for any types we can handle anywhere. For the cases only legal
860 // on the SALU, RegBankSelect will be able to re-legalize.
861 getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
862 .legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
863 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
864 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
865 .fewerElementsIf(Predicate: vectorWiderThan(TypeIdx: 0, Size: 64), Mutation: fewerEltsToSize64Vector(TypeIdx: 0))
866 .widenScalarToNextPow2(TypeIdx: 0)
867 .scalarize(TypeIdx: 0);
868
869 getActionDefinitionsBuilder(
870 Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871 .legalFor(Types: {{S32, S1}, {S32, S32}})
872 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
873 .scalarize(TypeIdx: 0);
874
875 getActionDefinitionsBuilder(Opcode: G_BITCAST)
876 // Don't worry about the size constraint.
877 .legalIf(Predicate: all(P0: isRegisterClassType(TypeIdx: 0), P1: isRegisterClassType(TypeIdx: 1)))
878 .lower();
879
880 getActionDefinitionsBuilder(Opcode: G_CONSTANT)
881 .legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883 .legalIf(Predicate: isPointer(TypeIdx: 0))
884 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
885 .widenScalarToNextPow2(TypeIdx: 0);
886
887 getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
888 .legalFor(Types: {S32, S64, S16})
889 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
890
891 getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
892 .legalIf(Predicate: isRegisterType(TypeIdx: 0))
893 // s1 and s16 are special cases because they have legal operations on
894 // them, but don't really occupy registers in the normal way.
895 .legalFor(Types: {S1, S16})
896 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
897 .clampScalarOrElt(TypeIdx: 0, MinTy: S32, MaxTy: MaxScalar)
898 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
899 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16);
900
901 getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
902
903 // If the amount is divergent, we have to do a wave reduction to get the
904 // maximum value, so this is expanded during RegBankSelect.
905 getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
906 .legalFor(Types: {{PrivatePtr, S32}});
907
908 getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
909 .customFor(Types: {PrivatePtr});
910 getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
911 .legalFor(Types: {PrivatePtr});
912
913 getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
914
915 getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
916 .customIf(Predicate: typeIsNot(TypeIdx: 0, Type: PrivatePtr));
917
918 getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
919
920 auto &FPOpActions = getActionDefinitionsBuilder(
921 Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
923 .legalFor(Types: {S32, S64});
924 auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
925 .customFor(Types: {S32, S64});
926 auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
927 .customFor(Types: {S32, S64});
928
929 if (ST.has16BitInsts()) {
930 if (ST.hasVOP3PInsts())
931 FPOpActions.legalFor(Types: {S16, V2S16});
932 else
933 FPOpActions.legalFor(Types: {S16});
934
935 TrigActions.customFor(Types: {S16});
936 FDIVActions.customFor(Types: {S16});
937 }
938
939 if (ST.hasPackedFP32Ops()) {
940 FPOpActions.legalFor(Types: {V2S32});
941 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S32, NumElts: 2);
942 }
943
944 auto &MinNumMaxNum = getActionDefinitionsBuilder(Opcodes: {
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
946
947 if (ST.hasVOP3PInsts()) {
948 MinNumMaxNum.customFor(Types: FPTypesPK16)
949 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
950 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
951 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
952 .scalarize(TypeIdx: 0);
953 } else if (ST.has16BitInsts()) {
954 MinNumMaxNum.customFor(Types: FPTypes16)
955 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
956 .scalarize(TypeIdx: 0);
957 } else {
958 MinNumMaxNum.customFor(Types: FPTypesBase)
959 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
960 .scalarize(TypeIdx: 0);
961 }
962
963 if (ST.hasVOP3PInsts())
964 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
965
966 FPOpActions
967 .scalarize(TypeIdx: 0)
968 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
969
970 TrigActions
971 .scalarize(TypeIdx: 0)
972 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
973
974 FDIVActions
975 .scalarize(TypeIdx: 0)
976 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
977
978 getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS})
979 .legalFor(Types: FPTypesPK16)
980 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
981 .scalarize(TypeIdx: 0)
982 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
983
984 if (ST.has16BitInsts()) {
985 getActionDefinitionsBuilder(Opcode: G_FSQRT)
986 .legalFor(Types: {S16})
987 .customFor(Types: {S32, S64})
988 .scalarize(TypeIdx: 0)
989 .unsupported();
990 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
991 .legalFor(Types: {S32, S64, S16})
992 .scalarize(TypeIdx: 0)
993 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
994
995 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
996 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
997 .scalarize(TypeIdx: 0)
998 .maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16)
999 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1000 .lower();
1001
1002 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1003 .customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1004 .scalarize(TypeIdx: 0)
1005 .lower();
1006 } else {
1007 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1008 .customFor(Types: {S32, S64, S16})
1009 .scalarize(TypeIdx: 0)
1010 .unsupported();
1011
1012
1013 if (ST.hasFractBug()) {
1014 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1015 .customFor(Types: {S64})
1016 .legalFor(Types: {S32, S64})
1017 .scalarize(TypeIdx: 0)
1018 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1019 } else {
1020 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1021 .legalFor(Types: {S32, S64})
1022 .scalarize(TypeIdx: 0)
1023 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1024 }
1025
1026 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1027 .legalFor(Types: {{S32, S32}, {S64, S32}})
1028 .scalarize(TypeIdx: 0)
1029 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1030 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1031 .lower();
1032
1033 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1034 .customFor(Types: {{S32, S32}, {S64, S32}})
1035 .scalarize(TypeIdx: 0)
1036 .minScalar(TypeIdx: 0, Ty: S32)
1037 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1038 .lower();
1039 }
1040
1041 getActionDefinitionsBuilder(Opcode: G_FPTRUNC)
1042 .legalFor(Types: {{S32, S64}, {S16, S32}})
1043 .scalarize(TypeIdx: 0)
1044 .lower();
1045
1046 getActionDefinitionsBuilder(Opcode: G_FPEXT)
1047 .legalFor(Types: {{S64, S32}, {S32, S16}})
1048 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32))
1049 .scalarize(TypeIdx: 0);
1050
1051 auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1052 if (ST.has16BitInsts()) {
1053 FSubActions
1054 // Use actual fsub instruction
1055 .legalFor(Types: {S32, S16})
1056 // Must use fadd + fneg
1057 .lowerFor(Types: {S64, V2S16});
1058 } else {
1059 FSubActions
1060 // Use actual fsub instruction
1061 .legalFor(Types: {S32})
1062 // Must use fadd + fneg
1063 .lowerFor(Types: {S64, S16, V2S16});
1064 }
1065
1066 FSubActions
1067 .scalarize(TypeIdx: 0)
1068 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1069
1070 // Whether this is legal depends on the floating point mode for the function.
1071 auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1072 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1073 FMad.customFor(Types: {S32, S16});
1074 else if (ST.hasMadMacF32Insts())
1075 FMad.customFor(Types: {S32});
1076 else if (ST.hasMadF16())
1077 FMad.customFor(Types: {S16});
1078 FMad.scalarize(TypeIdx: 0)
1079 .lower();
1080
1081 auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1082 if (ST.has16BitInsts()) {
1083 FRem.customFor(Types: {S16, S32, S64});
1084 } else {
1085 FRem.minScalar(TypeIdx: 0, Ty: S32)
1086 .customFor(Types: {S32, S64});
1087 }
1088 FRem.scalarize(TypeIdx: 0);
1089
1090 // TODO: Do we need to clamp maximum bitwidth?
1091 getActionDefinitionsBuilder(Opcode: G_TRUNC)
1092 .legalIf(Predicate: isScalar(TypeIdx: 0))
1093 .legalFor(Types: {{V2S16, V2S32}})
1094 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1095 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096 // situations (like an invalid implicit use), we don't want to infinite loop
1097 // in the legalizer.
1098 .fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: 0), Mutation: LegalizeMutations::scalarize(TypeIdx: 0))
1099 .alwaysLegal();
1100
1101 getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1102 .legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1103 {S32, S1}, {S64, S1}, {S16, S1}})
1104 .scalarize(TypeIdx: 0)
1105 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1106 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1107
1108 // TODO: Split s1->s64 during regbankselect for VALU.
1109 auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1110 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S32}})
1111 .lowerIf(Predicate: typeIs(TypeIdx: 1, TypesInit: S1))
1112 .customFor(Types: {{S32, S64}, {S64, S64}});
1113 if (ST.has16BitInsts())
1114 IToFP.legalFor(Types: {{S16, S16}});
1115 IToFP.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1116 .minScalar(TypeIdx: 0, Ty: S32)
1117 .scalarize(TypeIdx: 0)
1118 .widenScalarToNextPow2(TypeIdx: 1);
1119
1120 auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1121 .legalFor(Types: {{S32, S32}, {S32, S64}, {S32, S16}})
1122 .customFor(Types: {{S64, S32}, {S64, S64}})
1123 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1124 if (ST.has16BitInsts())
1125 FPToI.legalFor(Types: {{S16, S16}});
1126 else
1127 FPToI.minScalar(TypeIdx: 1, Ty: S32);
1128
1129 FPToI.minScalar(TypeIdx: 0, Ty: S32)
1130 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1131 .scalarize(TypeIdx: 0)
1132 .lower();
1133
1134 getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1135 .customFor(Types: {S16, S32})
1136 .scalarize(TypeIdx: 0)
1137 .lower();
1138
1139 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1140 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1141 .scalarize(TypeIdx: 0)
1142 .lower();
1143
1144 if (ST.has16BitInsts()) {
1145 getActionDefinitionsBuilder(
1146 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1147 .legalFor(Types: {S16, S32, S64})
1148 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1149 .scalarize(TypeIdx: 0);
1150 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1151 getActionDefinitionsBuilder(
1152 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1153 .legalFor(Types: {S32, S64})
1154 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1155 .scalarize(TypeIdx: 0);
1156 } else {
1157 getActionDefinitionsBuilder(
1158 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1159 .legalFor(Types: {S32})
1160 .customFor(Types: {S64})
1161 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1162 .scalarize(TypeIdx: 0);
1163 }
1164
1165 getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1166 .unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1167 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: sameSize(TypeIdx0: 0, TypeIdx1: 1)))
1168 .scalarize(TypeIdx: 0)
1169 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0);
1170
1171 getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1172 .legalIf(Predicate: all(P0: sameSize(TypeIdx0: 0, TypeIdx1: 1), P1: typeInSet(TypeIdx: 1, TypesInit: {S64, S32})))
1173 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0)
1174 .scalarize(TypeIdx: 0);
1175
1176 auto &CmpBuilder =
1177 getActionDefinitionsBuilder(Opcode: G_ICMP)
1178 // The compare output type differs based on the register bank of the output,
1179 // so make both s1 and s32 legal.
1180 //
1181 // Scalar compares producing output in scc will be promoted to s32, as that
1182 // is the allocatable register type that will be needed for the copy from
1183 // scc. This will be promoted during RegBankSelect, and we assume something
1184 // before that won't try to use s32 result types.
1185 //
1186 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1187 // bank.
1188 .legalForCartesianProduct(
1189 Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1190 .legalForCartesianProduct(
1191 Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1192 if (ST.has16BitInsts()) {
1193 CmpBuilder.legalFor(Types: {{S1, S16}});
1194 }
1195
1196 CmpBuilder
1197 .widenScalarToNextPow2(TypeIdx: 1)
1198 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1199 .scalarize(TypeIdx: 0)
1200 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: 1)));
1201
1202 auto &FCmpBuilder =
1203 getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1204 Types0: {S1}, Types1: ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1205
1206 if (ST.hasSALUFloatInsts())
1207 FCmpBuilder.legalForCartesianProduct(Types0: {S32}, Types1: {S16, S32});
1208
1209 FCmpBuilder
1210 .widenScalarToNextPow2(TypeIdx: 1)
1211 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1212 .scalarize(TypeIdx: 0);
1213
1214 // FIXME: fpow has a selection pattern that should move to custom lowering.
1215 auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1216 if (ST.has16BitInsts())
1217 ExpOps.customFor(Types: {{S32}, {S16}});
1218 else
1219 ExpOps.customFor(Types: {S32});
1220 ExpOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1221 .scalarize(TypeIdx: 0);
1222
1223 getActionDefinitionsBuilder(Opcode: G_FPOWI)
1224 .clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1225 .lower();
1226
1227 auto &Log2Ops = getActionDefinitionsBuilder(Opcodes: {G_FLOG2, G_FEXP2});
1228 Log2Ops.customFor(Types: {S32});
1229 if (ST.has16BitInsts())
1230 Log2Ops.legalFor(Types: {S16});
1231 else
1232 Log2Ops.customFor(Types: {S16});
1233 Log2Ops.scalarize(TypeIdx: 0)
1234 .lower();
1235
1236 auto &LogOps =
1237 getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1238 LogOps.customFor(Types: {S32, S16});
1239 LogOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1240 .scalarize(TypeIdx: 0);
1241
1242 // The 64-bit versions produce 32-bit results, but only on the SALU.
1243 getActionDefinitionsBuilder(Opcode: G_CTPOP)
1244 .legalFor(Types: {{S32, S32}, {S32, S64}})
1245 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1246 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1247 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1248 .scalarize(TypeIdx: 0)
1249 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1250
1251 // If no 16 bit instr is available, lower into different instructions.
1252 if (ST.has16BitInsts())
1253 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1254 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1255 .widenScalarToNextPow2(TypeIdx: 1)
1256 .scalarize(TypeIdx: 0)
1257 .lower();
1258 else
1259 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1260 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1261 .lowerFor(Types: {S1, S16})
1262 .widenScalarToNextPow2(TypeIdx: 1)
1263 .scalarize(TypeIdx: 0)
1264 .lower();
1265
1266 // The hardware instructions return a different result on 0 than the generic
1267 // instructions expect. The hardware produces -1, but these produce the
1268 // bitwidth.
1269 getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1270 .scalarize(TypeIdx: 0)
1271 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1272 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1273 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1274 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1275 .custom();
1276
1277 // The 64-bit versions produce 32-bit results, but only on the SALU.
1278 getActionDefinitionsBuilder(Opcode: G_CTLZ_ZERO_UNDEF)
1279 .legalFor(Types: {{S32, S32}, {S32, S64}})
1280 .customIf(Predicate: scalarNarrowerThan(TypeIdx: 1, Size: 32))
1281 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1282 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1283 .scalarize(TypeIdx: 0)
1284 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1285 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1286
1287 getActionDefinitionsBuilder(Opcode: G_CTTZ_ZERO_UNDEF)
1288 .legalFor(Types: {{S32, S32}, {S32, S64}})
1289 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1290 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1291 .scalarize(TypeIdx: 0)
1292 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1293 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1294
1295 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1296 // RegBankSelect.
1297 getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1298 .legalFor(Types: {S32, S64})
1299 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1300 .scalarize(TypeIdx: 0)
1301 .widenScalarToNextPow2(TypeIdx: 0);
1302
1303 if (ST.has16BitInsts()) {
1304 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1305 .legalFor(Types: {S16, S32, V2S16})
1306 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1307 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1308 // narrowScalar limitation.
1309 .widenScalarToNextPow2(TypeIdx: 0)
1310 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S32)
1311 .scalarize(TypeIdx: 0);
1312
1313 if (ST.hasVOP3PInsts()) {
1314 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1315 .legalFor(Types: {S32, S16, V2S16})
1316 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1317 .minScalar(TypeIdx: 0, Ty: S16)
1318 .widenScalarToNextPow2(TypeIdx: 0)
1319 .scalarize(TypeIdx: 0)
1320 .lower();
1321 } else {
1322 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1323 .legalFor(Types: {S32, S16})
1324 .widenScalarToNextPow2(TypeIdx: 0)
1325 .minScalar(TypeIdx: 0, Ty: S16)
1326 .scalarize(TypeIdx: 0)
1327 .lower();
1328 }
1329 } else {
1330 // TODO: Should have same legality without v_perm_b32
1331 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1332 .legalFor(Types: {S32})
1333 .lowerIf(Predicate: scalarNarrowerThan(TypeIdx: 0, Size: 32))
1334 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1335 // narrowScalar limitation.
1336 .widenScalarToNextPow2(TypeIdx: 0)
1337 .maxScalar(TypeIdx: 0, Ty: S32)
1338 .scalarize(TypeIdx: 0)
1339 .lower();
1340
1341 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1342 .legalFor(Types: {S32})
1343 .minScalar(TypeIdx: 0, Ty: S32)
1344 .widenScalarToNextPow2(TypeIdx: 0)
1345 .scalarize(TypeIdx: 0)
1346 .lower();
1347 }
1348
1349 getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1350 // List the common cases
1351 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1352 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1353 .scalarize(TypeIdx: 0)
1354 // Accept any address space as long as the size matches
1355 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1356 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 1, TypeIdx1: 0),
1357 Mutation: [](const LegalityQuery &Query) {
1358 return std::pair(
1359 1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1360 })
1361 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 1, TypeIdx1: 0), Mutation: [](const LegalityQuery &Query) {
1362 return std::pair(1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1363 });
1364
1365 getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1366 // List the common cases
1367 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1368 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1369 .scalarize(TypeIdx: 0)
1370 // Accept any address space as long as the size matches
1371 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1372 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 0, TypeIdx1: 1),
1373 Mutation: [](const LegalityQuery &Query) {
1374 return std::pair(
1375 0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1376 })
1377 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 0, TypeIdx1: 1), Mutation: [](const LegalityQuery &Query) {
1378 return std::pair(0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1379 });
1380
1381 getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1382 .scalarize(TypeIdx: 0)
1383 .custom();
1384
1385 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1386 bool IsLoad) -> bool {
1387 const LLT DstTy = Query.Types[0];
1388
1389 // Split vector extloads.
1390 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1391
1392 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1393 return true;
1394
1395 const LLT PtrTy = Query.Types[1];
1396 unsigned AS = PtrTy.getAddressSpace();
1397 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1398 IsAtomic: Query.MMODescrs[0].Ordering !=
1399 AtomicOrdering::NotAtomic))
1400 return true;
1401
1402 // Catch weird sized loads that don't evenly divide into the access sizes
1403 // TODO: May be able to widen depending on alignment etc.
1404 unsigned NumRegs = (MemSize + 31) / 32;
1405 if (NumRegs == 3) {
1406 if (!ST.hasDwordx3LoadStores())
1407 return true;
1408 } else {
1409 // If the alignment allows, these should have been widened.
1410 if (!isPowerOf2_32(Value: NumRegs))
1411 return true;
1412 }
1413
1414 return false;
1415 };
1416
1417 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1418 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1419 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1420
1421 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1422 // LDS
1423 // TODO: Unsupported flat for SI.
1424
1425 for (unsigned Op : {G_LOAD, G_STORE}) {
1426 const bool IsStore = Op == G_STORE;
1427
1428 auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1429 // Explicitly list some common cases.
1430 // TODO: Does this help compile time at all?
1431 Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1432 {.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1433 {.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1434 {.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1435 {.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1436 {.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1437 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1438 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1439
1440 {.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1441 {.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: 32},
1442 {.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: 32},
1443 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1444 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1445 {.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1446
1447 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1448 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1449 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1450 {.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1451
1452 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1453 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1454 {.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1455 {.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1456 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1457 Actions.legalIf(
1458 Predicate: [=](const LegalityQuery &Query) -> bool {
1459 return isLoadStoreLegal(ST, Query);
1460 });
1461
1462 // The custom pointers (fat pointers, buffer resources) don't work with load
1463 // and store at this level. Fat pointers should have been lowered to
1464 // intrinsics before the translation to MIR.
1465 Actions.unsupportedIf(
1466 Predicate: typeInSet(TypeIdx: 1, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1467
1468 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1469 // ptrtoint. This is needed to account for the fact that we can't have i128
1470 // as a register class for SelectionDAG reasons.
1471 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1472 return hasBufferRsrcWorkaround(Ty: Query.Types[0]);
1473 });
1474
1475 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1476 // 64-bits.
1477 //
1478 // TODO: Should generalize bitcast action into coerce, which will also cover
1479 // inserting addrspacecasts.
1480 Actions.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1481
1482 // Turn any illegal element vectors into something easier to deal
1483 // with. These will ultimately produce 32-bit scalar shifts to extract the
1484 // parts anyway.
1485 //
1486 // For odd 16-bit element vectors, prefer to split those into pieces with
1487 // 16-bit vector parts.
1488 Actions.bitcastIf(
1489 Predicate: [=](const LegalityQuery &Query) -> bool {
1490 return shouldBitcastLoadStoreType(ST, Ty: Query.Types[0],
1491 MemTy: Query.MMODescrs[0].MemoryTy);
1492 }, Mutation: bitcastToRegisterType(TypeIdx: 0));
1493
1494 if (!IsStore) {
1495 // Widen suitably aligned loads by loading extra bytes. The standard
1496 // legalization actions can't properly express widening memory operands.
1497 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1498 return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1499 });
1500 }
1501
1502 // FIXME: load/store narrowing should be moved to lower action
1503 Actions
1504 .narrowScalarIf(
1505 Predicate: [=](const LegalityQuery &Query) -> bool {
1506 return !Query.Types[0].isVector() &&
1507 needToSplitMemOp(Query, Op == G_LOAD);
1508 },
1509 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1510 const LLT DstTy = Query.Types[0];
1511 const LLT PtrTy = Query.Types[1];
1512
1513 const unsigned DstSize = DstTy.getSizeInBits();
1514 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1515
1516 // Split extloads.
1517 if (DstSize > MemSize)
1518 return std::pair(0, LLT::scalar(SizeInBits: MemSize));
1519
1520 unsigned MaxSize = maxSizeForAddrSpace(
1521 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1522 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1523 if (MemSize > MaxSize)
1524 return std::pair(0, LLT::scalar(SizeInBits: MaxSize));
1525
1526 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1527 return std::pair(0, LLT::scalar(SizeInBits: Align));
1528 })
1529 .fewerElementsIf(
1530 Predicate: [=](const LegalityQuery &Query) -> bool {
1531 return Query.Types[0].isVector() &&
1532 needToSplitMemOp(Query, Op == G_LOAD);
1533 },
1534 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1535 const LLT DstTy = Query.Types[0];
1536 const LLT PtrTy = Query.Types[1];
1537
1538 LLT EltTy = DstTy.getElementType();
1539 unsigned MaxSize = maxSizeForAddrSpace(
1540 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1541 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1542
1543 // FIXME: Handle widened to power of 2 results better. This ends
1544 // up scalarizing.
1545 // FIXME: 3 element stores scalarized on SI
1546
1547 // Split if it's too large for the address space.
1548 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1549 if (MemSize > MaxSize) {
1550 unsigned NumElts = DstTy.getNumElements();
1551 unsigned EltSize = EltTy.getSizeInBits();
1552
1553 if (MaxSize % EltSize == 0) {
1554 return std::pair(
1555 0, LLT::scalarOrVector(
1556 EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1557 }
1558
1559 unsigned NumPieces = MemSize / MaxSize;
1560
1561 // FIXME: Refine when odd breakdowns handled
1562 // The scalars will need to be re-legalized.
1563 if (NumPieces == 1 || NumPieces >= NumElts ||
1564 NumElts % NumPieces != 0)
1565 return std::pair(0, EltTy);
1566
1567 return std::pair(0,
1568 LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1569 }
1570
1571 // FIXME: We could probably handle weird extending loads better.
1572 if (DstTy.getSizeInBits() > MemSize)
1573 return std::pair(0, EltTy);
1574
1575 unsigned EltSize = EltTy.getSizeInBits();
1576 unsigned DstSize = DstTy.getSizeInBits();
1577 if (!isPowerOf2_32(Value: DstSize)) {
1578 // We're probably decomposing an odd sized store. Try to split
1579 // to the widest type. TODO: Account for alignment. As-is it
1580 // should be OK, since the new parts will be further legalized.
1581 unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1582 return std::pair(
1583 0, LLT::scalarOrVector(
1584 EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1585 }
1586
1587 // May need relegalization for the scalars.
1588 return std::pair(0, EltTy);
1589 })
1590 .minScalar(TypeIdx: 0, Ty: S32)
1591 .narrowScalarIf(Predicate: isWideScalarExtLoadTruncStore(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: S32))
1592 .widenScalarToNextPow2(TypeIdx: 0)
1593 .moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: 0, Size: 32), Mutation: moreEltsToNext32Bit(TypeIdx: 0))
1594 .lower();
1595 }
1596
1597 // FIXME: Unaligned accesses not lowered.
1598 auto &ExtLoads = getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1599 .legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: 8},
1600 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: 2 * 8},
1601 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1602 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1603 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1604 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1605 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: 8},
1606 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: 2 * 8}})
1607 .legalIf(
1608 Predicate: [=](const LegalityQuery &Query) -> bool {
1609 return isLoadStoreLegal(ST, Query);
1610 });
1611
1612 if (ST.hasFlatAddressSpace()) {
1613 ExtLoads.legalForTypesWithMemDesc(
1614 TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: 8}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: 16}});
1615 }
1616
1617 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1618 // 64-bits.
1619 //
1620 // TODO: Should generalize bitcast action into coerce, which will also cover
1621 // inserting addrspacecasts.
1622 ExtLoads.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1623
1624 ExtLoads.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1625 .widenScalarToNextPow2(TypeIdx: 0)
1626 .lower();
1627
1628 auto &Atomics = getActionDefinitionsBuilder(
1629 Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1630 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1631 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1632 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1633 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1634 {S64, GlobalPtr}, {S64, LocalPtr},
1635 {S32, RegionPtr}, {S64, RegionPtr}});
1636 if (ST.hasFlatAddressSpace()) {
1637 Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1638 }
1639
1640 // TODO: v2bf16 operations, and fat buffer pointer support.
1641 auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1642 if (ST.hasLDSFPAtomicAddF32()) {
1643 Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1644 if (ST.hasLdsAtomicAddF64())
1645 Atomic.legalFor(Types: {{S64, LocalPtr}});
1646 if (ST.hasAtomicDsPkAdd16Insts())
1647 Atomic.legalFor(Types: {{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1648 }
1649 if (ST.hasAtomicFaddInsts())
1650 Atomic.legalFor(Types: {{S32, GlobalPtr}});
1651 if (ST.hasFlatAtomicFaddF32Inst())
1652 Atomic.legalFor(Types: {{S32, FlatPtr}});
1653
1654 if (ST.hasGFX90AInsts()) {
1655 // These are legal with some caveats, and should have undergone expansion in
1656 // the IR in most situations
1657 // TODO: Move atomic expansion into legalizer
1658 Atomic.legalFor(Types: {
1659 {S32, GlobalPtr},
1660 {S64, GlobalPtr},
1661 {S64, FlatPtr}
1662 });
1663 }
1664
1665 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1666 ST.hasAtomicBufferGlobalPkAddF16Insts())
1667 Atomic.legalFor(Types: {{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1668 if (ST.hasAtomicGlobalPkAddBF16Inst())
1669 Atomic.legalFor(Types: {{V2BF16, GlobalPtr}});
1670 if (ST.hasAtomicFlatPkAdd16Insts())
1671 Atomic.legalFor(Types: {{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1672
1673
1674 // Most of the legalization work here is done by AtomicExpand. We could
1675 // probably use a simpler legality rule that just assumes anything is OK.
1676 auto &AtomicFMinFMax =
1677 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1678 .legalFor(Types: {{F32, LocalPtr}, {F64, LocalPtr}});
1679
1680 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1681 AtomicFMinFMax.legalFor(Types: {{F32, GlobalPtr},{F32, BufferFatPtr}});
1682 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1683 AtomicFMinFMax.legalFor(Types: {{F64, GlobalPtr}, {F64, BufferFatPtr}});
1684 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1685 AtomicFMinFMax.legalFor(Types: {F32, FlatPtr});
1686 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1687 AtomicFMinFMax.legalFor(Types: {F64, FlatPtr});
1688
1689 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1690 // demarshalling
1691 getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1692 .customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1693 {S32, FlatPtr}, {S64, FlatPtr}})
1694 .legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1695 {S32, RegionPtr}, {S64, RegionPtr}});
1696 // TODO: Pointer types, any 32-bit or 64-bit vector
1697
1698 // Condition should be s32 for scalar, s1 for vector.
1699 getActionDefinitionsBuilder(Opcode: G_SELECT)
1700 .legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1701 LocalPtr, FlatPtr, PrivatePtr,
1702 LLT::fixed_vector(NumElements: 2, ScalarTy: LocalPtr),
1703 LLT::fixed_vector(NumElements: 2, ScalarTy: PrivatePtr)},
1704 Types1: {S1, S32})
1705 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1706 .scalarize(TypeIdx: 1)
1707 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1708 .fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: 0), Mutation: scalarize(TypeIdx: 0))
1709 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 2)
1710 .clampMaxNumElements(TypeIdx: 0, EltTy: LocalPtr, MaxElements: 2)
1711 .clampMaxNumElements(TypeIdx: 0, EltTy: PrivatePtr, MaxElements: 2)
1712 .scalarize(TypeIdx: 0)
1713 .widenScalarToNextPow2(TypeIdx: 0)
1714 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: typeInSet(TypeIdx: 1, TypesInit: {S1, S32})));
1715
1716 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1717 // be more flexible with the shift amount type.
1718 auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1719 .legalFor(Types: {{S32, S32}, {S64, S32}});
1720 if (ST.has16BitInsts()) {
1721 if (ST.hasVOP3PInsts()) {
1722 Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1723 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1724 } else
1725 Shifts.legalFor(Types: {{S16, S16}});
1726
1727 // TODO: Support 16-bit shift amounts for all types
1728 Shifts.widenScalarIf(
1729 Predicate: [=](const LegalityQuery &Query) {
1730 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1731 // 32-bit amount.
1732 const LLT ValTy = Query.Types[0];
1733 const LLT AmountTy = Query.Types[1];
1734 return ValTy.getSizeInBits() <= 16 &&
1735 AmountTy.getSizeInBits() < 16;
1736 }, Mutation: changeTo(TypeIdx: 1, Ty: S16));
1737 Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16);
1738 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1739 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 16);
1740 Shifts.clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1741
1742 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1743 .minScalar(TypeIdx: 0, Ty: S16)
1744 .scalarize(TypeIdx: 0)
1745 .lower();
1746 } else {
1747 // Make sure we legalize the shift amount type first, as the general
1748 // expansion for the shifted type will produce much worse code if it hasn't
1749 // been truncated already.
1750 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1751 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1752 Shifts.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1753
1754 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1755 .minScalar(TypeIdx: 0, Ty: S32)
1756 .scalarize(TypeIdx: 0)
1757 .lower();
1758 }
1759 Shifts.scalarize(TypeIdx: 0);
1760
1761 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1762 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1763 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1764 unsigned IdxTypeIdx = 2;
1765
1766 getActionDefinitionsBuilder(Opcode: Op)
1767 .customIf(Predicate: [=](const LegalityQuery &Query) {
1768 const LLT EltTy = Query.Types[EltTypeIdx];
1769 const LLT VecTy = Query.Types[VecTypeIdx];
1770 const LLT IdxTy = Query.Types[IdxTypeIdx];
1771 const unsigned EltSize = EltTy.getSizeInBits();
1772 const bool isLegalVecType =
1773 !!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1774 // Address space 8 pointers are 128-bit wide values, but the logic
1775 // below will try to bitcast them to 2N x s64, which will fail.
1776 // Therefore, as an intermediate step, wrap extracts/insertions from a
1777 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1778 // extraction result) in order to produce a vector operation that can
1779 // be handled by the logic below.
1780 if (EltTy.isPointer() && EltSize > 64)
1781 return true;
1782 return (EltSize == 32 || EltSize == 64) &&
1783 VecTy.getSizeInBits() % 32 == 0 &&
1784 VecTy.getSizeInBits() <= MaxRegisterSize &&
1785 IdxTy.getSizeInBits() == 32 &&
1786 isLegalVecType;
1787 })
1788 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx), P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: 32)),
1789 Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1790 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1791 .bitcastIf(
1792 Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx), P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: 64)),
1793 Mutation: [=](const LegalityQuery &Query) {
1794 // For > 64-bit element types, try to turn this into a 64-bit
1795 // element vector since we may be able to do better indexing
1796 // if this is scalar. If not, fall back to 32.
1797 const LLT EltTy = Query.Types[EltTypeIdx];
1798 const LLT VecTy = Query.Types[VecTypeIdx];
1799 const unsigned DstEltSize = EltTy.getSizeInBits();
1800 const unsigned VecSize = VecTy.getSizeInBits();
1801
1802 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1803 return std::pair(
1804 VecTypeIdx,
1805 LLT::fixed_vector(NumElements: VecSize / TargetEltSize, ScalarSizeInBits: TargetEltSize));
1806 })
1807 .clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
1808 .clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
1809 .clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
1810 .clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: 32)
1811 // TODO: Clamp elements for 64-bit vectors?
1812 .moreElementsIf(
1813 Predicate: isIllegalRegisterType(TypeIdx: VecTypeIdx),
1814 Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
1815 // It should only be necessary with variable indexes.
1816 // As a last resort, lower to the stack
1817 .lower();
1818 }
1819
1820 getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1821 .unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1822 const LLT &EltTy = Query.Types[1].getElementType();
1823 return Query.Types[0] != EltTy;
1824 });
1825
1826 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1827 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1828 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1829
1830 // FIXME: Doesn't handle extract of illegal sizes.
1831 getActionDefinitionsBuilder(Opcode: Op)
1832 .lowerIf(Predicate: all(P0: typeIs(TypeIdx: LitTyIdx, TypesInit: S16), P1: sizeIs(TypeIdx: BigTyIdx, Size: 32)))
1833 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
1834 // Sub-vector(or single element) insert and extract.
1835 // TODO: verify immediate offset here since lower only works with
1836 // whole elements.
1837 const LLT BigTy = Query.Types[BigTyIdx];
1838 return BigTy.isVector();
1839 })
1840 // FIXME: Multiples of 16 should not be legal.
1841 .legalIf(Predicate: [=](const LegalityQuery &Query) {
1842 const LLT BigTy = Query.Types[BigTyIdx];
1843 const LLT LitTy = Query.Types[LitTyIdx];
1844 return (BigTy.getSizeInBits() % 32 == 0) &&
1845 (LitTy.getSizeInBits() % 16 == 0);
1846 })
1847 .widenScalarIf(
1848 Predicate: [=](const LegalityQuery &Query) {
1849 const LLT BigTy = Query.Types[BigTyIdx];
1850 return (BigTy.getScalarSizeInBits() < 16);
1851 },
1852 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: 16))
1853 .widenScalarIf(
1854 Predicate: [=](const LegalityQuery &Query) {
1855 const LLT LitTy = Query.Types[LitTyIdx];
1856 return (LitTy.getScalarSizeInBits() < 16);
1857 },
1858 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: 16))
1859 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1860 .widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: 32);
1861
1862 }
1863
1864 auto &BuildVector = getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1865 .legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
1866 .legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
1867 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
1868 .clampNumElements(TypeIdx: 0, MinTy: V2S64, MaxTy: V16S64)
1869 .fewerElementsIf(Predicate: isWideVec16(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: V2S16))
1870 .moreElementsIf(
1871 Predicate: isIllegalRegisterType(TypeIdx: 0),
1872 Mutation: moreElementsToNextExistingRegClass(TypeIdx: 0));
1873
1874 if (ST.hasScalarPackInsts()) {
1875 BuildVector
1876 // FIXME: Should probably widen s1 vectors straight to s32
1877 .minScalarOrElt(TypeIdx: 0, Ty: S16)
1878 .minScalar(TypeIdx: 1, Ty: S16);
1879
1880 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1881 .legalFor(Types: {V2S16, S32})
1882 .lower();
1883 } else {
1884 BuildVector.customFor(Types: {V2S16, S16});
1885 BuildVector.minScalarOrElt(TypeIdx: 0, Ty: S32);
1886
1887 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1888 .customFor(Types: {V2S16, S32})
1889 .lower();
1890 }
1891
1892 BuildVector.legalIf(Predicate: isRegisterType(TypeIdx: 0));
1893
1894 // FIXME: Clamp maximum size
1895 getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
1896 .legalIf(Predicate: all(P0: isRegisterType(TypeIdx: 0), P1: isRegisterType(TypeIdx: 1)))
1897 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 32)
1898 .clampMaxNumElements(TypeIdx: 1, EltTy: S16, MaxElements: 2) // TODO: Make 4?
1899 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 64);
1900
1901 getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
1902
1903 // Merge/Unmerge
1904 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1905 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1906 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1907
1908 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1909 const LLT Ty = Query.Types[TypeIdx];
1910 if (Ty.isVector()) {
1911 const LLT &EltTy = Ty.getElementType();
1912 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1913 return true;
1914 if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
1915 return true;
1916 }
1917 return false;
1918 };
1919
1920 auto &Builder = getActionDefinitionsBuilder(Opcode: Op)
1921 .legalIf(Predicate: all(P0: isRegisterType(TypeIdx: 0), P1: isRegisterType(TypeIdx: 1)))
1922 .lowerFor(Types: {{S16, V2S16}})
1923 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
1924 const LLT BigTy = Query.Types[BigTyIdx];
1925 return BigTy.getSizeInBits() == 32;
1926 })
1927 // Try to widen to s16 first for small types.
1928 // TODO: Only do this on targets with legal s16 shifts
1929 .minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: 16), TypeIdx: LitTyIdx, Ty: S16)
1930 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 16)
1931 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1932 .fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: 0, TypesInit: S16), P1: vectorWiderThan(TypeIdx: 1, Size: 32),
1933 args: elementTypeIs(TypeIdx: 1, EltTy: S16)),
1934 Mutation: changeTo(TypeIdx: 1, Ty: V2S16))
1935 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1936 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1937 // valid.
1938 .clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
1939 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 32)
1940 // Break up vectors with weird elements into scalars
1941 .fewerElementsIf(
1942 Predicate: [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1943 Mutation: scalarize(TypeIdx: 0))
1944 .fewerElementsIf(
1945 Predicate: [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1946 Mutation: scalarize(TypeIdx: 1))
1947 .clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
1948
1949 if (Op == G_MERGE_VALUES) {
1950 Builder.widenScalarIf(
1951 // TODO: Use 16-bit shifts if legal for 8-bit values?
1952 Predicate: [=](const LegalityQuery &Query) {
1953 const LLT Ty = Query.Types[LitTyIdx];
1954 return Ty.getSizeInBits() < 32;
1955 },
1956 Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
1957 }
1958
1959 Builder.widenScalarIf(
1960 Predicate: [=](const LegalityQuery &Query) {
1961 const LLT Ty = Query.Types[BigTyIdx];
1962 return Ty.getSizeInBits() % 16 != 0;
1963 },
1964 Mutation: [=](const LegalityQuery &Query) {
1965 // Pick the next power of 2, or a multiple of 64 over 128.
1966 // Whichever is smaller.
1967 const LLT &Ty = Query.Types[BigTyIdx];
1968 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Value: Ty.getSizeInBits() + 1);
1969 if (NewSizeInBits >= 256) {
1970 unsigned RoundedTo = alignTo<64>(Value: Ty.getSizeInBits() + 1);
1971 if (RoundedTo < NewSizeInBits)
1972 NewSizeInBits = RoundedTo;
1973 }
1974 return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
1975 })
1976 // Any vectors left are the wrong size. Scalarize them.
1977 .scalarize(TypeIdx: 0)
1978 .scalarize(TypeIdx: 1);
1979 }
1980
1981 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1982 // RegBankSelect.
1983 auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
1984 .legalFor(Types: {{S32}, {S64}});
1985
1986 if (ST.hasVOP3PInsts()) {
1987 SextInReg.lowerFor(Types: {{V2S16}})
1988 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1989 // get more vector shift opportunities, since we'll get those when
1990 // expanded.
1991 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
1992 } else if (ST.has16BitInsts()) {
1993 SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
1994 } else {
1995 // Prefer to promote to s32 before lowering if we don't have 16-bit
1996 // shifts. This avoid a lot of intermediate truncate and extend operations.
1997 SextInReg.lowerFor(Types: {{S32}, {S64}});
1998 }
1999
2000 SextInReg
2001 .scalarize(TypeIdx: 0)
2002 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2003 .lower();
2004
2005 getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
2006 .scalarize(TypeIdx: 0)
2007 .lower();
2008
2009 // TODO: Only Try to form v2s16 with legal packed instructions.
2010 getActionDefinitionsBuilder(Opcode: G_FSHR)
2011 .legalFor(Types: {{S32, S32}})
2012 .lowerFor(Types: {{V2S16, V2S16}})
2013 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2014 .scalarize(TypeIdx: 0)
2015 .lower();
2016
2017 if (ST.hasVOP3PInsts()) {
2018 getActionDefinitionsBuilder(Opcode: G_FSHL)
2019 .lowerFor(Types: {{V2S16, V2S16}})
2020 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2021 .scalarize(TypeIdx: 0)
2022 .lower();
2023 } else {
2024 getActionDefinitionsBuilder(Opcode: G_FSHL)
2025 .scalarize(TypeIdx: 0)
2026 .lower();
2027 }
2028
2029 getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
2030 .legalFor(Types: {S64});
2031
2032 getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
2033
2034 getActionDefinitionsBuilder(Opcode: G_FENCE)
2035 .alwaysLegal();
2036
2037 getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
2038 .scalarize(TypeIdx: 0)
2039 .minScalar(TypeIdx: 0, Ty: S32)
2040 .lower();
2041
2042 getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2043 .legalFor(Types: {{S32, S32}, {S64, S32}})
2044 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
2045 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2046 .widenScalarToNextPow2(TypeIdx: 0)
2047 .scalarize(TypeIdx: 0);
2048
2049 getActionDefinitionsBuilder(
2050 Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2051 G_FCOPYSIGN,
2052
2053 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2054 G_READ_REGISTER, G_WRITE_REGISTER,
2055
2056 G_SADDO, G_SSUBO})
2057 .lower();
2058
2059 if (ST.hasIEEEMinMax()) {
2060 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2061 .legalFor(Types: FPTypesPK16)
2062 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
2063 .scalarize(TypeIdx: 0);
2064 } else {
2065 // TODO: Implement
2066 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM}).lower();
2067 }
2068
2069 getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2070 .lower();
2071
2072 getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2073
2074 getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2075 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2076 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2077 .unsupported();
2078
2079 getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2080
2081 getLegacyLegalizerInfo().computeTables();
2082 verify(MII: *ST.getInstrInfo());
2083}
2084
2085bool AMDGPULegalizerInfo::legalizeCustom(
2086 LegalizerHelper &Helper, MachineInstr &MI,
2087 LostDebugLocObserver &LocObserver) const {
2088 MachineIRBuilder &B = Helper.MIRBuilder;
2089 MachineRegisterInfo &MRI = *B.getMRI();
2090
2091 switch (MI.getOpcode()) {
2092 case TargetOpcode::G_ADDRSPACE_CAST:
2093 return legalizeAddrSpaceCast(MI, MRI, B);
2094 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2095 return legalizeFroundeven(MI, MRI, B);
2096 case TargetOpcode::G_FCEIL:
2097 return legalizeFceil(MI, MRI, B);
2098 case TargetOpcode::G_FREM:
2099 return legalizeFrem(MI, MRI, B);
2100 case TargetOpcode::G_INTRINSIC_TRUNC:
2101 return legalizeIntrinsicTrunc(MI, MRI, B);
2102 case TargetOpcode::G_SITOFP:
2103 return legalizeITOFP(MI, MRI, B, Signed: true);
2104 case TargetOpcode::G_UITOFP:
2105 return legalizeITOFP(MI, MRI, B, Signed: false);
2106 case TargetOpcode::G_FPTOSI:
2107 return legalizeFPTOI(MI, MRI, B, Signed: true);
2108 case TargetOpcode::G_FPTOUI:
2109 return legalizeFPTOI(MI, MRI, B, Signed: false);
2110 case TargetOpcode::G_FMINNUM:
2111 case TargetOpcode::G_FMAXNUM:
2112 case TargetOpcode::G_FMINNUM_IEEE:
2113 case TargetOpcode::G_FMAXNUM_IEEE:
2114 return legalizeMinNumMaxNum(Helper, MI);
2115 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2116 return legalizeExtractVectorElt(MI, MRI, B);
2117 case TargetOpcode::G_INSERT_VECTOR_ELT:
2118 return legalizeInsertVectorElt(MI, MRI, B);
2119 case TargetOpcode::G_FSIN:
2120 case TargetOpcode::G_FCOS:
2121 return legalizeSinCos(MI, MRI, B);
2122 case TargetOpcode::G_GLOBAL_VALUE:
2123 return legalizeGlobalValue(MI, MRI, B);
2124 case TargetOpcode::G_LOAD:
2125 case TargetOpcode::G_SEXTLOAD:
2126 case TargetOpcode::G_ZEXTLOAD:
2127 return legalizeLoad(Helper, MI);
2128 case TargetOpcode::G_STORE:
2129 return legalizeStore(Helper, MI);
2130 case TargetOpcode::G_FMAD:
2131 return legalizeFMad(MI, MRI, B);
2132 case TargetOpcode::G_FDIV:
2133 return legalizeFDIV(MI, MRI, B);
2134 case TargetOpcode::G_FFREXP:
2135 return legalizeFFREXP(MI, MRI, B);
2136 case TargetOpcode::G_FSQRT:
2137 return legalizeFSQRT(MI, MRI, B);
2138 case TargetOpcode::G_UDIV:
2139 case TargetOpcode::G_UREM:
2140 case TargetOpcode::G_UDIVREM:
2141 return legalizeUnsignedDIV_REM(MI, MRI, B);
2142 case TargetOpcode::G_SDIV:
2143 case TargetOpcode::G_SREM:
2144 case TargetOpcode::G_SDIVREM:
2145 return legalizeSignedDIV_REM(MI, MRI, B);
2146 case TargetOpcode::G_ATOMIC_CMPXCHG:
2147 return legalizeAtomicCmpXChg(MI, MRI, B);
2148 case TargetOpcode::G_FLOG2:
2149 return legalizeFlog2(MI, B);
2150 case TargetOpcode::G_FLOG:
2151 case TargetOpcode::G_FLOG10:
2152 return legalizeFlogCommon(MI, B);
2153 case TargetOpcode::G_FEXP2:
2154 return legalizeFExp2(MI, B);
2155 case TargetOpcode::G_FEXP:
2156 case TargetOpcode::G_FEXP10:
2157 return legalizeFExp(MI, B);
2158 case TargetOpcode::G_FPOW:
2159 return legalizeFPow(MI, B);
2160 case TargetOpcode::G_FFLOOR:
2161 return legalizeFFloor(MI, MRI, B);
2162 case TargetOpcode::G_BUILD_VECTOR:
2163 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2164 return legalizeBuildVector(MI, MRI, B);
2165 case TargetOpcode::G_MUL:
2166 return legalizeMul(Helper, MI);
2167 case TargetOpcode::G_CTLZ:
2168 case TargetOpcode::G_CTTZ:
2169 return legalizeCTLZ_CTTZ(MI, MRI, B);
2170 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2171 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2172 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2173 return legalizeFPTruncRound(MI, B);
2174 case TargetOpcode::G_STACKSAVE:
2175 return legalizeStackSave(MI, B);
2176 case TargetOpcode::G_GET_FPENV:
2177 return legalizeGetFPEnv(MI, MRI, B);
2178 case TargetOpcode::G_SET_FPENV:
2179 return legalizeSetFPEnv(MI, MRI, B);
2180 case TargetOpcode::G_TRAP:
2181 return legalizeTrap(MI, MRI, B);
2182 case TargetOpcode::G_DEBUGTRAP:
2183 return legalizeDebugTrap(MI, MRI, B);
2184 default:
2185 return false;
2186 }
2187
2188 llvm_unreachable("expected switch to return");
2189}
2190
2191Register AMDGPULegalizerInfo::getSegmentAperture(
2192 unsigned AS,
2193 MachineRegisterInfo &MRI,
2194 MachineIRBuilder &B) const {
2195 MachineFunction &MF = B.getMF();
2196 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2197 const LLT S32 = LLT::scalar(SizeInBits: 32);
2198 const LLT S64 = LLT::scalar(SizeInBits: 64);
2199
2200 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2201
2202 if (ST.hasApertureRegs()) {
2203 // Note: this register is somewhat broken. When used as a 32-bit operand,
2204 // it only returns zeroes. The real value is in the upper 32 bits.
2205 // Thus, we must emit extract the high 32 bits.
2206 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2207 ? AMDGPU::SRC_SHARED_BASE
2208 : AMDGPU::SRC_PRIVATE_BASE;
2209 // FIXME: It would be more natural to emit a COPY here, but then copy
2210 // coalescing would kick in and it would think it's okay to use the "HI"
2211 // subregister (instead of extracting the HI 32 bits) which is an artificial
2212 // (unusable) register.
2213 // Register TableGen definitions would need an overhaul to get rid of the
2214 // artificial "HI" aperture registers and prevent this kind of issue from
2215 // happening.
2216 Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2217 MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2218 B.buildInstr(Opc: AMDGPU::S_MOV_B64, DstOps: {Dst}, SrcOps: {Register(ApertureRegNo)});
2219 return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: 1);
2220 }
2221
2222 // TODO: can we be smarter about machine pointer info?
2223 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2224 Register LoadAddr = MRI.createGenericVirtualRegister(
2225 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2226 // For code object version 5, private_base and shared_base are passed through
2227 // implicit kernargs.
2228 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2229 AMDGPU::AMDHSA_COV5) {
2230 AMDGPUTargetLowering::ImplicitParameter Param =
2231 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2232 : AMDGPUTargetLowering::PRIVATE_BASE;
2233 uint64_t Offset =
2234 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2235
2236 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2237 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2238
2239 if (!loadInputValue(DstReg: KernargPtrReg, B,
2240 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2241 return Register();
2242
2243 MachineMemOperand *MMO = MF.getMachineMemOperand(
2244 PtrInfo,
2245 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2246 MachineMemOperand::MOInvariant,
2247 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset));
2248
2249 // Pointer address
2250 B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
2251 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
2252 // Load address
2253 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2254 }
2255
2256 Register QueuePtr = MRI.createGenericVirtualRegister(
2257 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2258
2259 if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2260 return Register();
2261
2262 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2263 // private_segment_aperture_base_hi.
2264 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2265
2266 MachineMemOperand *MMO = MF.getMachineMemOperand(
2267 PtrInfo,
2268 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2269 MachineMemOperand::MOInvariant,
2270 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset: StructOffset));
2271
2272 B.buildPtrAdd(Res: LoadAddr, Op0: QueuePtr,
2273 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: StructOffset).getReg(Idx: 0));
2274 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2275}
2276
2277/// Return true if the value is a known valid address, such that a null check is
2278/// not necessary.
2279static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2280 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2281 MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2282 switch (Def->getOpcode()) {
2283 case AMDGPU::G_FRAME_INDEX:
2284 case AMDGPU::G_GLOBAL_VALUE:
2285 case AMDGPU::G_BLOCK_ADDR:
2286 return true;
2287 case AMDGPU::G_CONSTANT: {
2288 const ConstantInt *CI = Def->getOperand(i: 1).getCImm();
2289 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2290 }
2291 default:
2292 return false;
2293 }
2294
2295 return false;
2296}
2297
2298bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2299 MachineInstr &MI, MachineRegisterInfo &MRI,
2300 MachineIRBuilder &B) const {
2301 MachineFunction &MF = B.getMF();
2302
2303 // MI can either be a G_ADDRSPACE_CAST or a
2304 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2305 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2306 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2307 Intrinsic::amdgcn_addrspacecast_nonnull));
2308
2309 const LLT S32 = LLT::scalar(SizeInBits: 32);
2310 Register Dst = MI.getOperand(i: 0).getReg();
2311 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
2312 : MI.getOperand(i: 1).getReg();
2313 LLT DstTy = MRI.getType(Reg: Dst);
2314 LLT SrcTy = MRI.getType(Reg: Src);
2315 unsigned DestAS = DstTy.getAddressSpace();
2316 unsigned SrcAS = SrcTy.getAddressSpace();
2317
2318 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2319 // vector element.
2320 assert(!DstTy.isVector());
2321
2322 const AMDGPUTargetMachine &TM
2323 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2324
2325 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2326 MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2327 return true;
2328 }
2329
2330 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2331 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2332 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2333 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2334 // G_ADDRSPACE_CAST we need to guess.
2335 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2336 // Extract low 32-bits of the pointer.
2337 B.buildExtract(Res: Dst, Src, Index: 0);
2338 MI.eraseFromParent();
2339 return true;
2340 }
2341
2342 unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
2343
2344 auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2345 auto FlatNull = B.buildConstant(Res: SrcTy, Val: 0);
2346
2347 // Extract low 32-bits of the pointer.
2348 auto PtrLo32 = B.buildExtract(Res: DstTy, Src, Index: 0);
2349
2350 auto CmpRes =
2351 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: FlatNull.getReg(Idx: 0));
2352 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: 0));
2353
2354 MI.eraseFromParent();
2355 return true;
2356 }
2357
2358 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2359 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2360 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2361 Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2362 if (!ApertureReg.isValid())
2363 return false;
2364
2365 // Coerce the type of the low half of the result so we can use merge_values.
2366 Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: 0);
2367
2368 // TODO: Should we allow mismatched types but matching sizes in merges to
2369 // avoid the ptrtoint?
2370 auto BuildPtr = B.buildMergeLikeInstr(Res: DstTy, Ops: {SrcAsInt, ApertureReg});
2371
2372 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2373 // G_ADDRSPACE_CAST we need to guess.
2374 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2375 B.buildCopy(Res: Dst, Op: BuildPtr);
2376 MI.eraseFromParent();
2377 return true;
2378 }
2379
2380 auto SegmentNull = B.buildConstant(Res: SrcTy, Val: TM.getNullPointerValue(AddrSpace: SrcAS));
2381 auto FlatNull = B.buildConstant(Res: DstTy, Val: TM.getNullPointerValue(AddrSpace: DestAS));
2382
2383 auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
2384 Op1: SegmentNull.getReg(Idx: 0));
2385
2386 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2387
2388 MI.eraseFromParent();
2389 return true;
2390 }
2391
2392 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2393 SrcTy.getSizeInBits() == 64) {
2394 // Truncate.
2395 B.buildExtract(Res: Dst, Src, Index: 0);
2396 MI.eraseFromParent();
2397 return true;
2398 }
2399
2400 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2401 DstTy.getSizeInBits() == 64) {
2402 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2403 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2404 auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2405 auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2406 B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2407 MI.eraseFromParent();
2408 return true;
2409 }
2410
2411 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2412 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2413
2414 LLVMContext &Ctx = MF.getFunction().getContext();
2415 Ctx.diagnose(DI: InvalidAddrSpaceCast);
2416 B.buildUndef(Res: Dst);
2417 MI.eraseFromParent();
2418 return true;
2419}
2420
2421bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2422 MachineRegisterInfo &MRI,
2423 MachineIRBuilder &B) const {
2424 Register Src = MI.getOperand(i: 1).getReg();
2425 LLT Ty = MRI.getType(Reg: Src);
2426 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2427
2428 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2429 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2430
2431 auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2432 auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2433
2434 // TODO: Should this propagate fast-math-flags?
2435 auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2436 auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2437
2438 auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2439 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2440
2441 auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: C2);
2442 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2443 MI.eraseFromParent();
2444 return true;
2445}
2446
2447bool AMDGPULegalizerInfo::legalizeFceil(
2448 MachineInstr &MI, MachineRegisterInfo &MRI,
2449 MachineIRBuilder &B) const {
2450
2451 const LLT S1 = LLT::scalar(SizeInBits: 1);
2452 const LLT S64 = LLT::scalar(SizeInBits: 64);
2453
2454 Register Src = MI.getOperand(i: 1).getReg();
2455 assert(MRI.getType(Src) == S64);
2456
2457 // result = trunc(src)
2458 // if (src > 0.0 && src != result)
2459 // result += 1.0
2460
2461 auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2462
2463 const auto Zero = B.buildFConstant(Res: S64, Val: 0.0);
2464 const auto One = B.buildFConstant(Res: S64, Val: 1.0);
2465 auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2466 auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2467 auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2468 auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2469
2470 // TODO: Should this propagate fast-math-flags?
2471 B.buildFAdd(Dst: MI.getOperand(i: 0).getReg(), Src0: Trunc, Src1: Add);
2472 MI.eraseFromParent();
2473 return true;
2474}
2475
2476bool AMDGPULegalizerInfo::legalizeFrem(
2477 MachineInstr &MI, MachineRegisterInfo &MRI,
2478 MachineIRBuilder &B) const {
2479 Register DstReg = MI.getOperand(i: 0).getReg();
2480 Register Src0Reg = MI.getOperand(i: 1).getReg();
2481 Register Src1Reg = MI.getOperand(i: 2).getReg();
2482 auto Flags = MI.getFlags();
2483 LLT Ty = MRI.getType(Reg: DstReg);
2484
2485 auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2486 auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2487 auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2488 B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2489 MI.eraseFromParent();
2490 return true;
2491}
2492
2493static MachineInstrBuilder extractF64Exponent(Register Hi,
2494 MachineIRBuilder &B) {
2495 const unsigned FractBits = 52;
2496 const unsigned ExpBits = 11;
2497 LLT S32 = LLT::scalar(SizeInBits: 32);
2498
2499 auto Const0 = B.buildConstant(Res: S32, Val: FractBits - 32);
2500 auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2501
2502 auto ExpPart = B.buildIntrinsic(ID: Intrinsic::amdgcn_ubfe, Res: {S32})
2503 .addUse(RegNo: Hi)
2504 .addUse(RegNo: Const0.getReg(Idx: 0))
2505 .addUse(RegNo: Const1.getReg(Idx: 0));
2506
2507 return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: 1023));
2508}
2509
2510bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2511 MachineInstr &MI, MachineRegisterInfo &MRI,
2512 MachineIRBuilder &B) const {
2513 const LLT S1 = LLT::scalar(SizeInBits: 1);
2514 const LLT S32 = LLT::scalar(SizeInBits: 32);
2515 const LLT S64 = LLT::scalar(SizeInBits: 64);
2516
2517 Register Src = MI.getOperand(i: 1).getReg();
2518 assert(MRI.getType(Src) == S64);
2519
2520 // TODO: Should this use extract since the low half is unused?
2521 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2522 Register Hi = Unmerge.getReg(Idx: 1);
2523
2524 // Extract the upper half, since this is where we will find the sign and
2525 // exponent.
2526 auto Exp = extractF64Exponent(Hi, B);
2527
2528 const unsigned FractBits = 52;
2529
2530 // Extract the sign bit.
2531 const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(1) << 31);
2532 auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2533
2534 const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(1) << FractBits) - 1);
2535
2536 const auto Zero32 = B.buildConstant(Res: S32, Val: 0);
2537
2538 // Extend back to 64-bits.
2539 auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2540
2541 auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2542 auto Not = B.buildNot(Dst: S64, Src0: Shr);
2543 auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2544 auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - 1);
2545
2546 auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2547 auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2548
2549 auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2550 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2551 MI.eraseFromParent();
2552 return true;
2553}
2554
2555bool AMDGPULegalizerInfo::legalizeITOFP(
2556 MachineInstr &MI, MachineRegisterInfo &MRI,
2557 MachineIRBuilder &B, bool Signed) const {
2558
2559 Register Dst = MI.getOperand(i: 0).getReg();
2560 Register Src = MI.getOperand(i: 1).getReg();
2561
2562 const LLT S64 = LLT::scalar(SizeInBits: 64);
2563 const LLT S32 = LLT::scalar(SizeInBits: 32);
2564
2565 assert(MRI.getType(Src) == S64);
2566
2567 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2568 auto ThirtyTwo = B.buildConstant(Res: S32, Val: 32);
2569
2570 if (MRI.getType(Reg: Dst) == S64) {
2571 auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1))
2572 : B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1));
2573
2574 auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 0));
2575 auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2576
2577 // TODO: Should this propagate fast-math-flags?
2578 B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2579 MI.eraseFromParent();
2580 return true;
2581 }
2582
2583 assert(MRI.getType(Dst) == S32);
2584
2585 auto One = B.buildConstant(Res: S32, Val: 1);
2586
2587 MachineInstrBuilder ShAmt;
2588 if (Signed) {
2589 auto ThirtyOne = B.buildConstant(Res: S32, Val: 31);
2590 auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: 0), Src1: Unmerge.getReg(Idx: 1));
2591 auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2592 auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2593 auto LS = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32})
2594 .addUse(RegNo: Unmerge.getReg(Idx: 1));
2595 auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2596 ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2597 } else
2598 ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
2599 auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2600 auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2601 auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: 0));
2602 auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: 1), Src1: Adjust);
2603 auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2604 auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2605 B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2606 MI.eraseFromParent();
2607 return true;
2608}
2609
2610// TODO: Copied from DAG implementation. Verify logic and document how this
2611// actually works.
2612bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2613 MachineRegisterInfo &MRI,
2614 MachineIRBuilder &B,
2615 bool Signed) const {
2616
2617 Register Dst = MI.getOperand(i: 0).getReg();
2618 Register Src = MI.getOperand(i: 1).getReg();
2619
2620 const LLT S64 = LLT::scalar(SizeInBits: 64);
2621 const LLT S32 = LLT::scalar(SizeInBits: 32);
2622
2623 const LLT SrcLT = MRI.getType(Reg: Src);
2624 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2625
2626 unsigned Flags = MI.getFlags();
2627
2628 // The basic idea of converting a floating point number into a pair of 32-bit
2629 // integers is illustrated as follows:
2630 //
2631 // tf := trunc(val);
2632 // hif := floor(tf * 2^-32);
2633 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2634 // hi := fptoi(hif);
2635 // lo := fptoi(lof);
2636 //
2637 auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2638 MachineInstrBuilder Sign;
2639 if (Signed && SrcLT == S32) {
2640 // However, a 32-bit floating point number has only 23 bits mantissa and
2641 // it's not enough to hold all the significant bits of `lof` if val is
2642 // negative. To avoid the loss of precision, We need to take the absolute
2643 // value after truncating and flip the result back based on the original
2644 // signedness.
2645 Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: 31));
2646 Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2647 }
2648 MachineInstrBuilder K0, K1;
2649 if (SrcLT == S64) {
2650 K0 = B.buildFConstant(
2651 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2652 K1 = B.buildFConstant(
2653 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2654 } else {
2655 K0 = B.buildFConstant(
2656 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2657 K1 = B.buildFConstant(
2658 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2659 }
2660
2661 auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2662 auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2663 auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2664
2665 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2666 : B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2667 auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2668
2669 if (Signed && SrcLT == S32) {
2670 // Flip the result based on the signedness, which is either all 0s or 1s.
2671 Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2672 // r := xor({lo, hi}, sign) - sign;
2673 B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2674 Src1: Sign);
2675 } else
2676 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2677 MI.eraseFromParent();
2678
2679 return true;
2680}
2681
2682bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2683 MachineInstr &MI) const {
2684 MachineFunction &MF = Helper.MIRBuilder.getMF();
2685 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2686
2687 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2688 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2689
2690 // With ieee_mode disabled, the instructions have the correct behavior
2691 // already for G_FMINNUM/G_FMAXNUM
2692 if (!MFI->getMode().IEEE)
2693 return !IsIEEEOp;
2694
2695 if (IsIEEEOp)
2696 return true;
2697
2698 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2699}
2700
2701bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2702 MachineInstr &MI, MachineRegisterInfo &MRI,
2703 MachineIRBuilder &B) const {
2704 // TODO: Should move some of this into LegalizerHelper.
2705
2706 // TODO: Promote dynamic indexing of s16 to s32
2707
2708 Register Dst = MI.getOperand(i: 0).getReg();
2709 Register Vec = MI.getOperand(i: 1).getReg();
2710
2711 LLT VecTy = MRI.getType(Reg: Vec);
2712 LLT EltTy = VecTy.getElementType();
2713 assert(EltTy == MRI.getType(Dst));
2714
2715 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2716 // but we can't go directly to that logic becasue you can't bitcast a vector
2717 // of pointers to a vector of integers. Therefore, introduce an intermediate
2718 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2719 // drive the legalization forward.
2720 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2721 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2722 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2723
2724 auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2725 auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: 2));
2726 B.buildIntToPtr(Dst, Src: IntElt);
2727
2728 MI.eraseFromParent();
2729 return true;
2730 }
2731
2732 // FIXME: Artifact combiner probably should have replaced the truncated
2733 // constant before this, so we shouldn't need
2734 // getIConstantVRegValWithLookThrough.
2735 std::optional<ValueAndVReg> MaybeIdxVal =
2736 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 2).getReg(), MRI);
2737 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2738 return true;
2739 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2740
2741 if (IdxVal < VecTy.getNumElements()) {
2742 auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
2743 B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
2744 } else {
2745 B.buildUndef(Res: Dst);
2746 }
2747
2748 MI.eraseFromParent();
2749 return true;
2750}
2751
2752bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2753 MachineInstr &MI, MachineRegisterInfo &MRI,
2754 MachineIRBuilder &B) const {
2755 // TODO: Should move some of this into LegalizerHelper.
2756
2757 // TODO: Promote dynamic indexing of s16 to s32
2758
2759 Register Dst = MI.getOperand(i: 0).getReg();
2760 Register Vec = MI.getOperand(i: 1).getReg();
2761 Register Ins = MI.getOperand(i: 2).getReg();
2762
2763 LLT VecTy = MRI.getType(Reg: Vec);
2764 LLT EltTy = VecTy.getElementType();
2765 assert(EltTy == MRI.getType(Ins));
2766
2767 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2768 // but we can't go directly to that logic becasue you can't bitcast a vector
2769 // of pointers to a vector of integers. Therefore, make the pointer vector
2770 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2771 // new value, and then inttoptr the result vector back. This will then allow
2772 // the rest of legalization to take over.
2773 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2774 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2775 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2776
2777 auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2778 auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
2779 auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
2780 Idx: MI.getOperand(i: 3));
2781 B.buildIntToPtr(Dst, Src: IntVecDest);
2782 MI.eraseFromParent();
2783 return true;
2784 }
2785
2786 // FIXME: Artifact combiner probably should have replaced the truncated
2787 // constant before this, so we shouldn't need
2788 // getIConstantVRegValWithLookThrough.
2789 std::optional<ValueAndVReg> MaybeIdxVal =
2790 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 3).getReg(), MRI);
2791 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2792 return true;
2793
2794 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2795
2796 unsigned NumElts = VecTy.getNumElements();
2797 if (IdxVal < NumElts) {
2798 SmallVector<Register, 8> SrcRegs;
2799 for (unsigned i = 0; i < NumElts; ++i)
2800 SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
2801 B.buildUnmerge(Res: SrcRegs, Op: Vec);
2802
2803 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
2804 B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
2805 } else {
2806 B.buildUndef(Res: Dst);
2807 }
2808
2809 MI.eraseFromParent();
2810 return true;
2811}
2812
2813bool AMDGPULegalizerInfo::legalizeSinCos(
2814 MachineInstr &MI, MachineRegisterInfo &MRI,
2815 MachineIRBuilder &B) const {
2816
2817 Register DstReg = MI.getOperand(i: 0).getReg();
2818 Register SrcReg = MI.getOperand(i: 1).getReg();
2819 LLT Ty = MRI.getType(Reg: DstReg);
2820 unsigned Flags = MI.getFlags();
2821
2822 Register TrigVal;
2823 auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: 0.5 * numbers::inv_pi);
2824 if (ST.hasTrigReducedRange()) {
2825 auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
2826 TrigVal = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {Ty})
2827 .addUse(RegNo: MulVal.getReg(Idx: 0))
2828 .setMIFlags(Flags)
2829 .getReg(Idx: 0);
2830 } else
2831 TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: 0);
2832
2833 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2834 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2835 B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
2836 .addUse(RegNo: TrigVal)
2837 .setMIFlags(Flags);
2838 MI.eraseFromParent();
2839 return true;
2840}
2841
2842bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2843 MachineIRBuilder &B,
2844 const GlobalValue *GV,
2845 int64_t Offset,
2846 unsigned GAFlags) const {
2847 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2848 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2849 // to the following code sequence:
2850 //
2851 // For constant address space:
2852 // s_getpc_b64 s[0:1]
2853 // s_add_u32 s0, s0, $symbol
2854 // s_addc_u32 s1, s1, 0
2855 //
2856 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2857 // a fixup or relocation is emitted to replace $symbol with a literal
2858 // constant, which is a pc-relative offset from the encoding of the $symbol
2859 // operand to the global variable.
2860 //
2861 // For global address space:
2862 // s_getpc_b64 s[0:1]
2863 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2864 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2865 //
2866 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2867 // fixups or relocations are emitted to replace $symbol@*@lo and
2868 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2869 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2870 // operand to the global variable.
2871
2872 LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
2873
2874 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2875 B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
2876
2877 MachineInstrBuilder MIB = B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET)
2878 .addDef(RegNo: PCReg);
2879
2880 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
2881 if (GAFlags == SIInstrInfo::MO_NONE)
2882 MIB.addImm(Val: 0);
2883 else
2884 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 1);
2885
2886 if (!B.getMRI()->getRegClassOrNull(Reg: PCReg))
2887 B.getMRI()->setRegClass(Reg: PCReg, RC: &AMDGPU::SReg_64RegClass);
2888
2889 if (PtrTy.getSizeInBits() == 32)
2890 B.buildExtract(Res: DstReg, Src: PCReg, Index: 0);
2891 return true;
2892}
2893
2894// Emit a ABS32_LO / ABS32_HI relocation stub.
2895void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2896 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2897 MachineRegisterInfo &MRI) const {
2898 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2899
2900 LLT S32 = LLT::scalar(SizeInBits: 32);
2901
2902 // Use the destination directly, if and only if we store the lower address
2903 // part only and we don't have a register class being set.
2904 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
2905 ? DstReg
2906 : MRI.createGenericVirtualRegister(Ty: S32);
2907
2908 if (!MRI.getRegClassOrNull(Reg: AddrLo))
2909 MRI.setRegClass(Reg: AddrLo, RC: &AMDGPU::SReg_32RegClass);
2910
2911 // Write the lower half.
2912 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
2913 .addDef(RegNo: AddrLo)
2914 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
2915
2916 // If required, write the upper half as well.
2917 if (RequiresHighHalf) {
2918 assert(PtrTy.getSizeInBits() == 64 &&
2919 "Must provide a 64-bit pointer type!");
2920
2921 Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
2922 MRI.setRegClass(Reg: AddrHi, RC: &AMDGPU::SReg_32RegClass);
2923
2924 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
2925 .addDef(RegNo: AddrHi)
2926 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_HI);
2927
2928 // Use the destination directly, if and only if we don't have a register
2929 // class being set.
2930 Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
2931 ? DstReg
2932 : MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2933
2934 if (!MRI.getRegClassOrNull(Reg: AddrDst))
2935 MRI.setRegClass(Reg: AddrDst, RC: &AMDGPU::SReg_64RegClass);
2936
2937 B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
2938
2939 // If we created a new register for the destination, cast the result into
2940 // the final output.
2941 if (AddrDst != DstReg)
2942 B.buildCast(Dst: DstReg, Src: AddrDst);
2943 } else if (AddrLo != DstReg) {
2944 // If we created a new register for the destination, cast the result into
2945 // the final output.
2946 B.buildCast(Dst: DstReg, Src: AddrLo);
2947 }
2948}
2949
2950bool AMDGPULegalizerInfo::legalizeGlobalValue(
2951 MachineInstr &MI, MachineRegisterInfo &MRI,
2952 MachineIRBuilder &B) const {
2953 Register DstReg = MI.getOperand(i: 0).getReg();
2954 LLT Ty = MRI.getType(Reg: DstReg);
2955 unsigned AS = Ty.getAddressSpace();
2956
2957 const GlobalValue *GV = MI.getOperand(i: 1).getGlobal();
2958 MachineFunction &MF = B.getMF();
2959 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2960
2961 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2962 if (!MFI->isModuleEntryFunction() &&
2963 GV->getName() != "llvm.amdgcn.module.lds") {
2964 const Function &Fn = MF.getFunction();
2965 DiagnosticInfoUnsupported BadLDSDecl(
2966 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2967 DS_Warning);
2968 Fn.getContext().diagnose(DI: BadLDSDecl);
2969
2970 // We currently don't have a way to correctly allocate LDS objects that
2971 // aren't directly associated with a kernel. We do force inlining of
2972 // functions that use local objects. However, if these dead functions are
2973 // not eliminated, we don't want a compile time error. Just emit a warning
2974 // and a trap, since there should be no callable path here.
2975 B.buildTrap();
2976 B.buildUndef(Res: DstReg);
2977 MI.eraseFromParent();
2978 return true;
2979 }
2980
2981 // TODO: We could emit code to handle the initialization somewhere.
2982 // We ignore the initializer for now and legalize it to allow selection.
2983 // The initializer will anyway get errored out during assembly emission.
2984 const SITargetLowering *TLI = ST.getTargetLowering();
2985 if (!TLI->shouldUseLDSConstAddress(GV)) {
2986 MI.getOperand(i: 1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2987 return true; // Leave in place;
2988 }
2989
2990 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2991 Type *Ty = GV->getValueType();
2992 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2993 // zero-sized type in other languages to declare the dynamic shared
2994 // memory which size is not known at the compile time. They will be
2995 // allocated by the runtime and placed directly after the static
2996 // allocated ones. They all share the same offset.
2997 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2998 // Adjust alignment for that dynamic shared memory array.
2999 MFI->setDynLDSAlign(F: MF.getFunction(), GV: *cast<GlobalVariable>(Val: GV));
3000 LLT S32 = LLT::scalar(SizeInBits: 32);
3001 auto Sz = B.buildIntrinsic(ID: Intrinsic::amdgcn_groupstaticsize, Res: {S32});
3002 B.buildIntToPtr(Dst: DstReg, Src: Sz);
3003 MI.eraseFromParent();
3004 return true;
3005 }
3006 }
3007
3008 B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(),
3009 GV: *cast<GlobalVariable>(Val: GV)));
3010 MI.eraseFromParent();
3011 return true;
3012 }
3013
3014 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3015 buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
3016 MI.eraseFromParent();
3017 return true;
3018 }
3019
3020 const SITargetLowering *TLI = ST.getTargetLowering();
3021
3022 if (TLI->shouldEmitFixup(GV)) {
3023 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0);
3024 MI.eraseFromParent();
3025 return true;
3026 }
3027
3028 if (TLI->shouldEmitPCReloc(GV)) {
3029 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_REL32);
3030 MI.eraseFromParent();
3031 return true;
3032 }
3033
3034 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3035 Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
3036
3037 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3038 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3039 PtrInfo: MachinePointerInfo::getGOT(MF),
3040 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3041 MachineMemOperand::MOInvariant,
3042 MemTy: LoadTy, base_alignment: Align(8));
3043
3044 buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3045
3046 if (Ty.getSizeInBits() == 32) {
3047 // Truncate if this is a 32-bit constant address.
3048 auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3049 B.buildExtract(Res: DstReg, Src: Load, Index: 0);
3050 } else
3051 B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3052
3053 MI.eraseFromParent();
3054 return true;
3055}
3056
3057static LLT widenToNextPowerOf2(LLT Ty) {
3058 if (Ty.isVector())
3059 return Ty.changeElementCount(
3060 EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3061 return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3062}
3063
3064bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3065 MachineInstr &MI) const {
3066 MachineIRBuilder &B = Helper.MIRBuilder;
3067 MachineRegisterInfo &MRI = *B.getMRI();
3068 GISelChangeObserver &Observer = Helper.Observer;
3069
3070 Register PtrReg = MI.getOperand(i: 1).getReg();
3071 LLT PtrTy = MRI.getType(Reg: PtrReg);
3072 unsigned AddrSpace = PtrTy.getAddressSpace();
3073
3074 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3075 LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3076 auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3077 Observer.changingInstr(MI);
3078 MI.getOperand(i: 1).setReg(Cast.getReg(Idx: 0));
3079 Observer.changedInstr(MI);
3080 return true;
3081 }
3082
3083 if (MI.getOpcode() != AMDGPU::G_LOAD)
3084 return false;
3085
3086 Register ValReg = MI.getOperand(i: 0).getReg();
3087 LLT ValTy = MRI.getType(Reg: ValReg);
3088
3089 if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3090 Observer.changingInstr(MI);
3091 castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
3092 Observer.changedInstr(MI);
3093 return true;
3094 }
3095
3096 MachineMemOperand *MMO = *MI.memoperands_begin();
3097 const unsigned ValSize = ValTy.getSizeInBits();
3098 const LLT MemTy = MMO->getMemoryType();
3099 const Align MemAlign = MMO->getAlign();
3100 const unsigned MemSize = MemTy.getSizeInBits();
3101 const uint64_t AlignInBits = 8 * MemAlign.value();
3102
3103 // Widen non-power-of-2 loads to the alignment if needed
3104 if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3105 const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3106
3107 // This was already the correct extending load result type, so just adjust
3108 // the memory type.
3109 if (WideMemSize == ValSize) {
3110 MachineFunction &MF = B.getMF();
3111
3112 MachineMemOperand *WideMMO =
3113 MF.getMachineMemOperand(MMO, Offset: 0, Size: WideMemSize / 8);
3114 Observer.changingInstr(MI);
3115 MI.setMemRefs(MF, MemRefs: {WideMMO});
3116 Observer.changedInstr(MI);
3117 return true;
3118 }
3119
3120 // Don't bother handling edge case that should probably never be produced.
3121 if (ValSize > WideMemSize)
3122 return false;
3123
3124 LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3125
3126 Register WideLoad;
3127 if (!WideTy.isVector()) {
3128 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3129 B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: 0);
3130 } else {
3131 // Extract the subvector.
3132
3133 if (isRegisterType(Ty: ValTy)) {
3134 // If this a case where G_EXTRACT is legal, use it.
3135 // (e.g. <3 x s32> -> <4 x s32>)
3136 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3137 B.buildExtract(Res: ValReg, Src: WideLoad, Index: 0);
3138 } else {
3139 // For cases where the widened type isn't a nice register value, unmerge
3140 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3141 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3142 B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3143 }
3144 }
3145
3146 MI.eraseFromParent();
3147 return true;
3148 }
3149
3150 return false;
3151}
3152
3153bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3154 MachineInstr &MI) const {
3155 MachineIRBuilder &B = Helper.MIRBuilder;
3156 MachineRegisterInfo &MRI = *B.getMRI();
3157 GISelChangeObserver &Observer = Helper.Observer;
3158
3159 Register DataReg = MI.getOperand(i: 0).getReg();
3160 LLT DataTy = MRI.getType(Reg: DataReg);
3161
3162 if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3163 Observer.changingInstr(MI);
3164 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
3165 Observer.changedInstr(MI);
3166 return true;
3167 }
3168 return false;
3169}
3170
3171bool AMDGPULegalizerInfo::legalizeFMad(
3172 MachineInstr &MI, MachineRegisterInfo &MRI,
3173 MachineIRBuilder &B) const {
3174 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3175 assert(Ty.isScalar());
3176
3177 MachineFunction &MF = B.getMF();
3178 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3179
3180 // TODO: Always legal with future ftz flag.
3181 // FIXME: Do we need just output?
3182 if (Ty == LLT::float32() &&
3183 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3184 return true;
3185 if (Ty == LLT::float16() &&
3186 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3187 return true;
3188
3189 MachineIRBuilder HelperBuilder(MI);
3190 GISelObserverWrapper DummyObserver;
3191 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3192 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3193}
3194
3195bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3196 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3197 Register DstReg = MI.getOperand(i: 0).getReg();
3198 Register PtrReg = MI.getOperand(i: 1).getReg();
3199 Register CmpVal = MI.getOperand(i: 2).getReg();
3200 Register NewVal = MI.getOperand(i: 3).getReg();
3201
3202 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3203 "this should not have been custom lowered");
3204
3205 LLT ValTy = MRI.getType(Reg: CmpVal);
3206 LLT VecTy = LLT::fixed_vector(NumElements: 2, ScalarTy: ValTy);
3207
3208 Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: 0);
3209
3210 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3211 .addDef(RegNo: DstReg)
3212 .addUse(RegNo: PtrReg)
3213 .addUse(RegNo: PackedVal)
3214 .setMemRefs(MI.memoperands());
3215
3216 MI.eraseFromParent();
3217 return true;
3218}
3219
3220/// Return true if it's known that \p Src can never be an f32 denormal value.
3221static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3222 Register Src) {
3223 const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3224 switch (DefMI->getOpcode()) {
3225 case TargetOpcode::G_INTRINSIC: {
3226 switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3227 case Intrinsic::amdgcn_frexp_mant:
3228 return true;
3229 default:
3230 break;
3231 }
3232
3233 break;
3234 }
3235 case TargetOpcode::G_FFREXP: {
3236 if (DefMI->getOperand(i: 0).getReg() == Src)
3237 return true;
3238 break;
3239 }
3240 case TargetOpcode::G_FPEXT: {
3241 return MRI.getType(Reg: DefMI->getOperand(i: 1).getReg()) == LLT::scalar(SizeInBits: 16);
3242 }
3243 default:
3244 return false;
3245 }
3246
3247 return false;
3248}
3249
3250static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3251 if (Flags & MachineInstr::FmAfn)
3252 return true;
3253 const auto &Options = MF.getTarget().Options;
3254 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3255}
3256
3257static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3258 unsigned Flags) {
3259 return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3260 MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3261 DenormalMode::PreserveSign;
3262}
3263
3264std::pair<Register, Register>
3265AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3266 unsigned Flags) const {
3267 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3268 return {};
3269
3270 const LLT F32 = LLT::scalar(SizeInBits: 32);
3271 auto SmallestNormal = B.buildFConstant(
3272 Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3273 auto IsLtSmallestNormal =
3274 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: SmallestNormal);
3275
3276 auto Scale32 = B.buildFConstant(Res: F32, Val: 0x1.0p+32);
3277 auto One = B.buildFConstant(Res: F32, Val: 1.0);
3278 auto ScaleFactor =
3279 B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3280 auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3281
3282 return {ScaledInput.getReg(Idx: 0), IsLtSmallestNormal.getReg(Idx: 0)};
3283}
3284
3285bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3286 MachineIRBuilder &B) const {
3287 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3288 // If we have to handle denormals, scale up the input and adjust the result.
3289
3290 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3291 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3292
3293 Register Dst = MI.getOperand(i: 0).getReg();
3294 Register Src = MI.getOperand(i: 1).getReg();
3295 LLT Ty = B.getMRI()->getType(Reg: Dst);
3296 unsigned Flags = MI.getFlags();
3297
3298 if (Ty == LLT::scalar(SizeInBits: 16)) {
3299 const LLT F32 = LLT::scalar(SizeInBits: 32);
3300 // Nothing in half is a denormal when promoted to f32.
3301 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3302 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {F32})
3303 .addUse(RegNo: Ext.getReg(Idx: 0))
3304 .setMIFlags(Flags);
3305 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3306 MI.eraseFromParent();
3307 return true;
3308 }
3309
3310 assert(Ty == LLT::scalar(32));
3311
3312 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3313 if (!ScaledInput) {
3314 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {MI.getOperand(i: 0)})
3315 .addUse(RegNo: Src)
3316 .setMIFlags(Flags);
3317 MI.eraseFromParent();
3318 return true;
3319 }
3320
3321 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3322 .addUse(RegNo: ScaledInput)
3323 .setMIFlags(Flags);
3324
3325 auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: 32.0);
3326 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3327 auto ResultOffset =
3328 B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3329 B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3330
3331 MI.eraseFromParent();
3332 return true;
3333}
3334
3335static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3336 Register Z, unsigned Flags) {
3337 auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3338 return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: 0);
3339}
3340
3341bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3342 MachineIRBuilder &B) const {
3343 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3344 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3345
3346 MachineRegisterInfo &MRI = *B.getMRI();
3347 Register Dst = MI.getOperand(i: 0).getReg();
3348 Register X = MI.getOperand(i: 1).getReg();
3349 unsigned Flags = MI.getFlags();
3350 const LLT Ty = MRI.getType(Reg: X);
3351 MachineFunction &MF = B.getMF();
3352
3353 const LLT F32 = LLT::scalar(SizeInBits: 32);
3354 const LLT F16 = LLT::scalar(SizeInBits: 16);
3355
3356 const AMDGPUTargetMachine &TM =
3357 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3358
3359 if (Ty == F16 || MI.getFlag(Flag: MachineInstr::FmAfn) ||
3360 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3361 if (Ty == F16 && !ST.has16BitInsts()) {
3362 Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3363 auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3364 legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: 0), IsLog10, Flags);
3365 B.buildFPTrunc(Res: Dst, Op: LogVal);
3366 } else {
3367 legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3368 }
3369
3370 MI.eraseFromParent();
3371 return true;
3372 }
3373
3374 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3375 if (ScaledInput)
3376 X = ScaledInput;
3377
3378 auto Y =
3379 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty}).addUse(RegNo: X).setMIFlags(Flags);
3380
3381 Register R;
3382 if (ST.hasFastFMAF32()) {
3383 // c+cc are ln(2)/ln(10) to more than 49 bits
3384 const float c_log10 = 0x1.344134p-2f;
3385 const float cc_log10 = 0x1.09f79ep-26f;
3386
3387 // c + cc is ln(2) to more than 49 bits
3388 const float c_log = 0x1.62e42ep-1f;
3389 const float cc_log = 0x1.efa39ep-25f;
3390
3391 auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3392 auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3393
3394 R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags).getReg(Idx: 0);
3395 auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags);
3396 auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags);
3397 auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags);
3398 R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags).getReg(Idx: 0);
3399 } else {
3400 // ch+ct is ln(2)/ln(10) to more than 36 bits
3401 const float ch_log10 = 0x1.344000p-2f;
3402 const float ct_log10 = 0x1.3509f6p-18f;
3403
3404 // ch + ct is ln(2) to more than 36 bits
3405 const float ch_log = 0x1.62e000p-1f;
3406 const float ct_log = 0x1.0bfbe8p-15f;
3407
3408 auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3409 auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3410
3411 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3412 auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3413 auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3414 auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags);
3415
3416 Register Mad0 =
3417 getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CT.getReg(Idx: 0), Z: YTCT.getReg(Idx: 0), Flags);
3418 Register Mad1 = getMad(B, Ty, X: YT.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad0, Flags);
3419 R = getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad1, Flags);
3420 }
3421
3422 const bool IsFiniteOnly =
3423 (MI.getFlag(Flag: MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3424 (MI.getFlag(Flag: MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3425
3426 if (!IsFiniteOnly) {
3427 // Expand isfinite(x) => fabs(x) < inf
3428 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3429 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3430 auto IsFinite =
3431 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
3432 R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(Idx: 0);
3433 }
3434
3435 if (ScaledInput) {
3436 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3437 auto ShiftK =
3438 B.buildFConstant(Res: Ty, Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3439 auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3440 B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3441 } else {
3442 B.buildCopy(Res: Dst, Op: R);
3443 }
3444
3445 MI.eraseFromParent();
3446 return true;
3447}
3448
3449bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3450 Register Src, bool IsLog10,
3451 unsigned Flags) const {
3452 const double Log2BaseInverted =
3453 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3454
3455 LLT Ty = B.getMRI()->getType(Reg: Dst);
3456
3457 if (Ty == LLT::scalar(SizeInBits: 32)) {
3458 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3459 if (ScaledInput) {
3460 auto LogSrc = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3461 .addUse(RegNo: Src)
3462 .setMIFlags(Flags);
3463 auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -32.0 * Log2BaseInverted);
3464 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3465 auto ResultOffset =
3466 B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3467 auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3468
3469 if (ST.hasFastFMAF32())
3470 B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3471 else {
3472 auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3473 B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3474 }
3475
3476 return true;
3477 }
3478 }
3479
3480 auto Log2Operand = Ty == LLT::scalar(SizeInBits: 16)
3481 ? B.buildFLog2(Dst: Ty, Src, Flags)
3482 : B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3483 .addUse(RegNo: Src)
3484 .setMIFlags(Flags);
3485 auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3486 B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3487 return true;
3488}
3489
3490bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3491 MachineIRBuilder &B) const {
3492 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3493 // If we have to handle denormals, scale up the input and adjust the result.
3494
3495 Register Dst = MI.getOperand(i: 0).getReg();
3496 Register Src = MI.getOperand(i: 1).getReg();
3497 unsigned Flags = MI.getFlags();
3498 LLT Ty = B.getMRI()->getType(Reg: Dst);
3499 const LLT F16 = LLT::scalar(SizeInBits: 16);
3500 const LLT F32 = LLT::scalar(SizeInBits: 32);
3501
3502 if (Ty == F16) {
3503 // Nothing in half is a denormal when promoted to f32.
3504 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3505 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {F32})
3506 .addUse(RegNo: Ext.getReg(Idx: 0))
3507 .setMIFlags(Flags);
3508 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3509 MI.eraseFromParent();
3510 return true;
3511 }
3512
3513 assert(Ty == F32);
3514
3515 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3516 B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3517 .addUse(RegNo: Src)
3518 .setMIFlags(Flags);
3519 MI.eraseFromParent();
3520 return true;
3521 }
3522
3523 // bool needs_scaling = x < -0x1.f80000p+6f;
3524 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3525
3526 // -nextafter(128.0, -1)
3527 auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -0x1.f80000p+6f);
3528 auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
3529 Op1: RangeCheckConst, Flags);
3530
3531 auto SixtyFour = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3532 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3533 auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3534 auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3535
3536 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3537 .addUse(RegNo: AddInput.getReg(Idx: 0))
3538 .setMIFlags(Flags);
3539
3540 auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: 0x1.0p-64f);
3541 auto One = B.buildFConstant(Res: Ty, Val: 1.0);
3542 auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3543 B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3544 MI.eraseFromParent();
3545 return true;
3546}
3547
3548bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3549 Register X, unsigned Flags) const {
3550 LLT Ty = B.getMRI()->getType(Reg: Dst);
3551 LLT F32 = LLT::scalar(SizeInBits: 32);
3552
3553 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3554 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3555 auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Log2E, Flags);
3556
3557 if (Ty == F32) {
3558 B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3559 .addUse(RegNo: Mul.getReg(Idx: 0))
3560 .setMIFlags(Flags);
3561 } else {
3562 B.buildFExp2(Dst, Src: Mul.getReg(Idx: 0), Flags);
3563 }
3564
3565 return true;
3566 }
3567
3568 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.5d58a0p+6f);
3569 auto NeedsScaling =
3570 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold, Flags);
3571 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3572 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3573 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3574
3575 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3576 auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3577
3578 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3579 .addUse(RegNo: ExpInput.getReg(Idx: 0))
3580 .setMIFlags(Flags);
3581
3582 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.969d48p-93f);
3583 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3584 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3585 return true;
3586}
3587
3588bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3589 MachineIRBuilder &B) const {
3590 Register Dst = MI.getOperand(i: 0).getReg();
3591 Register X = MI.getOperand(i: 1).getReg();
3592 const unsigned Flags = MI.getFlags();
3593 MachineFunction &MF = B.getMF();
3594 MachineRegisterInfo &MRI = *B.getMRI();
3595 LLT Ty = MRI.getType(Reg: Dst);
3596 const LLT F16 = LLT::scalar(SizeInBits: 16);
3597 const LLT F32 = LLT::scalar(SizeInBits: 32);
3598 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3599
3600 if (Ty == F16) {
3601 // v_exp_f16 (fmul x, log2e)
3602 if (allowApproxFunc(MF, Flags)) {
3603 // TODO: Does this really require fast?
3604 legalizeFExpUnsafe(B, Dst, X, Flags);
3605 MI.eraseFromParent();
3606 return true;
3607 }
3608
3609 // exp(f16 x) ->
3610 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3611
3612 // Nothing in half is a denormal when promoted to f32.
3613 auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
3614 Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
3615 legalizeFExpUnsafe(B, Dst: Lowered, X: Ext.getReg(Idx: 0), Flags);
3616 B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
3617 MI.eraseFromParent();
3618 return true;
3619 }
3620
3621 assert(Ty == F32);
3622
3623 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3624 // library behavior. Also, is known-not-daz source sufficient?
3625 if (allowApproxFunc(MF, Flags)) {
3626 legalizeFExpUnsafe(B, Dst, X, Flags);
3627 MI.eraseFromParent();
3628 return true;
3629 }
3630
3631 // Algorithm:
3632 //
3633 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3634 //
3635 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3636 // n = 64*m + j, 0 <= j < 64
3637 //
3638 // e^x = 2^((64*m + j + f)/64)
3639 // = (2^m) * (2^(j/64)) * 2^(f/64)
3640 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3641 //
3642 // f = x*(64/ln(2)) - n
3643 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3644 //
3645 // e^x = (2^m) * (2^(j/64)) * e^r
3646 //
3647 // (2^(j/64)) is precomputed
3648 //
3649 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3650 // e^r = 1 + q
3651 //
3652 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3653 //
3654 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3655 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3656 Register PH, PL;
3657
3658 if (ST.hasFastFMAF32()) {
3659 const float c_exp = numbers::log2ef;
3660 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3661 const float c_exp10 = 0x1.a934f0p+1f;
3662 const float cc_exp10 = 0x1.2f346ep-24f;
3663
3664 auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
3665 PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: 0);
3666 auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
3667 auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
3668
3669 auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
3670 PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: 0);
3671 } else {
3672 const float ch_exp = 0x1.714000p+0f;
3673 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3674
3675 const float ch_exp10 = 0x1.a92000p+1f;
3676 const float cl_exp10 = 0x1.4f0978p-11f;
3677
3678 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3679 auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
3680 auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
3681
3682 auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
3683 PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: 0);
3684
3685 auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
3686 auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
3687
3688 Register Mad0 =
3689 getMad(B, Ty, X: XL.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: XLCL.getReg(Idx: 0), Flags);
3690 PL = getMad(B, Ty, X: XH.getReg(Idx: 0), Y: CL.getReg(Idx: 0), Z: Mad0, Flags);
3691 }
3692
3693 auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
3694
3695 // It is unsafe to contract this fsub into the PH multiply.
3696 auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
3697 auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
3698 auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: 32), Src0: E);
3699
3700 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3701 .addUse(RegNo: A.getReg(Idx: 0))
3702 .setMIFlags(Flags);
3703 auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
3704
3705 auto UnderflowCheckConst =
3706 B.buildFConstant(Res: Ty, Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3707 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3708 auto Underflow =
3709 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: UnderflowCheckConst);
3710
3711 R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
3712
3713 const auto &Options = MF.getTarget().Options;
3714
3715 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3716 auto OverflowCheckConst =
3717 B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3718
3719 auto Overflow =
3720 B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: OverflowCheckConst);
3721 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3722 R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
3723 }
3724
3725 B.buildCopy(Res: Dst, Op: R);
3726 MI.eraseFromParent();
3727 return true;
3728}
3729
3730bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3731 MachineIRBuilder &B) const {
3732 Register Dst = MI.getOperand(i: 0).getReg();
3733 Register Src0 = MI.getOperand(i: 1).getReg();
3734 Register Src1 = MI.getOperand(i: 2).getReg();
3735 unsigned Flags = MI.getFlags();
3736 LLT Ty = B.getMRI()->getType(Reg: Dst);
3737 const LLT F16 = LLT::float16();
3738 const LLT F32 = LLT::float32();
3739
3740 if (Ty == F32) {
3741 auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
3742 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
3743 .addUse(RegNo: Log.getReg(Idx: 0))
3744 .addUse(RegNo: Src1)
3745 .setMIFlags(Flags);
3746 B.buildFExp2(Dst, Src: Mul, Flags);
3747 } else if (Ty == F16) {
3748 // There's no f16 fmul_legacy, so we need to convert for it.
3749 auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
3750 auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
3751 auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
3752 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
3753 .addUse(RegNo: Ext0.getReg(Idx: 0))
3754 .addUse(RegNo: Ext1.getReg(Idx: 0))
3755 .setMIFlags(Flags);
3756 B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
3757 } else
3758 return false;
3759
3760 MI.eraseFromParent();
3761 return true;
3762}
3763
3764// Find a source register, ignoring any possible source modifiers.
3765static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3766 Register ModSrc = OrigSrc;
3767 if (MachineInstr *SrcFNeg = getOpcodeDef(Opcode: AMDGPU::G_FNEG, Reg: ModSrc, MRI)) {
3768 ModSrc = SrcFNeg->getOperand(i: 1).getReg();
3769 if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
3770 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
3771 } else if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
3772 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
3773 return ModSrc;
3774}
3775
3776bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3777 MachineRegisterInfo &MRI,
3778 MachineIRBuilder &B) const {
3779
3780 const LLT S1 = LLT::scalar(SizeInBits: 1);
3781 const LLT F64 = LLT::float64();
3782 Register Dst = MI.getOperand(i: 0).getReg();
3783 Register OrigSrc = MI.getOperand(i: 1).getReg();
3784 unsigned Flags = MI.getFlags();
3785 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3786 "this should not have been custom lowered");
3787
3788 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3789 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3790 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3791 // V_FRACT bug is:
3792 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3793 //
3794 // Convert floor(x) to (x - fract(x))
3795
3796 auto Fract = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {F64})
3797 .addUse(RegNo: OrigSrc)
3798 .setMIFlags(Flags);
3799
3800 // Give source modifier matching some assistance before obscuring a foldable
3801 // pattern.
3802
3803 // TODO: We can avoid the neg on the fract? The input sign to fract
3804 // shouldn't matter?
3805 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3806
3807 auto Const =
3808 B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: 0x3fefffffffffffff));
3809
3810 Register Min = MRI.createGenericVirtualRegister(Ty: F64);
3811
3812 // We don't need to concern ourselves with the snan handling difference, so
3813 // use the one which will directly select.
3814 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3815 if (MFI->getMode().IEEE)
3816 B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
3817 else
3818 B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
3819
3820 Register CorrectedFract = Min;
3821 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
3822 auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
3823 CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: 0);
3824 }
3825
3826 auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
3827 B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
3828
3829 MI.eraseFromParent();
3830 return true;
3831}
3832
3833// Turn an illegal packed v2s16 build vector into bit operations.
3834// TODO: This should probably be a bitcast action in LegalizerHelper.
3835bool AMDGPULegalizerInfo::legalizeBuildVector(
3836 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3837 Register Dst = MI.getOperand(i: 0).getReg();
3838 const LLT S32 = LLT::scalar(SizeInBits: 32);
3839 const LLT S16 = LLT::scalar(SizeInBits: 16);
3840 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3841
3842 Register Src0 = MI.getOperand(i: 1).getReg();
3843 Register Src1 = MI.getOperand(i: 2).getReg();
3844
3845 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3846 assert(MRI.getType(Src0) == S32);
3847 Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 1).getReg()).getReg(Idx: 0);
3848 Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 2).getReg()).getReg(Idx: 0);
3849 }
3850
3851 auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
3852 B.buildBitcast(Dst, Src: Merge);
3853
3854 MI.eraseFromParent();
3855 return true;
3856}
3857
3858// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3859//
3860// Source and accumulation registers must all be 32-bits.
3861//
3862// TODO: When the multiply is uniform, we should produce a code sequence
3863// that is better suited to instruction selection on the SALU. Instead of
3864// the outer loop going over parts of the result, the outer loop should go
3865// over parts of one of the factors. This should result in instruction
3866// selection that makes full use of S_ADDC_U32 instructions.
3867void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3868 MutableArrayRef<Register> Accum,
3869 ArrayRef<Register> Src0,
3870 ArrayRef<Register> Src1,
3871 bool UsePartialMad64_32,
3872 bool SeparateOddAlignedProducts) const {
3873 // Use (possibly empty) vectors of S1 registers to represent the set of
3874 // carries from one pair of positions to the next.
3875 using Carry = SmallVector<Register, 2>;
3876
3877 MachineIRBuilder &B = Helper.MIRBuilder;
3878 GISelKnownBits &KB = *Helper.getKnownBits();
3879
3880 const LLT S1 = LLT::scalar(SizeInBits: 1);
3881 const LLT S32 = LLT::scalar(SizeInBits: 32);
3882 const LLT S64 = LLT::scalar(SizeInBits: 64);
3883
3884 Register Zero32;
3885 Register Zero64;
3886
3887 auto getZero32 = [&]() -> Register {
3888 if (!Zero32)
3889 Zero32 = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
3890 return Zero32;
3891 };
3892 auto getZero64 = [&]() -> Register {
3893 if (!Zero64)
3894 Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
3895 return Zero64;
3896 };
3897
3898 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3899 for (unsigned i = 0; i < Src0.size(); ++i) {
3900 Src0KnownZeros.push_back(Elt: KB.getKnownBits(R: Src0[i]).isZero());
3901 Src1KnownZeros.push_back(Elt: KB.getKnownBits(R: Src1[i]).isZero());
3902 }
3903
3904 // Merge the given carries into the 32-bit LocalAccum, which is modified
3905 // in-place.
3906 //
3907 // Returns the carry-out, which is a single S1 register or null.
3908 auto mergeCarry =
3909 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3910 if (CarryIn.empty())
3911 return Register();
3912
3913 bool HaveCarryOut = true;
3914 Register CarryAccum;
3915 if (CarryIn.size() == 1) {
3916 if (!LocalAccum) {
3917 LocalAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
3918 return Register();
3919 }
3920
3921 CarryAccum = getZero32();
3922 } else {
3923 CarryAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
3924 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3925 CarryAccum =
3926 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32(), CarryIn: CarryIn[i])
3927 .getReg(Idx: 0);
3928 }
3929
3930 if (!LocalAccum) {
3931 LocalAccum = getZero32();
3932 HaveCarryOut = false;
3933 }
3934 }
3935
3936 auto Add =
3937 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
3938 LocalAccum = Add.getReg(Idx: 0);
3939 return HaveCarryOut ? Add.getReg(Idx: 1) : Register();
3940 };
3941
3942 // Build a multiply-add chain to compute
3943 //
3944 // LocalAccum + (partial products at DstIndex)
3945 // + (opportunistic subset of CarryIn)
3946 //
3947 // LocalAccum is an array of one or two 32-bit registers that are updated
3948 // in-place. The incoming registers may be null.
3949 //
3950 // In some edge cases, carry-ins can be consumed "for free". In that case,
3951 // the consumed carry bits are removed from CarryIn in-place.
3952 auto buildMadChain =
3953 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3954 -> Carry {
3955 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3956 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3957
3958 Carry CarryOut;
3959 unsigned j0 = 0;
3960
3961 // Use plain 32-bit multiplication for the most significant part of the
3962 // result by default.
3963 if (LocalAccum.size() == 1 &&
3964 (!UsePartialMad64_32 || !CarryIn.empty())) {
3965 do {
3966 // Skip multiplication if one of the operands is 0
3967 unsigned j1 = DstIndex - j0;
3968 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3969 ++j0;
3970 continue;
3971 }
3972 auto Mul = B.buildMul(Dst: S32, Src0: Src0[j0], Src1: Src1[j1]);
3973 if (!LocalAccum[0] || KB.getKnownBits(R: LocalAccum[0]).isZero()) {
3974 LocalAccum[0] = Mul.getReg(Idx: 0);
3975 } else {
3976 if (CarryIn.empty()) {
3977 LocalAccum[0] = B.buildAdd(Dst: S32, Src0: LocalAccum[0], Src1: Mul).getReg(Idx: 0);
3978 } else {
3979 LocalAccum[0] =
3980 B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum[0], Op1: Mul, CarryIn: CarryIn.back())
3981 .getReg(Idx: 0);
3982 CarryIn.pop_back();
3983 }
3984 }
3985 ++j0;
3986 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3987 }
3988
3989 // Build full 64-bit multiplies.
3990 if (j0 <= DstIndex) {
3991 bool HaveSmallAccum = false;
3992 Register Tmp;
3993
3994 if (LocalAccum[0]) {
3995 if (LocalAccum.size() == 1) {
3996 Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
3997 HaveSmallAccum = true;
3998 } else if (LocalAccum[1]) {
3999 Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: 0);
4000 HaveSmallAccum = false;
4001 } else {
4002 Tmp = B.buildZExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4003 HaveSmallAccum = true;
4004 }
4005 } else {
4006 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4007 Tmp = getZero64();
4008 HaveSmallAccum = true;
4009 }
4010
4011 do {
4012 unsigned j1 = DstIndex - j0;
4013 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4014 ++j0;
4015 continue;
4016 }
4017 auto Mad = B.buildInstr(Opc: AMDGPU::G_AMDGPU_MAD_U64_U32, DstOps: {S64, S1},
4018 SrcOps: {Src0[j0], Src1[j1], Tmp});
4019 Tmp = Mad.getReg(Idx: 0);
4020 if (!HaveSmallAccum)
4021 CarryOut.push_back(Elt: Mad.getReg(Idx: 1));
4022 HaveSmallAccum = false;
4023
4024 ++j0;
4025 } while (j0 <= DstIndex);
4026
4027 auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
4028 LocalAccum[0] = Unmerge.getReg(Idx: 0);
4029 if (LocalAccum.size() > 1)
4030 LocalAccum[1] = Unmerge.getReg(Idx: 1);
4031 }
4032
4033 return CarryOut;
4034 };
4035
4036 // Outer multiply loop, iterating over destination parts from least
4037 // significant to most significant parts.
4038 //
4039 // The columns of the following diagram correspond to the destination parts
4040 // affected by one iteration of the outer loop (ignoring boundary
4041 // conditions).
4042 //
4043 // Dest index relative to 2 * i: 1 0 -1
4044 // ------
4045 // Carries from previous iteration: e o
4046 // Even-aligned partial product sum: E E .
4047 // Odd-aligned partial product sum: O O
4048 //
4049 // 'o' is OddCarry, 'e' is EvenCarry.
4050 // EE and OO are computed from partial products via buildMadChain and use
4051 // accumulation where possible and appropriate.
4052 //
4053 Register SeparateOddCarry;
4054 Carry EvenCarry;
4055 Carry OddCarry;
4056
4057 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4058 Carry OddCarryIn = std::move(OddCarry);
4059 Carry EvenCarryIn = std::move(EvenCarry);
4060 OddCarry.clear();
4061 EvenCarry.clear();
4062
4063 // Partial products at offset 2 * i.
4064 if (2 * i < Accum.size()) {
4065 auto LocalAccum = Accum.drop_front(N: 2 * i).take_front(N: 2);
4066 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4067 }
4068
4069 // Partial products at offset 2 * i - 1.
4070 if (i > 0) {
4071 if (!SeparateOddAlignedProducts) {
4072 auto LocalAccum = Accum.drop_front(N: 2 * i - 1).take_front(N: 2);
4073 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4074 } else {
4075 bool IsHighest = 2 * i >= Accum.size();
4076 Register SeparateOddOut[2];
4077 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4078 .take_front(N: IsHighest ? 1 : 2);
4079 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4080
4081 MachineInstr *Lo;
4082
4083 if (i == 1) {
4084 if (!IsHighest)
4085 Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0]);
4086 else
4087 Lo = B.buildAdd(Dst: S32, Src0: Accum[2 * i - 1], Src1: SeparateOddOut[0]);
4088 } else {
4089 Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0],
4090 CarryIn: SeparateOddCarry);
4091 }
4092 Accum[2 * i - 1] = Lo->getOperand(i: 0).getReg();
4093
4094 if (!IsHighest) {
4095 auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i], Op1: SeparateOddOut[1],
4096 CarryIn: Lo->getOperand(i: 1).getReg());
4097 Accum[2 * i] = Hi.getReg(Idx: 0);
4098 SeparateOddCarry = Hi.getReg(Idx: 1);
4099 }
4100 }
4101 }
4102
4103 // Add in the carries from the previous iteration
4104 if (i > 0) {
4105 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4106 EvenCarryIn.push_back(Elt: CarryOut);
4107
4108 if (2 * i < Accum.size()) {
4109 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4110 OddCarry.push_back(Elt: CarryOut);
4111 }
4112 }
4113 }
4114}
4115
4116// Custom narrowing of wide multiplies using wide multiply-add instructions.
4117//
4118// TODO: If the multiply is followed by an addition, we should attempt to
4119// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4120bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4121 MachineInstr &MI) const {
4122 assert(ST.hasMad64_32());
4123 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4124
4125 MachineIRBuilder &B = Helper.MIRBuilder;
4126 MachineRegisterInfo &MRI = *B.getMRI();
4127
4128 Register DstReg = MI.getOperand(i: 0).getReg();
4129 Register Src0 = MI.getOperand(i: 1).getReg();
4130 Register Src1 = MI.getOperand(i: 2).getReg();
4131
4132 LLT Ty = MRI.getType(Reg: DstReg);
4133 assert(Ty.isScalar());
4134
4135 unsigned Size = Ty.getSizeInBits();
4136 unsigned NumParts = Size / 32;
4137 assert((Size % 32) == 0);
4138 assert(NumParts >= 2);
4139
4140 // Whether to use MAD_64_32 for partial products whose high half is
4141 // discarded. This avoids some ADD instructions but risks false dependency
4142 // stalls on some subtargets in some cases.
4143 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4144
4145 // Whether to compute odd-aligned partial products separately. This is
4146 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4147 // in an even-aligned VGPR.
4148 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4149
4150 LLT S32 = LLT::scalar(SizeInBits: 32);
4151 SmallVector<Register, 2> Src0Parts, Src1Parts;
4152 for (unsigned i = 0; i < NumParts; ++i) {
4153 Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4154 Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4155 }
4156 B.buildUnmerge(Res: Src0Parts, Op: Src0);
4157 B.buildUnmerge(Res: Src1Parts, Op: Src1);
4158
4159 SmallVector<Register, 2> AccumRegs(NumParts);
4160 buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4161 SeparateOddAlignedProducts);
4162
4163 B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4164 MI.eraseFromParent();
4165 return true;
4166}
4167
4168// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4169// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4170// case with a single min instruction instead of a compare+select.
4171bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4172 MachineRegisterInfo &MRI,
4173 MachineIRBuilder &B) const {
4174 Register Dst = MI.getOperand(i: 0).getReg();
4175 Register Src = MI.getOperand(i: 1).getReg();
4176 LLT DstTy = MRI.getType(Reg: Dst);
4177 LLT SrcTy = MRI.getType(Reg: Src);
4178
4179 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4180 ? AMDGPU::G_AMDGPU_FFBH_U32
4181 : AMDGPU::G_AMDGPU_FFBL_B32;
4182 auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4183 B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4184
4185 MI.eraseFromParent();
4186 return true;
4187}
4188
4189bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4190 MachineRegisterInfo &MRI,
4191 MachineIRBuilder &B) const {
4192 Register Dst = MI.getOperand(i: 0).getReg();
4193 Register Src = MI.getOperand(i: 1).getReg();
4194 LLT SrcTy = MRI.getType(Reg: Src);
4195 TypeSize NumBits = SrcTy.getSizeInBits();
4196
4197 assert(NumBits < 32u);
4198
4199 auto ShiftAmt = B.buildConstant(Res: S32, Val: 32u - NumBits);
4200 auto Extend = B.buildAnyExt(Res: S32, Op: {Src}).getReg(Idx: 0u);
4201 auto Shift = B.buildShl(Dst: S32, Src0: Extend, Src1: ShiftAmt);
4202 auto Ctlz = B.buildInstr(Opc: AMDGPU::G_AMDGPU_FFBH_U32, DstOps: {S32}, SrcOps: {Shift});
4203 B.buildTrunc(Res: Dst, Op: Ctlz);
4204 MI.eraseFromParent();
4205 return true;
4206}
4207
4208// Check that this is a G_XOR x, -1
4209static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4210 if (MI.getOpcode() != TargetOpcode::G_XOR)
4211 return false;
4212 auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: 2).getReg(), MRI);
4213 return ConstVal && *ConstVal == -1;
4214}
4215
4216// Return the use branch instruction, otherwise null if the usage is invalid.
4217static MachineInstr *
4218verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4219 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4220 Register CondDef = MI.getOperand(i: 0).getReg();
4221 if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4222 return nullptr;
4223
4224 MachineBasicBlock *Parent = MI.getParent();
4225 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(RegNo: CondDef);
4226
4227 if (isNot(MRI, MI: *UseMI)) {
4228 Register NegatedCond = UseMI->getOperand(i: 0).getReg();
4229 if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4230 return nullptr;
4231
4232 // We're deleting the def of this value, so we need to remove it.
4233 eraseInstr(MI&: *UseMI, MRI);
4234
4235 UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4236 Negated = true;
4237 }
4238
4239 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4240 return nullptr;
4241
4242 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4243 MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4244 if (Next == Parent->end()) {
4245 MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4246 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4247 return nullptr;
4248 UncondBrTarget = &*NextMBB;
4249 } else {
4250 if (Next->getOpcode() != AMDGPU::G_BR)
4251 return nullptr;
4252 Br = &*Next;
4253 UncondBrTarget = Br->getOperand(i: 0).getMBB();
4254 }
4255
4256 return UseMI;
4257}
4258
4259bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4260 const ArgDescriptor *Arg,
4261 const TargetRegisterClass *ArgRC,
4262 LLT ArgTy) const {
4263 MCRegister SrcReg = Arg->getRegister();
4264 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4265 assert(DstReg.isVirtual() && "Virtual register expected");
4266
4267 Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4268 RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4269 if (Arg->isMasked()) {
4270 // TODO: Should we try to emit this once in the entry block?
4271 const LLT S32 = LLT::scalar(SizeInBits: 32);
4272 const unsigned Mask = Arg->getMask();
4273 const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4274
4275 Register AndMaskSrc = LiveIn;
4276
4277 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4278 // 0.
4279 if (Shift != 0) {
4280 auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4281 AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: 0);
4282 }
4283
4284 B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4285 } else {
4286 B.buildCopy(Res: DstReg, Op: LiveIn);
4287 }
4288
4289 return true;
4290}
4291
4292bool AMDGPULegalizerInfo::loadInputValue(
4293 Register DstReg, MachineIRBuilder &B,
4294 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4295 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4296 const ArgDescriptor *Arg = nullptr;
4297 const TargetRegisterClass *ArgRC;
4298 LLT ArgTy;
4299
4300 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4301 const ArgDescriptor WorkGroupIDX =
4302 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
4303 // If GridZ is not programmed in an entry function then the hardware will set
4304 // it to all zeros, so there is no need to mask the GridY value in the low
4305 // order bits.
4306 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4307 Reg: AMDGPU::TTMP7,
4308 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4309 const ArgDescriptor WorkGroupIDZ =
4310 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
4311 if (ST.hasArchitectedSGPRs() &&
4312 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4313 switch (ArgType) {
4314 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4315 Arg = &WorkGroupIDX;
4316 ArgRC = &AMDGPU::SReg_32RegClass;
4317 ArgTy = LLT::scalar(SizeInBits: 32);
4318 break;
4319 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4320 Arg = &WorkGroupIDY;
4321 ArgRC = &AMDGPU::SReg_32RegClass;
4322 ArgTy = LLT::scalar(SizeInBits: 32);
4323 break;
4324 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4325 Arg = &WorkGroupIDZ;
4326 ArgRC = &AMDGPU::SReg_32RegClass;
4327 ArgTy = LLT::scalar(SizeInBits: 32);
4328 break;
4329 default:
4330 break;
4331 }
4332 }
4333
4334 if (!Arg)
4335 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4336
4337 if (!Arg) {
4338 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4339 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4340 // case the pointer argument may be missing and we use null.
4341 B.buildConstant(Res: DstReg, Val: 0);
4342 return true;
4343 }
4344
4345 // It's undefined behavior if a function marked with the amdgpu-no-*
4346 // attributes uses the corresponding intrinsic.
4347 B.buildUndef(Res: DstReg);
4348 return true;
4349 }
4350
4351 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4352 return false; // TODO: Handle these
4353 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4354}
4355
4356bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4357 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4358 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4359 if (!loadInputValue(DstReg: MI.getOperand(i: 0).getReg(), B, ArgType))
4360 return false;
4361
4362 MI.eraseFromParent();
4363 return true;
4364}
4365
4366static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4367 int64_t C) {
4368 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: C);
4369 MI.eraseFromParent();
4370 return true;
4371}
4372
4373bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4374 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4375 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4376 unsigned MaxID = ST.getMaxWorkitemID(Kernel: B.getMF().getFunction(), Dimension: Dim);
4377 if (MaxID == 0)
4378 return replaceWithConstant(B, MI, C: 0);
4379
4380 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4381 const ArgDescriptor *Arg;
4382 const TargetRegisterClass *ArgRC;
4383 LLT ArgTy;
4384 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4385
4386 Register DstReg = MI.getOperand(i: 0).getReg();
4387 if (!Arg) {
4388 // It's undefined behavior if a function marked with the amdgpu-no-*
4389 // attributes uses the corresponding intrinsic.
4390 B.buildUndef(Res: DstReg);
4391 MI.eraseFromParent();
4392 return true;
4393 }
4394
4395 if (Arg->isMasked()) {
4396 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4397 // masking operations anyway.
4398 //
4399 // TODO: We could assert the top bit is 0 for the source copy.
4400 if (!loadInputValue(DstReg, B, ArgType))
4401 return false;
4402 } else {
4403 Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
4404 if (!loadInputValue(DstReg: TmpReg, B, ArgType))
4405 return false;
4406 B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
4407 }
4408
4409 MI.eraseFromParent();
4410 return true;
4411}
4412
4413Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4414 int64_t Offset) const {
4415 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
4416 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
4417
4418 // TODO: If we passed in the base kernel offset we could have a better
4419 // alignment than 4, but we don't really need it.
4420 if (!loadInputValue(DstReg: KernArgReg, B,
4421 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4422 llvm_unreachable("failed to find kernarg segment ptr");
4423
4424 auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset);
4425 // TODO: Should get nuw
4426 return B.buildPtrAdd(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: 0);
4427}
4428
4429/// Legalize a value that's loaded from kernel arguments. This is only used by
4430/// legacy intrinsics.
4431bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4432 MachineIRBuilder &B,
4433 uint64_t Offset,
4434 Align Alignment) const {
4435 Register DstReg = MI.getOperand(i: 0).getReg();
4436
4437 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4438 "unexpected kernarg parameter type");
4439
4440 Register Ptr = getKernargParameterPtr(B, Offset);
4441 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4442 B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo, Alignment: Align(4),
4443 MMOFlags: MachineMemOperand::MODereferenceable |
4444 MachineMemOperand::MOInvariant);
4445 MI.eraseFromParent();
4446 return true;
4447}
4448
4449bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4450 MachineRegisterInfo &MRI,
4451 MachineIRBuilder &B) const {
4452 Register Dst = MI.getOperand(i: 0).getReg();
4453 LLT DstTy = MRI.getType(Reg: Dst);
4454 LLT S16 = LLT::scalar(SizeInBits: 16);
4455 LLT S32 = LLT::scalar(SizeInBits: 32);
4456 LLT S64 = LLT::scalar(SizeInBits: 64);
4457
4458 if (DstTy == S16)
4459 return legalizeFDIV16(MI, MRI, B);
4460 if (DstTy == S32)
4461 return legalizeFDIV32(MI, MRI, B);
4462 if (DstTy == S64)
4463 return legalizeFDIV64(MI, MRI, B);
4464
4465 return false;
4466}
4467
4468void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4469 Register DstDivReg,
4470 Register DstRemReg,
4471 Register X,
4472 Register Y) const {
4473 const LLT S1 = LLT::scalar(SizeInBits: 1);
4474 const LLT S32 = LLT::scalar(SizeInBits: 32);
4475
4476 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4477 // algorithm used here.
4478
4479 // Initial estimate of inv(y).
4480 auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
4481 auto RcpIFlag = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {FloatY});
4482 auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f7ffffe));
4483 auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
4484 auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
4485
4486 // One round of UNR.
4487 auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 0), Src1: Y);
4488 auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
4489 Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
4490
4491 // Quotient/remainder estimate.
4492 auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
4493 auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
4494
4495 // First quotient/remainder refinement.
4496 auto One = B.buildConstant(Res: S32, Val: 1);
4497 auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4498 if (DstDivReg)
4499 Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4500 R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4501
4502 // Second quotient/remainder refinement.
4503 Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4504 if (DstDivReg)
4505 B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4506
4507 if (DstRemReg)
4508 B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4509}
4510
4511// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4512//
4513// Return lo, hi of result
4514//
4515// %cvt.lo = G_UITOFP Val.lo
4516// %cvt.hi = G_UITOFP Val.hi
4517// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4518// %rcp = G_AMDGPU_RCP_IFLAG %mad
4519// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4520// %mul2 = G_FMUL %mul1, 2**(-32)
4521// %trunc = G_INTRINSIC_TRUNC %mul2
4522// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4523// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4524static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4525 Register Val) {
4526 const LLT S32 = LLT::scalar(SizeInBits: 32);
4527 auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
4528
4529 auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 0));
4530 auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
4531
4532 auto Mad = B.buildFMAD(
4533 Dst: S32, Src0: CvtHi, // 2**32
4534 Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f800000)), Src2: CvtLo);
4535
4536 auto Rcp = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {Mad});
4537 auto Mul1 = B.buildFMul(
4538 Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x5f7ffffc)));
4539
4540 // 2**(-32)
4541 auto Mul2 = B.buildFMul(
4542 Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x2f800000)));
4543 auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
4544
4545 // -(2**32)
4546 auto Mad2 = B.buildFMAD(
4547 Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0xcf800000)),
4548 Src2: Mul1);
4549
4550 auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
4551 auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
4552
4553 return {ResultLo.getReg(Idx: 0), ResultHi.getReg(Idx: 0)};
4554}
4555
4556void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4557 Register DstDivReg,
4558 Register DstRemReg,
4559 Register Numer,
4560 Register Denom) const {
4561 const LLT S32 = LLT::scalar(SizeInBits: 32);
4562 const LLT S64 = LLT::scalar(SizeInBits: 64);
4563 const LLT S1 = LLT::scalar(SizeInBits: 1);
4564 Register RcpLo, RcpHi;
4565
4566 std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
4567
4568 auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
4569
4570 auto Zero64 = B.buildConstant(Res: S64, Val: 0);
4571 auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
4572
4573 auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
4574 auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
4575
4576 auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
4577 Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: 0);
4578 Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: 1);
4579
4580 auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
4581 auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: 1));
4582 auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
4583
4584 auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
4585 auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
4586 auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
4587 Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: 0);
4588 Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: 1);
4589
4590 auto Zero32 = B.buildConstant(Res: S32, Val: 0);
4591 auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
4592 auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: 1));
4593 auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
4594
4595 auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
4596 Register NumerLo = UnmergeNumer.getReg(Idx: 0);
4597 Register NumerHi = UnmergeNumer.getReg(Idx: 1);
4598
4599 auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
4600 auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
4601 auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
4602 Register Mul3_Lo = UnmergeMul3.getReg(Idx: 0);
4603 Register Mul3_Hi = UnmergeMul3.getReg(Idx: 1);
4604 auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
4605 auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: 1));
4606 auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
4607 auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
4608
4609 auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
4610 Register DenomLo = UnmergeDenom.getReg(Idx: 0);
4611 Register DenomHi = UnmergeDenom.getReg(Idx: 1);
4612
4613 auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4614 auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
4615
4616 auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
4617 auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
4618
4619 auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4620 auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
4621
4622 // TODO: Here and below portions of the code can be enclosed into if/endif.
4623 // Currently control flow is unconditional and we have 4 selects after
4624 // potential endif to substitute PHIs.
4625
4626 // if C3 != 0 ...
4627 auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
4628 auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: 1));
4629 auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: 1));
4630 auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
4631
4632 auto One64 = B.buildConstant(Res: S64, Val: 1);
4633 auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
4634
4635 auto C4 =
4636 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
4637 auto C5 =
4638 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
4639 auto C6 = B.buildSelect(
4640 Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
4641
4642 // if (C6 != 0)
4643 auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
4644 auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
4645
4646 auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: 1));
4647 auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: 1));
4648 auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
4649
4650 // endif C6
4651 // endif C3
4652
4653 if (DstDivReg) {
4654 auto Sel1 = B.buildSelect(
4655 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
4656 B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4657 Op0: Sel1, Op1: MulHi3);
4658 }
4659
4660 if (DstRemReg) {
4661 auto Sel2 = B.buildSelect(
4662 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
4663 B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4664 Op0: Sel2, Op1: Sub1);
4665 }
4666}
4667
4668bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4669 MachineRegisterInfo &MRI,
4670 MachineIRBuilder &B) const {
4671 Register DstDivReg, DstRemReg;
4672 switch (MI.getOpcode()) {
4673 default:
4674 llvm_unreachable("Unexpected opcode!");
4675 case AMDGPU::G_UDIV: {
4676 DstDivReg = MI.getOperand(i: 0).getReg();
4677 break;
4678 }
4679 case AMDGPU::G_UREM: {
4680 DstRemReg = MI.getOperand(i: 0).getReg();
4681 break;
4682 }
4683 case AMDGPU::G_UDIVREM: {
4684 DstDivReg = MI.getOperand(i: 0).getReg();
4685 DstRemReg = MI.getOperand(i: 1).getReg();
4686 break;
4687 }
4688 }
4689
4690 const LLT S64 = LLT::scalar(SizeInBits: 64);
4691 const LLT S32 = LLT::scalar(SizeInBits: 32);
4692 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4693 Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
4694 Register Den = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
4695 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4696
4697 if (Ty == S32)
4698 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
4699 else if (Ty == S64)
4700 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
4701 else
4702 return false;
4703
4704 MI.eraseFromParent();
4705 return true;
4706}
4707
4708bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4709 MachineRegisterInfo &MRI,
4710 MachineIRBuilder &B) const {
4711 const LLT S64 = LLT::scalar(SizeInBits: 64);
4712 const LLT S32 = LLT::scalar(SizeInBits: 32);
4713
4714 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4715 if (Ty != S32 && Ty != S64)
4716 return false;
4717
4718 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4719 Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
4720 Register RHS = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
4721
4722 auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - 1);
4723 auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
4724 auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
4725
4726 LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
4727 RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
4728
4729 LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
4730 RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
4731
4732 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4733 switch (MI.getOpcode()) {
4734 default:
4735 llvm_unreachable("Unexpected opcode!");
4736 case AMDGPU::G_SDIV: {
4737 DstDivReg = MI.getOperand(i: 0).getReg();
4738 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4739 break;
4740 }
4741 case AMDGPU::G_SREM: {
4742 DstRemReg = MI.getOperand(i: 0).getReg();
4743 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4744 break;
4745 }
4746 case AMDGPU::G_SDIVREM: {
4747 DstDivReg = MI.getOperand(i: 0).getReg();
4748 DstRemReg = MI.getOperand(i: 1).getReg();
4749 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4750 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4751 break;
4752 }
4753 }
4754
4755 if (Ty == S32)
4756 legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
4757 else
4758 legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
4759
4760 if (DstDivReg) {
4761 auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: 0);
4762 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: 0);
4763 B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
4764 }
4765
4766 if (DstRemReg) {
4767 auto Sign = LHSign.getReg(Idx: 0); // Remainder sign is the same as LHS
4768 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: 0);
4769 B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
4770 }
4771
4772 MI.eraseFromParent();
4773 return true;
4774}
4775
4776bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4777 MachineRegisterInfo &MRI,
4778 MachineIRBuilder &B) const {
4779 Register Res = MI.getOperand(i: 0).getReg();
4780 Register LHS = MI.getOperand(i: 1).getReg();
4781 Register RHS = MI.getOperand(i: 2).getReg();
4782 uint16_t Flags = MI.getFlags();
4783 LLT ResTy = MRI.getType(Reg: Res);
4784
4785 const MachineFunction &MF = B.getMF();
4786 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn) ||
4787 MF.getTarget().Options.UnsafeFPMath;
4788
4789 if (auto CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
4790 if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: 16))
4791 return false;
4792
4793 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4794 // the CI documentation has a worst case error of 1 ulp.
4795 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4796 // use it as long as we aren't trying to use denormals.
4797 //
4798 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4799
4800 // 1 / x -> RCP(x)
4801 if (CLHS->isExactlyValue(V: 1.0)) {
4802 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
4803 .addUse(RegNo: RHS)
4804 .setMIFlags(Flags);
4805
4806 MI.eraseFromParent();
4807 return true;
4808 }
4809
4810 // -1 / x -> RCP( FNEG(x) )
4811 if (CLHS->isExactlyValue(V: -1.0)) {
4812 auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
4813 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
4814 .addUse(RegNo: FNeg.getReg(Idx: 0))
4815 .setMIFlags(Flags);
4816
4817 MI.eraseFromParent();
4818 return true;
4819 }
4820 }
4821
4822 // For f16 require afn or arcp.
4823 // For f32 require afn.
4824 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: 16) ||
4825 !MI.getFlag(Flag: MachineInstr::FmArcp)))
4826 return false;
4827
4828 // x / y -> x * (1.0 / y)
4829 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
4830 .addUse(RegNo: RHS)
4831 .setMIFlags(Flags);
4832 B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
4833
4834 MI.eraseFromParent();
4835 return true;
4836}
4837
4838bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4839 MachineRegisterInfo &MRI,
4840 MachineIRBuilder &B) const {
4841 Register Res = MI.getOperand(i: 0).getReg();
4842 Register X = MI.getOperand(i: 1).getReg();
4843 Register Y = MI.getOperand(i: 2).getReg();
4844 uint16_t Flags = MI.getFlags();
4845 LLT ResTy = MRI.getType(Reg: Res);
4846
4847 const MachineFunction &MF = B.getMF();
4848 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4849 MI.getFlag(Flag: MachineInstr::FmAfn);
4850
4851 if (!AllowInaccurateRcp)
4852 return false;
4853
4854 auto NegY = B.buildFNeg(Dst: ResTy, Src0: Y);
4855 auto One = B.buildFConstant(Res: ResTy, Val: 1.0);
4856
4857 auto R = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
4858 .addUse(RegNo: Y)
4859 .setMIFlags(Flags);
4860
4861 auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4862 R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
4863
4864 auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4865 R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
4866
4867 auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
4868 auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
4869
4870 B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
4871 MI.eraseFromParent();
4872 return true;
4873}
4874
4875bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4876 MachineRegisterInfo &MRI,
4877 MachineIRBuilder &B) const {
4878 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4879 return true;
4880
4881 Register Res = MI.getOperand(i: 0).getReg();
4882 Register LHS = MI.getOperand(i: 1).getReg();
4883 Register RHS = MI.getOperand(i: 2).getReg();
4884
4885 uint16_t Flags = MI.getFlags();
4886
4887 LLT S16 = LLT::scalar(SizeInBits: 16);
4888 LLT S32 = LLT::scalar(SizeInBits: 32);
4889
4890 auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
4891 auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
4892
4893 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
4894 .addUse(RegNo: RHSExt.getReg(Idx: 0))
4895 .setMIFlags(Flags);
4896
4897 auto QUOT = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: RCP, Flags);
4898 auto RDst = B.buildFPTrunc(Res: S16, Op: QUOT, Flags);
4899
4900 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
4901 .addUse(RegNo: RDst.getReg(Idx: 0))
4902 .addUse(RegNo: RHS)
4903 .addUse(RegNo: LHS)
4904 .setMIFlags(Flags);
4905
4906 MI.eraseFromParent();
4907 return true;
4908}
4909
4910static constexpr unsigned SPDenormModeBitField =
4911 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 4, Values: 2);
4912
4913// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4914// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4915static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4916 const GCNSubtarget &ST,
4917 SIModeRegisterDefaults Mode) {
4918 // Set SP denorm mode to this value.
4919 unsigned SPDenormMode =
4920 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4921
4922 if (ST.hasDenormModeInst()) {
4923 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4924 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4925
4926 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4927 B.buildInstr(Opcode: AMDGPU::S_DENORM_MODE)
4928 .addImm(Val: NewDenormModeValue);
4929
4930 } else {
4931 B.buildInstr(Opcode: AMDGPU::S_SETREG_IMM32_B32)
4932 .addImm(Val: SPDenormMode)
4933 .addImm(Val: SPDenormModeBitField);
4934 }
4935}
4936
4937bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4938 MachineRegisterInfo &MRI,
4939 MachineIRBuilder &B) const {
4940 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4941 return true;
4942
4943 Register Res = MI.getOperand(i: 0).getReg();
4944 Register LHS = MI.getOperand(i: 1).getReg();
4945 Register RHS = MI.getOperand(i: 2).getReg();
4946 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4947 SIModeRegisterDefaults Mode = MFI->getMode();
4948
4949 uint16_t Flags = MI.getFlags();
4950
4951 LLT S32 = LLT::scalar(SizeInBits: 32);
4952 LLT S1 = LLT::scalar(SizeInBits: 1);
4953
4954 auto One = B.buildFConstant(Res: S32, Val: 1.0f);
4955
4956 auto DenominatorScaled =
4957 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
4958 .addUse(RegNo: LHS)
4959 .addUse(RegNo: RHS)
4960 .addImm(Val: 0)
4961 .setMIFlags(Flags);
4962 auto NumeratorScaled =
4963 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
4964 .addUse(RegNo: LHS)
4965 .addUse(RegNo: RHS)
4966 .addImm(Val: 1)
4967 .setMIFlags(Flags);
4968
4969 auto ApproxRcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
4970 .addUse(RegNo: DenominatorScaled.getReg(Idx: 0))
4971 .setMIFlags(Flags);
4972 auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
4973
4974 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4975 const bool HasDynamicDenormals =
4976 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4977 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4978
4979 Register SavedSPDenormMode;
4980 if (!PreservesDenormals) {
4981 if (HasDynamicDenormals) {
4982 SavedSPDenormMode = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
4983 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32)
4984 .addDef(RegNo: SavedSPDenormMode)
4985 .addImm(Val: SPDenormModeBitField);
4986 }
4987 toggleSPDenormMode(Enable: true, B, ST, Mode);
4988 }
4989
4990 auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
4991 auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
4992 auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
4993 auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
4994 auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
4995 auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
4996
4997 if (!PreservesDenormals) {
4998 if (HasDynamicDenormals) {
4999 assert(SavedSPDenormMode);
5000 B.buildInstr(Opcode: AMDGPU::S_SETREG_B32)
5001 .addReg(RegNo: SavedSPDenormMode)
5002 .addImm(Val: SPDenormModeBitField);
5003 } else
5004 toggleSPDenormMode(Enable: false, B, ST, Mode);
5005 }
5006
5007 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S32})
5008 .addUse(RegNo: Fma4.getReg(Idx: 0))
5009 .addUse(RegNo: Fma1.getReg(Idx: 0))
5010 .addUse(RegNo: Fma3.getReg(Idx: 0))
5011 .addUse(RegNo: NumeratorScaled.getReg(Idx: 1))
5012 .setMIFlags(Flags);
5013
5014 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5015 .addUse(RegNo: Fmas.getReg(Idx: 0))
5016 .addUse(RegNo: RHS)
5017 .addUse(RegNo: LHS)
5018 .setMIFlags(Flags);
5019
5020 MI.eraseFromParent();
5021 return true;
5022}
5023
5024bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5025 MachineRegisterInfo &MRI,
5026 MachineIRBuilder &B) const {
5027 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5028 return true;
5029
5030 Register Res = MI.getOperand(i: 0).getReg();
5031 Register LHS = MI.getOperand(i: 1).getReg();
5032 Register RHS = MI.getOperand(i: 2).getReg();
5033
5034 uint16_t Flags = MI.getFlags();
5035
5036 LLT S64 = LLT::scalar(SizeInBits: 64);
5037 LLT S1 = LLT::scalar(SizeInBits: 1);
5038
5039 auto One = B.buildFConstant(Res: S64, Val: 1.0);
5040
5041 auto DivScale0 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5042 .addUse(RegNo: LHS)
5043 .addUse(RegNo: RHS)
5044 .addImm(Val: 0)
5045 .setMIFlags(Flags);
5046
5047 auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(Idx: 0), Flags);
5048
5049 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S64})
5050 .addUse(RegNo: DivScale0.getReg(Idx: 0))
5051 .setMIFlags(Flags);
5052
5053 auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
5054 auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
5055 auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
5056
5057 auto DivScale1 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5058 .addUse(RegNo: LHS)
5059 .addUse(RegNo: RHS)
5060 .addImm(Val: 1)
5061 .setMIFlags(Flags);
5062
5063 auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5064 auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(Idx: 0), Src1: Fma3, Flags);
5065 auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(Idx: 0), Flags);
5066
5067 Register Scale;
5068 if (!ST.hasUsableDivScaleConditionOutput()) {
5069 // Workaround a hardware bug on SI where the condition output from div_scale
5070 // is not usable.
5071
5072 LLT S32 = LLT::scalar(SizeInBits: 32);
5073
5074 auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5075 auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5076 auto Scale0Unmerge = B.buildUnmerge(Res: S32, Op: DivScale0);
5077 auto Scale1Unmerge = B.buildUnmerge(Res: S32, Op: DivScale1);
5078
5079 auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: 1),
5080 Op1: Scale1Unmerge.getReg(Idx: 1));
5081 auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: 1),
5082 Op1: Scale0Unmerge.getReg(Idx: 1));
5083 Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(Idx: 0);
5084 } else {
5085 Scale = DivScale1.getReg(Idx: 1);
5086 }
5087
5088 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S64})
5089 .addUse(RegNo: Fma4.getReg(Idx: 0))
5090 .addUse(RegNo: Fma3.getReg(Idx: 0))
5091 .addUse(RegNo: Mul.getReg(Idx: 0))
5092 .addUse(RegNo: Scale)
5093 .setMIFlags(Flags);
5094
5095 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res: ArrayRef(Res))
5096 .addUse(RegNo: Fmas.getReg(Idx: 0))
5097 .addUse(RegNo: RHS)
5098 .addUse(RegNo: LHS)
5099 .setMIFlags(Flags);
5100
5101 MI.eraseFromParent();
5102 return true;
5103}
5104
5105bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5106 MachineRegisterInfo &MRI,
5107 MachineIRBuilder &B) const {
5108 Register Res0 = MI.getOperand(i: 0).getReg();
5109 Register Res1 = MI.getOperand(i: 1).getReg();
5110 Register Val = MI.getOperand(i: 2).getReg();
5111 uint16_t Flags = MI.getFlags();
5112
5113 LLT Ty = MRI.getType(Reg: Res0);
5114 LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: 16) ? LLT::scalar(SizeInBits: 16) : LLT::scalar(SizeInBits: 32);
5115
5116 auto Mant = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_mant, Res: {Ty})
5117 .addUse(RegNo: Val)
5118 .setMIFlags(Flags);
5119 auto Exp = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_exp, Res: {InstrExpTy})
5120 .addUse(RegNo: Val)
5121 .setMIFlags(Flags);
5122
5123 if (ST.hasFractBug()) {
5124 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5125 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5126 auto IsFinite =
5127 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
5128 auto Zero = B.buildConstant(Res: InstrExpTy, Val: 0);
5129 Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5130 Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5131 }
5132
5133 B.buildCopy(Res: Res0, Op: Mant);
5134 B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5135
5136 MI.eraseFromParent();
5137 return true;
5138}
5139
5140bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5141 MachineRegisterInfo &MRI,
5142 MachineIRBuilder &B) const {
5143 Register Res = MI.getOperand(i: 0).getReg();
5144 Register LHS = MI.getOperand(i: 2).getReg();
5145 Register RHS = MI.getOperand(i: 3).getReg();
5146 uint16_t Flags = MI.getFlags();
5147
5148 LLT S32 = LLT::scalar(SizeInBits: 32);
5149 LLT S1 = LLT::scalar(SizeInBits: 1);
5150
5151 auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5152 const APFloat C0Val(1.0f);
5153
5154 auto C0 = B.buildFConstant(Res: S32, Val: 0x1p+96f);
5155 auto C1 = B.buildFConstant(Res: S32, Val: 0x1p-32f);
5156 auto C2 = B.buildFConstant(Res: S32, Val: 1.0f);
5157
5158 auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5159 auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5160
5161 auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5162
5163 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5164 .addUse(RegNo: Mul0.getReg(Idx: 0))
5165 .setMIFlags(Flags);
5166
5167 auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5168
5169 B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5170
5171 MI.eraseFromParent();
5172 return true;
5173}
5174
5175bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5176 MachineRegisterInfo &MRI,
5177 MachineIRBuilder &B) const {
5178 // Bypass the correct expansion a standard promotion through G_FSQRT would
5179 // get. The f32 op is accurate enough for the f16 cas.
5180 unsigned Flags = MI.getFlags();
5181 assert(!ST.has16BitInsts());
5182 const LLT F32 = LLT::scalar(SizeInBits: 32);
5183 auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: 1), Flags);
5184 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: {F32})
5185 .addUse(RegNo: Ext.getReg(Idx: 0))
5186 .setMIFlags(Flags);
5187 B.buildFPTrunc(Res: MI.getOperand(i: 0), Op: Log2, Flags);
5188 MI.eraseFromParent();
5189 return true;
5190}
5191
5192bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5193 MachineRegisterInfo &MRI,
5194 MachineIRBuilder &B) const {
5195 MachineFunction &MF = B.getMF();
5196 Register Dst = MI.getOperand(i: 0).getReg();
5197 Register X = MI.getOperand(i: 1).getReg();
5198 const unsigned Flags = MI.getFlags();
5199 const LLT S1 = LLT::scalar(SizeInBits: 1);
5200 const LLT F32 = LLT::scalar(SizeInBits: 32);
5201 const LLT I32 = LLT::scalar(SizeInBits: 32);
5202
5203 if (allowApproxFunc(MF, Flags)) {
5204 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({Dst}))
5205 .addUse(RegNo: X)
5206 .setMIFlags(Flags);
5207 MI.eraseFromParent();
5208 return true;
5209 }
5210
5211 auto ScaleThreshold = B.buildFConstant(Res: F32, Val: 0x1.0p-96f);
5212 auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5213 auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: 0x1.0p+32f);
5214 auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5215 auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5216
5217 Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5218 if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5219 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({SqrtS}))
5220 .addUse(RegNo: SqrtX.getReg(Idx: 0))
5221 .setMIFlags(Flags);
5222
5223 auto NegOne = B.buildConstant(Res: I32, Val: -1);
5224 auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5225
5226 auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5227 auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5228
5229 auto PosOne = B.buildConstant(Res: I32, Val: 1);
5230 auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5231
5232 auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5233 auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5234
5235 auto Zero = B.buildFConstant(Res: F32, Val: 0.0f);
5236 auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5237
5238 SqrtS =
5239 B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5240
5241 auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
5242 SqrtS =
5243 B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: 0);
5244 } else {
5245 auto SqrtR =
5246 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F32}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5247 B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
5248
5249 auto Half = B.buildFConstant(Res: F32, Val: 0.5f);
5250 auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
5251 auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
5252 auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
5253 SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
5254 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(Idx: 0);
5255 auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
5256 auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
5257 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(Idx: 0);
5258 }
5259
5260 auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: 0x1.0p-16f);
5261
5262 auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
5263
5264 SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5265
5266 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5267 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
5268
5269 MI.eraseFromParent();
5270 return true;
5271}
5272
5273bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5274 MachineRegisterInfo &MRI,
5275 MachineIRBuilder &B) const {
5276 // For double type, the SQRT and RSQ instructions don't have required
5277 // precision, we apply Goldschmidt's algorithm to improve the result:
5278 //
5279 // y0 = rsq(x)
5280 // g0 = x * y0
5281 // h0 = 0.5 * y0
5282 //
5283 // r0 = 0.5 - h0 * g0
5284 // g1 = g0 * r0 + g0
5285 // h1 = h0 * r0 + h0
5286 //
5287 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5288 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5289 // h2 = h1 * r1 + h1
5290 //
5291 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5292 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5293 //
5294 // sqrt(x) = g3
5295
5296 const LLT S1 = LLT::scalar(SizeInBits: 1);
5297 const LLT S32 = LLT::scalar(SizeInBits: 32);
5298 const LLT F64 = LLT::scalar(SizeInBits: 64);
5299
5300 Register Dst = MI.getOperand(i: 0).getReg();
5301 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5302
5303 Register X = MI.getOperand(i: 1).getReg();
5304 unsigned Flags = MI.getFlags();
5305
5306 auto ScaleConstant = B.buildFConstant(Res: F64, Val: 0x1.0p-767);
5307
5308 auto ZeroInt = B.buildConstant(Res: S32, Val: 0);
5309 auto Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant);
5310
5311 // Scale up input if it is too small.
5312 auto ScaleUpFactor = B.buildConstant(Res: S32, Val: 256);
5313 auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
5314 auto SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags);
5315
5316 auto SqrtY =
5317 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F64}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5318
5319 auto Half = B.buildFConstant(Res: F64, Val: 0.5);
5320 auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
5321 auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
5322
5323 auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
5324 auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
5325
5326 auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
5327 auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
5328
5329 auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
5330 auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
5331
5332 auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
5333
5334 auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
5335 auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
5336
5337 auto SqrtRet = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
5338
5339 // Scale down the result.
5340 auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -128);
5341 auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
5342 SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtRet, Src1: ScaleDown, Flags);
5343
5344 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5345 // with finite only or nsz because rsq(+/-0) = +/-inf
5346
5347 // TODO: Check for DAZ and expand to subnormals
5348 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5349
5350 // If x is +INF, +0, or -0, use its original value
5351 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
5352
5353 MI.eraseFromParent();
5354 return true;
5355}
5356
5357bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5358 MachineRegisterInfo &MRI,
5359 MachineIRBuilder &B) const {
5360 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5361 if (Ty == LLT::scalar(SizeInBits: 32))
5362 return legalizeFSQRTF32(MI, MRI, B);
5363 if (Ty == LLT::scalar(SizeInBits: 64))
5364 return legalizeFSQRTF64(MI, MRI, B);
5365 if (Ty == LLT::scalar(SizeInBits: 16))
5366 return legalizeFSQRTF16(MI, MRI, B);
5367 return false;
5368}
5369
5370// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5371// FIXME: Why do we handle this one but not other removed instructions?
5372//
5373// Reciprocal square root. The clamp prevents infinite results, clamping
5374// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5375// +-max_float.
5376bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5377 MachineRegisterInfo &MRI,
5378 MachineIRBuilder &B) const {
5379 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5380 return true;
5381
5382 Register Dst = MI.getOperand(i: 0).getReg();
5383 Register Src = MI.getOperand(i: 2).getReg();
5384 auto Flags = MI.getFlags();
5385
5386 LLT Ty = MRI.getType(Reg: Dst);
5387
5388 const fltSemantics *FltSemantics;
5389 if (Ty == LLT::scalar(SizeInBits: 32))
5390 FltSemantics = &APFloat::IEEEsingle();
5391 else if (Ty == LLT::scalar(SizeInBits: 64))
5392 FltSemantics = &APFloat::IEEEdouble();
5393 else
5394 return false;
5395
5396 auto Rsq = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {Ty})
5397 .addUse(RegNo: Src)
5398 .setMIFlags(Flags);
5399
5400 // We don't need to concern ourselves with the snan handling difference, since
5401 // the rsq quieted (or not) so use the one which will directly select.
5402 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5403 const bool UseIEEE = MFI->getMode().IEEE;
5404
5405 auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
5406 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
5407 B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
5408
5409 auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics, Negative: true));
5410
5411 if (UseIEEE)
5412 B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5413 else
5414 B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5415 MI.eraseFromParent();
5416 return true;
5417}
5418
5419// TODO: Fix pointer type handling
5420bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5421 MachineInstr &MI,
5422 Intrinsic::ID IID) const {
5423
5424 MachineIRBuilder &B = Helper.MIRBuilder;
5425 MachineRegisterInfo &MRI = *B.getMRI();
5426
5427 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5428 IID == Intrinsic::amdgcn_permlanex16;
5429
5430 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5431 Register Src2, LLT VT) -> Register {
5432 auto LaneOp = B.buildIntrinsic(ID: IID, Res: {VT}).addUse(RegNo: Src0);
5433 switch (IID) {
5434 case Intrinsic::amdgcn_readfirstlane:
5435 case Intrinsic::amdgcn_permlane64:
5436 return LaneOp.getReg(Idx: 0);
5437 case Intrinsic::amdgcn_readlane:
5438 return LaneOp.addUse(RegNo: Src1).getReg(Idx: 0);
5439 case Intrinsic::amdgcn_writelane:
5440 return LaneOp.addUse(RegNo: Src1).addUse(RegNo: Src2).getReg(Idx: 0);
5441 case Intrinsic::amdgcn_permlane16:
5442 case Intrinsic::amdgcn_permlanex16: {
5443 Register Src3 = MI.getOperand(i: 5).getReg();
5444 Register Src4 = MI.getOperand(i: 6).getImm();
5445 Register Src5 = MI.getOperand(i: 7).getImm();
5446 return LaneOp.addUse(RegNo: Src1)
5447 .addUse(RegNo: Src2)
5448 .addUse(RegNo: Src3)
5449 .addImm(Val: Src4)
5450 .addImm(Val: Src5)
5451 .getReg(Idx: 0);
5452 }
5453 default:
5454 llvm_unreachable("unhandled lane op");
5455 }
5456 };
5457
5458 Register DstReg = MI.getOperand(i: 0).getReg();
5459 Register Src0 = MI.getOperand(i: 2).getReg();
5460 Register Src1, Src2;
5461 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5462 IsPermLane16) {
5463 Src1 = MI.getOperand(i: 3).getReg();
5464 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5465 Src2 = MI.getOperand(i: 4).getReg();
5466 }
5467 }
5468
5469 LLT Ty = MRI.getType(Reg: DstReg);
5470 unsigned Size = Ty.getSizeInBits();
5471
5472 if (Size == 32) {
5473 // Already legal
5474 return true;
5475 }
5476
5477 if (Size < 32) {
5478 Src0 = B.buildAnyExt(Res: S32, Op: Src0).getReg(Idx: 0);
5479
5480 if (IsPermLane16)
5481 Src1 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src1).getReg(Idx: 0);
5482
5483 if (IID == Intrinsic::amdgcn_writelane)
5484 Src2 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src2).getReg(Idx: 0);
5485
5486 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5487 B.buildTrunc(Res: DstReg, Op: LaneOpDst);
5488 MI.eraseFromParent();
5489 return true;
5490 }
5491
5492 if (Size % 32 != 0)
5493 return false;
5494
5495 LLT PartialResTy = S32;
5496 if (Ty.isVector()) {
5497 LLT EltTy = Ty.getElementType();
5498 switch (EltTy.getSizeInBits()) {
5499 case 16:
5500 PartialResTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: 2));
5501 break;
5502 case 32:
5503 PartialResTy = EltTy;
5504 break;
5505 default:
5506 // Handle all other cases via S32 pieces;
5507 break;
5508 }
5509 }
5510
5511 SmallVector<Register, 2> PartialRes;
5512 unsigned NumParts = Size / 32;
5513 MachineInstrBuilder Src0Parts = B.buildUnmerge(Res: PartialResTy, Op: Src0);
5514 MachineInstrBuilder Src1Parts, Src2Parts;
5515
5516 if (IsPermLane16)
5517 Src1Parts = B.buildUnmerge(Res: PartialResTy, Op: Src1);
5518
5519 if (IID == Intrinsic::amdgcn_writelane)
5520 Src2Parts = B.buildUnmerge(Res: PartialResTy, Op: Src2);
5521
5522 for (unsigned i = 0; i < NumParts; ++i) {
5523 Src0 = Src0Parts.getReg(Idx: i);
5524
5525 if (IsPermLane16)
5526 Src1 = Src1Parts.getReg(Idx: i);
5527
5528 if (IID == Intrinsic::amdgcn_writelane)
5529 Src2 = Src2Parts.getReg(Idx: i);
5530
5531 PartialRes.push_back(Elt: createLaneOp(Src0, Src1, Src2, PartialResTy));
5532 }
5533
5534 B.buildMergeLikeInstr(Res: DstReg, Ops: PartialRes);
5535 MI.eraseFromParent();
5536 return true;
5537}
5538
5539bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5540 MachineRegisterInfo &MRI,
5541 MachineIRBuilder &B) const {
5542 uint64_t Offset =
5543 ST.getTargetLowering()->getImplicitParameterOffset(
5544 MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
5545 LLT DstTy = MRI.getType(Reg: DstReg);
5546 LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
5547
5548 Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
5549 if (!loadInputValue(DstReg: KernargPtrReg, B,
5550 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5551 return false;
5552
5553 // FIXME: This should be nuw
5554 B.buildPtrAdd(Res: DstReg, Op0: KernargPtrReg, Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: 0));
5555 return true;
5556}
5557
5558/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5559/// bits of the pointer and replace them with the stride argument, then
5560/// merge_values everything together. In the common case of a raw buffer (the
5561/// stride component is 0), we can just AND off the upper half.
5562bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5563 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5564 Register Result = MI.getOperand(i: 0).getReg();
5565 Register Pointer = MI.getOperand(i: 2).getReg();
5566 Register Stride = MI.getOperand(i: 3).getReg();
5567 Register NumRecords = MI.getOperand(i: 4).getReg();
5568 Register Flags = MI.getOperand(i: 5).getReg();
5569
5570 LLT S32 = LLT::scalar(SizeInBits: 32);
5571
5572 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
5573 auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
5574 Register LowHalf = Unmerge.getReg(Idx: 0);
5575 Register HighHalf = Unmerge.getReg(Idx: 1);
5576
5577 auto AndMask = B.buildConstant(Res: S32, Val: 0x0000ffff);
5578 auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
5579
5580 MachineInstrBuilder NewHighHalf = Masked;
5581 std::optional<ValueAndVReg> StrideConst =
5582 getIConstantVRegValWithLookThrough(VReg: Stride, MRI);
5583 if (!StrideConst || !StrideConst->Value.isZero()) {
5584 MachineInstrBuilder ShiftedStride;
5585 if (StrideConst) {
5586 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5587 uint32_t ShiftedStrideVal = StrideVal << 16;
5588 ShiftedStride = B.buildConstant(Res: S32, Val: ShiftedStrideVal);
5589 } else {
5590 auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
5591 auto ShiftConst = B.buildConstant(Res: S32, Val: 16);
5592 ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
5593 }
5594 NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
5595 }
5596 Register NewHighHalfReg = NewHighHalf.getReg(Idx: 0);
5597 B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
5598 MI.eraseFromParent();
5599 return true;
5600}
5601
5602bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5603 MachineRegisterInfo &MRI,
5604 MachineIRBuilder &B) const {
5605 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5606 if (!MFI->isEntryFunction()) {
5607 return legalizePreloadedArgIntrin(MI, MRI, B,
5608 ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5609 }
5610
5611 Register DstReg = MI.getOperand(i: 0).getReg();
5612 if (!getImplicitArgPtr(DstReg, MRI, B))
5613 return false;
5614
5615 MI.eraseFromParent();
5616 return true;
5617}
5618
5619bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5620 MachineRegisterInfo &MRI,
5621 MachineIRBuilder &B) const {
5622 Function &F = B.getMF().getFunction();
5623 std::optional<uint32_t> KnownSize =
5624 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5625 if (KnownSize.has_value())
5626 B.buildConstant(Res: DstReg, Val: *KnownSize);
5627 return false;
5628}
5629
5630bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5631 MachineRegisterInfo &MRI,
5632 MachineIRBuilder &B) const {
5633
5634 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5635 if (!MFI->isEntryFunction()) {
5636 return legalizePreloadedArgIntrin(MI, MRI, B,
5637 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5638 }
5639
5640 Register DstReg = MI.getOperand(i: 0).getReg();
5641 if (!getLDSKernelId(DstReg, MRI, B))
5642 return false;
5643
5644 MI.eraseFromParent();
5645 return true;
5646}
5647
5648bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5649 MachineRegisterInfo &MRI,
5650 MachineIRBuilder &B,
5651 unsigned AddrSpace) const {
5652 Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
5653 auto Unmerge = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: MI.getOperand(i: 2).getReg());
5654 Register Hi32 = Unmerge.getReg(Idx: 1);
5655
5656 B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: 0), Op0: Hi32, Op1: ApertureReg);
5657 MI.eraseFromParent();
5658 return true;
5659}
5660
5661// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5662// offset (the offset that is included in bounds checking and swizzling, to be
5663// split between the instruction's voffset and immoffset fields) and soffset
5664// (the offset that is excluded from bounds checking and swizzling, to go in
5665// the instruction's soffset field). This function takes the first kind of
5666// offset and figures out how to split it between voffset and immoffset.
5667std::pair<Register, unsigned>
5668AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5669 Register OrigOffset) const {
5670 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5671 Register BaseReg;
5672 unsigned ImmOffset;
5673 const LLT S32 = LLT::scalar(SizeInBits: 32);
5674 MachineRegisterInfo &MRI = *B.getMRI();
5675
5676 std::tie(args&: BaseReg, args&: ImmOffset) =
5677 AMDGPU::getBaseWithConstantOffset(MRI, Reg: OrigOffset);
5678
5679 // If BaseReg is a pointer, convert it to int.
5680 if (MRI.getType(Reg: BaseReg).isPointer())
5681 BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: 0);
5682
5683 // If the immediate value is too big for the immoffset field, put only bits
5684 // that would normally fit in the immoffset field. The remaining value that
5685 // is copied/added for the voffset field is a large power of 2, and it
5686 // stands more chance of being CSEd with the copy/add for another similar
5687 // load/store.
5688 // However, do not do that rounding down if that is a negative
5689 // number, as it appears to be illegal to have a negative offset in the
5690 // vgpr, even if adding the immediate offset makes it positive.
5691 unsigned Overflow = ImmOffset & ~MaxImm;
5692 ImmOffset -= Overflow;
5693 if ((int32_t)Overflow < 0) {
5694 Overflow += ImmOffset;
5695 ImmOffset = 0;
5696 }
5697
5698 if (Overflow != 0) {
5699 if (!BaseReg) {
5700 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
5701 } else {
5702 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
5703 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
5704 }
5705 }
5706
5707 if (!BaseReg)
5708 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
5709
5710 return std::pair(BaseReg, ImmOffset);
5711}
5712
5713/// Handle register layout difference for f16 images for some subtargets.
5714Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5715 MachineRegisterInfo &MRI,
5716 Register Reg,
5717 bool ImageStore) const {
5718 const LLT S16 = LLT::scalar(SizeInBits: 16);
5719 const LLT S32 = LLT::scalar(SizeInBits: 32);
5720 LLT StoreVT = MRI.getType(Reg);
5721 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5722
5723 if (ST.hasUnpackedD16VMem()) {
5724 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5725
5726 SmallVector<Register, 4> WideRegs;
5727 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5728 WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
5729
5730 int NumElts = StoreVT.getNumElements();
5731
5732 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
5733 .getReg(Idx: 0);
5734 }
5735
5736 if (ImageStore && ST.hasImageStoreD16Bug()) {
5737 if (StoreVT.getNumElements() == 2) {
5738 SmallVector<Register, 4> PackedRegs;
5739 Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: 0);
5740 PackedRegs.push_back(Elt: Reg);
5741 PackedRegs.resize(N: 2, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
5742 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Ops: PackedRegs)
5743 .getReg(Idx: 0);
5744 }
5745
5746 if (StoreVT.getNumElements() == 3) {
5747 SmallVector<Register, 4> PackedRegs;
5748 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5749 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5750 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5751 PackedRegs.resize(N: 6, NV: B.buildUndef(Res: S16).getReg(Idx: 0));
5752 Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 6, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: 0);
5753 return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 3, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
5754 }
5755
5756 if (StoreVT.getNumElements() == 4) {
5757 SmallVector<Register, 4> PackedRegs;
5758 Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
5759 auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
5760 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5761 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5762 PackedRegs.resize(N: 4, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
5763 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S32), Ops: PackedRegs)
5764 .getReg(Idx: 0);
5765 }
5766
5767 llvm_unreachable("invalid data type");
5768 }
5769
5770 if (StoreVT == LLT::fixed_vector(NumElements: 3, ScalarTy: S16)) {
5771 Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S16), Op0: Reg)
5772 .getReg(Idx: 0);
5773 }
5774 return Reg;
5775}
5776
5777Register AMDGPULegalizerInfo::fixStoreSourceType(
5778 MachineIRBuilder &B, Register VData, bool IsFormat) const {
5779 MachineRegisterInfo *MRI = B.getMRI();
5780 LLT Ty = MRI->getType(Reg: VData);
5781
5782 const LLT S16 = LLT::scalar(SizeInBits: 16);
5783
5784 // Fixup buffer resources themselves needing to be v4i128.
5785 if (hasBufferRsrcWorkaround(Ty))
5786 return castBufferRsrcToV4I32(Pointer: VData, B);
5787
5788 // Fixup illegal register types for i8 stores.
5789 if (Ty == LLT::scalar(SizeInBits: 8) || Ty == S16) {
5790 Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: VData).getReg(Idx: 0);
5791 return AnyExt;
5792 }
5793
5794 if (Ty.isVector()) {
5795 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5796 if (IsFormat)
5797 return handleD16VData(B, MRI&: *MRI, Reg: VData);
5798 }
5799 }
5800
5801 return VData;
5802}
5803
5804bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5805 MachineRegisterInfo &MRI,
5806 MachineIRBuilder &B,
5807 bool IsTyped,
5808 bool IsFormat) const {
5809 Register VData = MI.getOperand(i: 1).getReg();
5810 LLT Ty = MRI.getType(Reg: VData);
5811 LLT EltTy = Ty.getScalarType();
5812 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5813 const LLT S32 = LLT::scalar(SizeInBits: 32);
5814
5815 VData = fixStoreSourceType(B, VData, IsFormat);
5816 castBufferRsrcArgToV4I32(MI, B, Idx: 2);
5817 Register RSrc = MI.getOperand(i: 2).getReg();
5818
5819 MachineMemOperand *MMO = *MI.memoperands_begin();
5820 const int MemSize = MMO->getSize().getValue();
5821
5822 unsigned ImmOffset;
5823
5824 // The typed intrinsics add an immediate after the registers.
5825 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5826
5827 // The struct intrinsic variants add one additional operand over raw.
5828 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5829 Register VIndex;
5830 int OpOffset = 0;
5831 if (HasVIndex) {
5832 VIndex = MI.getOperand(i: 3).getReg();
5833 OpOffset = 1;
5834 } else {
5835 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
5836 }
5837
5838 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
5839 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
5840
5841 unsigned Format = 0;
5842 if (IsTyped) {
5843 Format = MI.getOperand(i: 5 + OpOffset).getImm();
5844 ++OpOffset;
5845 }
5846
5847 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
5848
5849 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
5850
5851 unsigned Opc;
5852 if (IsTyped) {
5853 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5854 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5855 } else if (IsFormat) {
5856 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5857 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5858 } else {
5859 switch (MemSize) {
5860 case 1:
5861 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5862 break;
5863 case 2:
5864 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5865 break;
5866 default:
5867 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5868 break;
5869 }
5870 }
5871
5872 auto MIB = B.buildInstr(Opcode: Opc)
5873 .addUse(RegNo: VData) // vdata
5874 .addUse(RegNo: RSrc) // rsrc
5875 .addUse(RegNo: VIndex) // vindex
5876 .addUse(RegNo: VOffset) // voffset
5877 .addUse(RegNo: SOffset) // soffset
5878 .addImm(Val: ImmOffset); // offset(imm)
5879
5880 if (IsTyped)
5881 MIB.addImm(Val: Format);
5882
5883 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5884 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
5885 .addMemOperand(MMO);
5886
5887 MI.eraseFromParent();
5888 return true;
5889}
5890
5891static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5892 Register VIndex, Register VOffset, Register SOffset,
5893 unsigned ImmOffset, unsigned Format,
5894 unsigned AuxiliaryData, MachineMemOperand *MMO,
5895 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5896 auto MIB = B.buildInstr(Opcode: Opc)
5897 .addDef(RegNo: LoadDstReg) // vdata
5898 .addUse(RegNo: RSrc) // rsrc
5899 .addUse(RegNo: VIndex) // vindex
5900 .addUse(RegNo: VOffset) // voffset
5901 .addUse(RegNo: SOffset) // soffset
5902 .addImm(Val: ImmOffset); // offset(imm)
5903
5904 if (IsTyped)
5905 MIB.addImm(Val: Format);
5906
5907 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5908 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
5909 .addMemOperand(MMO);
5910}
5911
5912bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5913 MachineRegisterInfo &MRI,
5914 MachineIRBuilder &B,
5915 bool IsFormat,
5916 bool IsTyped) const {
5917 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5918 MachineMemOperand *MMO = *MI.memoperands_begin();
5919 const LLT MemTy = MMO->getMemoryType();
5920 const LLT S32 = LLT::scalar(SizeInBits: 32);
5921
5922 Register Dst = MI.getOperand(i: 0).getReg();
5923
5924 Register StatusDst;
5925 int OpOffset = 0;
5926 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5927 bool IsTFE = MI.getNumExplicitDefs() == 2;
5928 if (IsTFE) {
5929 StatusDst = MI.getOperand(i: 1).getReg();
5930 ++OpOffset;
5931 }
5932
5933 castBufferRsrcArgToV4I32(MI, B, Idx: 2 + OpOffset);
5934 Register RSrc = MI.getOperand(i: 2 + OpOffset).getReg();
5935
5936 // The typed intrinsics add an immediate after the registers.
5937 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5938
5939 // The struct intrinsic variants add one additional operand over raw.
5940 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5941 Register VIndex;
5942 if (HasVIndex) {
5943 VIndex = MI.getOperand(i: 3 + OpOffset).getReg();
5944 ++OpOffset;
5945 } else {
5946 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
5947 }
5948
5949 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
5950 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
5951
5952 unsigned Format = 0;
5953 if (IsTyped) {
5954 Format = MI.getOperand(i: 5 + OpOffset).getImm();
5955 ++OpOffset;
5956 }
5957
5958 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
5959 unsigned ImmOffset;
5960
5961 LLT Ty = MRI.getType(Reg: Dst);
5962 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5963 // logic doesn't have to handle that case.
5964 if (hasBufferRsrcWorkaround(Ty)) {
5965 Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
5966 Dst = MI.getOperand(i: 0).getReg();
5967 }
5968 LLT EltTy = Ty.getScalarType();
5969 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5970 const bool Unpacked = ST.hasUnpackedD16VMem();
5971
5972 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
5973
5974 unsigned Opc;
5975
5976 // TODO: Support TFE for typed and narrow loads.
5977 if (IsTyped) {
5978 if (IsTFE)
5979 return false;
5980 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5981 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5982 } else if (IsFormat) {
5983 if (IsD16) {
5984 if (IsTFE)
5985 return false;
5986 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5987 } else {
5988 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5989 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5990 }
5991 } else {
5992 switch (MemTy.getSizeInBits()) {
5993 case 8:
5994 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
5995 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5996 break;
5997 case 16:
5998 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
5999 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6000 break;
6001 default:
6002 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6003 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6004 break;
6005 }
6006 }
6007
6008 if (IsTFE) {
6009 unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: 32);
6010 unsigned NumLoadDWords = NumValueDWords + 1;
6011 LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
6012 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
6013 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6014 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6015 if (MemTy.getSizeInBits() < 32) {
6016 Register ExtDst = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6017 B.buildUnmerge(Res: {ExtDst, StatusDst}, Op: LoadDstReg);
6018 B.buildTrunc(Res: Dst, Op: ExtDst);
6019 } else if (NumValueDWords == 1) {
6020 B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
6021 } else {
6022 SmallVector<Register, 5> LoadElts;
6023 for (unsigned I = 0; I != NumValueDWords; ++I)
6024 LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
6025 LoadElts.push_back(Elt: StatusDst);
6026 B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
6027 LoadElts.truncate(N: NumValueDWords);
6028 B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
6029 }
6030 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6031 (IsD16 && !Ty.isVector())) {
6032 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6033 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6034 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6035 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6036 B.buildTrunc(Res: Dst, Op: LoadDstReg);
6037 } else if (Unpacked && IsD16 && Ty.isVector()) {
6038 LLT UnpackedTy = Ty.changeElementSize(NewEltSize: 32);
6039 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
6040 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6041 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6042 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6043 // FIXME: G_TRUNC should work, but legalization currently fails
6044 auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
6045 SmallVector<Register, 4> Repack;
6046 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6047 Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6048 B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
6049 } else {
6050 buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6051 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6052 }
6053
6054 MI.eraseFromParent();
6055 return true;
6056}
6057
6058static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6059 switch (IntrID) {
6060 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6061 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6062 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6064 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6065 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6066 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6067 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6068 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6069 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6070 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6072 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6074 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6075 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6077 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6079 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6080 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6082 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6085 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6086 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6087 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6090 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6092 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6094 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6095 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6096 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6097 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6098 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6100 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6101 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6102 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6105 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6107 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6108 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6110 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6111 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6112 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6113 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6115 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6116 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6117 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6118 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6120 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6122 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6125 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6126 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6127 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6130 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6131 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6132 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6134 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6135 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6137 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6138 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6140 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6141 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6142 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6143 default:
6144 llvm_unreachable("unhandled atomic opcode");
6145 }
6146}
6147
6148bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6149 MachineIRBuilder &B,
6150 Intrinsic::ID IID) const {
6151 const bool IsCmpSwap =
6152 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6153 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6154 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6155 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6156
6157 Register Dst = MI.getOperand(i: 0).getReg();
6158 // Since we don't have 128-bit atomics, we don't need to handle the case of
6159 // p8 argmunents to the atomic itself
6160 Register VData = MI.getOperand(i: 2).getReg();
6161
6162 Register CmpVal;
6163 int OpOffset = 0;
6164
6165 if (IsCmpSwap) {
6166 CmpVal = MI.getOperand(i: 3).getReg();
6167 ++OpOffset;
6168 }
6169
6170 castBufferRsrcArgToV4I32(MI, B, Idx: 3 + OpOffset);
6171 Register RSrc = MI.getOperand(i: 3 + OpOffset).getReg();
6172 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6173
6174 // The struct intrinsic variants add one additional operand over raw.
6175 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6176 Register VIndex;
6177 if (HasVIndex) {
6178 VIndex = MI.getOperand(i: 4 + OpOffset).getReg();
6179 ++OpOffset;
6180 } else {
6181 VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 0).getReg(Idx: 0);
6182 }
6183
6184 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6185 Register SOffset = MI.getOperand(i: 5 + OpOffset).getReg();
6186 unsigned AuxiliaryData = MI.getOperand(i: 6 + OpOffset).getImm();
6187
6188 MachineMemOperand *MMO = *MI.memoperands_begin();
6189
6190 unsigned ImmOffset;
6191 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6192
6193 auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
6194 .addDef(RegNo: Dst)
6195 .addUse(RegNo: VData); // vdata
6196
6197 if (IsCmpSwap)
6198 MIB.addReg(RegNo: CmpVal);
6199
6200 MIB.addUse(RegNo: RSrc) // rsrc
6201 .addUse(RegNo: VIndex) // vindex
6202 .addUse(RegNo: VOffset) // voffset
6203 .addUse(RegNo: SOffset) // soffset
6204 .addImm(Val: ImmOffset) // offset(imm)
6205 .addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6206 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6207 .addMemOperand(MMO);
6208
6209 MI.eraseFromParent();
6210 return true;
6211}
6212
6213/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6214/// vector with s16 typed elements.
6215static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6216 SmallVectorImpl<Register> &PackedAddrs,
6217 unsigned ArgOffset,
6218 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6219 bool IsA16, bool IsG16) {
6220 const LLT S16 = LLT::scalar(SizeInBits: 16);
6221 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6222 auto EndIdx = Intr->VAddrEnd;
6223
6224 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6225 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6226 if (!SrcOp.isReg())
6227 continue; // _L to _LZ may have eliminated this.
6228
6229 Register AddrReg = SrcOp.getReg();
6230
6231 if ((I < Intr->GradientStart) ||
6232 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6233 (I >= Intr->CoordStart && !IsA16)) {
6234 if ((I < Intr->GradientStart) && IsA16 &&
6235 (B.getMRI()->getType(Reg: AddrReg) == S16)) {
6236 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6237 // Special handling of bias when A16 is on. Bias is of type half but
6238 // occupies full 32-bit.
6239 PackedAddrs.push_back(
6240 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6241 .getReg(Idx: 0));
6242 } else {
6243 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6244 "Bias needs to be converted to 16 bit in A16 mode");
6245 // Handle any gradient or coordinate operands that should not be packed
6246 AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: 0);
6247 PackedAddrs.push_back(Elt: AddrReg);
6248 }
6249 } else {
6250 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6251 // derivatives dx/dh and dx/dv are packed with undef.
6252 if (((I + 1) >= EndIdx) ||
6253 ((Intr->NumGradients / 2) % 2 == 1 &&
6254 (I == static_cast<unsigned>(Intr->GradientStart +
6255 (Intr->NumGradients / 2) - 1) ||
6256 I == static_cast<unsigned>(Intr->GradientStart +
6257 Intr->NumGradients - 1))) ||
6258 // Check for _L to _LZ optimization
6259 !MI.getOperand(i: ArgOffset + I + 1).isReg()) {
6260 PackedAddrs.push_back(
6261 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6262 .getReg(Idx: 0));
6263 } else {
6264 PackedAddrs.push_back(
6265 Elt: B.buildBuildVector(
6266 Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + 1).getReg()})
6267 .getReg(Idx: 0));
6268 ++I;
6269 }
6270 }
6271 }
6272}
6273
6274/// Convert from separate vaddr components to a single vector address register,
6275/// and replace the remaining operands with $noreg.
6276static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6277 int DimIdx, int NumVAddrs) {
6278 const LLT S32 = LLT::scalar(SizeInBits: 32);
6279 (void)S32;
6280 SmallVector<Register, 8> AddrRegs;
6281 for (int I = 0; I != NumVAddrs; ++I) {
6282 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6283 if (SrcOp.isReg()) {
6284 AddrRegs.push_back(Elt: SrcOp.getReg());
6285 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6286 }
6287 }
6288
6289 int NumAddrRegs = AddrRegs.size();
6290 if (NumAddrRegs != 1) {
6291 auto VAddr =
6292 B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: 32), Ops: AddrRegs);
6293 MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: 0));
6294 }
6295
6296 for (int I = 1; I != NumVAddrs; ++I) {
6297 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6298 if (SrcOp.isReg())
6299 MI.getOperand(i: DimIdx + I).setReg(AMDGPU::NoRegister);
6300 }
6301}
6302
6303/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6304///
6305/// Depending on the subtarget, load/store with 16-bit element data need to be
6306/// rewritten to use the low half of 32-bit registers, or directly use a packed
6307/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6308/// registers.
6309///
6310/// We don't want to directly select image instructions just yet, but also want
6311/// to exposes all register repacking to the legalizer/combiners. We also don't
6312/// want a selected instruction entering RegBankSelect. In order to avoid
6313/// defining a multitude of intermediate image instructions, directly hack on
6314/// the intrinsic's arguments. In cases like a16 addresses, this requires
6315/// padding now unnecessary arguments with $noreg.
6316bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6317 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6318 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6319
6320 const MachineFunction &MF = *MI.getMF();
6321 const unsigned NumDefs = MI.getNumExplicitDefs();
6322 const unsigned ArgOffset = NumDefs + 1;
6323 bool IsTFE = NumDefs == 2;
6324 // We are only processing the operands of d16 image operations on subtargets
6325 // that use the unpacked register layout, or need to repack the TFE result.
6326
6327 // TODO: Do we need to guard against already legalized intrinsics?
6328 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6329 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
6330
6331 MachineRegisterInfo *MRI = B.getMRI();
6332 const LLT S32 = LLT::scalar(SizeInBits: 32);
6333 const LLT S16 = LLT::scalar(SizeInBits: 16);
6334 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6335
6336 unsigned DMask = 0;
6337 Register VData;
6338 LLT Ty;
6339
6340 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6341 VData = MI.getOperand(i: NumDefs == 0 ? 1 : 0).getReg();
6342 Ty = MRI->getType(Reg: VData);
6343 }
6344
6345 const bool IsAtomicPacked16Bit =
6346 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6347 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6348
6349 // Check for 16 bit addresses and pack if true.
6350 LLT GradTy =
6351 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
6352 LLT AddrTy =
6353 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
6354 const bool IsG16 =
6355 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6356 const bool IsA16 = AddrTy == S16;
6357 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6358
6359 int DMaskLanes = 0;
6360 if (!BaseOpcode->Atomic) {
6361 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
6362 if (BaseOpcode->Gather4) {
6363 DMaskLanes = 4;
6364 } else if (DMask != 0) {
6365 DMaskLanes = llvm::popcount(Value: DMask);
6366 } else if (!IsTFE && !BaseOpcode->Store) {
6367 // If dmask is 0, this is a no-op load. This can be eliminated.
6368 B.buildUndef(Res: MI.getOperand(i: 0));
6369 MI.eraseFromParent();
6370 return true;
6371 }
6372 }
6373
6374 Observer.changingInstr(MI);
6375 auto ChangedInstr = make_scope_exit(F: [&] { Observer.changedInstr(MI); });
6376
6377 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6378 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6379 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6380 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6381 unsigned NewOpcode = LoadOpcode;
6382 if (BaseOpcode->Store)
6383 NewOpcode = StoreOpcode;
6384 else if (BaseOpcode->NoReturn)
6385 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6386
6387 // Track that we legalized this
6388 MI.setDesc(B.getTII().get(Opcode: NewOpcode));
6389
6390 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6391 // dmask to be at least 1 otherwise the instruction will fail
6392 if (IsTFE && DMask == 0) {
6393 DMask = 0x1;
6394 DMaskLanes = 1;
6395 MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
6396 }
6397
6398 if (BaseOpcode->Atomic) {
6399 Register VData0 = MI.getOperand(i: 2).getReg();
6400 LLT Ty = MRI->getType(Reg: VData0);
6401
6402 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6403 if (Ty.isVector() && !IsAtomicPacked16Bit)
6404 return false;
6405
6406 if (BaseOpcode->AtomicX2) {
6407 Register VData1 = MI.getOperand(i: 3).getReg();
6408 // The two values are packed in one register.
6409 LLT PackedTy = LLT::fixed_vector(NumElements: 2, ScalarTy: Ty);
6410 auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
6411 MI.getOperand(i: 2).setReg(Concat.getReg(Idx: 0));
6412 MI.getOperand(i: 3).setReg(AMDGPU::NoRegister);
6413 }
6414 }
6415
6416 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6417
6418 // Rewrite the addressing register layout before doing anything else.
6419 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6420 // 16 bit gradients are supported, but are tied to the A16 control
6421 // so both gradients and addresses must be 16 bit
6422 return false;
6423 }
6424
6425 if (IsA16 && !ST.hasA16()) {
6426 // A16 not supported
6427 return false;
6428 }
6429
6430 const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
6431 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6432
6433 if (IsA16 || IsG16) {
6434 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6435 // instructions expect VGPR_32
6436 SmallVector<Register, 4> PackedRegs;
6437
6438 packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6439
6440 // See also below in the non-a16 branch
6441 const bool UseNSA = ST.hasNSAEncoding() &&
6442 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6443 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6444 const bool UsePartialNSA =
6445 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6446
6447 if (UsePartialNSA) {
6448 // Pack registers that would go over NSAMaxSize into last VAddr register
6449 LLT PackedAddrTy =
6450 LLT::fixed_vector(NumElements: 2 * (PackedRegs.size() - NSAMaxSize + 1), ScalarSizeInBits: 16);
6451 auto Concat = B.buildConcatVectors(
6452 Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - 1));
6453 PackedRegs[NSAMaxSize - 1] = Concat.getReg(Idx: 0);
6454 PackedRegs.resize(N: NSAMaxSize);
6455 } else if (!UseNSA && PackedRegs.size() > 1) {
6456 LLT PackedAddrTy = LLT::fixed_vector(NumElements: 2 * PackedRegs.size(), ScalarSizeInBits: 16);
6457 auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
6458 PackedRegs[0] = Concat.getReg(Idx: 0);
6459 PackedRegs.resize(N: 1);
6460 }
6461
6462 const unsigned NumPacked = PackedRegs.size();
6463 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6464 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6465 if (!SrcOp.isReg()) {
6466 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6467 continue;
6468 }
6469
6470 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6471
6472 if (I - Intr->VAddrStart < NumPacked)
6473 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6474 else
6475 SrcOp.setReg(AMDGPU::NoRegister);
6476 }
6477 } else {
6478 // If the register allocator cannot place the address registers contiguously
6479 // without introducing moves, then using the non-sequential address encoding
6480 // is always preferable, since it saves VALU instructions and is usually a
6481 // wash in terms of code size or even better.
6482 //
6483 // However, we currently have no way of hinting to the register allocator
6484 // that MIMG addresses should be placed contiguously when it is possible to
6485 // do so, so force non-NSA for the common 2-address case as a heuristic.
6486 //
6487 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6488 // allocation when possible.
6489 //
6490 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6491 // set of the remaining addresses.
6492 const bool UseNSA = ST.hasNSAEncoding() &&
6493 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6494 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6495 const bool UsePartialNSA =
6496 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6497
6498 if (UsePartialNSA) {
6499 convertImageAddrToPacked(B, MI,
6500 DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6501 NumVAddrs: Intr->NumVAddrs - NSAMaxSize + 1);
6502 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6503 convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
6504 NumVAddrs: Intr->NumVAddrs);
6505 }
6506 }
6507
6508 int Flags = 0;
6509 if (IsA16)
6510 Flags |= 1;
6511 if (IsG16)
6512 Flags |= 2;
6513 MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
6514
6515 if (BaseOpcode->NoReturn) { // No TFE for stores?
6516 // TODO: Handle dmask trim
6517 if (!Ty.isVector() || !IsD16)
6518 return true;
6519
6520 Register RepackedReg = handleD16VData(B, MRI&: *MRI, Reg: VData, ImageStore: true);
6521 if (RepackedReg != VData) {
6522 MI.getOperand(i: 1).setReg(RepackedReg);
6523 }
6524
6525 return true;
6526 }
6527
6528 Register DstReg = MI.getOperand(i: 0).getReg();
6529 const LLT EltTy = Ty.getScalarType();
6530 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6531
6532 // Confirm that the return type is large enough for the dmask specified
6533 if (NumElts < DMaskLanes)
6534 return false;
6535
6536 if (NumElts > 4 || DMaskLanes > 4)
6537 return false;
6538
6539 // Image atomic instructions are using DMask to specify how many bits
6540 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6541 // DMaskLanes for image atomic has default value '0'.
6542 // We must be sure that atomic variants (especially packed) will not be
6543 // truncated from v2s16 or v4s16 to s16 type.
6544 //
6545 // ChangeElementCount will be needed for image load where Ty is always scalar.
6546 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6547 const LLT AdjustedTy =
6548 DMaskLanes == 0
6549 ? Ty
6550 : Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
6551
6552 // The raw dword aligned data component of the load. The only legal cases
6553 // where this matters should be when using the packed D16 format, for
6554 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6555 LLT RoundedTy;
6556
6557 // S32 vector to cover all data, plus TFE result element.
6558 LLT TFETy;
6559
6560 // Register type to use for each loaded component. Will be S32 or V2S16.
6561 LLT RegTy;
6562
6563 if (IsD16 && ST.hasUnpackedD16VMem()) {
6564 RoundedTy =
6565 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: 32);
6566 TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + 1, ScalarSizeInBits: 32);
6567 RegTy = S32;
6568 } else {
6569 unsigned EltSize = EltTy.getSizeInBits();
6570 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6571 unsigned RoundedSize = 32 * RoundedElts;
6572 RoundedTy = LLT::scalarOrVector(
6573 EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
6574 TFETy = LLT::fixed_vector(NumElements: RoundedSize / 32 + 1, ScalarTy: S32);
6575 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6576 }
6577
6578 // The return type does not need adjustment.
6579 // TODO: Should we change s16 case to s32 or <2 x s16>?
6580 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6581 return true;
6582
6583 Register Dst1Reg;
6584
6585 // Insert after the instruction.
6586 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
6587
6588 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6589 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6590 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6591 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6592
6593 Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
6594
6595 MI.getOperand(i: 0).setReg(NewResultReg);
6596
6597 // In the IR, TFE is supposed to be used with a 2 element struct return
6598 // type. The instruction really returns these two values in one contiguous
6599 // register, with one additional dword beyond the loaded data. Rewrite the
6600 // return type to use a single register result.
6601
6602 if (IsTFE) {
6603 Dst1Reg = MI.getOperand(i: 1).getReg();
6604 if (MRI->getType(Reg: Dst1Reg) != S32)
6605 return false;
6606
6607 // TODO: Make sure the TFE operand bit is set.
6608 MI.removeOperand(OpNo: 1);
6609
6610 // Handle the easy case that requires no repack instructions.
6611 if (Ty == S32) {
6612 B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
6613 return true;
6614 }
6615 }
6616
6617 // Now figure out how to copy the new result register back into the old
6618 // result.
6619 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6620
6621 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6622
6623 if (ResultNumRegs == 1) {
6624 assert(!IsTFE);
6625 ResultRegs[0] = NewResultReg;
6626 } else {
6627 // We have to repack into a new vector of some kind.
6628 for (int I = 0; I != NumDataRegs; ++I)
6629 ResultRegs[I] = MRI->createGenericVirtualRegister(Ty: RegTy);
6630 B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
6631
6632 // Drop the final TFE element to get the data part. The TFE result is
6633 // directly written to the right place already.
6634 if (IsTFE)
6635 ResultRegs.resize(N: NumDataRegs);
6636 }
6637
6638 // For an s16 scalar result, we form an s32 result with a truncate regardless
6639 // of packed vs. unpacked.
6640 if (IsD16 && !Ty.isVector()) {
6641 B.buildTrunc(Res: DstReg, Op: ResultRegs[0]);
6642 return true;
6643 }
6644
6645 // Avoid a build/concat_vector of 1 entry.
6646 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6647 B.buildBitcast(Dst: DstReg, Src: ResultRegs[0]);
6648 return true;
6649 }
6650
6651 assert(Ty.isVector());
6652
6653 if (IsD16) {
6654 // For packed D16 results with TFE enabled, all the data components are
6655 // S32. Cast back to the expected type.
6656 //
6657 // TODO: We don't really need to use load s32 elements. We would only need one
6658 // cast for the TFE result if a multiple of v2s16 was used.
6659 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6660 for (Register &Reg : ResultRegs)
6661 Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: 0);
6662 } else if (ST.hasUnpackedD16VMem()) {
6663 for (Register &Reg : ResultRegs)
6664 Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: 0);
6665 }
6666 }
6667
6668 auto padWithUndef = [&](LLT Ty, int NumElts) {
6669 if (NumElts == 0)
6670 return;
6671 Register Undef = B.buildUndef(Res: Ty).getReg(Idx: 0);
6672 for (int I = 0; I != NumElts; ++I)
6673 ResultRegs.push_back(Elt: Undef);
6674 };
6675
6676 // Pad out any elements eliminated due to the dmask.
6677 LLT ResTy = MRI->getType(Reg: ResultRegs[0]);
6678 if (!ResTy.isVector()) {
6679 padWithUndef(ResTy, NumElts - ResultRegs.size());
6680 B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
6681 return true;
6682 }
6683
6684 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6685 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6686
6687 // Deal with the one annoying legal case.
6688 const LLT V3S16 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 16);
6689 if (Ty == V3S16) {
6690 if (IsTFE) {
6691 if (ResultRegs.size() == 1) {
6692 NewResultReg = ResultRegs[0];
6693 } else if (ResultRegs.size() == 2) {
6694 LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
6695 NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: 0);
6696 } else {
6697 return false;
6698 }
6699 }
6700
6701 if (MRI->getType(Reg: DstReg).getNumElements() <
6702 MRI->getType(Reg: NewResultReg).getNumElements()) {
6703 B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
6704 } else {
6705 B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
6706 }
6707 return true;
6708 }
6709
6710 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6711 B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
6712 return true;
6713}
6714
6715bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6716 MachineInstr &MI) const {
6717 MachineIRBuilder &B = Helper.MIRBuilder;
6718 GISelChangeObserver &Observer = Helper.Observer;
6719
6720 Register OrigDst = MI.getOperand(i: 0).getReg();
6721 Register Dst;
6722 LLT Ty = B.getMRI()->getType(Reg: OrigDst);
6723 unsigned Size = Ty.getSizeInBits();
6724 MachineFunction &MF = B.getMF();
6725 unsigned Opc = 0;
6726 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6727 assert(Size == 8 || Size == 16);
6728 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6729 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6730 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6731 // destination register.
6732 Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
6733 } else {
6734 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6735 Dst = OrigDst;
6736 }
6737
6738 Observer.changingInstr(MI);
6739
6740 // Handle needing to s.buffer.load() a p8 value.
6741 if (hasBufferRsrcWorkaround(Ty)) {
6742 Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: 0);
6743 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6744 }
6745 if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
6746 Ty = getBitcastRegisterType(Ty);
6747 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
6748 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6749 }
6750
6751 // FIXME: We don't really need this intermediate instruction. The intrinsic
6752 // should be fixed to have a memory operand. Since it's readnone, we're not
6753 // allowed to add one.
6754 MI.setDesc(B.getTII().get(Opcode: Opc));
6755 MI.removeOperand(OpNo: 1); // Remove intrinsic ID
6756
6757 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6758 const unsigned MemSize = (Size + 7) / 8;
6759 const Align MemAlign = B.getDataLayout().getABITypeAlign(
6760 Ty: getTypeForLLT(Ty, C&: MF.getFunction().getContext()));
6761 MachineMemOperand *MMO = MF.getMachineMemOperand(
6762 PtrInfo: MachinePointerInfo(),
6763 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6764 MachineMemOperand::MOInvariant,
6765 Size: MemSize, BaseAlignment: MemAlign);
6766 MI.addMemOperand(MF, MO: MMO);
6767 if (Dst != OrigDst) {
6768 MI.getOperand(i: 0).setReg(Dst);
6769 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6770 B.buildTrunc(Res: OrigDst, Op: Dst);
6771 }
6772
6773 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6774 // always be legal. We may need to restore this to a 96-bit result if it turns
6775 // out this needs to be converted to a vector load during RegBankSelect.
6776 if (!isPowerOf2_32(Value: Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6777 if (Ty.isVector())
6778 Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: 0);
6779 else
6780 Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: 0);
6781 }
6782
6783 Observer.changedInstr(MI);
6784 return true;
6785}
6786
6787// TODO: Move to selection
6788bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6789 MachineRegisterInfo &MRI,
6790 MachineIRBuilder &B) const {
6791 if (!ST.isTrapHandlerEnabled() ||
6792 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6793 return legalizeTrapEndpgm(MI, MRI, B);
6794
6795 return ST.supportsGetDoorbellID() ?
6796 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6797}
6798
6799bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6800 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6801 const DebugLoc &DL = MI.getDebugLoc();
6802 MachineBasicBlock &BB = B.getMBB();
6803 MachineFunction *MF = BB.getParent();
6804
6805 if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
6806 BuildMI(BB, I: BB.end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
6807 .addImm(Val: 0);
6808 MI.eraseFromParent();
6809 return true;
6810 }
6811
6812 // We need a block split to make the real endpgm a terminator. We also don't
6813 // want to break phis in successor blocks, so we can't just delete to the
6814 // end of the block.
6815 BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
6816 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6817 MF->push_back(MBB: TrapBB);
6818 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
6819 .addImm(Val: 0);
6820 BuildMI(BB, I: &MI, MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
6821 .addMBB(MBB: TrapBB);
6822
6823 BB.addSuccessor(Succ: TrapBB);
6824 MI.eraseFromParent();
6825 return true;
6826}
6827
6828bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6829 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6830 MachineFunction &MF = B.getMF();
6831 const LLT S64 = LLT::scalar(SizeInBits: 64);
6832
6833 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6834 // For code object version 5, queue_ptr is passed through implicit kernarg.
6835 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
6836 AMDGPU::AMDHSA_COV5) {
6837 AMDGPUTargetLowering::ImplicitParameter Param =
6838 AMDGPUTargetLowering::QUEUE_PTR;
6839 uint64_t Offset =
6840 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
6841
6842 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6843 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
6844
6845 if (!loadInputValue(DstReg: KernargPtrReg, B,
6846 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6847 return false;
6848
6849 // TODO: can we be smarter about machine pointer info?
6850 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6851 MachineMemOperand *MMO = MF.getMachineMemOperand(
6852 PtrInfo,
6853 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6854 MachineMemOperand::MOInvariant,
6855 MemTy: LLT::scalar(SizeInBits: 64), base_alignment: commonAlignment(A: Align(64), Offset));
6856
6857 // Pointer address
6858 Register LoadAddr = MRI.createGenericVirtualRegister(
6859 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
6860 B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
6861 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
6862 // Load address
6863 Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
6864 B.buildCopy(Res: SGPR01, Op: Temp);
6865 B.buildInstr(Opcode: AMDGPU::S_TRAP)
6866 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6867 .addReg(RegNo: SGPR01, flags: RegState::Implicit);
6868 MI.eraseFromParent();
6869 return true;
6870 }
6871
6872 // Pass queue pointer to trap handler as input, and insert trap instruction
6873 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6874 Register LiveIn =
6875 MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
6876 if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
6877 return false;
6878
6879 B.buildCopy(Res: SGPR01, Op: LiveIn);
6880 B.buildInstr(Opcode: AMDGPU::S_TRAP)
6881 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6882 .addReg(RegNo: SGPR01, flags: RegState::Implicit);
6883
6884 MI.eraseFromParent();
6885 return true;
6886}
6887
6888bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6889 MachineRegisterInfo &MRI,
6890 MachineIRBuilder &B) const {
6891 // We need to simulate the 's_trap 2' instruction on targets that run in
6892 // PRIV=1 (where it is treated as a nop).
6893 if (ST.hasPrivEnabledTrap2NopBug()) {
6894 ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
6895 DL: MI.getDebugLoc());
6896 MI.eraseFromParent();
6897 return true;
6898 }
6899
6900 B.buildInstr(Opcode: AMDGPU::S_TRAP)
6901 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6902 MI.eraseFromParent();
6903 return true;
6904}
6905
6906bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
6907 MachineRegisterInfo &MRI,
6908 MachineIRBuilder &B) const {
6909 // Is non-HSA path or trap-handler disabled? Then, report a warning
6910 // accordingly
6911 if (!ST.isTrapHandlerEnabled() ||
6912 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6913 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6914 "debugtrap handler not supported",
6915 MI.getDebugLoc(), DS_Warning);
6916 LLVMContext &Ctx = B.getMF().getFunction().getContext();
6917 Ctx.diagnose(DI: NoTrap);
6918 } else {
6919 // Insert debug-trap instruction
6920 B.buildInstr(Opcode: AMDGPU::S_TRAP)
6921 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6922 }
6923
6924 MI.eraseFromParent();
6925 return true;
6926}
6927
6928bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6929 MachineIRBuilder &B) const {
6930 MachineRegisterInfo &MRI = *B.getMRI();
6931 const LLT S16 = LLT::scalar(SizeInBits: 16);
6932 const LLT S32 = LLT::scalar(SizeInBits: 32);
6933 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6934 const LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
6935
6936 Register DstReg = MI.getOperand(i: 0).getReg();
6937 Register NodePtr = MI.getOperand(i: 2).getReg();
6938 Register RayExtent = MI.getOperand(i: 3).getReg();
6939 Register RayOrigin = MI.getOperand(i: 4).getReg();
6940 Register RayDir = MI.getOperand(i: 5).getReg();
6941 Register RayInvDir = MI.getOperand(i: 6).getReg();
6942 Register TDescr = MI.getOperand(i: 7).getReg();
6943
6944 if (!ST.hasGFX10_AEncoding()) {
6945 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6946 "intrinsic not supported on subtarget",
6947 MI.getDebugLoc());
6948 B.getMF().getFunction().getContext().diagnose(DI: BadIntrin);
6949 return false;
6950 }
6951
6952 const bool IsGFX11 = AMDGPU::isGFX11(STI: ST);
6953 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: ST);
6954 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: ST);
6955 const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == 16;
6956 const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == 64;
6957 const unsigned NumVDataDwords = 4;
6958 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6959 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6960 const bool UseNSA =
6961 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6962
6963 const unsigned BaseOpcodes[2][2] = {
6964 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6965 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6966 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6967 int Opcode;
6968 if (UseNSA) {
6969 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
6970 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6971 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6972 : AMDGPU::MIMGEncGfx10NSA,
6973 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
6974 } else {
6975 assert(!IsGFX12Plus);
6976 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
6977 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6978 : AMDGPU::MIMGEncGfx10Default,
6979 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
6980 }
6981 assert(Opcode != -1);
6982
6983 SmallVector<Register, 12> Ops;
6984 if (UseNSA && IsGFX11Plus) {
6985 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6986 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
6987 auto Merged = B.buildMergeLikeInstr(
6988 Res: V3S32, Ops: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1), Unmerge.getReg(Idx: 2)});
6989 Ops.push_back(Elt: Merged.getReg(Idx: 0));
6990 };
6991
6992 Ops.push_back(Elt: NodePtr);
6993 Ops.push_back(Elt: RayExtent);
6994 packLanes(RayOrigin);
6995
6996 if (IsA16) {
6997 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
6998 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
6999 auto MergedDir = B.buildMergeLikeInstr(
7000 Res: V3S32,
7001 Ops: {B.buildBitcast(
7002 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 0),
7003 UnmergeRayDir.getReg(Idx: 0)}))
7004 .getReg(Idx: 0),
7005 B.buildBitcast(
7006 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 1),
7007 UnmergeRayDir.getReg(Idx: 1)}))
7008 .getReg(Idx: 0),
7009 B.buildBitcast(
7010 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 2),
7011 UnmergeRayDir.getReg(Idx: 2)}))
7012 .getReg(Idx: 0)});
7013 Ops.push_back(Elt: MergedDir.getReg(Idx: 0));
7014 } else {
7015 packLanes(RayDir);
7016 packLanes(RayInvDir);
7017 }
7018 } else {
7019 if (Is64) {
7020 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
7021 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7022 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7023 } else {
7024 Ops.push_back(Elt: NodePtr);
7025 }
7026 Ops.push_back(Elt: RayExtent);
7027
7028 auto packLanes = [&Ops, &S32, &B](Register Src) {
7029 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7030 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7031 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7032 Ops.push_back(Elt: Unmerge.getReg(Idx: 2));
7033 };
7034
7035 packLanes(RayOrigin);
7036 if (IsA16) {
7037 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7038 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7039 Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
7040 Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
7041 Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
7042 B.buildMergeLikeInstr(Res: R1,
7043 Ops: {UnmergeRayDir.getReg(Idx: 0), UnmergeRayDir.getReg(Idx: 1)});
7044 B.buildMergeLikeInstr(
7045 Res: R2, Ops: {UnmergeRayDir.getReg(Idx: 2), UnmergeRayInvDir.getReg(Idx: 0)});
7046 B.buildMergeLikeInstr(
7047 Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: 1), UnmergeRayInvDir.getReg(Idx: 2)});
7048 Ops.push_back(Elt: R1);
7049 Ops.push_back(Elt: R2);
7050 Ops.push_back(Elt: R3);
7051 } else {
7052 packLanes(RayDir);
7053 packLanes(RayInvDir);
7054 }
7055 }
7056
7057 if (!UseNSA) {
7058 // Build a single vector containing all the operands so far prepared.
7059 LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: 32);
7060 Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: 0);
7061 Ops.clear();
7062 Ops.push_back(Elt: MergedOps);
7063 }
7064
7065 auto MIB = B.buildInstr(Opcode: AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7066 .addDef(RegNo: DstReg)
7067 .addImm(Val: Opcode);
7068
7069 for (Register R : Ops) {
7070 MIB.addUse(RegNo: R);
7071 }
7072
7073 MIB.addUse(RegNo: TDescr)
7074 .addImm(Val: IsA16 ? 1 : 0)
7075 .cloneMemRefs(OtherMI: MI);
7076
7077 MI.eraseFromParent();
7078 return true;
7079}
7080
7081bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
7082 MachineIRBuilder &B) const {
7083 unsigned Opc;
7084 int RoundMode = MI.getOperand(i: 2).getImm();
7085
7086 if (RoundMode == (int)RoundingMode::TowardPositive)
7087 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
7088 else if (RoundMode == (int)RoundingMode::TowardNegative)
7089 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
7090 else
7091 return false;
7092
7093 B.buildInstr(Opcode: Opc)
7094 .addDef(RegNo: MI.getOperand(i: 0).getReg())
7095 .addUse(RegNo: MI.getOperand(i: 1).getReg());
7096
7097 MI.eraseFromParent();
7098
7099 return true;
7100}
7101
7102bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7103 MachineIRBuilder &B) const {
7104 const SITargetLowering *TLI = ST.getTargetLowering();
7105 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7106 Register DstReg = MI.getOperand(i: 0).getReg();
7107 B.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {DstReg}, SrcOps: {StackPtr});
7108 MI.eraseFromParent();
7109 return true;
7110}
7111
7112bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7113 MachineIRBuilder &B) const {
7114 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7115 if (!ST.hasArchitectedSGPRs())
7116 return false;
7117 LLT S32 = LLT::scalar(SizeInBits: 32);
7118 Register DstReg = MI.getOperand(i: 0).getReg();
7119 auto TTMP8 = B.buildCopy(Res: S32, Op: Register(AMDGPU::TTMP8));
7120 auto LSB = B.buildConstant(Res: S32, Val: 25);
7121 auto Width = B.buildConstant(Res: S32, Val: 5);
7122 B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
7123 MI.eraseFromParent();
7124 return true;
7125}
7126
7127static constexpr unsigned FPEnvModeBitField =
7128 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
7129
7130static constexpr unsigned FPEnvTrapBitField =
7131 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
7132
7133bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7134 MachineRegisterInfo &MRI,
7135 MachineIRBuilder &B) const {
7136 Register Src = MI.getOperand(i: 0).getReg();
7137 if (MRI.getType(Reg: Src) != S64)
7138 return false;
7139
7140 auto ModeReg =
7141 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7142 /*HasSideEffects=*/true, /*isConvergent=*/false)
7143 .addImm(Val: FPEnvModeBitField);
7144 auto TrapReg =
7145 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7146 /*HasSideEffects=*/true, /*isConvergent=*/false)
7147 .addImm(Val: FPEnvTrapBitField);
7148 B.buildMergeLikeInstr(Res: Src, Ops: {ModeReg, TrapReg});
7149 MI.eraseFromParent();
7150 return true;
7151}
7152
7153bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7154 MachineRegisterInfo &MRI,
7155 MachineIRBuilder &B) const {
7156 Register Src = MI.getOperand(i: 0).getReg();
7157 if (MRI.getType(Reg: Src) != S64)
7158 return false;
7159
7160 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: 0));
7161 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7162 /*HasSideEffects=*/true, /*isConvergent=*/false)
7163 .addImm(Val: static_cast<int16_t>(FPEnvModeBitField))
7164 .addReg(RegNo: Unmerge.getReg(Idx: 0));
7165 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7166 /*HasSideEffects=*/true, /*isConvergent=*/false)
7167 .addImm(Val: static_cast<int16_t>(FPEnvTrapBitField))
7168 .addReg(RegNo: Unmerge.getReg(Idx: 1));
7169 MI.eraseFromParent();
7170 return true;
7171}
7172
7173bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7174 MachineInstr &MI) const {
7175 MachineIRBuilder &B = Helper.MIRBuilder;
7176 MachineRegisterInfo &MRI = *B.getMRI();
7177
7178 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7179 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
7180 switch (IntrID) {
7181 case Intrinsic::amdgcn_if:
7182 case Intrinsic::amdgcn_else: {
7183 MachineInstr *Br = nullptr;
7184 MachineBasicBlock *UncondBrTarget = nullptr;
7185 bool Negated = false;
7186 if (MachineInstr *BrCond =
7187 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7188 const SIRegisterInfo *TRI
7189 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7190
7191 Register Def = MI.getOperand(i: 1).getReg();
7192 Register Use = MI.getOperand(i: 3).getReg();
7193
7194 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7195
7196 if (Negated)
7197 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7198
7199 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7200 if (IntrID == Intrinsic::amdgcn_if) {
7201 B.buildInstr(Opcode: AMDGPU::SI_IF)
7202 .addDef(RegNo: Def)
7203 .addUse(RegNo: Use)
7204 .addMBB(MBB: UncondBrTarget);
7205 } else {
7206 B.buildInstr(Opcode: AMDGPU::SI_ELSE)
7207 .addDef(RegNo: Def)
7208 .addUse(RegNo: Use)
7209 .addMBB(MBB: UncondBrTarget);
7210 }
7211
7212 if (Br) {
7213 Br->getOperand(i: 0).setMBB(CondBrTarget);
7214 } else {
7215 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7216 // since we're swapping branch targets it needs to be reinserted.
7217 // FIXME: IRTranslator should probably not do this
7218 B.buildBr(Dest&: *CondBrTarget);
7219 }
7220
7221 MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
7222 MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
7223 MI.eraseFromParent();
7224 BrCond->eraseFromParent();
7225 return true;
7226 }
7227
7228 return false;
7229 }
7230 case Intrinsic::amdgcn_loop: {
7231 MachineInstr *Br = nullptr;
7232 MachineBasicBlock *UncondBrTarget = nullptr;
7233 bool Negated = false;
7234 if (MachineInstr *BrCond =
7235 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7236 const SIRegisterInfo *TRI
7237 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7238
7239 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7240 Register Reg = MI.getOperand(i: 2).getReg();
7241
7242 if (Negated)
7243 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7244
7245 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7246 B.buildInstr(Opcode: AMDGPU::SI_LOOP)
7247 .addUse(RegNo: Reg)
7248 .addMBB(MBB: UncondBrTarget);
7249
7250 if (Br)
7251 Br->getOperand(i: 0).setMBB(CondBrTarget);
7252 else
7253 B.buildBr(Dest&: *CondBrTarget);
7254
7255 MI.eraseFromParent();
7256 BrCond->eraseFromParent();
7257 MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
7258 return true;
7259 }
7260
7261 return false;
7262 }
7263 case Intrinsic::amdgcn_addrspacecast_nonnull:
7264 return legalizeAddrSpaceCast(MI, MRI, B);
7265 case Intrinsic::amdgcn_make_buffer_rsrc:
7266 return legalizePointerAsRsrcIntrin(MI, MRI, B);
7267 case Intrinsic::amdgcn_kernarg_segment_ptr:
7268 if (!AMDGPU::isKernel(CC: B.getMF().getFunction().getCallingConv())) {
7269 // This only makes sense to call in a kernel, so just lower to null.
7270 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: 0);
7271 MI.eraseFromParent();
7272 return true;
7273 }
7274
7275 return legalizePreloadedArgIntrin(
7276 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7277 case Intrinsic::amdgcn_implicitarg_ptr:
7278 return legalizeImplicitArgPtr(MI, MRI, B);
7279 case Intrinsic::amdgcn_workitem_id_x:
7280 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 0,
7281 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7282 case Intrinsic::amdgcn_workitem_id_y:
7283 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 1,
7284 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7285 case Intrinsic::amdgcn_workitem_id_z:
7286 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 2,
7287 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7288 case Intrinsic::amdgcn_workgroup_id_x:
7289 return legalizePreloadedArgIntrin(MI, MRI, B,
7290 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7291 case Intrinsic::amdgcn_workgroup_id_y:
7292 return legalizePreloadedArgIntrin(MI, MRI, B,
7293 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7294 case Intrinsic::amdgcn_workgroup_id_z:
7295 return legalizePreloadedArgIntrin(MI, MRI, B,
7296 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7297 case Intrinsic::amdgcn_wave_id:
7298 return legalizeWaveID(MI, B);
7299 case Intrinsic::amdgcn_lds_kernel_id:
7300 return legalizePreloadedArgIntrin(MI, MRI, B,
7301 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7302 case Intrinsic::amdgcn_dispatch_ptr:
7303 return legalizePreloadedArgIntrin(MI, MRI, B,
7304 ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
7305 case Intrinsic::amdgcn_queue_ptr:
7306 return legalizePreloadedArgIntrin(MI, MRI, B,
7307 ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
7308 case Intrinsic::amdgcn_implicit_buffer_ptr:
7309 return legalizePreloadedArgIntrin(
7310 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7311 case Intrinsic::amdgcn_dispatch_id:
7312 return legalizePreloadedArgIntrin(MI, MRI, B,
7313 ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
7314 case Intrinsic::r600_read_ngroups_x:
7315 // TODO: Emit error for hsa
7316 return legalizeKernargMemParameter(MI, B,
7317 Offset: SI::KernelInputOffsets::NGROUPS_X);
7318 case Intrinsic::r600_read_ngroups_y:
7319 return legalizeKernargMemParameter(MI, B,
7320 Offset: SI::KernelInputOffsets::NGROUPS_Y);
7321 case Intrinsic::r600_read_ngroups_z:
7322 return legalizeKernargMemParameter(MI, B,
7323 Offset: SI::KernelInputOffsets::NGROUPS_Z);
7324 case Intrinsic::r600_read_local_size_x:
7325 // TODO: Could insert G_ASSERT_ZEXT from s16
7326 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
7327 case Intrinsic::r600_read_local_size_y:
7328 // TODO: Could insert G_ASSERT_ZEXT from s16
7329 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
7330 // TODO: Could insert G_ASSERT_ZEXT from s16
7331 case Intrinsic::r600_read_local_size_z:
7332 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
7333 case Intrinsic::r600_read_global_size_x:
7334 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_X);
7335 case Intrinsic::r600_read_global_size_y:
7336 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7337 case Intrinsic::r600_read_global_size_z:
7338 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7339 case Intrinsic::amdgcn_fdiv_fast:
7340 return legalizeFDIVFastIntrin(MI, MRI, B);
7341 case Intrinsic::amdgcn_is_shared:
7342 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
7343 case Intrinsic::amdgcn_is_private:
7344 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
7345 case Intrinsic::amdgcn_wavefrontsize: {
7346 B.buildConstant(Res: MI.getOperand(i: 0), Val: ST.getWavefrontSize());
7347 MI.eraseFromParent();
7348 return true;
7349 }
7350 case Intrinsic::amdgcn_s_buffer_load:
7351 return legalizeSBufferLoad(Helper, MI);
7352 case Intrinsic::amdgcn_raw_buffer_store:
7353 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7354 case Intrinsic::amdgcn_struct_buffer_store:
7355 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7356 return legalizeBufferStore(MI, MRI, B, IsTyped: false, IsFormat: false);
7357 case Intrinsic::amdgcn_raw_buffer_store_format:
7358 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7359 case Intrinsic::amdgcn_struct_buffer_store_format:
7360 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7361 return legalizeBufferStore(MI, MRI, B, IsTyped: false, IsFormat: true);
7362 case Intrinsic::amdgcn_raw_tbuffer_store:
7363 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7364 case Intrinsic::amdgcn_struct_tbuffer_store:
7365 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7366 return legalizeBufferStore(MI, MRI, B, IsTyped: true, IsFormat: true);
7367 case Intrinsic::amdgcn_raw_buffer_load:
7368 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7369 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7370 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7371 case Intrinsic::amdgcn_struct_buffer_load:
7372 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7373 return legalizeBufferLoad(MI, MRI, B, IsFormat: false, IsTyped: false);
7374 case Intrinsic::amdgcn_raw_buffer_load_format:
7375 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7376 case Intrinsic::amdgcn_struct_buffer_load_format:
7377 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7378 return legalizeBufferLoad(MI, MRI, B, IsFormat: true, IsTyped: false);
7379 case Intrinsic::amdgcn_raw_tbuffer_load:
7380 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7381 case Intrinsic::amdgcn_struct_tbuffer_load:
7382 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7383 return legalizeBufferLoad(MI, MRI, B, IsFormat: true, IsTyped: true);
7384 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7385 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7386 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7388 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7389 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7390 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7391 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7392 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7393 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7394 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7395 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7396 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7398 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7400 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7402 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7404 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7405 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7406 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7407 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7408 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7410 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7412 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7413 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7414 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7415 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7416 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7417 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7418 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7419 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7420 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7421 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7422 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7423 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7424 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7425 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7426 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7427 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7428 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7430 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7431 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7432 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7433 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7434 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7435 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7436 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7438 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7440 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7442 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7443 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7444 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7445 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7446 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7447 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7448 return legalizeBufferAtomic(MI, B, IID: IntrID);
7449 case Intrinsic::amdgcn_rsq_clamp:
7450 return legalizeRsqClampIntrinsic(MI, MRI, B);
7451 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7452 return legalizeBVHIntrinsic(MI, B);
7453 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7454 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7455 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7456 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7457 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7458 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7459 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7460 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7461 Register Index = MI.getOperand(i: 5).getReg();
7462 LLT S32 = LLT::scalar(SizeInBits: 32);
7463 if (MRI.getType(Reg: Index) != S32)
7464 MI.getOperand(i: 5).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
7465 return true;
7466 }
7467 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7468 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7469 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7470 Register Index = MI.getOperand(i: 7).getReg();
7471 LLT S32 = LLT::scalar(SizeInBits: 32);
7472 if (MRI.getType(Reg: Index) != S32)
7473 MI.getOperand(i: 7).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
7474 return true;
7475 }
7476 case Intrinsic::amdgcn_fmed3: {
7477 GISelChangeObserver &Observer = Helper.Observer;
7478
7479 // FIXME: This is to workaround the inability of tablegen match combiners to
7480 // match intrinsics in patterns.
7481 Observer.changingInstr(MI);
7482 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_FMED3));
7483 MI.removeOperand(OpNo: 1);
7484 Observer.changedInstr(MI);
7485 return true;
7486 }
7487 case Intrinsic::amdgcn_readlane:
7488 case Intrinsic::amdgcn_writelane:
7489 case Intrinsic::amdgcn_readfirstlane:
7490 case Intrinsic::amdgcn_permlane16:
7491 case Intrinsic::amdgcn_permlanex16:
7492 case Intrinsic::amdgcn_permlane64:
7493 return legalizeLaneOp(Helper, MI, IID: IntrID);
7494 default: {
7495 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7496 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
7497 return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
7498 return true;
7499 }
7500 }
7501
7502 return true;
7503}
7504