1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
24#include "SIRegisterInfo.h"
25#include "Utils/AMDGPUBaseInfo.h"
26#include "llvm/ADT/ScopeExit.h"
27#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
30#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
32#include "llvm/CodeGen/GlobalISel/Utils.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/PseudoSourceValueManager.h"
35#include "llvm/CodeGen/TargetOpcodes.h"
36#include "llvm/IR/DiagnosticInfo.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
49static cl::opt<bool> EnableNewLegality(
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(Val: false),
54 cl::ReallyHidden);
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
59static LLT getPow2VectorType(LLT Ty) {
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(Value: NElts);
62 return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
66static LLT getPow2ScalarType(LLT Ty) {
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Value: Bits);
69 return LLT::scalar(SizeInBits: Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(NumElements: Ty.getNumElements() + 1, ScalarTy: EltTy));
110 };
111}
112
113static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
144static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) {
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(x: TypeIdx, y: LLT::scalar(SizeInBits: MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
152static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: Ty.getElementType()));
170 };
171}
172
173static LLT getBufferRsrcScalarType(const LLT Ty) {
174 if (!Ty.isVector())
175 return LLT::scalar(SizeInBits: 128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: 128));
178}
179
180static LLT getBufferRsrcRegisterType(const LLT Ty) {
181 if (!Ty.isVector())
182 return LLT::fixed_vector(NumElements: 4, ScalarTy: LLT::scalar(SizeInBits: 32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElements: NumElems * 4, ScalarTy: LLT::scalar(SizeInBits: 32));
185}
186
187static LLT getBitcastRegisterType(const LLT Ty) {
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(SizeInBits: Size);
194 }
195
196 return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32);
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
206static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
212 TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32));
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
239 Size <= MaxRegisterSize;
240}
241
242static bool isRegisterVectorElementType(LLT EltTy) {
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Size: Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
267static LegalityPredicate isRegisterType(const GCNSubtarget &ST,
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Ty: Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
277static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST,
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(SizeInBits: 16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(SizeInBits: 1);
297constexpr LLT S8 = LLT::scalar(SizeInBits: 8);
298constexpr LLT S16 = LLT::scalar(SizeInBits: 16);
299constexpr LLT S32 = LLT::scalar(SizeInBits: 32);
300constexpr LLT F32 = LLT::float32();
301constexpr LLT S64 = LLT::scalar(SizeInBits: 64);
302constexpr LLT F64 = LLT::float64();
303constexpr LLT S96 = LLT::scalar(SizeInBits: 96);
304constexpr LLT S128 = LLT::scalar(SizeInBits: 128);
305constexpr LLT S160 = LLT::scalar(SizeInBits: 160);
306constexpr LLT S192 = LLT::scalar(SizeInBits: 192);
307constexpr LLT S224 = LLT::scalar(SizeInBits: 224);
308constexpr LLT S256 = LLT::scalar(SizeInBits: 256);
309constexpr LLT S512 = LLT::scalar(SizeInBits: 512);
310constexpr LLT S1024 = LLT::scalar(SizeInBits: 1024);
311constexpr LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
312
313constexpr LLT V2S8 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 8);
314constexpr LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
315constexpr LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
316constexpr LLT V6S16 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16);
317constexpr LLT V8S16 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
318constexpr LLT V10S16 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 16);
319constexpr LLT V12S16 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 16);
320constexpr LLT V16S16 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16);
321
322constexpr LLT V2F16 = LLT::fixed_vector(NumElements: 2, ScalarTy: LLT::float16());
323constexpr LLT V2BF16 = V2F16; // FIXME
324
325constexpr LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
326constexpr LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
327constexpr LLT V4S32 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
328constexpr LLT V5S32 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 32);
329constexpr LLT V6S32 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 32);
330constexpr LLT V7S32 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 32);
331constexpr LLT V8S32 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32);
332constexpr LLT V9S32 = LLT::fixed_vector(NumElements: 9, ScalarSizeInBits: 32);
333constexpr LLT V10S32 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 32);
334constexpr LLT V11S32 = LLT::fixed_vector(NumElements: 11, ScalarSizeInBits: 32);
335constexpr LLT V12S32 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 32);
336constexpr LLT V16S32 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32);
337constexpr LLT V32S32 = LLT::fixed_vector(NumElements: 32, ScalarSizeInBits: 32);
338
339constexpr LLT V2S64 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
340constexpr LLT V3S64 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 64);
341constexpr LLT V4S64 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64);
342constexpr LLT V5S64 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 64);
343constexpr LLT V6S64 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 64);
344constexpr LLT V7S64 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 64);
345constexpr LLT V8S64 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64);
346constexpr LLT V16S64 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 64);
347
348constexpr LLT V2S128 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 128);
349constexpr LLT V4S128 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 128);
350
351constexpr std::initializer_list<LLT> AllScalarTypes = {
352 S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
353
354constexpr std::initializer_list<LLT> AllS16Vectors{
355 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
356
357constexpr std::initializer_list<LLT> AllS32Vectors = {
358 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
359 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
360
361constexpr std::initializer_list<LLT> AllS64Vectors = {
362 V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
363
364constexpr std::initializer_list<LLT> AllVectors{
365 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128,
366 V4S128, V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
367 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32, V2S64, V3S64,
368 V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
369
370// Checks whether a type is in the list of legal register types.
371static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
372 if (Ty.isPointerOrPointerVector())
373 Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
374
375 return is_contained(Set: AllS32Vectors, Element: Ty) || is_contained(Set: AllS64Vectors, Element: Ty) ||
376 is_contained(Set: AllScalarTypes, Element: Ty) ||
377 (ST.useRealTrue16Insts() && Ty == S16) ||
378 is_contained(Set: AllS16Vectors, Element: Ty);
379}
380
381static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST,
382 unsigned TypeIdx) {
383 return [&ST, TypeIdx](const LegalityQuery &Query) {
384 return isRegisterClassType(ST, Ty: Query.Types[TypeIdx]);
385 };
386}
387
388// If we have a truncating store or an extending load with a data size larger
389// than 32-bits, we need to reduce to a 32-bit type.
390static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
391 return [=](const LegalityQuery &Query) {
392 const LLT Ty = Query.Types[TypeIdx];
393 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
394 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
395 };
396}
397
398// If we have a truncating store or an extending load with a data size larger
399// than 32-bits and mem location is a power of 2
400static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) {
401 return [=](const LegalityQuery &Query) {
402 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
403 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
404 isPowerOf2_64(Value: MemSize);
405 };
406}
407
408// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
409// handle some operations by just promoting the register during
410// selection. There are also d16 loads on GFX9+ which preserve the high bits.
411static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
412 bool IsLoad, bool IsAtomic) {
413 switch (AS) {
414 case AMDGPUAS::PRIVATE_ADDRESS:
415 // FIXME: Private element size.
416 return ST.hasFlatScratchEnabled() ? 128 : 32;
417 case AMDGPUAS::LOCAL_ADDRESS:
418 return ST.useDS128() ? 128 : 64;
419 case AMDGPUAS::GLOBAL_ADDRESS:
420 case AMDGPUAS::CONSTANT_ADDRESS:
421 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
422 case AMDGPUAS::BUFFER_RESOURCE:
423 // Treat constant and global as identical. SMRD loads are sometimes usable for
424 // global loads (ideally constant address space should be eliminated)
425 // depending on the context. Legality cannot be context dependent, but
426 // RegBankSelect can split the load as necessary depending on the pointer
427 // register bank/uniformity and if the memory is invariant or not written in a
428 // kernel.
429 return IsLoad ? 512 : 128;
430 default:
431 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
432 // if they may alias scratch depending on the subtarget. This needs to be
433 // moved to custom handling to use addressMayBeAccessedAsPrivate
434 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
435 }
436}
437
438static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
439 const LegalityQuery &Query) {
440 const LLT Ty = Query.Types[0];
441
442 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
443 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
444
445 unsigned RegSize = Ty.getSizeInBits();
446 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
447 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
448 unsigned AS = Query.Types[1].getAddressSpace();
449
450 // All of these need to be custom lowered to cast the pointer operand.
451 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
452 return false;
453
454 // Do not handle extending vector loads.
455 if (Ty.isVector() && MemSize != RegSize)
456 return false;
457
458 // TODO: We should be able to widen loads if the alignment is high enough, but
459 // we also need to modify the memory access size.
460#if 0
461 // Accept widening loads based on alignment.
462 if (IsLoad && MemSize < Size)
463 MemSize = std::max(MemSize, Align);
464#endif
465
466 // Only 1-byte and 2-byte to 32-bit extloads are valid.
467 if (MemSize != RegSize && RegSize != 32)
468 return false;
469
470 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
471 IsAtomic: Query.MMODescrs[0].Ordering !=
472 AtomicOrdering::NotAtomic))
473 return false;
474
475 switch (MemSize) {
476 case 8:
477 case 16:
478 case 32:
479 case 64:
480 case 128:
481 break;
482 case 96:
483 if (!ST.hasDwordx3LoadStores())
484 return false;
485 break;
486 case 256:
487 case 512:
488 // These may contextually need to be broken down.
489 break;
490 default:
491 return false;
492 }
493
494 assert(RegSize >= MemSize);
495
496 if (AlignBits < MemSize) {
497 const SITargetLowering *TLI = ST.getTargetLowering();
498 if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
499 Alignment: Align(AlignBits / 8)))
500 return false;
501 }
502
503 return true;
504}
505
506// The newer buffer intrinsic forms take their resource arguments as
507// pointers in address space 8, aka s128 values. However, in order to not break
508// SelectionDAG, the underlying operations have to continue to take v4i32
509// arguments. Therefore, we convert resource pointers - or vectors of them
510// to integer values here.
511static bool hasBufferRsrcWorkaround(const LLT Ty) {
512 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
513 return true;
514 if (Ty.isVector()) {
515 const LLT ElemTy = Ty.getElementType();
516 return hasBufferRsrcWorkaround(Ty: ElemTy);
517 }
518 return false;
519}
520
521// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
522// workaround this. Eventually it should ignore the type for loads and only care
523// about the size. Return true in cases where we will workaround this for now by
524// bitcasting.
525static bool loadStoreBitcastWorkaround(const LLT Ty) {
526 if (EnableNewLegality)
527 return false;
528
529 const unsigned Size = Ty.getSizeInBits();
530 if (Ty.isPointerVector())
531 return true;
532 if (Size <= 64)
533 return false;
534 // Address space 8 pointers get their own workaround.
535 if (hasBufferRsrcWorkaround(Ty))
536 return false;
537 if (!Ty.isVector())
538 return true;
539
540 unsigned EltSize = Ty.getScalarSizeInBits();
541 return EltSize != 32 && EltSize != 64;
542}
543
544static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
545 const LLT Ty = Query.Types[0];
546 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
547 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
548}
549
550/// Return true if a load or store of the type should be lowered with a bitcast
551/// to a different type.
552static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
553 const LLT MemTy) {
554 const unsigned MemSizeInBits = MemTy.getSizeInBits();
555 const unsigned Size = Ty.getSizeInBits();
556 if (Size != MemSizeInBits)
557 return Size <= 32 && Ty.isVector();
558
559 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
560 return true;
561
562 // Don't try to handle bitcasting vector ext loads for now.
563 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
564 (Size <= 32 || isRegisterSize(ST, Size)) &&
565 !isRegisterVectorElementType(EltTy: Ty.getElementType());
566}
567
568/// Return true if we should legalize a load by widening an odd sized memory
569/// access up to the alignment. Note this case when the memory access itself
570/// changes, not the size of the result register.
571static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
572 uint64_t AlignInBits, unsigned AddrSpace,
573 unsigned Opcode) {
574 unsigned SizeInBits = MemoryTy.getSizeInBits();
575 // We don't want to widen cases that are naturally legal.
576 if (isPowerOf2_32(Value: SizeInBits))
577 return false;
578
579 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
580 // end up widening these for a scalar load during RegBankSelect, if we don't
581 // have 96-bit scalar loads.
582 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
583 return false;
584
585 if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
586 return false;
587
588 // A load is known dereferenceable up to the alignment, so it's legal to widen
589 // to it.
590 //
591 // TODO: Could check dereferenceable for less aligned cases.
592 unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
593 if (AlignInBits < RoundedSize)
594 return false;
595
596 // Do not widen if it would introduce a slow unaligned load.
597 const SITargetLowering *TLI = ST.getTargetLowering();
598 unsigned Fast = 0;
599 return TLI->allowsMisalignedMemoryAccessesImpl(
600 Size: RoundedSize, AddrSpace, Alignment: Align(AlignInBits / 8),
601 Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
602 Fast;
603}
604
605static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
606 unsigned Opcode) {
607 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
608 return false;
609
610 return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs[0].MemoryTy,
611 AlignInBits: Query.MMODescrs[0].AlignInBits,
612 AddrSpace: Query.Types[1].getAddressSpace(), Opcode);
613}
614
615/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
616/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
617/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
618static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
619 MachineRegisterInfo &MRI, unsigned Idx) {
620 MachineOperand &MO = MI.getOperand(i: Idx);
621
622 const LLT PointerTy = MRI.getType(Reg: MO.getReg());
623
624 // Paranoidly prevent us from doing this multiple times.
625 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
626 return PointerTy;
627
628 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
630 if (!PointerTy.isVector()) {
631 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
632 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
633 const LLT S32 = LLT::scalar(SizeInBits: 32);
634
635 Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
636 std::array<Register, 4> VectorElems;
637 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
638 for (unsigned I = 0; I < NumParts; ++I)
639 VectorElems[I] =
640 B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: 0);
641 B.buildMergeValues(Res: MO, Ops: VectorElems);
642 MO.setReg(VectorReg);
643 return VectorTy;
644 }
645 Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
646 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
647 auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
648 B.buildIntToPtr(Dst: MO, Src: Scalar);
649 MO.setReg(BitcastReg);
650
651 return VectorTy;
652}
653
654/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
655/// the form in which the value must be in order to be passed to the low-level
656/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
657/// needed in order to account for the fact that we can't define a register
658/// class for s128 without breaking SelectionDAG.
659static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
660 MachineRegisterInfo &MRI = *B.getMRI();
661 const LLT PointerTy = MRI.getType(Reg: Pointer);
662 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
663 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
664
665 if (!PointerTy.isVector()) {
666 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
667 SmallVector<Register, 4> PointerParts;
668 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
669 auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: Pointer);
670 for (unsigned I = 0; I < NumParts; ++I)
671 PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
672 return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: 0);
673 }
674 Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: 0);
675 return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: 0);
676}
677
678static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
679 unsigned Idx) {
680 MachineOperand &MO = MI.getOperand(i: Idx);
681
682 const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
683 // Paranoidly prevent us from doing this multiple times.
684 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
685 return;
686 MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
687}
688
689AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
690 const GCNTargetMachine &TM)
691 : ST(ST_) {
692 using namespace TargetOpcode;
693
694 auto GetAddrSpacePtr = [&TM](unsigned AS) {
695 return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
696 };
697
698 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
699 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
700 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
701 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
702 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
703 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
704 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
705 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
706 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
707 const LLT BufferStridedPtr =
708 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
709
710 const LLT CodePtr = FlatPtr;
711
712 const std::initializer_list<LLT> AddrSpaces64 = {
713 GlobalPtr, ConstantPtr, FlatPtr
714 };
715
716 const std::initializer_list<LLT> AddrSpaces32 = {
717 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
718 };
719
720 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
721
722 const std::initializer_list<LLT> FPTypesBase = {
723 S32, S64
724 };
725
726 const std::initializer_list<LLT> FPTypes16 = {
727 S32, S64, S16
728 };
729
730 const std::initializer_list<LLT> FPTypesPK16 = {
731 S32, S64, S16, V2S16
732 };
733
734 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
735
736 // s1 for VCC branches, s32 for SCC branches.
737 getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
738
739 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
740 // elements for v3s16
741 getActionDefinitionsBuilder(Opcode: G_PHI)
742 .legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
743 .legalFor(Types: AllS32Vectors)
744 .legalFor(Types: AllS64Vectors)
745 .legalFor(Types: AddrSpaces64)
746 .legalFor(Types: AddrSpaces32)
747 .legalFor(Types: AddrSpaces128)
748 .legalIf(Predicate: isPointer(TypeIdx: 0))
749 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S256)
750 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
751 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16)
752 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
753 .scalarize(TypeIdx: 0);
754
755 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
756 // Full set of gfx9 features.
757 if (ST.hasScalarAddSub64()) {
758 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
759 .legalFor(Types: {S64, S32, S16, V2S16})
760 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
761 .scalarize(TypeIdx: 0)
762 .minScalar(TypeIdx: 0, Ty: S16)
763 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
764 .maxScalar(TypeIdx: 0, Ty: S32);
765 } else {
766 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
767 .legalFor(Types: {S32, S16, V2S16})
768 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
769 .scalarize(TypeIdx: 0)
770 .minScalar(TypeIdx: 0, Ty: S16)
771 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
772 .maxScalar(TypeIdx: 0, Ty: S32);
773 }
774
775 if (ST.hasScalarSMulU64()) {
776 getActionDefinitionsBuilder(Opcode: G_MUL)
777 .legalFor(Types: {S64, S32, S16, V2S16})
778 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
779 .scalarize(TypeIdx: 0)
780 .minScalar(TypeIdx: 0, Ty: S16)
781 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
782 .custom();
783 } else {
784 getActionDefinitionsBuilder(Opcode: G_MUL)
785 .legalFor(Types: {S32, S16, V2S16})
786 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
787 .scalarize(TypeIdx: 0)
788 .minScalar(TypeIdx: 0, Ty: S16)
789 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
790 .custom();
791 }
792 assert(ST.hasMad64_32());
793
794 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
795 .legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
796 .minScalarOrElt(TypeIdx: 0, Ty: S16)
797 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
798 .scalarize(TypeIdx: 0)
799 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
800 .lower();
801 } else if (ST.has16BitInsts()) {
802 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
803 .legalFor(Types: {S32, S16})
804 .minScalar(TypeIdx: 0, Ty: S16)
805 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
806 .maxScalar(TypeIdx: 0, Ty: S32)
807 .scalarize(TypeIdx: 0);
808
809 getActionDefinitionsBuilder(Opcode: G_MUL)
810 .legalFor(Types: {S32, S16})
811 .scalarize(TypeIdx: 0)
812 .minScalar(TypeIdx: 0, Ty: S16)
813 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
814 .custom();
815 assert(ST.hasMad64_32());
816
817 // Technically the saturating operations require clamp bit support, but this
818 // was introduced at the same time as 16-bit operations.
819 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
820 .legalFor(Types: {S32, S16}) // Clamp modifier
821 .minScalar(TypeIdx: 0, Ty: S16)
822 .scalarize(TypeIdx: 0)
823 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 16)
824 .lower();
825
826 // We're just lowering this, but it helps get a better result to try to
827 // coerce to the desired type first.
828 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
829 .minScalar(TypeIdx: 0, Ty: S16)
830 .scalarize(TypeIdx: 0)
831 .lower();
832 } else {
833 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
834 .legalFor(Types: {S32})
835 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
836 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
837 .scalarize(TypeIdx: 0);
838
839 auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
840 .legalFor(Types: {S32})
841 .scalarize(TypeIdx: 0)
842 .minScalar(TypeIdx: 0, Ty: S32)
843 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32);
844
845 if (ST.hasMad64_32())
846 Mul.custom();
847 else
848 Mul.maxScalar(TypeIdx: 0, Ty: S32);
849
850 if (ST.hasIntClamp()) {
851 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
852 .legalFor(Types: {S32}) // Clamp modifier.
853 .scalarize(TypeIdx: 0)
854 .minScalarOrElt(TypeIdx: 0, Ty: S32)
855 .lower();
856 } else {
857 // Clamp bit support was added in VI, along with 16-bit operations.
858 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
859 .minScalar(TypeIdx: 0, Ty: S32)
860 .scalarize(TypeIdx: 0)
861 .lower();
862 }
863
864 // FIXME: DAG expansion gets better results. The widening uses the smaller
865 // range values and goes for the min/max lowering directly.
866 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
867 .minScalar(TypeIdx: 0, Ty: S32)
868 .scalarize(TypeIdx: 0)
869 .lower();
870 }
871
872 getActionDefinitionsBuilder(
873 Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
874 .customFor(Types: {S32, S64})
875 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
876 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
877 .scalarize(TypeIdx: 0);
878
879 auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
880 .legalFor(Types: {S32})
881 .maxScalar(TypeIdx: 0, Ty: S32);
882
883 if (ST.hasVOP3PInsts()) {
884 Mulh
885 .clampMaxNumElements(TypeIdx: 0, EltTy: S8, MaxElements: 2)
886 .lowerFor(Types: {V2S8});
887 }
888
889 Mulh
890 .scalarize(TypeIdx: 0)
891 .lower();
892
893 // Report legal for any types we can handle anywhere. For the cases only legal
894 // on the SALU, RegBankSelect will be able to re-legalize.
895 getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
896 .legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
897 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
898 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
899 .fewerElementsIf(
900 Predicate: all(P0: vectorWiderThan(TypeIdx: 0, Size: 64), P1: scalarOrEltNarrowerThan(TypeIdx: 0, Size: 64)),
901 Mutation: fewerEltsToSize64Vector(TypeIdx: 0))
902 .widenScalarToNextPow2(TypeIdx: 0)
903 .scalarize(TypeIdx: 0);
904
905 getActionDefinitionsBuilder(
906 Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
907 .legalFor(Types: {{S32, S1}, {S32, S32}})
908 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
909 .scalarize(TypeIdx: 0);
910
911 getActionDefinitionsBuilder(Opcode: G_BITCAST)
912 // Don't worry about the size constraint.
913 .legalIf(Predicate: all(P0: isRegisterClassType(ST, TypeIdx: 0), P1: isRegisterClassType(ST, TypeIdx: 1)))
914 .lower();
915
916 getActionDefinitionsBuilder(Opcode: G_CONSTANT)
917 .legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
918 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
919 .legalIf(Predicate: isPointer(TypeIdx: 0))
920 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
921 .widenScalarToNextPow2(TypeIdx: 0);
922
923 getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
924 .legalFor(Types: {S32, S64, S16})
925 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
926
927 getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
928 .legalIf(Predicate: isRegisterClassType(ST, TypeIdx: 0))
929 // s1 and s16 are special cases because they have legal operations on
930 // them, but don't really occupy registers in the normal way.
931 .legalFor(Types: {S1, S16})
932 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
933 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
934 .clampScalarOrElt(TypeIdx: 0, MinTy: S32, MaxTy: MaxScalar)
935 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
936 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16);
937
938 getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
939
940 // If the amount is divergent, we have to do a wave reduction to get the
941 // maximum value, so this is expanded during RegBankSelect.
942 getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
943 .legalFor(Types: {{PrivatePtr, S32}});
944
945 getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
946 .customFor(Types: {PrivatePtr});
947 getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
948 .legalFor(Types: {PrivatePtr});
949
950 getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
951
952 getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
953 .customIf(Predicate: typeIsNot(TypeIdx: 0, Type: PrivatePtr));
954
955 getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
956
957 auto &FPOpActions = getActionDefinitionsBuilder(
958 Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
959 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
960 .legalFor(Types: {S32, S64});
961 auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
962 .customFor(Types: {S32, S64});
963 auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
964 .customFor(Types: {S32, S64});
965
966 if (ST.has16BitInsts()) {
967 if (ST.hasVOP3PInsts())
968 FPOpActions.legalFor(Types: {S16, V2S16});
969 else
970 FPOpActions.legalFor(Types: {S16});
971
972 TrigActions.customFor(Types: {S16});
973 FDIVActions.customFor(Types: {S16});
974 }
975
976 if (ST.hasPackedFP32Ops()) {
977 FPOpActions.legalFor(Types: {V2S32});
978 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S32, NumElts: 2);
979 }
980
981 auto &MinNumMaxNumIeee =
982 getActionDefinitionsBuilder(Opcodes: {G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
983
984 if (ST.hasVOP3PInsts()) {
985 MinNumMaxNumIeee.legalFor(Types: FPTypesPK16)
986 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
987 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
988 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
989 .scalarize(TypeIdx: 0);
990 } else if (ST.has16BitInsts()) {
991 MinNumMaxNumIeee.legalFor(Types: FPTypes16).clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64).scalarize(TypeIdx: 0);
992 } else {
993 MinNumMaxNumIeee.legalFor(Types: FPTypesBase)
994 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
995 .scalarize(TypeIdx: 0);
996 }
997
998 auto &MinNumMaxNum = getActionDefinitionsBuilder(
999 Opcodes: {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1000
1001 if (ST.hasVOP3PInsts()) {
1002 MinNumMaxNum.customFor(Types: FPTypesPK16)
1003 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1004 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1005 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1006 .scalarize(TypeIdx: 0);
1007 } else if (ST.has16BitInsts()) {
1008 MinNumMaxNum.customFor(Types: FPTypes16)
1009 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1010 .scalarize(TypeIdx: 0);
1011 } else {
1012 MinNumMaxNum.customFor(Types: FPTypesBase)
1013 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1014 .scalarize(TypeIdx: 0);
1015 }
1016
1017 if (ST.hasVOP3PInsts())
1018 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
1019
1020 FPOpActions
1021 .scalarize(TypeIdx: 0)
1022 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1023
1024 TrigActions
1025 .scalarize(TypeIdx: 0)
1026 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1027
1028 FDIVActions
1029 .scalarize(TypeIdx: 0)
1030 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1031
1032 getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS})
1033 .legalFor(Types: FPTypesPK16)
1034 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1035 .scalarize(TypeIdx: 0)
1036 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1037
1038 if (ST.has16BitInsts()) {
1039 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1040 .legalFor(Types: {S16})
1041 .customFor(Types: {S32, S64})
1042 .scalarize(TypeIdx: 0)
1043 .unsupported();
1044 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1045 .legalFor(Types: {S32, S64, S16})
1046 .scalarize(TypeIdx: 0)
1047 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1048
1049 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1050 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
1051 .scalarize(TypeIdx: 0)
1052 .maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16)
1053 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1054 .lower();
1055
1056 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1057 .customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1058 .scalarize(TypeIdx: 0)
1059 .lower();
1060
1061 getActionDefinitionsBuilder(Opcode: G_FMODF)
1062 .lowerFor(Types: {S16, S32, S64})
1063 .scalarize(TypeIdx: 0)
1064 .lower();
1065 } else {
1066 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1067 .customFor(Types: {S32, S64, S16})
1068 .scalarize(TypeIdx: 0)
1069 .unsupported();
1070
1071
1072 if (ST.hasFractBug()) {
1073 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1074 .customFor(Types: {S64})
1075 .legalFor(Types: {S32, S64})
1076 .scalarize(TypeIdx: 0)
1077 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1078 } else {
1079 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1080 .legalFor(Types: {S32, S64})
1081 .scalarize(TypeIdx: 0)
1082 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1083 }
1084
1085 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1086 .legalFor(Types: {{S32, S32}, {S64, S32}})
1087 .scalarize(TypeIdx: 0)
1088 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1089 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1090 .lower();
1091
1092 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1093 .customFor(Types: {{S32, S32}, {S64, S32}})
1094 .scalarize(TypeIdx: 0)
1095 .minScalar(TypeIdx: 0, Ty: S32)
1096 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1097 .lower();
1098
1099 getActionDefinitionsBuilder(Opcode: G_FMODF)
1100 .lowerFor(Types: {S32, S64})
1101 .scalarize(TypeIdx: 0)
1102 .lower();
1103 }
1104
1105 auto &FPTruncActions = getActionDefinitionsBuilder(Opcode: G_FPTRUNC);
1106 if (ST.hasCvtPkF16F32Inst()) {
1107 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1108 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1109 } else {
1110 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}});
1111 }
1112 FPTruncActions.scalarize(TypeIdx: 0).lower();
1113
1114 getActionDefinitionsBuilder(Opcode: G_FPEXT)
1115 .legalFor(Types: {{S64, S32}, {S32, S16}})
1116 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32))
1117 .scalarize(TypeIdx: 0);
1118
1119 auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1120 if (ST.has16BitInsts()) {
1121 FSubActions
1122 // Use actual fsub instruction
1123 .legalFor(Types: {S32, S16})
1124 // Must use fadd + fneg
1125 .lowerFor(Types: {S64, V2S16});
1126 } else {
1127 FSubActions
1128 // Use actual fsub instruction
1129 .legalFor(Types: {S32})
1130 // Must use fadd + fneg
1131 .lowerFor(Types: {S64, S16, V2S16});
1132 }
1133
1134 FSubActions
1135 .scalarize(TypeIdx: 0)
1136 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1137
1138 // Whether this is legal depends on the floating point mode for the function.
1139 auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1140 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1141 FMad.customFor(Types: {S32, S16});
1142 else if (ST.hasMadMacF32Insts())
1143 FMad.customFor(Types: {S32});
1144 else if (ST.hasMadF16())
1145 FMad.customFor(Types: {S16});
1146 FMad.scalarize(TypeIdx: 0)
1147 .lower();
1148
1149 auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1150 if (ST.has16BitInsts()) {
1151 FRem.customFor(Types: {S16, S32, S64});
1152 } else {
1153 FRem.minScalar(TypeIdx: 0, Ty: S32)
1154 .customFor(Types: {S32, S64});
1155 }
1156 FRem.scalarize(TypeIdx: 0);
1157
1158 // TODO: Do we need to clamp maximum bitwidth?
1159 getActionDefinitionsBuilder(Opcode: G_TRUNC)
1160 .legalIf(Predicate: isScalar(TypeIdx: 0))
1161 .legalFor(Types: {{V2S16, V2S32}})
1162 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1163 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1164 // situations (like an invalid implicit use), we don't want to infinite loop
1165 // in the legalizer.
1166 .fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: 0), Mutation: LegalizeMutations::scalarize(TypeIdx: 0))
1167 .alwaysLegal();
1168
1169 getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1170 .legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1171 {S32, S1}, {S64, S1}, {S16, S1}})
1172 .scalarize(TypeIdx: 0)
1173 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1174 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1175
1176 // TODO: Split s1->s64 during regbankselect for VALU.
1177 auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1178 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S32}})
1179 .lowerIf(Predicate: typeIs(TypeIdx: 1, TypesInit: S1))
1180 .customFor(Types: {{S32, S64}, {S64, S64}});
1181 if (ST.has16BitInsts())
1182 IToFP.legalFor(Types: {{S16, S16}});
1183 IToFP.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1184 .minScalar(TypeIdx: 0, Ty: S32)
1185 .scalarize(TypeIdx: 0)
1186 .widenScalarToNextPow2(TypeIdx: 1);
1187
1188 auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1189 .legalFor(Types: {{S32, S32}, {S32, S64}, {S32, S16}})
1190 .customFor(Types: {{S64, S32}, {S64, S64}})
1191 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1192 if (ST.has16BitInsts())
1193 FPToI.legalFor(Types: {{S16, S16}});
1194 else
1195 FPToI.minScalar(TypeIdx: 1, Ty: S32);
1196
1197 FPToI.minScalar(TypeIdx: 0, Ty: S32)
1198 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1199 .scalarize(TypeIdx: 0)
1200 .lower();
1201
1202 // clang-format off
1203 auto &FPToISat = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI_SAT, G_FPTOUI_SAT})
1204 .legalFor(Types: {{S32, S32}, {S32, S64}})
1205 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1206 FPToISat.minScalar(TypeIdx: 1, Ty: S32);
1207 FPToISat.minScalar(TypeIdx: 0, Ty: S32)
1208 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1209 .scalarize(TypeIdx: 0)
1210 .lower();
1211 // clang-format on
1212
1213 getActionDefinitionsBuilder(Opcodes: {G_LROUND, G_LLROUND})
1214 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1215 .scalarize(TypeIdx: 0)
1216 .lower();
1217
1218 getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1219 .legalFor(Types: {S16, S32})
1220 .scalarize(TypeIdx: 0)
1221 .lower();
1222
1223 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1224 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1225 .scalarize(TypeIdx: 0)
1226 .lower();
1227
1228 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1229 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1230 .scalarize(TypeIdx: 0)
1231 .lower();
1232
1233 if (ST.has16BitInsts()) {
1234 getActionDefinitionsBuilder(
1235 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1236 .legalFor(Types: {S16, S32, S64})
1237 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1238 .scalarize(TypeIdx: 0);
1239 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1240 getActionDefinitionsBuilder(
1241 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1242 .legalFor(Types: {S32, S64})
1243 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1244 .scalarize(TypeIdx: 0);
1245 } else {
1246 getActionDefinitionsBuilder(
1247 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1248 .legalFor(Types: {S32})
1249 .customFor(Types: {S64})
1250 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1251 .scalarize(TypeIdx: 0);
1252 }
1253
1254 getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1255 .unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1256 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: sameSize(TypeIdx0: 0, TypeIdx1: 1)))
1257 .scalarize(TypeIdx: 0)
1258 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0);
1259
1260 getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1261 .legalIf(Predicate: all(P0: sameSize(TypeIdx0: 0, TypeIdx1: 1), P1: typeInSet(TypeIdx: 1, TypesInit: {S64, S32})))
1262 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0)
1263 .scalarize(TypeIdx: 0);
1264
1265 auto &CmpBuilder =
1266 getActionDefinitionsBuilder(Opcode: G_ICMP)
1267 // The compare output type differs based on the register bank of the output,
1268 // so make both s1 and s32 legal.
1269 //
1270 // Scalar compares producing output in scc will be promoted to s32, as that
1271 // is the allocatable register type that will be needed for the copy from
1272 // scc. This will be promoted during RegBankSelect, and we assume something
1273 // before that won't try to use s32 result types.
1274 //
1275 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1276 // bank.
1277 .legalForCartesianProduct(
1278 Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1279 .legalForCartesianProduct(
1280 Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1281 if (ST.has16BitInsts()) {
1282 CmpBuilder.legalFor(Types: {{S1, S16}});
1283 }
1284
1285 CmpBuilder
1286 .widenScalarToNextPow2(TypeIdx: 1)
1287 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1288 .scalarize(TypeIdx: 0)
1289 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: 1)));
1290
1291 auto &FCmpBuilder =
1292 getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1293 Types0: {S1}, Types1: ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1294
1295 if (ST.hasSALUFloatInsts())
1296 FCmpBuilder.legalForCartesianProduct(Types0: {S32}, Types1: {S16, S32});
1297
1298 FCmpBuilder
1299 .widenScalarToNextPow2(TypeIdx: 1)
1300 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1301 .scalarize(TypeIdx: 0);
1302
1303 // FIXME: fpow has a selection pattern that should move to custom lowering.
1304 auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1305 if (ST.has16BitInsts())
1306 ExpOps.customFor(Types: {{S32}, {S16}});
1307 else
1308 ExpOps.customFor(Types: {S32});
1309 ExpOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1310 .scalarize(TypeIdx: 0);
1311
1312 getActionDefinitionsBuilder(Opcode: G_FPOWI)
1313 .clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1314 .lower();
1315
1316 auto &Log2Ops = getActionDefinitionsBuilder(Opcodes: {G_FLOG2, G_FEXP2});
1317 Log2Ops.customFor(Types: {S32});
1318 if (ST.has16BitInsts())
1319 Log2Ops.legalFor(Types: {S16});
1320 else
1321 Log2Ops.customFor(Types: {S16});
1322 Log2Ops.scalarize(TypeIdx: 0)
1323 .lower();
1324
1325 auto &LogOps =
1326 getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1327 LogOps.customFor(Types: {S32, S16});
1328 LogOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1329 .scalarize(TypeIdx: 0);
1330
1331 // The 64-bit versions produce 32-bit results, but only on the SALU.
1332 getActionDefinitionsBuilder(Opcode: G_CTPOP)
1333 .legalFor(Types: {{S32, S32}, {S32, S64}})
1334 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1335 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1336 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1337 .scalarize(TypeIdx: 0)
1338 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1339
1340 // If no 16 bit instr is available, lower into different instructions.
1341 if (ST.has16BitInsts())
1342 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1343 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1344 .widenScalarToNextPow2(TypeIdx: 1)
1345 .scalarize(TypeIdx: 0)
1346 .lower();
1347 else
1348 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1349 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1350 .lowerFor(Types: {S1, S16})
1351 .widenScalarToNextPow2(TypeIdx: 1)
1352 .scalarize(TypeIdx: 0)
1353 .lower();
1354
1355 // The hardware instructions return a different result on 0 than the generic
1356 // instructions expect. The hardware produces -1, but these produce the
1357 // bitwidth.
1358 getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1359 .scalarize(TypeIdx: 0)
1360 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1361 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1362 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1363 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1364 .custom();
1365
1366 // The 64-bit versions produce 32-bit results, but only on the SALU.
1367 getActionDefinitionsBuilder(Opcode: G_CTLZ_ZERO_UNDEF)
1368 .legalFor(Types: {{S32, S32}, {S32, S64}})
1369 .customIf(Predicate: scalarNarrowerThan(TypeIdx: 1, Size: 32))
1370 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1371 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1372 .scalarize(TypeIdx: 0)
1373 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1374 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1375
1376 getActionDefinitionsBuilder(Opcode: G_CTTZ_ZERO_UNDEF)
1377 .legalFor(Types: {{S32, S32}, {S32, S64}})
1378 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1379 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1380 .scalarize(TypeIdx: 0)
1381 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1382 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1383
1384 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1385 // RegBankSelect.
1386 getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1387 .legalFor(Types: {S32, S64})
1388 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1389 .scalarize(TypeIdx: 0)
1390 .widenScalarToNextPow2(TypeIdx: 0);
1391
1392 if (ST.has16BitInsts()) {
1393 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1394 .legalFor(Types: {S16, S32, V2S16})
1395 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1396 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1397 // narrowScalar limitation.
1398 .widenScalarToNextPow2(TypeIdx: 0)
1399 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S32)
1400 .scalarize(TypeIdx: 0);
1401
1402 if (ST.hasVOP3PInsts()) {
1403 getActionDefinitionsBuilder(Opcode: G_ABS)
1404 .legalFor(Types: {S32, S16, V2S16})
1405 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1406 .minScalar(TypeIdx: 0, Ty: S16)
1407 .widenScalarToNextPow2(TypeIdx: 0)
1408 .scalarize(TypeIdx: 0)
1409 .lower();
1410 if (ST.hasIntMinMax64()) {
1411 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1412 .legalFor(Types: {S32, S16, S64, V2S16})
1413 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1414 .minScalar(TypeIdx: 0, Ty: S16)
1415 .widenScalarToNextPow2(TypeIdx: 0)
1416 .scalarize(TypeIdx: 0)
1417 .lower();
1418 } else {
1419 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1420 .legalFor(Types: {S32, S16, V2S16})
1421 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1422 .minScalar(TypeIdx: 0, Ty: S16)
1423 .widenScalarToNextPow2(TypeIdx: 0)
1424 .scalarize(TypeIdx: 0)
1425 .lower();
1426 }
1427 } else {
1428 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1429 .legalFor(Types: {S32, S16})
1430 .widenScalarToNextPow2(TypeIdx: 0)
1431 .minScalar(TypeIdx: 0, Ty: S16)
1432 .scalarize(TypeIdx: 0)
1433 .lower();
1434 }
1435 } else {
1436 // TODO: Should have same legality without v_perm_b32
1437 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1438 .legalFor(Types: {S32})
1439 .lowerIf(Predicate: scalarNarrowerThan(TypeIdx: 0, Size: 32))
1440 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1441 // narrowScalar limitation.
1442 .widenScalarToNextPow2(TypeIdx: 0)
1443 .maxScalar(TypeIdx: 0, Ty: S32)
1444 .scalarize(TypeIdx: 0)
1445 .lower();
1446
1447 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1448 .legalFor(Types: {S32})
1449 .minScalar(TypeIdx: 0, Ty: S32)
1450 .widenScalarToNextPow2(TypeIdx: 0)
1451 .scalarize(TypeIdx: 0)
1452 .lower();
1453 }
1454
1455 getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1456 // List the common cases
1457 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1458 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1459 .scalarize(TypeIdx: 0)
1460 // Accept any address space as long as the size matches
1461 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1462 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 1, TypeIdx1: 0),
1463 Mutation: [](const LegalityQuery &Query) {
1464 return std::pair(
1465 1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1466 })
1467 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 1, TypeIdx1: 0), Mutation: [](const LegalityQuery &Query) {
1468 return std::pair(1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1469 });
1470
1471 getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1472 // List the common cases
1473 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1474 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1475 .scalarize(TypeIdx: 0)
1476 // Accept any address space as long as the size matches
1477 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1478 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 0, TypeIdx1: 1),
1479 Mutation: [](const LegalityQuery &Query) {
1480 return std::pair(
1481 0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1482 })
1483 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 0, TypeIdx1: 1), Mutation: [](const LegalityQuery &Query) {
1484 return std::pair(0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1485 });
1486
1487 getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1488 .scalarize(TypeIdx: 0)
1489 .custom();
1490
1491 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1492 bool IsLoad) -> bool {
1493 const LLT DstTy = Query.Types[0];
1494
1495 // Split vector extloads.
1496 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1497
1498 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1499 return true;
1500
1501 const LLT PtrTy = Query.Types[1];
1502 unsigned AS = PtrTy.getAddressSpace();
1503 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1504 IsAtomic: Query.MMODescrs[0].Ordering !=
1505 AtomicOrdering::NotAtomic))
1506 return true;
1507
1508 // Catch weird sized loads that don't evenly divide into the access sizes
1509 // TODO: May be able to widen depending on alignment etc.
1510 unsigned NumRegs = (MemSize + 31) / 32;
1511 if (NumRegs == 3) {
1512 if (!ST.hasDwordx3LoadStores())
1513 return true;
1514 } else {
1515 // If the alignment allows, these should have been widened.
1516 if (!isPowerOf2_32(Value: NumRegs))
1517 return true;
1518 }
1519
1520 return false;
1521 };
1522
1523 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1524 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1525 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1526
1527 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1528 // LDS
1529 // TODO: Unsupported flat for SI.
1530
1531 for (unsigned Op : {G_LOAD, G_STORE}) {
1532 const bool IsStore = Op == G_STORE;
1533
1534 auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1535 // Explicitly list some common cases.
1536 // TODO: Does this help compile time at all?
1537 Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1538 {.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1539 {.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1540 {.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1541 {.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1542 {.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1543 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1544 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1545
1546 {.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1547 {.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: 32},
1548 {.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: 32},
1549 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1550 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1551 {.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1552
1553 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1554 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1555 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1556 {.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1557
1558 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1559 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1560 {.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1561 {.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1562 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1563 Actions.legalIf(
1564 Predicate: [=](const LegalityQuery &Query) -> bool {
1565 return isLoadStoreLegal(ST, Query);
1566 });
1567
1568 // The custom pointers (fat pointers, buffer resources) don't work with load
1569 // and store at this level. Fat pointers should have been lowered to
1570 // intrinsics before the translation to MIR.
1571 Actions.unsupportedIf(
1572 Predicate: typeInSet(TypeIdx: 1, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1573
1574 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1575 // ptrtoint. This is needed to account for the fact that we can't have i128
1576 // as a register class for SelectionDAG reasons.
1577 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1578 return hasBufferRsrcWorkaround(Ty: Query.Types[0]);
1579 });
1580
1581 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1582 // 64-bits.
1583 //
1584 // TODO: Should generalize bitcast action into coerce, which will also cover
1585 // inserting addrspacecasts.
1586 Actions.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1587
1588 // Turn any illegal element vectors into something easier to deal
1589 // with. These will ultimately produce 32-bit scalar shifts to extract the
1590 // parts anyway.
1591 //
1592 // For odd 16-bit element vectors, prefer to split those into pieces with
1593 // 16-bit vector parts.
1594 Actions.bitcastIf(
1595 Predicate: [=](const LegalityQuery &Query) -> bool {
1596 return shouldBitcastLoadStoreType(ST, Ty: Query.Types[0],
1597 MemTy: Query.MMODescrs[0].MemoryTy);
1598 }, Mutation: bitcastToRegisterType(TypeIdx: 0));
1599
1600 if (!IsStore) {
1601 // Widen suitably aligned loads by loading extra bytes. The standard
1602 // legalization actions can't properly express widening memory operands.
1603 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1604 return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1605 });
1606 }
1607
1608 // FIXME: load/store narrowing should be moved to lower action
1609 Actions
1610 .narrowScalarIf(
1611 Predicate: [=](const LegalityQuery &Query) -> bool {
1612 return !Query.Types[0].isVector() &&
1613 needToSplitMemOp(Query, Op == G_LOAD);
1614 },
1615 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1616 const LLT DstTy = Query.Types[0];
1617 const LLT PtrTy = Query.Types[1];
1618
1619 const unsigned DstSize = DstTy.getSizeInBits();
1620 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1621
1622 // Split extloads.
1623 if (DstSize > MemSize)
1624 return std::pair(0, LLT::scalar(SizeInBits: MemSize));
1625
1626 unsigned MaxSize = maxSizeForAddrSpace(
1627 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1628 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1629 if (MemSize > MaxSize)
1630 return std::pair(0, LLT::scalar(SizeInBits: MaxSize));
1631
1632 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1633 return std::pair(0, LLT::scalar(SizeInBits: Align));
1634 })
1635 .fewerElementsIf(
1636 Predicate: [=](const LegalityQuery &Query) -> bool {
1637 return Query.Types[0].isVector() &&
1638 needToSplitMemOp(Query, Op == G_LOAD);
1639 },
1640 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1641 const LLT DstTy = Query.Types[0];
1642 const LLT PtrTy = Query.Types[1];
1643
1644 LLT EltTy = DstTy.getElementType();
1645 unsigned MaxSize = maxSizeForAddrSpace(
1646 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1647 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1648
1649 // FIXME: Handle widened to power of 2 results better. This ends
1650 // up scalarizing.
1651 // FIXME: 3 element stores scalarized on SI
1652
1653 // Split if it's too large for the address space.
1654 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1655 if (MemSize > MaxSize) {
1656 unsigned NumElts = DstTy.getNumElements();
1657 unsigned EltSize = EltTy.getSizeInBits();
1658
1659 if (MaxSize % EltSize == 0) {
1660 return std::pair(
1661 0, LLT::scalarOrVector(
1662 EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1663 }
1664
1665 unsigned NumPieces = MemSize / MaxSize;
1666
1667 // FIXME: Refine when odd breakdowns handled
1668 // The scalars will need to be re-legalized.
1669 if (NumPieces == 1 || NumPieces >= NumElts ||
1670 NumElts % NumPieces != 0)
1671 return std::pair(0, EltTy);
1672
1673 return std::pair(0,
1674 LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1675 }
1676
1677 // FIXME: We could probably handle weird extending loads better.
1678 if (DstTy.getSizeInBits() > MemSize)
1679 return std::pair(0, EltTy);
1680
1681 unsigned EltSize = EltTy.getSizeInBits();
1682 unsigned DstSize = DstTy.getSizeInBits();
1683 if (!isPowerOf2_32(Value: DstSize)) {
1684 // We're probably decomposing an odd sized store. Try to split
1685 // to the widest type. TODO: Account for alignment. As-is it
1686 // should be OK, since the new parts will be further legalized.
1687 unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1688 return std::pair(
1689 0, LLT::scalarOrVector(
1690 EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1691 }
1692
1693 // May need relegalization for the scalars.
1694 return std::pair(0, EltTy);
1695 })
1696 .minScalar(TypeIdx: 0, Ty: S32)
1697 .narrowScalarIf(Predicate: isTruncStoreToSizePowerOf2(TypeIdx: 0),
1698 Mutation: getScalarTypeFromMemDesc(TypeIdx: 0))
1699 .widenScalarToNextPow2(TypeIdx: 0)
1700 .moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: 0, Size: 32), Mutation: moreEltsToNext32Bit(TypeIdx: 0))
1701 .lower();
1702 }
1703
1704 // FIXME: Unaligned accesses not lowered.
1705 auto &ExtLoads = getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1706 .legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: 8},
1707 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: 2 * 8},
1708 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1709 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1710 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1711 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1712 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: 8},
1713 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: 2 * 8}})
1714 .legalIf(
1715 Predicate: [=](const LegalityQuery &Query) -> bool {
1716 return isLoadStoreLegal(ST, Query);
1717 });
1718
1719 if (ST.hasFlatAddressSpace()) {
1720 ExtLoads.legalForTypesWithMemDesc(
1721 TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: 8}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: 16}});
1722 }
1723
1724 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1725 // 64-bits.
1726 //
1727 // TODO: Should generalize bitcast action into coerce, which will also cover
1728 // inserting addrspacecasts.
1729 ExtLoads.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1730
1731 ExtLoads.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1732 .widenScalarToNextPow2(TypeIdx: 0)
1733 .lower();
1734
1735 auto &Atomics = getActionDefinitionsBuilder(
1736 Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1737 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1738 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1739 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1740 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1741 {S64, GlobalPtr}, {S64, LocalPtr},
1742 {S32, RegionPtr}, {S64, RegionPtr}});
1743 if (ST.hasFlatAddressSpace()) {
1744 Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1745 }
1746
1747 auto &Atomics32 =
1748 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1749 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1750 if (ST.hasFlatAddressSpace()) {
1751 Atomics32.legalFor(Types: {{S32, FlatPtr}});
1752 }
1753
1754 // TODO: v2bf16 operations, and fat buffer pointer support.
1755 auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1756 if (ST.hasLDSFPAtomicAddF32()) {
1757 Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1758 if (ST.hasLdsAtomicAddF64())
1759 Atomic.legalFor(Types: {{S64, LocalPtr}});
1760 if (ST.hasAtomicDsPkAdd16Insts())
1761 Atomic.legalFor(Types: {{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1762 }
1763 if (ST.hasAtomicFaddInsts())
1764 Atomic.legalFor(Types: {{S32, GlobalPtr}});
1765 if (ST.hasFlatAtomicFaddF32Inst())
1766 Atomic.legalFor(Types: {{S32, FlatPtr}});
1767
1768 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1769 // These are legal with some caveats, and should have undergone expansion in
1770 // the IR in most situations
1771 // TODO: Move atomic expansion into legalizer
1772 Atomic.legalFor(Types: {
1773 {S32, GlobalPtr},
1774 {S64, GlobalPtr},
1775 {S64, FlatPtr}
1776 });
1777 }
1778
1779 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1780 ST.hasAtomicBufferGlobalPkAddF16Insts())
1781 Atomic.legalFor(Types: {{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1782 if (ST.hasAtomicGlobalPkAddBF16Inst())
1783 Atomic.legalFor(Types: {{V2BF16, GlobalPtr}});
1784 if (ST.hasAtomicFlatPkAdd16Insts())
1785 Atomic.legalFor(Types: {{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1786
1787
1788 // Most of the legalization work here is done by AtomicExpand. We could
1789 // probably use a simpler legality rule that just assumes anything is OK.
1790 auto &AtomicFMinFMax =
1791 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1792 .legalFor(Types: {{F32, LocalPtr}, {F64, LocalPtr}});
1793
1794 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1795 AtomicFMinFMax.legalFor(Types: {{F32, GlobalPtr},{F32, BufferFatPtr}});
1796 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1797 AtomicFMinFMax.legalFor(Types: {{F64, GlobalPtr}, {F64, BufferFatPtr}});
1798 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1799 AtomicFMinFMax.legalFor(Types: {F32, FlatPtr});
1800 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1801 AtomicFMinFMax.legalFor(Types: {F64, FlatPtr});
1802
1803 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1804 // demarshalling
1805 getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1806 .customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1807 {S32, FlatPtr}, {S64, FlatPtr}})
1808 .legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1809 {S32, RegionPtr}, {S64, RegionPtr}});
1810 // TODO: Pointer types, any 32-bit or 64-bit vector
1811
1812 // Condition should be s32 for scalar, s1 for vector.
1813 getActionDefinitionsBuilder(Opcode: G_SELECT)
1814 .legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1815 LocalPtr, FlatPtr, PrivatePtr,
1816 LLT::fixed_vector(NumElements: 2, ScalarTy: LocalPtr),
1817 LLT::fixed_vector(NumElements: 2, ScalarTy: PrivatePtr)},
1818 Types1: {S1, S32})
1819 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1820 .scalarize(TypeIdx: 1)
1821 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1822 .fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: 0), Mutation: scalarize(TypeIdx: 0))
1823 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 2)
1824 .clampMaxNumElements(TypeIdx: 0, EltTy: LocalPtr, MaxElements: 2)
1825 .clampMaxNumElements(TypeIdx: 0, EltTy: PrivatePtr, MaxElements: 2)
1826 .scalarize(TypeIdx: 0)
1827 .widenScalarToNextPow2(TypeIdx: 0)
1828 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: typeInSet(TypeIdx: 1, TypesInit: {S1, S32})));
1829
1830 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1831 // be more flexible with the shift amount type.
1832 auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1833 .legalFor(Types: {{S32, S32}, {S64, S32}});
1834 if (ST.has16BitInsts()) {
1835 if (ST.hasVOP3PInsts()) {
1836 Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1837 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1838 } else
1839 Shifts.legalFor(Types: {{S16, S16}});
1840
1841 // TODO: Support 16-bit shift amounts for all types
1842 Shifts.widenScalarIf(
1843 Predicate: [=](const LegalityQuery &Query) {
1844 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1845 // 32-bit amount.
1846 const LLT ValTy = Query.Types[0];
1847 const LLT AmountTy = Query.Types[1];
1848 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1849 AmountTy.getSizeInBits() < 16;
1850 }, Mutation: changeTo(TypeIdx: 1, Ty: S16));
1851 Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16);
1852 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1853 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 16);
1854 Shifts.clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1855
1856 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1857 .minScalar(TypeIdx: 0, Ty: S16)
1858 .scalarize(TypeIdx: 0)
1859 .lower();
1860 } else {
1861 // Make sure we legalize the shift amount type first, as the general
1862 // expansion for the shifted type will produce much worse code if it hasn't
1863 // been truncated already.
1864 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1865 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1866 Shifts.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1867
1868 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1869 .minScalar(TypeIdx: 0, Ty: S32)
1870 .scalarize(TypeIdx: 0)
1871 .lower();
1872 }
1873 Shifts.scalarize(TypeIdx: 0);
1874
1875 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1876 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1877 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1878 unsigned IdxTypeIdx = 2;
1879
1880 getActionDefinitionsBuilder(Opcode: Op)
1881 .customIf(Predicate: [=](const LegalityQuery &Query) {
1882 const LLT EltTy = Query.Types[EltTypeIdx];
1883 const LLT VecTy = Query.Types[VecTypeIdx];
1884 const LLT IdxTy = Query.Types[IdxTypeIdx];
1885 const unsigned EltSize = EltTy.getSizeInBits();
1886 const bool isLegalVecType =
1887 !!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1888 // Address space 8 pointers are 128-bit wide values, but the logic
1889 // below will try to bitcast them to 2N x s64, which will fail.
1890 // Therefore, as an intermediate step, wrap extracts/insertions from a
1891 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1892 // extraction result) in order to produce a vector operation that can
1893 // be handled by the logic below.
1894 if (EltTy.isPointer() && EltSize > 64)
1895 return true;
1896 return (EltSize == 32 || EltSize == 64) &&
1897 VecTy.getSizeInBits() % 32 == 0 &&
1898 VecTy.getSizeInBits() <= MaxRegisterSize &&
1899 IdxTy.getSizeInBits() == 32 &&
1900 isLegalVecType;
1901 })
1902 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1903 P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: 32)),
1904 Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1905 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1906 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1907 P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: 64)),
1908 Mutation: [=](const LegalityQuery &Query) {
1909 // For > 64-bit element types, try to turn this into a
1910 // 64-bit element vector since we may be able to do better
1911 // indexing if this is scalar. If not, fall back to 32.
1912 const LLT EltTy = Query.Types[EltTypeIdx];
1913 const LLT VecTy = Query.Types[VecTypeIdx];
1914 const unsigned DstEltSize = EltTy.getSizeInBits();
1915 const unsigned VecSize = VecTy.getSizeInBits();
1916
1917 const unsigned TargetEltSize =
1918 DstEltSize % 64 == 0 ? 64 : 32;
1919 return std::pair(VecTypeIdx,
1920 LLT::fixed_vector(NumElements: VecSize / TargetEltSize,
1921 ScalarSizeInBits: TargetEltSize));
1922 })
1923 .clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
1924 .clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
1925 .clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
1926 .clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: 32)
1927 // TODO: Clamp elements for 64-bit vectors?
1928 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: VecTypeIdx),
1929 Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
1930 // It should only be necessary with variable indexes.
1931 // As a last resort, lower to the stack
1932 .lower();
1933 }
1934
1935 getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1936 .unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1937 const LLT &EltTy = Query.Types[1].getElementType();
1938 return Query.Types[0] != EltTy;
1939 });
1940
1941 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1942 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1943 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1944
1945 // FIXME: Doesn't handle extract of illegal sizes.
1946 getActionDefinitionsBuilder(Opcode: Op)
1947 .lowerIf(Predicate: all(P0: typeIs(TypeIdx: LitTyIdx, TypesInit: S16), P1: sizeIs(TypeIdx: BigTyIdx, Size: 32)))
1948 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
1949 // Sub-vector(or single element) insert and extract.
1950 // TODO: verify immediate offset here since lower only works with
1951 // whole elements.
1952 const LLT BigTy = Query.Types[BigTyIdx];
1953 return BigTy.isVector();
1954 })
1955 // FIXME: Multiples of 16 should not be legal.
1956 .legalIf(Predicate: [=](const LegalityQuery &Query) {
1957 const LLT BigTy = Query.Types[BigTyIdx];
1958 const LLT LitTy = Query.Types[LitTyIdx];
1959 return (BigTy.getSizeInBits() % 32 == 0) &&
1960 (LitTy.getSizeInBits() % 16 == 0);
1961 })
1962 .widenScalarIf(
1963 Predicate: [=](const LegalityQuery &Query) {
1964 const LLT BigTy = Query.Types[BigTyIdx];
1965 return (BigTy.getScalarSizeInBits() < 16);
1966 },
1967 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: 16))
1968 .widenScalarIf(
1969 Predicate: [=](const LegalityQuery &Query) {
1970 const LLT LitTy = Query.Types[LitTyIdx];
1971 return (LitTy.getScalarSizeInBits() < 16);
1972 },
1973 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: 16))
1974 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1975 .widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: 32);
1976
1977 }
1978
1979 auto &BuildVector =
1980 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1981 .legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
1982 .legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
1983 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
1984 .clampNumElements(TypeIdx: 0, MinTy: V2S64, MaxTy: V16S64)
1985 .fewerElementsIf(Predicate: isWideVec16(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: V2S16))
1986 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: 0),
1987 Mutation: moreElementsToNextExistingRegClass(TypeIdx: 0));
1988
1989 if (ST.hasScalarPackInsts()) {
1990 BuildVector
1991 // FIXME: Should probably widen s1 vectors straight to s32
1992 .minScalarOrElt(TypeIdx: 0, Ty: S16)
1993 .minScalar(TypeIdx: 1, Ty: S16);
1994
1995 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1996 .legalFor(Types: {V2S16, S32})
1997 .lower();
1998 } else {
1999 BuildVector.customFor(Types: {V2S16, S16});
2000 BuildVector.minScalarOrElt(TypeIdx: 0, Ty: S32);
2001
2002 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
2003 .customFor(Types: {V2S16, S32})
2004 .lower();
2005 }
2006
2007 BuildVector.legalIf(Predicate: isRegisterType(ST, TypeIdx: 0));
2008
2009 // FIXME: Clamp maximum size
2010 getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
2011 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
2012 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 32)
2013 .clampMaxNumElements(TypeIdx: 1, EltTy: S16, MaxElements: 2) // TODO: Make 4?
2014 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 64);
2015
2016 getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
2017
2018 // Merge/Unmerge
2019 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2020 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2021 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2022
2023 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2024 const LLT Ty = Query.Types[TypeIdx];
2025 if (Ty.isVector()) {
2026 const LLT &EltTy = Ty.getElementType();
2027 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2028 return true;
2029 if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
2030 return true;
2031 }
2032 return false;
2033 };
2034
2035 auto &Builder =
2036 getActionDefinitionsBuilder(Opcode: Op)
2037 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
2038 .lowerFor(Types: {{S16, V2S16}})
2039 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
2040 const LLT BigTy = Query.Types[BigTyIdx];
2041 return BigTy.getSizeInBits() == 32;
2042 })
2043 // Try to widen to s16 first for small types.
2044 // TODO: Only do this on targets with legal s16 shifts
2045 .minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: 16), TypeIdx: LitTyIdx, Ty: S16)
2046 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 16)
2047 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx),
2048 Mutation: oneMoreElement(TypeIdx: BigTyIdx))
2049 .fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: 0, TypesInit: S16), P1: vectorWiderThan(TypeIdx: 1, Size: 32),
2050 args: elementTypeIs(TypeIdx: 1, EltTy: S16)),
2051 Mutation: changeTo(TypeIdx: 1, Ty: V2S16))
2052 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2053 // not worth considering the multiples of 64 since 2*192 and 2*384
2054 // are not valid.
2055 .clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
2056 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 32)
2057 // Break up vectors with weird elements into scalars
2058 .fewerElementsIf(
2059 Predicate: [=](const LegalityQuery &Query) {
2060 return notValidElt(Query, LitTyIdx);
2061 },
2062 Mutation: scalarize(TypeIdx: 0))
2063 .fewerElementsIf(
2064 Predicate: [=](const LegalityQuery &Query) {
2065 return notValidElt(Query, BigTyIdx);
2066 },
2067 Mutation: scalarize(TypeIdx: 1))
2068 .clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
2069
2070 if (Op == G_MERGE_VALUES) {
2071 Builder.widenScalarIf(
2072 // TODO: Use 16-bit shifts if legal for 8-bit values?
2073 Predicate: [=](const LegalityQuery &Query) {
2074 const LLT Ty = Query.Types[LitTyIdx];
2075 return Ty.getSizeInBits() < 32;
2076 },
2077 Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
2078 }
2079
2080 Builder.widenScalarIf(
2081 Predicate: [=](const LegalityQuery &Query) {
2082 const LLT Ty = Query.Types[BigTyIdx];
2083 return Ty.getSizeInBits() % 16 != 0;
2084 },
2085 Mutation: [=](const LegalityQuery &Query) {
2086 // Pick the next power of 2, or a multiple of 64 over 128.
2087 // Whichever is smaller.
2088 const LLT &Ty = Query.Types[BigTyIdx];
2089 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Value: Ty.getSizeInBits() + 1);
2090 if (NewSizeInBits >= 256) {
2091 unsigned RoundedTo = alignTo<64>(Value: Ty.getSizeInBits() + 1);
2092 if (RoundedTo < NewSizeInBits)
2093 NewSizeInBits = RoundedTo;
2094 }
2095 return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
2096 })
2097 // Any vectors left are the wrong size. Scalarize them.
2098 .scalarize(TypeIdx: 0)
2099 .scalarize(TypeIdx: 1);
2100 }
2101
2102 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2103 // RegBankSelect.
2104 auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
2105 .legalFor(Types: {{S32}, {S64}})
2106 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
2107
2108 if (ST.hasVOP3PInsts()) {
2109 SextInReg.lowerFor(Types: {{V2S16}})
2110 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2111 // get more vector shift opportunities, since we'll get those when
2112 // expanded.
2113 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2114 } else if (ST.has16BitInsts()) {
2115 SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
2116 } else {
2117 // Prefer to promote to s32 before lowering if we don't have 16-bit
2118 // shifts. This avoid a lot of intermediate truncate and extend operations.
2119 SextInReg.lowerFor(Types: {{S32}, {S64}});
2120 }
2121
2122 SextInReg
2123 .scalarize(TypeIdx: 0)
2124 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2125 .lower();
2126
2127 getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
2128 .scalarize(TypeIdx: 0)
2129 .lower();
2130
2131 auto &FSHRActionDefs = getActionDefinitionsBuilder(Opcode: G_FSHR);
2132 FSHRActionDefs.legalFor(Types: {{S32, S32}})
2133 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2134 if (ST.hasVOP3PInsts())
2135 FSHRActionDefs.lowerFor(Types: {{V2S16, V2S16}});
2136 FSHRActionDefs.scalarize(TypeIdx: 0).lower();
2137
2138 if (ST.hasVOP3PInsts()) {
2139 getActionDefinitionsBuilder(Opcode: G_FSHL)
2140 .lowerFor(Types: {{V2S16, V2S16}})
2141 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2142 .scalarize(TypeIdx: 0)
2143 .lower();
2144 } else {
2145 getActionDefinitionsBuilder(Opcode: G_FSHL)
2146 .scalarize(TypeIdx: 0)
2147 .lower();
2148 }
2149
2150 getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
2151 .legalFor(Types: {S64});
2152
2153 getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
2154
2155 getActionDefinitionsBuilder(Opcode: G_FENCE)
2156 .alwaysLegal();
2157
2158 getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
2159 .scalarize(TypeIdx: 0)
2160 .minScalar(TypeIdx: 0, Ty: S32)
2161 .lower();
2162
2163 getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2164 .legalFor(Types: {{S32, S32}, {S64, S32}})
2165 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
2166 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2167 .widenScalarToNextPow2(TypeIdx: 0)
2168 .scalarize(TypeIdx: 0);
2169
2170 getActionDefinitionsBuilder(
2171 Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2172 G_FCOPYSIGN,
2173
2174 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2175 G_READ_REGISTER, G_WRITE_REGISTER,
2176
2177 G_SADDO, G_SSUBO})
2178 .lower();
2179
2180 if (ST.hasIEEEMinimumMaximumInsts()) {
2181 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2182 .legalFor(Types: FPTypesPK16)
2183 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
2184 .scalarize(TypeIdx: 0);
2185 } else if (ST.hasVOP3PInsts()) {
2186 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2187 .lowerFor(Types: {V2S16})
2188 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2189 .scalarize(TypeIdx: 0)
2190 .lower();
2191 } else {
2192 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2193 .scalarize(TypeIdx: 0)
2194 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2195 .lower();
2196 }
2197
2198 getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2199 .lower();
2200
2201 getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2202
2203 getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2204 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2205 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2206 .unsupported();
2207
2208 getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2209
2210 getActionDefinitionsBuilder(
2211 Opcodes: {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2212 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2213 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2214 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2215 .legalFor(Types: AllVectors)
2216 .scalarize(TypeIdx: 1)
2217 .lower();
2218
2219 getLegacyLegalizerInfo().computeTables();
2220 verify(MII: *ST.getInstrInfo());
2221}
2222
2223bool AMDGPULegalizerInfo::legalizeCustom(
2224 LegalizerHelper &Helper, MachineInstr &MI,
2225 LostDebugLocObserver &LocObserver) const {
2226 MachineIRBuilder &B = Helper.MIRBuilder;
2227 MachineRegisterInfo &MRI = *B.getMRI();
2228
2229 switch (MI.getOpcode()) {
2230 case TargetOpcode::G_ADDRSPACE_CAST:
2231 return legalizeAddrSpaceCast(MI, MRI, B);
2232 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2233 return legalizeFroundeven(MI, MRI, B);
2234 case TargetOpcode::G_FCEIL:
2235 return legalizeFceil(MI, MRI, B);
2236 case TargetOpcode::G_FREM:
2237 return legalizeFrem(MI, MRI, B);
2238 case TargetOpcode::G_INTRINSIC_TRUNC:
2239 return legalizeIntrinsicTrunc(MI, MRI, B);
2240 case TargetOpcode::G_SITOFP:
2241 return legalizeITOFP(MI, MRI, B, Signed: true);
2242 case TargetOpcode::G_UITOFP:
2243 return legalizeITOFP(MI, MRI, B, Signed: false);
2244 case TargetOpcode::G_FPTOSI:
2245 return legalizeFPTOI(MI, MRI, B, Signed: true);
2246 case TargetOpcode::G_FPTOUI:
2247 return legalizeFPTOI(MI, MRI, B, Signed: false);
2248 case TargetOpcode::G_FMINNUM:
2249 case TargetOpcode::G_FMAXNUM:
2250 case TargetOpcode::G_FMINIMUMNUM:
2251 case TargetOpcode::G_FMAXIMUMNUM:
2252 return legalizeMinNumMaxNum(Helper, MI);
2253 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2254 return legalizeExtractVectorElt(MI, MRI, B);
2255 case TargetOpcode::G_INSERT_VECTOR_ELT:
2256 return legalizeInsertVectorElt(MI, MRI, B);
2257 case TargetOpcode::G_FSIN:
2258 case TargetOpcode::G_FCOS:
2259 return legalizeSinCos(MI, MRI, B);
2260 case TargetOpcode::G_GLOBAL_VALUE:
2261 return legalizeGlobalValue(MI, MRI, B);
2262 case TargetOpcode::G_LOAD:
2263 case TargetOpcode::G_SEXTLOAD:
2264 case TargetOpcode::G_ZEXTLOAD:
2265 return legalizeLoad(Helper, MI);
2266 case TargetOpcode::G_STORE:
2267 return legalizeStore(Helper, MI);
2268 case TargetOpcode::G_FMAD:
2269 return legalizeFMad(MI, MRI, B);
2270 case TargetOpcode::G_FDIV:
2271 return legalizeFDIV(MI, MRI, B);
2272 case TargetOpcode::G_FFREXP:
2273 return legalizeFFREXP(MI, MRI, B);
2274 case TargetOpcode::G_FSQRT:
2275 return legalizeFSQRT(MI, MRI, B);
2276 case TargetOpcode::G_UDIV:
2277 case TargetOpcode::G_UREM:
2278 case TargetOpcode::G_UDIVREM:
2279 return legalizeUnsignedDIV_REM(MI, MRI, B);
2280 case TargetOpcode::G_SDIV:
2281 case TargetOpcode::G_SREM:
2282 case TargetOpcode::G_SDIVREM:
2283 return legalizeSignedDIV_REM(MI, MRI, B);
2284 case TargetOpcode::G_ATOMIC_CMPXCHG:
2285 return legalizeAtomicCmpXChg(MI, MRI, B);
2286 case TargetOpcode::G_FLOG2:
2287 return legalizeFlog2(MI, B);
2288 case TargetOpcode::G_FLOG:
2289 case TargetOpcode::G_FLOG10:
2290 return legalizeFlogCommon(MI, B);
2291 case TargetOpcode::G_FEXP2:
2292 return legalizeFExp2(MI, B);
2293 case TargetOpcode::G_FEXP:
2294 case TargetOpcode::G_FEXP10:
2295 return legalizeFExp(MI, B);
2296 case TargetOpcode::G_FPOW:
2297 return legalizeFPow(MI, B);
2298 case TargetOpcode::G_FFLOOR:
2299 return legalizeFFloor(MI, MRI, B);
2300 case TargetOpcode::G_BUILD_VECTOR:
2301 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2302 return legalizeBuildVector(MI, MRI, B);
2303 case TargetOpcode::G_MUL:
2304 return legalizeMul(Helper, MI);
2305 case TargetOpcode::G_CTLZ:
2306 case TargetOpcode::G_CTTZ:
2307 return legalizeCTLZ_CTTZ(MI, MRI, B);
2308 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2309 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2310 case TargetOpcode::G_STACKSAVE:
2311 return legalizeStackSave(MI, B);
2312 case TargetOpcode::G_GET_FPENV:
2313 return legalizeGetFPEnv(MI, MRI, B);
2314 case TargetOpcode::G_SET_FPENV:
2315 return legalizeSetFPEnv(MI, MRI, B);
2316 case TargetOpcode::G_TRAP:
2317 return legalizeTrap(MI, MRI, B);
2318 case TargetOpcode::G_DEBUGTRAP:
2319 return legalizeDebugTrap(MI, MRI, B);
2320 default:
2321 return false;
2322 }
2323
2324 llvm_unreachable("expected switch to return");
2325}
2326
2327Register AMDGPULegalizerInfo::getSegmentAperture(
2328 unsigned AS,
2329 MachineRegisterInfo &MRI,
2330 MachineIRBuilder &B) const {
2331 MachineFunction &MF = B.getMF();
2332 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2333 const LLT S32 = LLT::scalar(SizeInBits: 32);
2334 const LLT S64 = LLT::scalar(SizeInBits: 64);
2335
2336 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2337
2338 if (ST.hasApertureRegs()) {
2339 // Note: this register is somewhat broken. When used as a 32-bit operand,
2340 // it only returns zeroes. The real value is in the upper 32 bits.
2341 // Thus, we must emit extract the high 32 bits.
2342 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2343 ? AMDGPU::SRC_SHARED_BASE
2344 : AMDGPU::SRC_PRIVATE_BASE;
2345 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2346 !ST.hasGloballyAddressableScratch()) &&
2347 "Cannot use src_private_base with globally addressable scratch!");
2348 Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2349 MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2350 B.buildCopy(Res: {Dst}, Op: {Register(ApertureRegNo)});
2351 return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: 1);
2352 }
2353
2354 Register LoadAddr = MRI.createGenericVirtualRegister(
2355 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2356 // For code object version 5, private_base and shared_base are passed through
2357 // implicit kernargs.
2358 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2359 AMDGPU::AMDHSA_COV5) {
2360 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF&: B.getMF());
2361
2362 AMDGPUTargetLowering::ImplicitParameter Param =
2363 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2364 : AMDGPUTargetLowering::PRIVATE_BASE;
2365 uint64_t Offset =
2366 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2367
2368 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2369 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2370
2371 if (!loadInputValue(DstReg: KernargPtrReg, B,
2372 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2373 return Register();
2374
2375 MachineMemOperand *MMO = MF.getMachineMemOperand(
2376 PtrInfo: PtrInfo.getWithOffset(O: Offset),
2377 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2378 MachineMemOperand::MOInvariant,
2379 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset));
2380
2381 // Pointer address
2382 B.buildObjectPtrOffset(Res: LoadAddr, Op0: KernargPtrReg,
2383 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
2384 // Load address
2385 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2386 }
2387
2388 Register QueuePtr = MRI.createGenericVirtualRegister(
2389 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2390
2391 if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2392 return Register();
2393
2394 // TODO: Use custom PseudoSourceValue
2395 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2396
2397 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2398 // private_segment_aperture_base_hi.
2399 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2400
2401 MachineMemOperand *MMO = MF.getMachineMemOperand(
2402 PtrInfo,
2403 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2404 MachineMemOperand::MOInvariant,
2405 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset: StructOffset));
2406
2407 B.buildObjectPtrOffset(
2408 Res: LoadAddr, Op0: QueuePtr,
2409 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: StructOffset).getReg(Idx: 0));
2410 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2411}
2412
2413/// Return true if the value is a known valid address, such that a null check is
2414/// not necessary.
2415static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2416 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2417 MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2418 switch (Def->getOpcode()) {
2419 case AMDGPU::G_FRAME_INDEX:
2420 case AMDGPU::G_GLOBAL_VALUE:
2421 case AMDGPU::G_BLOCK_ADDR:
2422 return true;
2423 case AMDGPU::G_CONSTANT: {
2424 const ConstantInt *CI = Def->getOperand(i: 1).getCImm();
2425 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2426 }
2427 default:
2428 return false;
2429 }
2430
2431 return false;
2432}
2433
2434bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2435 MachineInstr &MI, MachineRegisterInfo &MRI,
2436 MachineIRBuilder &B) const {
2437 MachineFunction &MF = B.getMF();
2438
2439 // MI can either be a G_ADDRSPACE_CAST or a
2440 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2441 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2442 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2443 Intrinsic::amdgcn_addrspacecast_nonnull));
2444
2445 const LLT S32 = LLT::scalar(SizeInBits: 32);
2446 Register Dst = MI.getOperand(i: 0).getReg();
2447 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
2448 : MI.getOperand(i: 1).getReg();
2449 LLT DstTy = MRI.getType(Reg: Dst);
2450 LLT SrcTy = MRI.getType(Reg: Src);
2451 unsigned DestAS = DstTy.getAddressSpace();
2452 unsigned SrcAS = SrcTy.getAddressSpace();
2453
2454 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2455 // vector element.
2456 assert(!DstTy.isVector());
2457
2458 const AMDGPUTargetMachine &TM
2459 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2460
2461 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2462 MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2463 return true;
2464 }
2465
2466 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2467 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2468 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2469 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2470 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2471 ST.hasGloballyAddressableScratch()) {
2472 // flat -> private with globally addressable scratch: subtract
2473 // src_flat_scratch_base_lo.
2474 const LLT S32 = LLT::scalar(SizeInBits: 32);
2475 Register SrcLo = B.buildExtract(Res: S32, Src, Index: 0).getReg(Idx: 0);
2476 Register FlatScratchBaseLo =
2477 B.buildInstr(Opc: AMDGPU::S_MOV_B32, DstOps: {S32},
2478 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2479 .getReg(Idx: 0);
2480 MRI.setRegClass(Reg: FlatScratchBaseLo, RC: &AMDGPU::SReg_32RegClass);
2481 Register Sub = B.buildSub(Dst: S32, Src0: SrcLo, Src1: FlatScratchBaseLo).getReg(Idx: 0);
2482 return B.buildIntToPtr(Dst, Src: Sub).getReg(Idx: 0);
2483 }
2484
2485 // Extract low 32-bits of the pointer.
2486 return B.buildExtract(Res: Dst, Src, Index: 0).getReg(Idx: 0);
2487 };
2488
2489 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2490 // G_ADDRSPACE_CAST we need to guess.
2491 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2492 castFlatToLocalOrPrivate(Dst);
2493 MI.eraseFromParent();
2494 return true;
2495 }
2496
2497 unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
2498
2499 auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2500 auto FlatNull = B.buildConstant(Res: SrcTy, Val: 0);
2501
2502 // Extract low 32-bits of the pointer.
2503 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2504
2505 auto CmpRes =
2506 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: FlatNull.getReg(Idx: 0));
2507 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: 0));
2508
2509 MI.eraseFromParent();
2510 return true;
2511 }
2512
2513 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2514 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2515 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2516 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2517 // Coerce the type of the low half of the result so we can use
2518 // merge_values.
2519 Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: 0);
2520
2521 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2522 ST.hasGloballyAddressableScratch()) {
2523 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2524 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2525 Register AllOnes = B.buildConstant(Res: S32, Val: -1).getReg(Idx: 0);
2526 Register ThreadID = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
2527 ThreadID = B.buildIntrinsic(ID: Intrinsic::amdgcn_mbcnt_lo, Res: {S32})
2528 .addUse(RegNo: AllOnes)
2529 .addUse(RegNo: ThreadID)
2530 .getReg(Idx: 0);
2531 if (ST.isWave64()) {
2532 ThreadID = B.buildIntrinsic(ID: Intrinsic::amdgcn_mbcnt_hi, Res: {S32})
2533 .addUse(RegNo: AllOnes)
2534 .addUse(RegNo: ThreadID)
2535 .getReg(Idx: 0);
2536 }
2537 Register ShAmt =
2538 B.buildConstant(Res: S32, Val: 57 - 32 - ST.getWavefrontSizeLog2()).getReg(Idx: 0);
2539 Register SrcHi = B.buildShl(Dst: S32, Src0: ThreadID, Src1: ShAmt).getReg(Idx: 0);
2540 Register CvtPtr =
2541 B.buildMergeLikeInstr(Res: DstTy, Ops: {SrcAsInt, SrcHi}).getReg(Idx: 0);
2542 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2543 // 64-bit hi:lo value.
2544 Register FlatScratchBase =
2545 B.buildInstr(Opc: AMDGPU::S_MOV_B64, DstOps: {S64},
2546 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2547 .getReg(Idx: 0);
2548 MRI.setRegClass(Reg: FlatScratchBase, RC: &AMDGPU::SReg_64RegClass);
2549 return B.buildPtrAdd(Res: Dst, Op0: CvtPtr, Op1: FlatScratchBase).getReg(Idx: 0);
2550 }
2551
2552 Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2553 if (!ApertureReg.isValid())
2554 return false;
2555
2556 // TODO: Should we allow mismatched types but matching sizes in merges to
2557 // avoid the ptrtoint?
2558 return B.buildMergeLikeInstr(Res: Dst, Ops: {SrcAsInt, ApertureReg}).getReg(Idx: 0);
2559 };
2560
2561 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2562 // G_ADDRSPACE_CAST we need to guess.
2563 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2564 castLocalOrPrivateToFlat(Dst);
2565 MI.eraseFromParent();
2566 return true;
2567 }
2568
2569 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2570
2571 auto SegmentNull = B.buildConstant(Res: SrcTy, Val: TM.getNullPointerValue(AddrSpace: SrcAS));
2572 auto FlatNull = B.buildConstant(Res: DstTy, Val: TM.getNullPointerValue(AddrSpace: DestAS));
2573
2574 auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
2575 Op1: SegmentNull.getReg(Idx: 0));
2576
2577 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2578
2579 MI.eraseFromParent();
2580 return true;
2581 }
2582
2583 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2584 SrcTy.getSizeInBits() == 64) {
2585 // Truncate.
2586 B.buildExtract(Res: Dst, Src, Index: 0);
2587 MI.eraseFromParent();
2588 return true;
2589 }
2590
2591 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2592 DstTy.getSizeInBits() == 64) {
2593 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2594 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2595 auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2596 if (AddrHiVal == 0) {
2597 auto Zext = B.buildZExt(Res: LLT::scalar(SizeInBits: 64), Op: PtrLo);
2598 B.buildIntToPtr(Dst, Src: Zext);
2599 } else {
2600 auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2601 B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2602 }
2603
2604 MI.eraseFromParent();
2605 return true;
2606 }
2607
2608 // Invalid casts are poison.
2609 // TODO: Should return poison
2610 B.buildUndef(Res: Dst);
2611 MI.eraseFromParent();
2612 return true;
2613}
2614
2615bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2616 MachineRegisterInfo &MRI,
2617 MachineIRBuilder &B) const {
2618 Register Src = MI.getOperand(i: 1).getReg();
2619 LLT Ty = MRI.getType(Reg: Src);
2620 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2621
2622 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2623 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2624
2625 auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2626 auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2627
2628 // TODO: Should this propagate fast-math-flags?
2629 auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2630 auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2631
2632 auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2633 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2634
2635 auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: C2);
2636 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2637 MI.eraseFromParent();
2638 return true;
2639}
2640
2641bool AMDGPULegalizerInfo::legalizeFceil(
2642 MachineInstr &MI, MachineRegisterInfo &MRI,
2643 MachineIRBuilder &B) const {
2644
2645 const LLT S1 = LLT::scalar(SizeInBits: 1);
2646 const LLT S64 = LLT::scalar(SizeInBits: 64);
2647
2648 Register Src = MI.getOperand(i: 1).getReg();
2649 assert(MRI.getType(Src) == S64);
2650
2651 // result = trunc(src)
2652 // if (src > 0.0 && src != result)
2653 // result += 1.0
2654
2655 auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2656
2657 const auto Zero = B.buildFConstant(Res: S64, Val: 0.0);
2658 const auto One = B.buildFConstant(Res: S64, Val: 1.0);
2659 auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2660 auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2661 auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2662 auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2663
2664 // TODO: Should this propagate fast-math-flags?
2665 B.buildFAdd(Dst: MI.getOperand(i: 0).getReg(), Src0: Trunc, Src1: Add);
2666 MI.eraseFromParent();
2667 return true;
2668}
2669
2670bool AMDGPULegalizerInfo::legalizeFrem(
2671 MachineInstr &MI, MachineRegisterInfo &MRI,
2672 MachineIRBuilder &B) const {
2673 Register DstReg = MI.getOperand(i: 0).getReg();
2674 Register Src0Reg = MI.getOperand(i: 1).getReg();
2675 Register Src1Reg = MI.getOperand(i: 2).getReg();
2676 auto Flags = MI.getFlags();
2677 LLT Ty = MRI.getType(Reg: DstReg);
2678
2679 auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2680 auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2681 auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2682 B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2683 MI.eraseFromParent();
2684 return true;
2685}
2686
2687static MachineInstrBuilder extractF64Exponent(Register Hi,
2688 MachineIRBuilder &B) {
2689 const unsigned FractBits = 52;
2690 const unsigned ExpBits = 11;
2691 LLT S32 = LLT::scalar(SizeInBits: 32);
2692
2693 auto Const0 = B.buildConstant(Res: S32, Val: FractBits - 32);
2694 auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2695
2696 auto ExpPart = B.buildIntrinsic(ID: Intrinsic::amdgcn_ubfe, Res: {S32})
2697 .addUse(RegNo: Hi)
2698 .addUse(RegNo: Const0.getReg(Idx: 0))
2699 .addUse(RegNo: Const1.getReg(Idx: 0));
2700
2701 return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: 1023));
2702}
2703
2704bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2705 MachineInstr &MI, MachineRegisterInfo &MRI,
2706 MachineIRBuilder &B) const {
2707 const LLT S1 = LLT::scalar(SizeInBits: 1);
2708 const LLT S32 = LLT::scalar(SizeInBits: 32);
2709 const LLT S64 = LLT::scalar(SizeInBits: 64);
2710
2711 Register Src = MI.getOperand(i: 1).getReg();
2712 assert(MRI.getType(Src) == S64);
2713
2714 // TODO: Should this use extract since the low half is unused?
2715 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2716 Register Hi = Unmerge.getReg(Idx: 1);
2717
2718 // Extract the upper half, since this is where we will find the sign and
2719 // exponent.
2720 auto Exp = extractF64Exponent(Hi, B);
2721
2722 const unsigned FractBits = 52;
2723
2724 // Extract the sign bit.
2725 const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(1) << 31);
2726 auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2727
2728 const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(1) << FractBits) - 1);
2729
2730 const auto Zero32 = B.buildConstant(Res: S32, Val: 0);
2731
2732 // Extend back to 64-bits.
2733 auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2734
2735 auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2736 auto Not = B.buildNot(Dst: S64, Src0: Shr);
2737 auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2738 auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - 1);
2739
2740 auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2741 auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2742
2743 auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2744 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2745 MI.eraseFromParent();
2746 return true;
2747}
2748
2749bool AMDGPULegalizerInfo::legalizeITOFP(
2750 MachineInstr &MI, MachineRegisterInfo &MRI,
2751 MachineIRBuilder &B, bool Signed) const {
2752
2753 Register Dst = MI.getOperand(i: 0).getReg();
2754 Register Src = MI.getOperand(i: 1).getReg();
2755
2756 const LLT S64 = LLT::scalar(SizeInBits: 64);
2757 const LLT S32 = LLT::scalar(SizeInBits: 32);
2758
2759 assert(MRI.getType(Src) == S64);
2760
2761 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2762 auto ThirtyTwo = B.buildConstant(Res: S32, Val: 32);
2763
2764 if (MRI.getType(Reg: Dst) == S64) {
2765 auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1))
2766 : B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1));
2767
2768 auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 0));
2769 auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2770
2771 // TODO: Should this propagate fast-math-flags?
2772 B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2773 MI.eraseFromParent();
2774 return true;
2775 }
2776
2777 assert(MRI.getType(Dst) == S32);
2778
2779 auto One = B.buildConstant(Res: S32, Val: 1);
2780
2781 MachineInstrBuilder ShAmt;
2782 if (Signed) {
2783 auto ThirtyOne = B.buildConstant(Res: S32, Val: 31);
2784 auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: 0), Src1: Unmerge.getReg(Idx: 1));
2785 auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2786 auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2787 auto LS = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32})
2788 .addUse(RegNo: Unmerge.getReg(Idx: 1));
2789 auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2790 ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2791 } else
2792 ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
2793 auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2794 auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2795 auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: 0));
2796 auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: 1), Src1: Adjust);
2797 auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2798 auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2799 B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2800 MI.eraseFromParent();
2801 return true;
2802}
2803
2804// TODO: Copied from DAG implementation. Verify logic and document how this
2805// actually works.
2806bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2807 MachineRegisterInfo &MRI,
2808 MachineIRBuilder &B,
2809 bool Signed) const {
2810
2811 Register Dst = MI.getOperand(i: 0).getReg();
2812 Register Src = MI.getOperand(i: 1).getReg();
2813
2814 const LLT S64 = LLT::scalar(SizeInBits: 64);
2815 const LLT S32 = LLT::scalar(SizeInBits: 32);
2816
2817 const LLT SrcLT = MRI.getType(Reg: Src);
2818 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2819
2820 unsigned Flags = MI.getFlags();
2821
2822 // The basic idea of converting a floating point number into a pair of 32-bit
2823 // integers is illustrated as follows:
2824 //
2825 // tf := trunc(val);
2826 // hif := floor(tf * 2^-32);
2827 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2828 // hi := fptoi(hif);
2829 // lo := fptoi(lof);
2830 //
2831 auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2832 MachineInstrBuilder Sign;
2833 if (Signed && SrcLT == S32) {
2834 // However, a 32-bit floating point number has only 23 bits mantissa and
2835 // it's not enough to hold all the significant bits of `lof` if val is
2836 // negative. To avoid the loss of precision, We need to take the absolute
2837 // value after truncating and flip the result back based on the original
2838 // signedness.
2839 Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: 31));
2840 Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2841 }
2842 MachineInstrBuilder K0, K1;
2843 if (SrcLT == S64) {
2844 K0 = B.buildFConstant(
2845 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2846 K1 = B.buildFConstant(
2847 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2848 } else {
2849 K0 = B.buildFConstant(
2850 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2851 K1 = B.buildFConstant(
2852 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2853 }
2854
2855 auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2856 auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2857 auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2858
2859 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2860 : B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2861 auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2862
2863 if (Signed && SrcLT == S32) {
2864 // Flip the result based on the signedness, which is either all 0s or 1s.
2865 Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2866 // r := xor({lo, hi}, sign) - sign;
2867 B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2868 Src1: Sign);
2869 } else
2870 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2871 MI.eraseFromParent();
2872
2873 return true;
2874}
2875
2876bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2877 MachineInstr &MI) const {
2878 MachineFunction &MF = Helper.MIRBuilder.getMF();
2879 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2880
2881 // With ieee_mode disabled, the instructions have the correct behavior.
2882 if (!MFI->getMode().IEEE)
2883 return true;
2884
2885 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2886}
2887
2888bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2889 MachineInstr &MI, MachineRegisterInfo &MRI,
2890 MachineIRBuilder &B) const {
2891 // TODO: Should move some of this into LegalizerHelper.
2892
2893 // TODO: Promote dynamic indexing of s16 to s32
2894
2895 Register Dst = MI.getOperand(i: 0).getReg();
2896 Register Vec = MI.getOperand(i: 1).getReg();
2897
2898 LLT VecTy = MRI.getType(Reg: Vec);
2899 LLT EltTy = VecTy.getElementType();
2900 assert(EltTy == MRI.getType(Dst));
2901
2902 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2903 // but we can't go directly to that logic becasue you can't bitcast a vector
2904 // of pointers to a vector of integers. Therefore, introduce an intermediate
2905 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2906 // drive the legalization forward.
2907 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2908 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2909 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2910
2911 auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2912 auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: 2));
2913 B.buildIntToPtr(Dst, Src: IntElt);
2914
2915 MI.eraseFromParent();
2916 return true;
2917 }
2918
2919 // FIXME: Artifact combiner probably should have replaced the truncated
2920 // constant before this, so we shouldn't need
2921 // getIConstantVRegValWithLookThrough.
2922 std::optional<ValueAndVReg> MaybeIdxVal =
2923 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 2).getReg(), MRI);
2924 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2925 return true;
2926 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2927
2928 if (IdxVal < VecTy.getNumElements()) {
2929 auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
2930 B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
2931 } else {
2932 B.buildUndef(Res: Dst);
2933 }
2934
2935 MI.eraseFromParent();
2936 return true;
2937}
2938
2939bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2940 MachineInstr &MI, MachineRegisterInfo &MRI,
2941 MachineIRBuilder &B) const {
2942 // TODO: Should move some of this into LegalizerHelper.
2943
2944 // TODO: Promote dynamic indexing of s16 to s32
2945
2946 Register Dst = MI.getOperand(i: 0).getReg();
2947 Register Vec = MI.getOperand(i: 1).getReg();
2948 Register Ins = MI.getOperand(i: 2).getReg();
2949
2950 LLT VecTy = MRI.getType(Reg: Vec);
2951 LLT EltTy = VecTy.getElementType();
2952 assert(EltTy == MRI.getType(Ins));
2953
2954 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2955 // but we can't go directly to that logic becasue you can't bitcast a vector
2956 // of pointers to a vector of integers. Therefore, make the pointer vector
2957 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2958 // new value, and then inttoptr the result vector back. This will then allow
2959 // the rest of legalization to take over.
2960 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2961 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2962 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2963
2964 auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2965 auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
2966 auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
2967 Idx: MI.getOperand(i: 3));
2968 B.buildIntToPtr(Dst, Src: IntVecDest);
2969 MI.eraseFromParent();
2970 return true;
2971 }
2972
2973 // FIXME: Artifact combiner probably should have replaced the truncated
2974 // constant before this, so we shouldn't need
2975 // getIConstantVRegValWithLookThrough.
2976 std::optional<ValueAndVReg> MaybeIdxVal =
2977 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 3).getReg(), MRI);
2978 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2979 return true;
2980
2981 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2982
2983 unsigned NumElts = VecTy.getNumElements();
2984 if (IdxVal < NumElts) {
2985 SmallVector<Register, 8> SrcRegs;
2986 for (unsigned i = 0; i < NumElts; ++i)
2987 SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
2988 B.buildUnmerge(Res: SrcRegs, Op: Vec);
2989
2990 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
2991 B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
2992 } else {
2993 B.buildUndef(Res: Dst);
2994 }
2995
2996 MI.eraseFromParent();
2997 return true;
2998}
2999
3000bool AMDGPULegalizerInfo::legalizeSinCos(
3001 MachineInstr &MI, MachineRegisterInfo &MRI,
3002 MachineIRBuilder &B) const {
3003
3004 Register DstReg = MI.getOperand(i: 0).getReg();
3005 Register SrcReg = MI.getOperand(i: 1).getReg();
3006 LLT Ty = MRI.getType(Reg: DstReg);
3007 unsigned Flags = MI.getFlags();
3008
3009 Register TrigVal;
3010 auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: 0.5 * numbers::inv_pi);
3011 if (ST.hasTrigReducedRange()) {
3012 auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
3013 TrigVal = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {Ty})
3014 .addUse(RegNo: MulVal.getReg(Idx: 0))
3015 .setMIFlags(Flags)
3016 .getReg(Idx: 0);
3017 } else
3018 TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: 0);
3019
3020 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3021 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3022 B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
3023 .addUse(RegNo: TrigVal)
3024 .setMIFlags(Flags);
3025 MI.eraseFromParent();
3026 return true;
3027}
3028
3029bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
3030 MachineIRBuilder &B,
3031 const GlobalValue *GV,
3032 int64_t Offset,
3033 unsigned GAFlags) const {
3034 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3035 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3036 // to the following code sequence:
3037 //
3038 // For constant address space:
3039 // s_getpc_b64 s[0:1]
3040 // s_add_u32 s0, s0, $symbol
3041 // s_addc_u32 s1, s1, 0
3042 //
3043 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3044 // a fixup or relocation is emitted to replace $symbol with a literal
3045 // constant, which is a pc-relative offset from the encoding of the $symbol
3046 // operand to the global variable.
3047 //
3048 // For global address space:
3049 // s_getpc_b64 s[0:1]
3050 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3051 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3052 //
3053 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3054 // fixups or relocations are emitted to replace $symbol@*@lo and
3055 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3056 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3057 // operand to the global variable.
3058
3059 LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3060
3061 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3062 B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
3063
3064 if (ST.has64BitLiterals()) {
3065 assert(GAFlags != SIInstrInfo::MO_NONE);
3066
3067 MachineInstrBuilder MIB =
3068 B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(RegNo: PCReg);
3069 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 2);
3070 } else {
3071 MachineInstrBuilder MIB =
3072 B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(RegNo: PCReg);
3073
3074 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
3075 if (GAFlags == SIInstrInfo::MO_NONE)
3076 MIB.addImm(Val: 0);
3077 else
3078 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 1);
3079 }
3080
3081 if (!B.getMRI()->getRegClassOrNull(Reg: PCReg))
3082 B.getMRI()->setRegClass(Reg: PCReg, RC: &AMDGPU::SReg_64RegClass);
3083
3084 if (PtrTy.getSizeInBits() == 32)
3085 B.buildExtract(Res: DstReg, Src: PCReg, Index: 0);
3086 return true;
3087}
3088
3089// Emit a ABS32_LO / ABS32_HI relocation stub.
3090void AMDGPULegalizerInfo::buildAbsGlobalAddress(
3091 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3092 MachineRegisterInfo &MRI) const {
3093 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3094
3095 if (RequiresHighHalf && ST.has64BitLiterals()) {
3096 if (!MRI.getRegClassOrNull(Reg: DstReg))
3097 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_64RegClass);
3098 B.buildInstr(Opcode: AMDGPU::S_MOV_B64)
3099 .addDef(RegNo: DstReg)
3100 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS64);
3101 return;
3102 }
3103
3104 LLT S32 = LLT::scalar(SizeInBits: 32);
3105
3106 // Use the destination directly, if and only if we store the lower address
3107 // part only and we don't have a register class being set.
3108 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
3109 ? DstReg
3110 : MRI.createGenericVirtualRegister(Ty: S32);
3111
3112 if (!MRI.getRegClassOrNull(Reg: AddrLo))
3113 MRI.setRegClass(Reg: AddrLo, RC: &AMDGPU::SReg_32RegClass);
3114
3115 // Write the lower half.
3116 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
3117 .addDef(RegNo: AddrLo)
3118 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
3119
3120 // If required, write the upper half as well.
3121 if (RequiresHighHalf) {
3122 assert(PtrTy.getSizeInBits() == 64 &&
3123 "Must provide a 64-bit pointer type!");
3124
3125 Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
3126 MRI.setRegClass(Reg: AddrHi, RC: &AMDGPU::SReg_32RegClass);
3127
3128 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
3129 .addDef(RegNo: AddrHi)
3130 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_HI);
3131
3132 // Use the destination directly, if and only if we don't have a register
3133 // class being set.
3134 Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
3135 ? DstReg
3136 : MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
3137
3138 if (!MRI.getRegClassOrNull(Reg: AddrDst))
3139 MRI.setRegClass(Reg: AddrDst, RC: &AMDGPU::SReg_64RegClass);
3140
3141 B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
3142
3143 // If we created a new register for the destination, cast the result into
3144 // the final output.
3145 if (AddrDst != DstReg)
3146 B.buildCast(Dst: DstReg, Src: AddrDst);
3147 } else if (AddrLo != DstReg) {
3148 // If we created a new register for the destination, cast the result into
3149 // the final output.
3150 B.buildCast(Dst: DstReg, Src: AddrLo);
3151 }
3152}
3153
3154bool AMDGPULegalizerInfo::legalizeGlobalValue(
3155 MachineInstr &MI, MachineRegisterInfo &MRI,
3156 MachineIRBuilder &B) const {
3157 Register DstReg = MI.getOperand(i: 0).getReg();
3158 LLT Ty = MRI.getType(Reg: DstReg);
3159 unsigned AS = Ty.getAddressSpace();
3160
3161 const GlobalValue *GV = MI.getOperand(i: 1).getGlobal();
3162 MachineFunction &MF = B.getMF();
3163 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3164
3165 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
3166 if (!MFI->isModuleEntryFunction() &&
3167 GV->getName() != "llvm.amdgcn.module.lds" &&
3168 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
3169 const Function &Fn = MF.getFunction();
3170 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
3171 Fn, "local memory global used by non-kernel function",
3172 MI.getDebugLoc(), DS_Warning));
3173
3174 // We currently don't have a way to correctly allocate LDS objects that
3175 // aren't directly associated with a kernel. We do force inlining of
3176 // functions that use local objects. However, if these dead functions are
3177 // not eliminated, we don't want a compile time error. Just emit a warning
3178 // and a trap, since there should be no callable path here.
3179 B.buildTrap();
3180 B.buildUndef(Res: DstReg);
3181 MI.eraseFromParent();
3182 return true;
3183 }
3184
3185 // TODO: We could emit code to handle the initialization somewhere.
3186 // We ignore the initializer for now and legalize it to allow selection.
3187 // The initializer will anyway get errored out during assembly emission.
3188 const SITargetLowering *TLI = ST.getTargetLowering();
3189 if (!TLI->shouldUseLDSConstAddress(GV)) {
3190 MI.getOperand(i: 1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3191 return true; // Leave in place;
3192 }
3193
3194 const GlobalVariable &GVar = *cast<GlobalVariable>(Val: GV);
3195 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3196 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3197 // zero-sized type in other languages to declare the dynamic shared
3198 // memory which size is not known at the compile time. They will be
3199 // allocated by the runtime and placed directly after the static
3200 // allocated ones. They all share the same offset.
3201 if (GVar.getGlobalSize(DL: GVar.getDataLayout()) == 0) {
3202 // Adjust alignment for that dynamic shared memory array.
3203 MFI->setDynLDSAlign(F: MF.getFunction(), GV: GVar);
3204 LLT S32 = LLT::scalar(SizeInBits: 32);
3205 auto Sz = B.buildIntrinsic(ID: Intrinsic::amdgcn_groupstaticsize, Res: {S32});
3206 B.buildIntToPtr(Dst: DstReg, Src: Sz);
3207 MI.eraseFromParent();
3208 return true;
3209 }
3210 }
3211
3212 B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(), GV: GVar));
3213 MI.eraseFromParent();
3214 return true;
3215 }
3216
3217 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3218 buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
3219 MI.eraseFromParent();
3220 return true;
3221 }
3222
3223 const SITargetLowering *TLI = ST.getTargetLowering();
3224
3225 if (TLI->shouldEmitFixup(GV)) {
3226 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0);
3227 MI.eraseFromParent();
3228 return true;
3229 }
3230
3231 if (TLI->shouldEmitPCReloc(GV)) {
3232 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_REL32);
3233 MI.eraseFromParent();
3234 return true;
3235 }
3236
3237 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3238 Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
3239
3240 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3241 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3242 PtrInfo: MachinePointerInfo::getGOT(MF),
3243 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3244 MachineMemOperand::MOInvariant,
3245 MemTy: LoadTy, base_alignment: Align(8));
3246
3247 buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3248
3249 if (Ty.getSizeInBits() == 32) {
3250 // Truncate if this is a 32-bit constant address.
3251 auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3252 B.buildExtract(Res: DstReg, Src: Load, Index: 0);
3253 } else
3254 B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3255
3256 MI.eraseFromParent();
3257 return true;
3258}
3259
3260static LLT widenToNextPowerOf2(LLT Ty) {
3261 if (Ty.isVector())
3262 return Ty.changeElementCount(
3263 EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3264 return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3265}
3266
3267bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3268 MachineInstr &MI) const {
3269 MachineIRBuilder &B = Helper.MIRBuilder;
3270 MachineRegisterInfo &MRI = *B.getMRI();
3271 GISelChangeObserver &Observer = Helper.Observer;
3272
3273 Register PtrReg = MI.getOperand(i: 1).getReg();
3274 LLT PtrTy = MRI.getType(Reg: PtrReg);
3275 unsigned AddrSpace = PtrTy.getAddressSpace();
3276
3277 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3278 LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3279 auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3280 Observer.changingInstr(MI);
3281 MI.getOperand(i: 1).setReg(Cast.getReg(Idx: 0));
3282 Observer.changedInstr(MI);
3283 return true;
3284 }
3285
3286 if (MI.getOpcode() != AMDGPU::G_LOAD)
3287 return false;
3288
3289 Register ValReg = MI.getOperand(i: 0).getReg();
3290 LLT ValTy = MRI.getType(Reg: ValReg);
3291
3292 if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3293 Observer.changingInstr(MI);
3294 castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
3295 Observer.changedInstr(MI);
3296 return true;
3297 }
3298
3299 MachineMemOperand *MMO = *MI.memoperands_begin();
3300 const unsigned ValSize = ValTy.getSizeInBits();
3301 const LLT MemTy = MMO->getMemoryType();
3302 const Align MemAlign = MMO->getAlign();
3303 const unsigned MemSize = MemTy.getSizeInBits();
3304 const uint64_t AlignInBits = 8 * MemAlign.value();
3305
3306 // Widen non-power-of-2 loads to the alignment if needed
3307 if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3308 const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3309
3310 // This was already the correct extending load result type, so just adjust
3311 // the memory type.
3312 if (WideMemSize == ValSize) {
3313 MachineFunction &MF = B.getMF();
3314
3315 MachineMemOperand *WideMMO =
3316 MF.getMachineMemOperand(MMO, Offset: 0, Size: WideMemSize / 8);
3317 Observer.changingInstr(MI);
3318 MI.setMemRefs(MF, MemRefs: {WideMMO});
3319 Observer.changedInstr(MI);
3320 return true;
3321 }
3322
3323 // Don't bother handling edge case that should probably never be produced.
3324 if (ValSize > WideMemSize)
3325 return false;
3326
3327 LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3328
3329 Register WideLoad;
3330 if (!WideTy.isVector()) {
3331 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3332 B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: 0);
3333 } else {
3334 // Extract the subvector.
3335
3336 if (isRegisterType(ST, Ty: ValTy)) {
3337 // If this a case where G_EXTRACT is legal, use it.
3338 // (e.g. <3 x s32> -> <4 x s32>)
3339 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3340 B.buildExtract(Res: ValReg, Src: WideLoad, Index: 0);
3341 } else {
3342 // For cases where the widened type isn't a nice register value, unmerge
3343 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3344 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3345 B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3346 }
3347 }
3348
3349 MI.eraseFromParent();
3350 return true;
3351 }
3352
3353 return false;
3354}
3355
3356bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3357 MachineInstr &MI) const {
3358 MachineIRBuilder &B = Helper.MIRBuilder;
3359 MachineRegisterInfo &MRI = *B.getMRI();
3360 GISelChangeObserver &Observer = Helper.Observer;
3361
3362 Register DataReg = MI.getOperand(i: 0).getReg();
3363 LLT DataTy = MRI.getType(Reg: DataReg);
3364
3365 if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3366 Observer.changingInstr(MI);
3367 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
3368 Observer.changedInstr(MI);
3369 return true;
3370 }
3371 return false;
3372}
3373
3374bool AMDGPULegalizerInfo::legalizeFMad(
3375 MachineInstr &MI, MachineRegisterInfo &MRI,
3376 MachineIRBuilder &B) const {
3377 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3378 assert(Ty.isScalar());
3379
3380 MachineFunction &MF = B.getMF();
3381 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3382
3383 // TODO: Always legal with future ftz flag.
3384 // FIXME: Do we need just output?
3385 if (Ty == LLT::float32() &&
3386 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3387 return true;
3388 if (Ty == LLT::float16() &&
3389 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3390 return true;
3391
3392 MachineIRBuilder HelperBuilder(MI);
3393 GISelObserverWrapper DummyObserver;
3394 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3395 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3396}
3397
3398bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3399 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3400 Register DstReg = MI.getOperand(i: 0).getReg();
3401 Register PtrReg = MI.getOperand(i: 1).getReg();
3402 Register CmpVal = MI.getOperand(i: 2).getReg();
3403 Register NewVal = MI.getOperand(i: 3).getReg();
3404
3405 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3406 "this should not have been custom lowered");
3407
3408 LLT ValTy = MRI.getType(Reg: CmpVal);
3409 LLT VecTy = LLT::fixed_vector(NumElements: 2, ScalarTy: ValTy);
3410
3411 Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: 0);
3412
3413 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3414 .addDef(RegNo: DstReg)
3415 .addUse(RegNo: PtrReg)
3416 .addUse(RegNo: PackedVal)
3417 .setMemRefs(MI.memoperands());
3418
3419 MI.eraseFromParent();
3420 return true;
3421}
3422
3423/// Return true if it's known that \p Src can never be an f32 denormal value.
3424static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3425 Register Src) {
3426 const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3427 switch (DefMI->getOpcode()) {
3428 case TargetOpcode::G_INTRINSIC: {
3429 switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3430 case Intrinsic::amdgcn_frexp_mant:
3431 case Intrinsic::amdgcn_log:
3432 case Intrinsic::amdgcn_log_clamp:
3433 case Intrinsic::amdgcn_exp2:
3434 case Intrinsic::amdgcn_sqrt:
3435 return true;
3436 default:
3437 break;
3438 }
3439
3440 break;
3441 }
3442 case TargetOpcode::G_FSQRT:
3443 return true;
3444 case TargetOpcode::G_FFREXP: {
3445 if (DefMI->getOperand(i: 0).getReg() == Src)
3446 return true;
3447 break;
3448 }
3449 case TargetOpcode::G_FPEXT: {
3450 return MRI.getType(Reg: DefMI->getOperand(i: 1).getReg()) == LLT::scalar(SizeInBits: 16);
3451 }
3452 default:
3453 return false;
3454 }
3455
3456 return false;
3457}
3458
3459static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3460 return Flags & MachineInstr::FmAfn;
3461}
3462
3463static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3464 unsigned Flags) {
3465 return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3466 MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3467 DenormalMode::PreserveSign;
3468}
3469
3470std::pair<Register, Register>
3471AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3472 unsigned Flags) const {
3473 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3474 return {};
3475
3476 const LLT F32 = LLT::scalar(SizeInBits: 32);
3477 auto SmallestNormal = B.buildFConstant(
3478 Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3479 auto IsLtSmallestNormal =
3480 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: SmallestNormal);
3481
3482 auto Scale32 = B.buildFConstant(Res: F32, Val: 0x1.0p+32);
3483 auto One = B.buildFConstant(Res: F32, Val: 1.0);
3484 auto ScaleFactor =
3485 B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3486 auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3487
3488 return {ScaledInput.getReg(Idx: 0), IsLtSmallestNormal.getReg(Idx: 0)};
3489}
3490
3491bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3492 MachineIRBuilder &B) const {
3493 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3494 // If we have to handle denormals, scale up the input and adjust the result.
3495
3496 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3497 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3498
3499 Register Dst = MI.getOperand(i: 0).getReg();
3500 Register Src = MI.getOperand(i: 1).getReg();
3501 LLT Ty = B.getMRI()->getType(Reg: Dst);
3502 unsigned Flags = MI.getFlags();
3503
3504 if (Ty == LLT::scalar(SizeInBits: 16)) {
3505 const LLT F32 = LLT::scalar(SizeInBits: 32);
3506 // Nothing in half is a denormal when promoted to f32.
3507 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3508 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {F32})
3509 .addUse(RegNo: Ext.getReg(Idx: 0))
3510 .setMIFlags(Flags);
3511 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3512 MI.eraseFromParent();
3513 return true;
3514 }
3515
3516 assert(Ty == LLT::scalar(32));
3517
3518 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3519 if (!ScaledInput) {
3520 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {MI.getOperand(i: 0)})
3521 .addUse(RegNo: Src)
3522 .setMIFlags(Flags);
3523 MI.eraseFromParent();
3524 return true;
3525 }
3526
3527 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3528 .addUse(RegNo: ScaledInput)
3529 .setMIFlags(Flags);
3530
3531 auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: 32.0);
3532 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3533 auto ResultOffset =
3534 B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3535 B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3536
3537 MI.eraseFromParent();
3538 return true;
3539}
3540
3541static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3542 Register Z, unsigned Flags) {
3543 auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3544 return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: 0);
3545}
3546
3547bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3548 MachineIRBuilder &B) const {
3549 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3550 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3551
3552 MachineRegisterInfo &MRI = *B.getMRI();
3553 Register Dst = MI.getOperand(i: 0).getReg();
3554 Register X = MI.getOperand(i: 1).getReg();
3555 unsigned Flags = MI.getFlags();
3556 const LLT Ty = MRI.getType(Reg: X);
3557
3558 const LLT F32 = LLT::scalar(SizeInBits: 32);
3559 const LLT F16 = LLT::scalar(SizeInBits: 16);
3560
3561 if (Ty == F16 || MI.getFlag(Flag: MachineInstr::FmAfn)) {
3562 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3563 // depending on !fpmath metadata.
3564 bool PromoteToF32 =
3565 Ty == F16 && (!MI.getFlag(Flag: MachineInstr::FmAfn) || !ST.has16BitInsts());
3566 if (PromoteToF32) {
3567 Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3568 auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3569 legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: 0), IsLog10, Flags);
3570 B.buildFPTrunc(Res: Dst, Op: LogVal);
3571 } else {
3572 legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3573 }
3574
3575 MI.eraseFromParent();
3576 return true;
3577 }
3578
3579 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3580 if (ScaledInput)
3581 X = ScaledInput;
3582
3583 auto Y =
3584 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty}).addUse(RegNo: X).setMIFlags(Flags);
3585
3586 Register R;
3587 if (ST.hasFastFMAF32()) {
3588 // c+cc are ln(2)/ln(10) to more than 49 bits
3589 const float c_log10 = 0x1.344134p-2f;
3590 const float cc_log10 = 0x1.09f79ep-26f;
3591
3592 // c + cc is ln(2) to more than 49 bits
3593 const float c_log = 0x1.62e42ep-1f;
3594 const float cc_log = 0x1.efa39ep-25f;
3595
3596 auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3597 auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3598 // This adds correction terms for which contraction may lead to an increase
3599 // in the error of the approximation, so disable it.
3600 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3601 R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags: NewFlags).getReg(Idx: 0);
3602 auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags: NewFlags);
3603 auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags: NewFlags);
3604 auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags: NewFlags);
3605 R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags: NewFlags).getReg(Idx: 0);
3606 } else {
3607 // ch+ct is ln(2)/ln(10) to more than 36 bits
3608 const float ch_log10 = 0x1.344000p-2f;
3609 const float ct_log10 = 0x1.3509f6p-18f;
3610
3611 // ch + ct is ln(2) to more than 36 bits
3612 const float ch_log = 0x1.62e000p-1f;
3613 const float ct_log = 0x1.0bfbe8p-15f;
3614
3615 auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3616 auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3617
3618 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3619 auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3620 auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3621 // This adds correction terms for which contraction may lead to an increase
3622 // in the error of the approximation, so disable it.
3623 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3624 auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags: NewFlags);
3625
3626 Register Mad0 =
3627 getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CT.getReg(Idx: 0), Z: YTCT.getReg(Idx: 0), Flags: NewFlags);
3628 Register Mad1 = getMad(B, Ty, X: YT.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad0, Flags: NewFlags);
3629 R = getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad1, Flags: NewFlags);
3630 }
3631
3632 const bool IsFiniteOnly =
3633 MI.getFlag(Flag: MachineInstr::FmNoNans) && MI.getFlag(Flag: MachineInstr::FmNoInfs);
3634
3635 if (!IsFiniteOnly) {
3636 // Expand isfinite(x) => fabs(x) < inf
3637 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3638 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3639 auto IsFinite =
3640 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
3641 R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(Idx: 0);
3642 }
3643
3644 if (ScaledInput) {
3645 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3646 auto ShiftK =
3647 B.buildFConstant(Res: Ty, Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3648 auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3649 B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3650 } else {
3651 B.buildCopy(Res: Dst, Op: R);
3652 }
3653
3654 MI.eraseFromParent();
3655 return true;
3656}
3657
3658bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3659 Register Src, bool IsLog10,
3660 unsigned Flags) const {
3661 const double Log2BaseInverted =
3662 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3663
3664 LLT Ty = B.getMRI()->getType(Reg: Dst);
3665
3666 if (Ty == LLT::scalar(SizeInBits: 32)) {
3667 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3668 if (ScaledInput) {
3669 auto LogSrc = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3670 .addUse(RegNo: Src)
3671 .setMIFlags(Flags);
3672 auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -32.0 * Log2BaseInverted);
3673 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3674 auto ResultOffset =
3675 B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3676 auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3677
3678 if (ST.hasFastFMAF32())
3679 B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3680 else {
3681 auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3682 B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3683 }
3684
3685 return true;
3686 }
3687 }
3688
3689 auto Log2Operand = Ty == LLT::scalar(SizeInBits: 16)
3690 ? B.buildFLog2(Dst: Ty, Src, Flags)
3691 : B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3692 .addUse(RegNo: Src)
3693 .setMIFlags(Flags);
3694 auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3695 B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3696 return true;
3697}
3698
3699bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3700 MachineIRBuilder &B) const {
3701 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3702 // If we have to handle denormals, scale up the input and adjust the result.
3703
3704 Register Dst = MI.getOperand(i: 0).getReg();
3705 Register Src = MI.getOperand(i: 1).getReg();
3706 unsigned Flags = MI.getFlags();
3707 LLT Ty = B.getMRI()->getType(Reg: Dst);
3708 const LLT F16 = LLT::scalar(SizeInBits: 16);
3709 const LLT F32 = LLT::scalar(SizeInBits: 32);
3710
3711 if (Ty == F16) {
3712 // Nothing in half is a denormal when promoted to f32.
3713 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3714 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {F32})
3715 .addUse(RegNo: Ext.getReg(Idx: 0))
3716 .setMIFlags(Flags);
3717 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3718 MI.eraseFromParent();
3719 return true;
3720 }
3721
3722 assert(Ty == F32);
3723
3724 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3725 B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3726 .addUse(RegNo: Src)
3727 .setMIFlags(Flags);
3728 MI.eraseFromParent();
3729 return true;
3730 }
3731
3732 // bool needs_scaling = x < -0x1.f80000p+6f;
3733 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3734
3735 // -nextafter(128.0, -1)
3736 auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -0x1.f80000p+6f);
3737 auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
3738 Op1: RangeCheckConst, Flags);
3739
3740 auto SixtyFour = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3741 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3742 auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3743 auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3744
3745 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3746 .addUse(RegNo: AddInput.getReg(Idx: 0))
3747 .setMIFlags(Flags);
3748
3749 auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: 0x1.0p-64f);
3750 auto One = B.buildFConstant(Res: Ty, Val: 1.0);
3751 auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3752 B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3753 MI.eraseFromParent();
3754 return true;
3755}
3756
3757static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst,
3758 const SrcOp &Src, unsigned Flags) {
3759 LLT Ty = Dst.getLLTTy(MRI: *B.getMRI());
3760
3761 if (Ty == LLT::scalar(SizeInBits: 32)) {
3762 return B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Dst})
3763 .addUse(RegNo: Src.getReg())
3764 .setMIFlags(Flags);
3765 }
3766 return B.buildFExp2(Dst, Src, Flags);
3767}
3768
3769bool AMDGPULegalizerInfo::legalizeFExpUnsafeImpl(MachineIRBuilder &B,
3770 Register Dst, Register X,
3771 unsigned Flags,
3772 bool IsExp10) const {
3773 LLT Ty = B.getMRI()->getType(Reg: X);
3774
3775 // exp(x) -> exp2(M_LOG2E_F * x);
3776 // exp10(x) -> exp2(log2(10) * x);
3777 auto Const = B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3778 auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Const, Flags);
3779 buildExp(B, Dst, Src: Mul, Flags);
3780 return true;
3781}
3782
3783bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3784 Register X, unsigned Flags) const {
3785 LLT Ty = B.getMRI()->getType(Reg: Dst);
3786 LLT F32 = LLT::scalar(SizeInBits: 32);
3787
3788 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3789 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3790 }
3791
3792 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.5d58a0p+6f);
3793 auto NeedsScaling =
3794 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold, Flags);
3795 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3796 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3797 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3798
3799 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3800 auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3801
3802 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3803 .addUse(RegNo: ExpInput.getReg(Idx: 0))
3804 .setMIFlags(Flags);
3805
3806 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.969d48p-93f);
3807 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3808 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3809 return true;
3810}
3811
3812bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B,
3813 Register Dst, Register X,
3814 unsigned Flags) const {
3815 LLT Ty = B.getMRI()->getType(Reg: Dst);
3816 LLT F32 = LLT::scalar(SizeInBits: 32);
3817
3818 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3819 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3820 auto K0 = B.buildFConstant(Res: Ty, Val: 0x1.a92000p+1f);
3821 auto K1 = B.buildFConstant(Res: Ty, Val: 0x1.4f0978p-11f);
3822
3823 auto Mul1 = B.buildFMul(Dst: Ty, Src0: X, Src1: K1, Flags);
3824 auto Exp2_1 = buildExp(B, Dst: Ty, Src: Mul1, Flags);
3825 auto Mul0 = B.buildFMul(Dst: Ty, Src0: X, Src1: K0, Flags);
3826 auto Exp2_0 = buildExp(B, Dst: Ty, Src: Mul0, Flags);
3827 B.buildFMul(Dst, Src0: Exp2_0, Src1: Exp2_1, Flags);
3828 return true;
3829 }
3830
3831 // bool s = x < -0x1.2f7030p+5f;
3832 // x += s ? 0x1.0p+5f : 0.0f;
3833 // exp10 = exp2(x * 0x1.a92000p+1f) *
3834 // exp2(x * 0x1.4f0978p-11f) *
3835 // (s ? 0x1.9f623ep-107f : 1.0f);
3836
3837 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.2f7030p+5f);
3838 auto NeedsScaling =
3839 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold);
3840
3841 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+5f);
3842 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3843 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X);
3844
3845 auto K0 = B.buildFConstant(Res: Ty, Val: 0x1.a92000p+1f);
3846 auto K1 = B.buildFConstant(Res: Ty, Val: 0x1.4f0978p-11f);
3847
3848 auto Mul1 = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: K1, Flags);
3849 auto Exp2_1 = buildExp(B, Dst: Ty, Src: Mul1, Flags);
3850 auto Mul0 = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: K0, Flags);
3851 auto Exp2_0 = buildExp(B, Dst: Ty, Src: Mul0, Flags);
3852
3853 auto MulExps = B.buildFMul(Dst: Ty, Src0: Exp2_0, Src1: Exp2_1, Flags);
3854 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.9f623ep-107f);
3855 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: MulExps, Src1: ResultScaleFactor, Flags);
3856
3857 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: MulExps);
3858 return true;
3859}
3860
3861bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3862 MachineIRBuilder &B) const {
3863 Register Dst = MI.getOperand(i: 0).getReg();
3864 Register X = MI.getOperand(i: 1).getReg();
3865 const unsigned Flags = MI.getFlags();
3866 MachineFunction &MF = B.getMF();
3867 MachineRegisterInfo &MRI = *B.getMRI();
3868 LLT Ty = MRI.getType(Reg: Dst);
3869 const LLT F16 = LLT::scalar(SizeInBits: 16);
3870 const LLT F32 = LLT::scalar(SizeInBits: 32);
3871 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3872
3873 if (Ty == F16) {
3874 // v_exp_f16 (fmul x, log2e)
3875 if (allowApproxFunc(MF, Flags)) {
3876 // TODO: Does this really require fast?
3877 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
3878 : legalizeFExpUnsafe(B, Dst, X, Flags);
3879 MI.eraseFromParent();
3880 return true;
3881 }
3882
3883 // Nothing in half is a denormal when promoted to f32.
3884 //
3885 // exp(f16 x) ->
3886 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3887 //
3888 // exp10(f16 x) ->
3889 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3890 auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
3891 Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
3892 legalizeFExpUnsafeImpl(B, Dst: Lowered, X: Ext.getReg(Idx: 0), Flags, IsExp10);
3893 B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
3894 MI.eraseFromParent();
3895 return true;
3896 }
3897
3898 assert(Ty == F32);
3899
3900 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3901 // library behavior. Also, is known-not-daz source sufficient?
3902 if (allowApproxFunc(MF, Flags)) {
3903 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
3904 : legalizeFExpUnsafe(B, Dst, X, Flags);
3905 MI.eraseFromParent();
3906 return true;
3907 }
3908
3909 // Algorithm:
3910 //
3911 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3912 //
3913 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3914 // n = 64*m + j, 0 <= j < 64
3915 //
3916 // e^x = 2^((64*m + j + f)/64)
3917 // = (2^m) * (2^(j/64)) * 2^(f/64)
3918 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3919 //
3920 // f = x*(64/ln(2)) - n
3921 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3922 //
3923 // e^x = (2^m) * (2^(j/64)) * e^r
3924 //
3925 // (2^(j/64)) is precomputed
3926 //
3927 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3928 // e^r = 1 + q
3929 //
3930 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3931 //
3932 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3933 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3934 Register PH, PL;
3935
3936 if (ST.hasFastFMAF32()) {
3937 const float c_exp = numbers::log2ef;
3938 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3939 const float c_exp10 = 0x1.a934f0p+1f;
3940 const float cc_exp10 = 0x1.2f346ep-24f;
3941
3942 auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
3943 PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: 0);
3944 auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
3945 auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
3946
3947 auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
3948 PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: 0);
3949 } else {
3950 const float ch_exp = 0x1.714000p+0f;
3951 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3952
3953 const float ch_exp10 = 0x1.a92000p+1f;
3954 const float cl_exp10 = 0x1.4f0978p-11f;
3955
3956 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3957 auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
3958 auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
3959
3960 auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
3961 PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: 0);
3962
3963 auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
3964 auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
3965
3966 Register Mad0 =
3967 getMad(B, Ty, X: XL.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: XLCL.getReg(Idx: 0), Flags);
3968 PL = getMad(B, Ty, X: XH.getReg(Idx: 0), Y: CL.getReg(Idx: 0), Z: Mad0, Flags);
3969 }
3970
3971 auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
3972
3973 // It is unsafe to contract this fsub into the PH multiply.
3974 auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
3975 auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
3976 auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: 32), Src0: E);
3977
3978 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3979 .addUse(RegNo: A.getReg(Idx: 0))
3980 .setMIFlags(Flags);
3981 auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
3982
3983 auto UnderflowCheckConst =
3984 B.buildFConstant(Res: Ty, Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3985 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3986 auto Underflow =
3987 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: UnderflowCheckConst);
3988
3989 R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
3990
3991 if (!(Flags & MachineInstr::FmNoInfs)) {
3992 auto OverflowCheckConst =
3993 B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3994
3995 auto Overflow =
3996 B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: OverflowCheckConst);
3997 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3998 R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
3999 }
4000
4001 B.buildCopy(Res: Dst, Op: R);
4002 MI.eraseFromParent();
4003 return true;
4004}
4005
4006bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
4007 MachineIRBuilder &B) const {
4008 Register Dst = MI.getOperand(i: 0).getReg();
4009 Register Src0 = MI.getOperand(i: 1).getReg();
4010 Register Src1 = MI.getOperand(i: 2).getReg();
4011 unsigned Flags = MI.getFlags();
4012 LLT Ty = B.getMRI()->getType(Reg: Dst);
4013 const LLT F16 = LLT::float16();
4014 const LLT F32 = LLT::float32();
4015
4016 if (Ty == F32) {
4017 auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
4018 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
4019 .addUse(RegNo: Log.getReg(Idx: 0))
4020 .addUse(RegNo: Src1)
4021 .setMIFlags(Flags);
4022 B.buildFExp2(Dst, Src: Mul, Flags);
4023 } else if (Ty == F16) {
4024 // There's no f16 fmul_legacy, so we need to convert for it.
4025 auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
4026 auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
4027 auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
4028 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
4029 .addUse(RegNo: Ext0.getReg(Idx: 0))
4030 .addUse(RegNo: Ext1.getReg(Idx: 0))
4031 .setMIFlags(Flags);
4032 B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
4033 } else
4034 return false;
4035
4036 MI.eraseFromParent();
4037 return true;
4038}
4039
4040// Find a source register, ignoring any possible source modifiers.
4041static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
4042 Register ModSrc = OrigSrc;
4043 if (MachineInstr *SrcFNeg = getOpcodeDef(Opcode: AMDGPU::G_FNEG, Reg: ModSrc, MRI)) {
4044 ModSrc = SrcFNeg->getOperand(i: 1).getReg();
4045 if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
4046 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
4047 } else if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
4048 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
4049 return ModSrc;
4050}
4051
4052bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
4053 MachineRegisterInfo &MRI,
4054 MachineIRBuilder &B) const {
4055
4056 const LLT S1 = LLT::scalar(SizeInBits: 1);
4057 const LLT F64 = LLT::float64();
4058 Register Dst = MI.getOperand(i: 0).getReg();
4059 Register OrigSrc = MI.getOperand(i: 1).getReg();
4060 unsigned Flags = MI.getFlags();
4061 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4062 "this should not have been custom lowered");
4063
4064 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4065 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4066 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4067 // V_FRACT bug is:
4068 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4069 //
4070 // Convert floor(x) to (x - fract(x))
4071
4072 auto Fract = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {F64})
4073 .addUse(RegNo: OrigSrc)
4074 .setMIFlags(Flags);
4075
4076 // Give source modifier matching some assistance before obscuring a foldable
4077 // pattern.
4078
4079 // TODO: We can avoid the neg on the fract? The input sign to fract
4080 // shouldn't matter?
4081 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4082
4083 auto Const =
4084 B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: 0x3fefffffffffffff));
4085
4086 Register Min = MRI.createGenericVirtualRegister(Ty: F64);
4087
4088 // We don't need to concern ourselves with the snan handling difference, so
4089 // use the one which will directly select.
4090 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4091 if (MFI->getMode().IEEE)
4092 B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
4093 else
4094 B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
4095
4096 Register CorrectedFract = Min;
4097 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
4098 auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
4099 CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: 0);
4100 }
4101
4102 auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
4103 B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
4104
4105 MI.eraseFromParent();
4106 return true;
4107}
4108
4109// Turn an illegal packed v2s16 build vector into bit operations.
4110// TODO: This should probably be a bitcast action in LegalizerHelper.
4111bool AMDGPULegalizerInfo::legalizeBuildVector(
4112 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4113 Register Dst = MI.getOperand(i: 0).getReg();
4114 const LLT S32 = LLT::scalar(SizeInBits: 32);
4115 const LLT S16 = LLT::scalar(SizeInBits: 16);
4116 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4117
4118 Register Src0 = MI.getOperand(i: 1).getReg();
4119 Register Src1 = MI.getOperand(i: 2).getReg();
4120
4121 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4122 assert(MRI.getType(Src0) == S32);
4123 Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 1).getReg()).getReg(Idx: 0);
4124 Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 2).getReg()).getReg(Idx: 0);
4125 }
4126
4127 auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
4128 B.buildBitcast(Dst, Src: Merge);
4129
4130 MI.eraseFromParent();
4131 return true;
4132}
4133
4134// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4135//
4136// Source and accumulation registers must all be 32-bits.
4137//
4138// TODO: When the multiply is uniform, we should produce a code sequence
4139// that is better suited to instruction selection on the SALU. Instead of
4140// the outer loop going over parts of the result, the outer loop should go
4141// over parts of one of the factors. This should result in instruction
4142// selection that makes full use of S_ADDC_U32 instructions.
4143void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
4144 MutableArrayRef<Register> Accum,
4145 ArrayRef<Register> Src0,
4146 ArrayRef<Register> Src1,
4147 bool UsePartialMad64_32,
4148 bool SeparateOddAlignedProducts) const {
4149 // Use (possibly empty) vectors of S1 registers to represent the set of
4150 // carries from one pair of positions to the next.
4151 using Carry = SmallVector<Register, 2>;
4152
4153 MachineIRBuilder &B = Helper.MIRBuilder;
4154 GISelValueTracking &VT = *Helper.getValueTracking();
4155
4156 const LLT S1 = LLT::scalar(SizeInBits: 1);
4157 const LLT S32 = LLT::scalar(SizeInBits: 32);
4158 const LLT S64 = LLT::scalar(SizeInBits: 64);
4159
4160 Register Zero32;
4161 Register Zero64;
4162
4163 auto getZero32 = [&]() -> Register {
4164 if (!Zero32)
4165 Zero32 = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
4166 return Zero32;
4167 };
4168 auto getZero64 = [&]() -> Register {
4169 if (!Zero64)
4170 Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
4171 return Zero64;
4172 };
4173
4174 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4175 for (unsigned i = 0; i < Src0.size(); ++i) {
4176 Src0KnownZeros.push_back(Elt: VT.getKnownBits(R: Src0[i]).isZero());
4177 Src1KnownZeros.push_back(Elt: VT.getKnownBits(R: Src1[i]).isZero());
4178 }
4179
4180 // Merge the given carries into the 32-bit LocalAccum, which is modified
4181 // in-place.
4182 //
4183 // Returns the carry-out, which is a single S1 register or null.
4184 auto mergeCarry =
4185 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4186 if (CarryIn.empty())
4187 return Register();
4188
4189 bool HaveCarryOut = true;
4190 Register CarryAccum;
4191 if (CarryIn.size() == 1) {
4192 if (!LocalAccum) {
4193 LocalAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
4194 return Register();
4195 }
4196
4197 CarryAccum = getZero32();
4198 } else {
4199 CarryAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
4200 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4201 CarryAccum =
4202 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32(), CarryIn: CarryIn[i])
4203 .getReg(Idx: 0);
4204 }
4205
4206 if (!LocalAccum) {
4207 LocalAccum = getZero32();
4208 HaveCarryOut = false;
4209 }
4210 }
4211
4212 auto Add =
4213 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
4214 LocalAccum = Add.getReg(Idx: 0);
4215 return HaveCarryOut ? Add.getReg(Idx: 1) : Register();
4216 };
4217
4218 // Build a multiply-add chain to compute
4219 //
4220 // LocalAccum + (partial products at DstIndex)
4221 // + (opportunistic subset of CarryIn)
4222 //
4223 // LocalAccum is an array of one or two 32-bit registers that are updated
4224 // in-place. The incoming registers may be null.
4225 //
4226 // In some edge cases, carry-ins can be consumed "for free". In that case,
4227 // the consumed carry bits are removed from CarryIn in-place.
4228 auto buildMadChain =
4229 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4230 -> Carry {
4231 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4232 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4233
4234 Carry CarryOut;
4235 unsigned j0 = 0;
4236
4237 // Use plain 32-bit multiplication for the most significant part of the
4238 // result by default.
4239 if (LocalAccum.size() == 1 &&
4240 (!UsePartialMad64_32 || !CarryIn.empty())) {
4241 do {
4242 // Skip multiplication if one of the operands is 0
4243 unsigned j1 = DstIndex - j0;
4244 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4245 ++j0;
4246 continue;
4247 }
4248 auto Mul = B.buildMul(Dst: S32, Src0: Src0[j0], Src1: Src1[j1]);
4249 if (!LocalAccum[0] || VT.getKnownBits(R: LocalAccum[0]).isZero()) {
4250 LocalAccum[0] = Mul.getReg(Idx: 0);
4251 } else {
4252 if (CarryIn.empty()) {
4253 LocalAccum[0] = B.buildAdd(Dst: S32, Src0: LocalAccum[0], Src1: Mul).getReg(Idx: 0);
4254 } else {
4255 LocalAccum[0] =
4256 B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum[0], Op1: Mul, CarryIn: CarryIn.back())
4257 .getReg(Idx: 0);
4258 CarryIn.pop_back();
4259 }
4260 }
4261 ++j0;
4262 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4263 }
4264
4265 // Build full 64-bit multiplies.
4266 if (j0 <= DstIndex) {
4267 bool HaveSmallAccum = false;
4268 Register Tmp;
4269
4270 if (LocalAccum[0]) {
4271 if (LocalAccum.size() == 1) {
4272 Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4273 HaveSmallAccum = true;
4274 } else if (LocalAccum[1]) {
4275 Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: 0);
4276 HaveSmallAccum = false;
4277 } else {
4278 Tmp = B.buildZExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4279 HaveSmallAccum = true;
4280 }
4281 } else {
4282 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4283 Tmp = getZero64();
4284 HaveSmallAccum = true;
4285 }
4286
4287 do {
4288 unsigned j1 = DstIndex - j0;
4289 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4290 ++j0;
4291 continue;
4292 }
4293 auto Mad = B.buildInstr(Opc: AMDGPU::G_AMDGPU_MAD_U64_U32, DstOps: {S64, S1},
4294 SrcOps: {Src0[j0], Src1[j1], Tmp});
4295 Tmp = Mad.getReg(Idx: 0);
4296 if (!HaveSmallAccum)
4297 CarryOut.push_back(Elt: Mad.getReg(Idx: 1));
4298 HaveSmallAccum = false;
4299
4300 ++j0;
4301 } while (j0 <= DstIndex);
4302
4303 auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
4304 LocalAccum[0] = Unmerge.getReg(Idx: 0);
4305 if (LocalAccum.size() > 1)
4306 LocalAccum[1] = Unmerge.getReg(Idx: 1);
4307 }
4308
4309 return CarryOut;
4310 };
4311
4312 // Outer multiply loop, iterating over destination parts from least
4313 // significant to most significant parts.
4314 //
4315 // The columns of the following diagram correspond to the destination parts
4316 // affected by one iteration of the outer loop (ignoring boundary
4317 // conditions).
4318 //
4319 // Dest index relative to 2 * i: 1 0 -1
4320 // ------
4321 // Carries from previous iteration: e o
4322 // Even-aligned partial product sum: E E .
4323 // Odd-aligned partial product sum: O O
4324 //
4325 // 'o' is OddCarry, 'e' is EvenCarry.
4326 // EE and OO are computed from partial products via buildMadChain and use
4327 // accumulation where possible and appropriate.
4328 //
4329 Register SeparateOddCarry;
4330 Carry EvenCarry;
4331 Carry OddCarry;
4332
4333 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4334 Carry OddCarryIn = std::move(OddCarry);
4335 Carry EvenCarryIn = std::move(EvenCarry);
4336 OddCarry.clear();
4337 EvenCarry.clear();
4338
4339 // Partial products at offset 2 * i.
4340 if (2 * i < Accum.size()) {
4341 auto LocalAccum = Accum.drop_front(N: 2 * i).take_front(N: 2);
4342 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4343 }
4344
4345 // Partial products at offset 2 * i - 1.
4346 if (i > 0) {
4347 if (!SeparateOddAlignedProducts) {
4348 auto LocalAccum = Accum.drop_front(N: 2 * i - 1).take_front(N: 2);
4349 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4350 } else {
4351 bool IsHighest = 2 * i >= Accum.size();
4352 Register SeparateOddOut[2];
4353 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4354 .take_front(N: IsHighest ? 1 : 2);
4355 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4356
4357 MachineInstr *Lo;
4358
4359 if (i == 1) {
4360 if (!IsHighest)
4361 Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0]);
4362 else
4363 Lo = B.buildAdd(Dst: S32, Src0: Accum[2 * i - 1], Src1: SeparateOddOut[0]);
4364 } else {
4365 Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0],
4366 CarryIn: SeparateOddCarry);
4367 }
4368 Accum[2 * i - 1] = Lo->getOperand(i: 0).getReg();
4369
4370 if (!IsHighest) {
4371 auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i], Op1: SeparateOddOut[1],
4372 CarryIn: Lo->getOperand(i: 1).getReg());
4373 Accum[2 * i] = Hi.getReg(Idx: 0);
4374 SeparateOddCarry = Hi.getReg(Idx: 1);
4375 }
4376 }
4377 }
4378
4379 // Add in the carries from the previous iteration
4380 if (i > 0) {
4381 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4382 EvenCarryIn.push_back(Elt: CarryOut);
4383
4384 if (2 * i < Accum.size()) {
4385 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4386 OddCarry.push_back(Elt: CarryOut);
4387 }
4388 }
4389 }
4390}
4391
4392// Custom narrowing of wide multiplies using wide multiply-add instructions.
4393//
4394// TODO: If the multiply is followed by an addition, we should attempt to
4395// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4396bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4397 MachineInstr &MI) const {
4398 assert(ST.hasMad64_32());
4399 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4400
4401 MachineIRBuilder &B = Helper.MIRBuilder;
4402 MachineRegisterInfo &MRI = *B.getMRI();
4403
4404 Register DstReg = MI.getOperand(i: 0).getReg();
4405 Register Src0 = MI.getOperand(i: 1).getReg();
4406 Register Src1 = MI.getOperand(i: 2).getReg();
4407
4408 LLT Ty = MRI.getType(Reg: DstReg);
4409 assert(Ty.isScalar());
4410
4411 unsigned Size = Ty.getSizeInBits();
4412 if (ST.hasVectorMulU64() && Size == 64)
4413 return true;
4414
4415 unsigned NumParts = Size / 32;
4416 assert((Size % 32) == 0);
4417 assert(NumParts >= 2);
4418
4419 // Whether to use MAD_64_32 for partial products whose high half is
4420 // discarded. This avoids some ADD instructions but risks false dependency
4421 // stalls on some subtargets in some cases.
4422 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4423
4424 // Whether to compute odd-aligned partial products separately. This is
4425 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4426 // in an even-aligned VGPR.
4427 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4428
4429 LLT S32 = LLT::scalar(SizeInBits: 32);
4430 SmallVector<Register, 2> Src0Parts, Src1Parts;
4431 for (unsigned i = 0; i < NumParts; ++i) {
4432 Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4433 Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4434 }
4435 B.buildUnmerge(Res: Src0Parts, Op: Src0);
4436 B.buildUnmerge(Res: Src1Parts, Op: Src1);
4437
4438 SmallVector<Register, 2> AccumRegs(NumParts);
4439 buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4440 SeparateOddAlignedProducts);
4441
4442 B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4443 MI.eraseFromParent();
4444 return true;
4445}
4446
4447// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4448// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4449// case with a single min instruction instead of a compare+select.
4450bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4451 MachineRegisterInfo &MRI,
4452 MachineIRBuilder &B) const {
4453 Register Dst = MI.getOperand(i: 0).getReg();
4454 Register Src = MI.getOperand(i: 1).getReg();
4455 LLT DstTy = MRI.getType(Reg: Dst);
4456 LLT SrcTy = MRI.getType(Reg: Src);
4457
4458 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4459 ? AMDGPU::G_AMDGPU_FFBH_U32
4460 : AMDGPU::G_AMDGPU_FFBL_B32;
4461 auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4462 B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4463
4464 MI.eraseFromParent();
4465 return true;
4466}
4467
4468bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4469 MachineRegisterInfo &MRI,
4470 MachineIRBuilder &B) const {
4471 Register Dst = MI.getOperand(i: 0).getReg();
4472 Register Src = MI.getOperand(i: 1).getReg();
4473 LLT SrcTy = MRI.getType(Reg: Src);
4474 TypeSize NumBits = SrcTy.getSizeInBits();
4475
4476 assert(NumBits < 32u);
4477
4478 auto ShiftAmt = B.buildConstant(Res: S32, Val: 32u - NumBits);
4479 auto Extend = B.buildAnyExt(Res: S32, Op: {Src}).getReg(Idx: 0u);
4480 auto Shift = B.buildShl(Dst: S32, Src0: Extend, Src1: ShiftAmt);
4481 auto Ctlz = B.buildInstr(Opc: AMDGPU::G_AMDGPU_FFBH_U32, DstOps: {S32}, SrcOps: {Shift});
4482 B.buildTrunc(Res: Dst, Op: Ctlz);
4483 MI.eraseFromParent();
4484 return true;
4485}
4486
4487// Check that this is a G_XOR x, -1
4488static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4489 if (MI.getOpcode() != TargetOpcode::G_XOR)
4490 return false;
4491 auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: 2).getReg(), MRI);
4492 return ConstVal == -1;
4493}
4494
4495// Return the use branch instruction, otherwise null if the usage is invalid.
4496static MachineInstr *
4497verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4498 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4499 Register CondDef = MI.getOperand(i: 0).getReg();
4500 if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4501 return nullptr;
4502
4503 MachineBasicBlock *Parent = MI.getParent();
4504 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(RegNo: CondDef);
4505
4506 if (isNot(MRI, MI: *UseMI)) {
4507 Register NegatedCond = UseMI->getOperand(i: 0).getReg();
4508 if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4509 return nullptr;
4510
4511 // We're deleting the def of this value, so we need to remove it.
4512 eraseInstr(MI&: *UseMI, MRI);
4513
4514 UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4515 Negated = true;
4516 }
4517
4518 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4519 return nullptr;
4520
4521 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4522 MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4523 if (Next == Parent->end()) {
4524 MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4525 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4526 return nullptr;
4527 UncondBrTarget = &*NextMBB;
4528 } else {
4529 if (Next->getOpcode() != AMDGPU::G_BR)
4530 return nullptr;
4531 Br = &*Next;
4532 UncondBrTarget = Br->getOperand(i: 0).getMBB();
4533 }
4534
4535 return UseMI;
4536}
4537
4538void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
4539 MachineIRBuilder &B,
4540 const ArgDescriptor *Arg,
4541 const TargetRegisterClass *ArgRC,
4542 LLT ArgTy) const {
4543 MCRegister SrcReg = Arg->getRegister();
4544 assert(SrcReg.isPhysical() && "Physical register expected");
4545 assert(DstReg.isVirtual() && "Virtual register expected");
4546
4547 Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4548 RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4549 if (Arg->isMasked()) {
4550 // TODO: Should we try to emit this once in the entry block?
4551 const LLT S32 = LLT::scalar(SizeInBits: 32);
4552 const unsigned Mask = Arg->getMask();
4553 const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4554
4555 Register AndMaskSrc = LiveIn;
4556
4557 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4558 // 0.
4559 if (Shift != 0) {
4560 auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4561 AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: 0);
4562 }
4563
4564 B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4565 } else {
4566 B.buildCopy(Res: DstReg, Op: LiveIn);
4567 }
4568}
4569
4570bool AMDGPULegalizerInfo::legalizeWorkGroupId(
4571 MachineInstr &MI, MachineIRBuilder &B,
4572 AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
4573 AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
4574 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4575 Register DstReg = MI.getOperand(i: 0).getReg();
4576 if (!ST.hasClusters()) {
4577 if (!loadInputValue(DstReg, B, ArgType: WorkGroupIdPV))
4578 return false;
4579 MI.eraseFromParent();
4580 return true;
4581 }
4582
4583 // Clusters are supported. Return the global position in the grid. If clusters
4584 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4585
4586 // WorkGroupIdXYZ = ClusterId == 0 ?
4587 // ClusterIdXYZ :
4588 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4589 MachineRegisterInfo &MRI = *B.getMRI();
4590 const LLT S32 = LLT::scalar(SizeInBits: 32);
4591 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4592 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4593 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4594 if (!loadInputValue(DstReg: ClusterIdXYZ, B, ArgType: WorkGroupIdPV) ||
4595 !loadInputValue(DstReg: ClusterWorkGroupIdXYZ, B, ArgType: ClusterWorkGroupIdPV) ||
4596 !loadInputValue(DstReg: ClusterMaxIdXYZ, B, ArgType: ClusterMaxIdPV))
4597 return false;
4598
4599 auto One = B.buildConstant(Res: S32, Val: 1);
4600 auto ClusterSizeXYZ = B.buildAdd(Dst: S32, Src0: ClusterMaxIdXYZ, Src1: One);
4601 auto GlobalIdXYZ = B.buildAdd(Dst: S32, Src0: ClusterWorkGroupIdXYZ,
4602 Src1: B.buildMul(Dst: S32, Src0: ClusterIdXYZ, Src1: ClusterSizeXYZ));
4603
4604 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4605
4606 switch (MFI->getClusterDims().getKind()) {
4607 case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
4608 case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
4609 B.buildCopy(Res: DstReg, Op: GlobalIdXYZ);
4610 MI.eraseFromParent();
4611 return true;
4612 }
4613 case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
4614 B.buildCopy(Res: DstReg, Op: ClusterIdXYZ);
4615 MI.eraseFromParent();
4616 return true;
4617 }
4618 case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
4619 using namespace AMDGPU::Hwreg;
4620 unsigned ClusterIdField = HwregEncoding::encode(Values: ID_IB_STS2, Values: 6, Values: 4);
4621 Register ClusterId = MRI.createGenericVirtualRegister(Ty: S32);
4622 MRI.setRegClass(Reg: ClusterId, RC: &AMDGPU::SReg_32RegClass);
4623 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32_const)
4624 .addDef(RegNo: ClusterId)
4625 .addImm(Val: ClusterIdField);
4626 auto Zero = B.buildConstant(Res: S32, Val: 0);
4627 auto NoClusters =
4628 B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1), Op0: ClusterId, Op1: Zero);
4629 B.buildSelect(Res: DstReg, Tst: NoClusters, Op0: ClusterIdXYZ, Op1: GlobalIdXYZ);
4630 MI.eraseFromParent();
4631 return true;
4632 }
4633 }
4634
4635 llvm_unreachable("nothing should reach here");
4636}
4637
4638bool AMDGPULegalizerInfo::loadInputValue(
4639 Register DstReg, MachineIRBuilder &B,
4640 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4641 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4642 const ArgDescriptor *Arg = nullptr;
4643 const TargetRegisterClass *ArgRC;
4644 LLT ArgTy;
4645
4646 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4647 const ArgDescriptor WorkGroupIDX =
4648 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
4649 // If GridZ is not programmed in an entry function then the hardware will set
4650 // it to all zeros, so there is no need to mask the GridY value in the low
4651 // order bits.
4652 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4653 Reg: AMDGPU::TTMP7,
4654 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4655 const ArgDescriptor WorkGroupIDZ =
4656 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
4657 const ArgDescriptor ClusterWorkGroupIDX =
4658 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000000Fu);
4659 const ArgDescriptor ClusterWorkGroupIDY =
4660 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000000F0u);
4661 const ArgDescriptor ClusterWorkGroupIDZ =
4662 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00000F00u);
4663 const ArgDescriptor ClusterWorkGroupMaxIDX =
4664 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000F000u);
4665 const ArgDescriptor ClusterWorkGroupMaxIDY =
4666 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000F0000u);
4667 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4668 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00F00000u);
4669 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4670 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0F000000u);
4671
4672 auto LoadConstant = [&](unsigned N) {
4673 B.buildConstant(Res: DstReg, Val: N);
4674 return true;
4675 };
4676
4677 if (ST.hasArchitectedSGPRs() &&
4678 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4679 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4680 bool HasFixedDims = ClusterDims.isFixedDims();
4681
4682 switch (ArgType) {
4683 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4684 Arg = &WorkGroupIDX;
4685 ArgRC = &AMDGPU::SReg_32RegClass;
4686 ArgTy = LLT::scalar(SizeInBits: 32);
4687 break;
4688 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4689 Arg = &WorkGroupIDY;
4690 ArgRC = &AMDGPU::SReg_32RegClass;
4691 ArgTy = LLT::scalar(SizeInBits: 32);
4692 break;
4693 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4694 Arg = &WorkGroupIDZ;
4695 ArgRC = &AMDGPU::SReg_32RegClass;
4696 ArgTy = LLT::scalar(SizeInBits: 32);
4697 break;
4698 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
4699 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4700 return LoadConstant(0);
4701 Arg = &ClusterWorkGroupIDX;
4702 ArgRC = &AMDGPU::SReg_32RegClass;
4703 ArgTy = LLT::scalar(SizeInBits: 32);
4704 break;
4705 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
4706 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4707 return LoadConstant(0);
4708 Arg = &ClusterWorkGroupIDY;
4709 ArgRC = &AMDGPU::SReg_32RegClass;
4710 ArgTy = LLT::scalar(SizeInBits: 32);
4711 break;
4712 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
4713 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4714 return LoadConstant(0);
4715 Arg = &ClusterWorkGroupIDZ;
4716 ArgRC = &AMDGPU::SReg_32RegClass;
4717 ArgTy = LLT::scalar(SizeInBits: 32);
4718 break;
4719 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
4720 if (HasFixedDims)
4721 return LoadConstant(ClusterDims.getDims()[0] - 1);
4722 Arg = &ClusterWorkGroupMaxIDX;
4723 ArgRC = &AMDGPU::SReg_32RegClass;
4724 ArgTy = LLT::scalar(SizeInBits: 32);
4725 break;
4726 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
4727 if (HasFixedDims)
4728 return LoadConstant(ClusterDims.getDims()[1] - 1);
4729 Arg = &ClusterWorkGroupMaxIDY;
4730 ArgRC = &AMDGPU::SReg_32RegClass;
4731 ArgTy = LLT::scalar(SizeInBits: 32);
4732 break;
4733 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
4734 if (HasFixedDims)
4735 return LoadConstant(ClusterDims.getDims()[2] - 1);
4736 Arg = &ClusterWorkGroupMaxIDZ;
4737 ArgRC = &AMDGPU::SReg_32RegClass;
4738 ArgTy = LLT::scalar(SizeInBits: 32);
4739 break;
4740 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
4741 Arg = &ClusterWorkGroupMaxFlatID;
4742 ArgRC = &AMDGPU::SReg_32RegClass;
4743 ArgTy = LLT::scalar(SizeInBits: 32);
4744 break;
4745 default:
4746 break;
4747 }
4748 }
4749
4750 if (!Arg)
4751 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4752
4753 if (!Arg) {
4754 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4755 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4756 // which case the pointer argument may be missing and we use null.
4757 return LoadConstant(0);
4758 }
4759
4760 // It's undefined behavior if a function marked with the amdgpu-no-*
4761 // attributes uses the corresponding intrinsic.
4762 B.buildUndef(Res: DstReg);
4763 return true;
4764 }
4765
4766 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4767 return false; // TODO: Handle these
4768 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4769 return true;
4770}
4771
4772bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4773 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4774 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4775 if (!loadInputValue(DstReg: MI.getOperand(i: 0).getReg(), B, ArgType))
4776 return false;
4777
4778 MI.eraseFromParent();
4779 return true;
4780}
4781
4782static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4783 int64_t C) {
4784 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: C);
4785 MI.eraseFromParent();
4786 return true;
4787}
4788
4789bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4790 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4791 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4792 unsigned MaxID = ST.getMaxWorkitemID(Kernel: B.getMF().getFunction(), Dimension: Dim);
4793 if (MaxID == 0)
4794 return replaceWithConstant(B, MI, C: 0);
4795
4796 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4797 const ArgDescriptor *Arg;
4798 const TargetRegisterClass *ArgRC;
4799 LLT ArgTy;
4800 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4801
4802 Register DstReg = MI.getOperand(i: 0).getReg();
4803 if (!Arg) {
4804 // It's undefined behavior if a function marked with the amdgpu-no-*
4805 // attributes uses the corresponding intrinsic.
4806 B.buildUndef(Res: DstReg);
4807 MI.eraseFromParent();
4808 return true;
4809 }
4810
4811 if (Arg->isMasked()) {
4812 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4813 // masking operations anyway.
4814 //
4815 // TODO: We could assert the top bit is 0 for the source copy.
4816 if (!loadInputValue(DstReg, B, ArgType))
4817 return false;
4818 } else {
4819 Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
4820 if (!loadInputValue(DstReg: TmpReg, B, ArgType))
4821 return false;
4822 B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
4823 }
4824
4825 MI.eraseFromParent();
4826 return true;
4827}
4828
4829MachinePointerInfo
4830AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const {
4831 // This isn't really a constant pool but close enough.
4832 MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
4833 PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
4834 return PtrInfo;
4835}
4836
4837Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4838 int64_t Offset) const {
4839 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
4840 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
4841
4842 // TODO: If we passed in the base kernel offset we could have a better
4843 // alignment than 4, but we don't really need it.
4844 if (!loadInputValue(DstReg: KernArgReg, B,
4845 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4846 llvm_unreachable("failed to find kernarg segment ptr");
4847
4848 auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset);
4849 return B.buildObjectPtrOffset(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: 0);
4850}
4851
4852/// Legalize a value that's loaded from kernel arguments. This is only used by
4853/// legacy intrinsics.
4854bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4855 MachineIRBuilder &B,
4856 uint64_t Offset,
4857 Align Alignment) const {
4858 Register DstReg = MI.getOperand(i: 0).getReg();
4859
4860 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4861 "unexpected kernarg parameter type");
4862
4863 Register Ptr = getKernargParameterPtr(B, Offset);
4864 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF&: B.getMF());
4865 B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment: Align(4),
4866 MMOFlags: MachineMemOperand::MODereferenceable |
4867 MachineMemOperand::MOInvariant);
4868 MI.eraseFromParent();
4869 return true;
4870}
4871
4872bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4873 MachineRegisterInfo &MRI,
4874 MachineIRBuilder &B) const {
4875 Register Dst = MI.getOperand(i: 0).getReg();
4876 LLT DstTy = MRI.getType(Reg: Dst);
4877 LLT S16 = LLT::scalar(SizeInBits: 16);
4878 LLT S32 = LLT::scalar(SizeInBits: 32);
4879 LLT S64 = LLT::scalar(SizeInBits: 64);
4880
4881 if (DstTy == S16)
4882 return legalizeFDIV16(MI, MRI, B);
4883 if (DstTy == S32)
4884 return legalizeFDIV32(MI, MRI, B);
4885 if (DstTy == S64)
4886 return legalizeFDIV64(MI, MRI, B);
4887
4888 return false;
4889}
4890
4891void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4892 Register DstDivReg,
4893 Register DstRemReg,
4894 Register X,
4895 Register Y) const {
4896 const LLT S1 = LLT::scalar(SizeInBits: 1);
4897 const LLT S32 = LLT::scalar(SizeInBits: 32);
4898
4899 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4900 // algorithm used here.
4901
4902 // Initial estimate of inv(y).
4903 auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
4904 auto RcpIFlag = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {FloatY});
4905 auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f7ffffe));
4906 auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
4907 auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
4908
4909 // One round of UNR.
4910 auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 0), Src1: Y);
4911 auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
4912 Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
4913
4914 // Quotient/remainder estimate.
4915 auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
4916 auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
4917
4918 // First quotient/remainder refinement.
4919 auto One = B.buildConstant(Res: S32, Val: 1);
4920 auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4921 if (DstDivReg)
4922 Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4923 R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4924
4925 // Second quotient/remainder refinement.
4926 Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4927 if (DstDivReg)
4928 B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4929
4930 if (DstRemReg)
4931 B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4932}
4933
4934// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4935//
4936// Return lo, hi of result
4937//
4938// %cvt.lo = G_UITOFP Val.lo
4939// %cvt.hi = G_UITOFP Val.hi
4940// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4941// %rcp = G_AMDGPU_RCP_IFLAG %mad
4942// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4943// %mul2 = G_FMUL %mul1, 2**(-32)
4944// %trunc = G_INTRINSIC_TRUNC %mul2
4945// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4946// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4947static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4948 Register Val) {
4949 const LLT S32 = LLT::scalar(SizeInBits: 32);
4950 auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
4951
4952 auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 0));
4953 auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
4954
4955 auto Mad = B.buildFMAD(
4956 Dst: S32, Src0: CvtHi, // 2**32
4957 Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f800000)), Src2: CvtLo);
4958
4959 auto Rcp = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {Mad});
4960 auto Mul1 = B.buildFMul(
4961 Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x5f7ffffc)));
4962
4963 // 2**(-32)
4964 auto Mul2 = B.buildFMul(
4965 Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x2f800000)));
4966 auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
4967
4968 // -(2**32)
4969 auto Mad2 = B.buildFMAD(
4970 Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0xcf800000)),
4971 Src2: Mul1);
4972
4973 auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
4974 auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
4975
4976 return {ResultLo.getReg(Idx: 0), ResultHi.getReg(Idx: 0)};
4977}
4978
4979void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4980 Register DstDivReg,
4981 Register DstRemReg,
4982 Register Numer,
4983 Register Denom) const {
4984 const LLT S32 = LLT::scalar(SizeInBits: 32);
4985 const LLT S64 = LLT::scalar(SizeInBits: 64);
4986 const LLT S1 = LLT::scalar(SizeInBits: 1);
4987 Register RcpLo, RcpHi;
4988
4989 std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
4990
4991 auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
4992
4993 auto Zero64 = B.buildConstant(Res: S64, Val: 0);
4994 auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
4995
4996 auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
4997 auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
4998
4999 auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
5000 Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: 0);
5001 Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: 1);
5002
5003 auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
5004 auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: 1));
5005 auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
5006
5007 auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
5008 auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
5009 auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
5010 Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: 0);
5011 Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: 1);
5012
5013 auto Zero32 = B.buildConstant(Res: S32, Val: 0);
5014 auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
5015 auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: 1));
5016 auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
5017
5018 auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
5019 Register NumerLo = UnmergeNumer.getReg(Idx: 0);
5020 Register NumerHi = UnmergeNumer.getReg(Idx: 1);
5021
5022 auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
5023 auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
5024 auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
5025 Register Mul3_Lo = UnmergeMul3.getReg(Idx: 0);
5026 Register Mul3_Hi = UnmergeMul3.getReg(Idx: 1);
5027 auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
5028 auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: 1));
5029 auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
5030 auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
5031
5032 auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
5033 Register DenomLo = UnmergeDenom.getReg(Idx: 0);
5034 Register DenomHi = UnmergeDenom.getReg(Idx: 1);
5035
5036 auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
5037 auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
5038
5039 auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
5040 auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
5041
5042 auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
5043 auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
5044
5045 // TODO: Here and below portions of the code can be enclosed into if/endif.
5046 // Currently control flow is unconditional and we have 4 selects after
5047 // potential endif to substitute PHIs.
5048
5049 // if C3 != 0 ...
5050 auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
5051 auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: 1));
5052 auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: 1));
5053 auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
5054
5055 auto One64 = B.buildConstant(Res: S64, Val: 1);
5056 auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
5057
5058 auto C4 =
5059 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
5060 auto C5 =
5061 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
5062 auto C6 = B.buildSelect(
5063 Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
5064
5065 // if (C6 != 0)
5066 auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
5067 auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
5068
5069 auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: 1));
5070 auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: 1));
5071 auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
5072
5073 // endif C6
5074 // endif C3
5075
5076 if (DstDivReg) {
5077 auto Sel1 = B.buildSelect(
5078 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
5079 B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
5080 Op0: Sel1, Op1: MulHi3);
5081 }
5082
5083 if (DstRemReg) {
5084 auto Sel2 = B.buildSelect(
5085 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
5086 B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
5087 Op0: Sel2, Op1: Sub1);
5088 }
5089}
5090
5091bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
5092 MachineRegisterInfo &MRI,
5093 MachineIRBuilder &B) const {
5094 Register DstDivReg, DstRemReg;
5095 switch (MI.getOpcode()) {
5096 default:
5097 llvm_unreachable("Unexpected opcode!");
5098 case AMDGPU::G_UDIV: {
5099 DstDivReg = MI.getOperand(i: 0).getReg();
5100 break;
5101 }
5102 case AMDGPU::G_UREM: {
5103 DstRemReg = MI.getOperand(i: 0).getReg();
5104 break;
5105 }
5106 case AMDGPU::G_UDIVREM: {
5107 DstDivReg = MI.getOperand(i: 0).getReg();
5108 DstRemReg = MI.getOperand(i: 1).getReg();
5109 break;
5110 }
5111 }
5112
5113 const LLT S64 = LLT::scalar(SizeInBits: 64);
5114 const LLT S32 = LLT::scalar(SizeInBits: 32);
5115 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5116 Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
5117 Register Den = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
5118 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5119
5120 if (Ty == S32)
5121 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
5122 else if (Ty == S64)
5123 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
5124 else
5125 return false;
5126
5127 MI.eraseFromParent();
5128 return true;
5129}
5130
5131bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
5132 MachineRegisterInfo &MRI,
5133 MachineIRBuilder &B) const {
5134 const LLT S64 = LLT::scalar(SizeInBits: 64);
5135 const LLT S32 = LLT::scalar(SizeInBits: 32);
5136
5137 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5138 if (Ty != S32 && Ty != S64)
5139 return false;
5140
5141 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5142 Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
5143 Register RHS = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
5144
5145 auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - 1);
5146 auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
5147 auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
5148
5149 LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
5150 RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
5151
5152 LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
5153 RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
5154
5155 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5156 switch (MI.getOpcode()) {
5157 default:
5158 llvm_unreachable("Unexpected opcode!");
5159 case AMDGPU::G_SDIV: {
5160 DstDivReg = MI.getOperand(i: 0).getReg();
5161 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5162 break;
5163 }
5164 case AMDGPU::G_SREM: {
5165 DstRemReg = MI.getOperand(i: 0).getReg();
5166 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5167 break;
5168 }
5169 case AMDGPU::G_SDIVREM: {
5170 DstDivReg = MI.getOperand(i: 0).getReg();
5171 DstRemReg = MI.getOperand(i: 1).getReg();
5172 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5173 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5174 break;
5175 }
5176 }
5177
5178 if (Ty == S32)
5179 legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
5180 else
5181 legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
5182
5183 if (DstDivReg) {
5184 auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: 0);
5185 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: 0);
5186 B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
5187 }
5188
5189 if (DstRemReg) {
5190 auto Sign = LHSign.getReg(Idx: 0); // Remainder sign is the same as LHS
5191 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: 0);
5192 B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
5193 }
5194
5195 MI.eraseFromParent();
5196 return true;
5197}
5198
5199bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
5200 MachineRegisterInfo &MRI,
5201 MachineIRBuilder &B) const {
5202 Register Res = MI.getOperand(i: 0).getReg();
5203 Register LHS = MI.getOperand(i: 1).getReg();
5204 Register RHS = MI.getOperand(i: 2).getReg();
5205 uint16_t Flags = MI.getFlags();
5206 LLT ResTy = MRI.getType(Reg: Res);
5207
5208 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn);
5209
5210 if (const auto *CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
5211 if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: 16))
5212 return false;
5213
5214 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5215 // the CI documentation has a worst case error of 1 ulp.
5216 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5217 // use it as long as we aren't trying to use denormals.
5218 //
5219 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5220
5221 // 1 / x -> RCP(x)
5222 if (CLHS->isExactlyValue(V: 1.0)) {
5223 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
5224 .addUse(RegNo: RHS)
5225 .setMIFlags(Flags);
5226
5227 MI.eraseFromParent();
5228 return true;
5229 }
5230
5231 // -1 / x -> RCP( FNEG(x) )
5232 if (CLHS->isExactlyValue(V: -1.0)) {
5233 auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
5234 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
5235 .addUse(RegNo: FNeg.getReg(Idx: 0))
5236 .setMIFlags(Flags);
5237
5238 MI.eraseFromParent();
5239 return true;
5240 }
5241 }
5242
5243 // For f16 require afn or arcp.
5244 // For f32 require afn.
5245 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: 16) ||
5246 !MI.getFlag(Flag: MachineInstr::FmArcp)))
5247 return false;
5248
5249 // x / y -> x * (1.0 / y)
5250 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
5251 .addUse(RegNo: RHS)
5252 .setMIFlags(Flags);
5253 B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
5254
5255 MI.eraseFromParent();
5256 return true;
5257}
5258
5259bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
5260 MachineRegisterInfo &MRI,
5261 MachineIRBuilder &B) const {
5262 Register Res = MI.getOperand(i: 0).getReg();
5263 Register X = MI.getOperand(i: 1).getReg();
5264 Register Y = MI.getOperand(i: 2).getReg();
5265 uint16_t Flags = MI.getFlags();
5266 LLT ResTy = MRI.getType(Reg: Res);
5267
5268 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn);
5269
5270 if (!AllowInaccurateRcp)
5271 return false;
5272
5273 auto NegY = B.buildFNeg(Dst: ResTy, Src0: Y);
5274 auto One = B.buildFConstant(Res: ResTy, Val: 1.0);
5275
5276 auto R = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
5277 .addUse(RegNo: Y)
5278 .setMIFlags(Flags);
5279
5280 auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
5281 R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
5282
5283 auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
5284 R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
5285
5286 auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
5287 auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
5288
5289 B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
5290 MI.eraseFromParent();
5291 return true;
5292}
5293
5294bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
5295 MachineRegisterInfo &MRI,
5296 MachineIRBuilder &B) const {
5297 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5298 return true;
5299
5300 Register Res = MI.getOperand(i: 0).getReg();
5301 Register LHS = MI.getOperand(i: 1).getReg();
5302 Register RHS = MI.getOperand(i: 2).getReg();
5303
5304 uint16_t Flags = MI.getFlags();
5305
5306 LLT S16 = LLT::scalar(SizeInBits: 16);
5307 LLT S32 = LLT::scalar(SizeInBits: 32);
5308
5309 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5310 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5311 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5312 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5313 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5314 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5315 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5316 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5317 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5318 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5319 // q16.u = opx(V_CVT_F16_F32, q32.u);
5320 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5321
5322 auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
5323 auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
5324 auto NegRHSExt = B.buildFNeg(Dst: S32, Src0: RHSExt);
5325 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5326 .addUse(RegNo: RHSExt.getReg(Idx: 0))
5327 .setMIFlags(Flags);
5328 auto Quot = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: Rcp, Flags);
5329 MachineInstrBuilder Err;
5330 if (ST.hasMadMacF32Insts()) {
5331 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5332 Quot = B.buildFMAD(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
5333 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5334 } else {
5335 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5336 Quot = B.buildFMA(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
5337 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5338 }
5339 auto Tmp = B.buildFMul(Dst: S32, Src0: Err, Src1: Rcp, Flags);
5340 Tmp = B.buildAnd(Dst: S32, Src0: Tmp, Src1: B.buildConstant(Res: S32, Val: 0xff800000));
5341 Quot = B.buildFAdd(Dst: S32, Src0: Tmp, Src1: Quot, Flags);
5342 auto RDst = B.buildFPTrunc(Res: S16, Op: Quot, Flags);
5343 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5344 .addUse(RegNo: RDst.getReg(Idx: 0))
5345 .addUse(RegNo: RHS)
5346 .addUse(RegNo: LHS)
5347 .setMIFlags(Flags);
5348
5349 MI.eraseFromParent();
5350 return true;
5351}
5352
5353static constexpr unsigned SPDenormModeBitField =
5354 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 4, Values: 2);
5355
5356// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5357// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5358static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
5359 const GCNSubtarget &ST,
5360 SIModeRegisterDefaults Mode) {
5361 // Set SP denorm mode to this value.
5362 unsigned SPDenormMode =
5363 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5364
5365 if (ST.hasDenormModeInst()) {
5366 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5367 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5368
5369 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5370 B.buildInstr(Opcode: AMDGPU::S_DENORM_MODE)
5371 .addImm(Val: NewDenormModeValue);
5372
5373 } else {
5374 B.buildInstr(Opcode: AMDGPU::S_SETREG_IMM32_B32)
5375 .addImm(Val: SPDenormMode)
5376 .addImm(Val: SPDenormModeBitField);
5377 }
5378}
5379
5380bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
5381 MachineRegisterInfo &MRI,
5382 MachineIRBuilder &B) const {
5383 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5384 return true;
5385
5386 Register Res = MI.getOperand(i: 0).getReg();
5387 Register LHS = MI.getOperand(i: 1).getReg();
5388 Register RHS = MI.getOperand(i: 2).getReg();
5389 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5390 SIModeRegisterDefaults Mode = MFI->getMode();
5391
5392 uint16_t Flags = MI.getFlags();
5393
5394 LLT S32 = LLT::scalar(SizeInBits: 32);
5395 LLT S1 = LLT::scalar(SizeInBits: 1);
5396
5397 auto One = B.buildFConstant(Res: S32, Val: 1.0f);
5398
5399 auto DenominatorScaled =
5400 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5401 .addUse(RegNo: LHS)
5402 .addUse(RegNo: RHS)
5403 .addImm(Val: 0)
5404 .setMIFlags(Flags);
5405 auto NumeratorScaled =
5406 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5407 .addUse(RegNo: LHS)
5408 .addUse(RegNo: RHS)
5409 .addImm(Val: 1)
5410 .setMIFlags(Flags);
5411
5412 auto ApproxRcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5413 .addUse(RegNo: DenominatorScaled.getReg(Idx: 0))
5414 .setMIFlags(Flags);
5415 auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
5416
5417 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5418 const bool HasDynamicDenormals =
5419 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5420 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5421
5422 Register SavedSPDenormMode;
5423 if (!PreservesDenormals) {
5424 if (HasDynamicDenormals) {
5425 SavedSPDenormMode = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5426 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32)
5427 .addDef(RegNo: SavedSPDenormMode)
5428 .addImm(Val: SPDenormModeBitField);
5429 }
5430 toggleSPDenormMode(Enable: true, B, ST, Mode);
5431 }
5432
5433 auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
5434 auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
5435 auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
5436 auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
5437 auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
5438 auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
5439
5440 if (!PreservesDenormals) {
5441 if (HasDynamicDenormals) {
5442 assert(SavedSPDenormMode);
5443 B.buildInstr(Opcode: AMDGPU::S_SETREG_B32)
5444 .addReg(RegNo: SavedSPDenormMode)
5445 .addImm(Val: SPDenormModeBitField);
5446 } else
5447 toggleSPDenormMode(Enable: false, B, ST, Mode);
5448 }
5449
5450 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S32})
5451 .addUse(RegNo: Fma4.getReg(Idx: 0))
5452 .addUse(RegNo: Fma1.getReg(Idx: 0))
5453 .addUse(RegNo: Fma3.getReg(Idx: 0))
5454 .addUse(RegNo: NumeratorScaled.getReg(Idx: 1))
5455 .setMIFlags(Flags);
5456
5457 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5458 .addUse(RegNo: Fmas.getReg(Idx: 0))
5459 .addUse(RegNo: RHS)
5460 .addUse(RegNo: LHS)
5461 .setMIFlags(Flags);
5462
5463 MI.eraseFromParent();
5464 return true;
5465}
5466
5467bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5468 MachineRegisterInfo &MRI,
5469 MachineIRBuilder &B) const {
5470 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5471 return true;
5472
5473 Register Res = MI.getOperand(i: 0).getReg();
5474 Register LHS = MI.getOperand(i: 1).getReg();
5475 Register RHS = MI.getOperand(i: 2).getReg();
5476
5477 uint16_t Flags = MI.getFlags();
5478
5479 LLT S64 = LLT::scalar(SizeInBits: 64);
5480 LLT S1 = LLT::scalar(SizeInBits: 1);
5481
5482 auto One = B.buildFConstant(Res: S64, Val: 1.0);
5483
5484 auto DivScale0 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5485 .addUse(RegNo: LHS)
5486 .addUse(RegNo: RHS)
5487 .addImm(Val: 0)
5488 .setMIFlags(Flags);
5489
5490 auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(Idx: 0), Flags);
5491
5492 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S64})
5493 .addUse(RegNo: DivScale0.getReg(Idx: 0))
5494 .setMIFlags(Flags);
5495
5496 auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
5497 auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
5498 auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
5499
5500 auto DivScale1 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5501 .addUse(RegNo: LHS)
5502 .addUse(RegNo: RHS)
5503 .addImm(Val: 1)
5504 .setMIFlags(Flags);
5505
5506 auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5507 auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(Idx: 0), Src1: Fma3, Flags);
5508 auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(Idx: 0), Flags);
5509
5510 Register Scale;
5511 if (!ST.hasUsableDivScaleConditionOutput()) {
5512 // Workaround a hardware bug on SI where the condition output from div_scale
5513 // is not usable.
5514
5515 LLT S32 = LLT::scalar(SizeInBits: 32);
5516
5517 auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5518 auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5519 auto Scale0Unmerge = B.buildUnmerge(Res: S32, Op: DivScale0);
5520 auto Scale1Unmerge = B.buildUnmerge(Res: S32, Op: DivScale1);
5521
5522 auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: 1),
5523 Op1: Scale1Unmerge.getReg(Idx: 1));
5524 auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: 1),
5525 Op1: Scale0Unmerge.getReg(Idx: 1));
5526 Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(Idx: 0);
5527 } else {
5528 Scale = DivScale1.getReg(Idx: 1);
5529 }
5530
5531 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S64})
5532 .addUse(RegNo: Fma4.getReg(Idx: 0))
5533 .addUse(RegNo: Fma3.getReg(Idx: 0))
5534 .addUse(RegNo: Mul.getReg(Idx: 0))
5535 .addUse(RegNo: Scale)
5536 .setMIFlags(Flags);
5537
5538 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res: ArrayRef(Res))
5539 .addUse(RegNo: Fmas.getReg(Idx: 0))
5540 .addUse(RegNo: RHS)
5541 .addUse(RegNo: LHS)
5542 .setMIFlags(Flags);
5543
5544 MI.eraseFromParent();
5545 return true;
5546}
5547
5548bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5549 MachineRegisterInfo &MRI,
5550 MachineIRBuilder &B) const {
5551 Register Res0 = MI.getOperand(i: 0).getReg();
5552 Register Res1 = MI.getOperand(i: 1).getReg();
5553 Register Val = MI.getOperand(i: 2).getReg();
5554 uint16_t Flags = MI.getFlags();
5555
5556 LLT Ty = MRI.getType(Reg: Res0);
5557 LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: 16) ? LLT::scalar(SizeInBits: 16) : LLT::scalar(SizeInBits: 32);
5558
5559 auto Mant = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_mant, Res: {Ty})
5560 .addUse(RegNo: Val)
5561 .setMIFlags(Flags);
5562 auto Exp = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_exp, Res: {InstrExpTy})
5563 .addUse(RegNo: Val)
5564 .setMIFlags(Flags);
5565
5566 if (ST.hasFractBug()) {
5567 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5568 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5569 auto IsFinite =
5570 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
5571 auto Zero = B.buildConstant(Res: InstrExpTy, Val: 0);
5572 Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5573 Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5574 }
5575
5576 B.buildCopy(Res: Res0, Op: Mant);
5577 B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5578
5579 MI.eraseFromParent();
5580 return true;
5581}
5582
5583bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5584 MachineRegisterInfo &MRI,
5585 MachineIRBuilder &B) const {
5586 Register Res = MI.getOperand(i: 0).getReg();
5587 Register LHS = MI.getOperand(i: 2).getReg();
5588 Register RHS = MI.getOperand(i: 3).getReg();
5589 uint16_t Flags = MI.getFlags();
5590
5591 LLT S32 = LLT::scalar(SizeInBits: 32);
5592 LLT S1 = LLT::scalar(SizeInBits: 1);
5593
5594 auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5595 const APFloat C0Val(1.0f);
5596
5597 auto C0 = B.buildFConstant(Res: S32, Val: 0x1p+96f);
5598 auto C1 = B.buildFConstant(Res: S32, Val: 0x1p-32f);
5599 auto C2 = B.buildFConstant(Res: S32, Val: 1.0f);
5600
5601 auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5602 auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5603
5604 auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5605
5606 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5607 .addUse(RegNo: Mul0.getReg(Idx: 0))
5608 .setMIFlags(Flags);
5609
5610 auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5611
5612 B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5613
5614 MI.eraseFromParent();
5615 return true;
5616}
5617
5618bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5619 MachineRegisterInfo &MRI,
5620 MachineIRBuilder &B) const {
5621 // Bypass the correct expansion a standard promotion through G_FSQRT would
5622 // get. The f32 op is accurate enough for the f16 cas.
5623 unsigned Flags = MI.getFlags();
5624 assert(!ST.has16BitInsts());
5625 const LLT F32 = LLT::scalar(SizeInBits: 32);
5626 auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: 1), Flags);
5627 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: {F32})
5628 .addUse(RegNo: Ext.getReg(Idx: 0))
5629 .setMIFlags(Flags);
5630 B.buildFPTrunc(Res: MI.getOperand(i: 0), Op: Log2, Flags);
5631 MI.eraseFromParent();
5632 return true;
5633}
5634
5635bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5636 MachineRegisterInfo &MRI,
5637 MachineIRBuilder &B) const {
5638 MachineFunction &MF = B.getMF();
5639 Register Dst = MI.getOperand(i: 0).getReg();
5640 Register X = MI.getOperand(i: 1).getReg();
5641 const unsigned Flags = MI.getFlags();
5642 const LLT S1 = LLT::scalar(SizeInBits: 1);
5643 const LLT F32 = LLT::scalar(SizeInBits: 32);
5644 const LLT I32 = LLT::scalar(SizeInBits: 32);
5645
5646 if (allowApproxFunc(MF, Flags)) {
5647 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({Dst}))
5648 .addUse(RegNo: X)
5649 .setMIFlags(Flags);
5650 MI.eraseFromParent();
5651 return true;
5652 }
5653
5654 auto ScaleThreshold = B.buildFConstant(Res: F32, Val: 0x1.0p-96f);
5655 auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5656 auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: 0x1.0p+32f);
5657 auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5658 auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5659
5660 Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5661 if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5662 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({SqrtS}))
5663 .addUse(RegNo: SqrtX.getReg(Idx: 0))
5664 .setMIFlags(Flags);
5665
5666 auto NegOne = B.buildConstant(Res: I32, Val: -1);
5667 auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5668
5669 auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5670 auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5671
5672 auto PosOne = B.buildConstant(Res: I32, Val: 1);
5673 auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5674
5675 auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5676 auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5677
5678 auto Zero = B.buildFConstant(Res: F32, Val: 0.0f);
5679 auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5680
5681 SqrtS =
5682 B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5683
5684 auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
5685 SqrtS =
5686 B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: 0);
5687 } else {
5688 auto SqrtR =
5689 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F32}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5690 B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
5691
5692 auto Half = B.buildFConstant(Res: F32, Val: 0.5f);
5693 auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
5694 auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
5695 auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
5696 SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
5697 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(Idx: 0);
5698 auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
5699 auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
5700 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(Idx: 0);
5701 }
5702
5703 auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: 0x1.0p-16f);
5704
5705 auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
5706
5707 SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5708
5709 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5710 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
5711
5712 MI.eraseFromParent();
5713 return true;
5714}
5715
5716bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5717 MachineRegisterInfo &MRI,
5718 MachineIRBuilder &B) const {
5719 // For double type, the SQRT and RSQ instructions don't have required
5720 // precision, we apply Goldschmidt's algorithm to improve the result:
5721 //
5722 // y0 = rsq(x)
5723 // g0 = x * y0
5724 // h0 = 0.5 * y0
5725 //
5726 // r0 = 0.5 - h0 * g0
5727 // g1 = g0 * r0 + g0
5728 // h1 = h0 * r0 + h0
5729 //
5730 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5731 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5732 // h2 = h1 * r1 + h1
5733 //
5734 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5735 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5736 //
5737 // sqrt(x) = g3
5738
5739 const LLT S1 = LLT::scalar(SizeInBits: 1);
5740 const LLT S32 = LLT::scalar(SizeInBits: 32);
5741 const LLT F64 = LLT::scalar(SizeInBits: 64);
5742
5743 Register Dst = MI.getOperand(i: 0).getReg();
5744 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5745
5746 Register X = MI.getOperand(i: 1).getReg();
5747 unsigned Flags = MI.getFlags();
5748
5749 auto ScaleConstant = B.buildFConstant(Res: F64, Val: 0x1.0p-767);
5750
5751 auto ZeroInt = B.buildConstant(Res: S32, Val: 0);
5752 auto Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant);
5753
5754 // Scale up input if it is too small.
5755 auto ScaleUpFactor = B.buildConstant(Res: S32, Val: 256);
5756 auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
5757 auto SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags);
5758
5759 auto SqrtY =
5760 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F64}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5761
5762 auto Half = B.buildFConstant(Res: F64, Val: 0.5);
5763 auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
5764 auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
5765
5766 auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
5767 auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
5768
5769 auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
5770 auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
5771
5772 auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
5773 auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
5774
5775 auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
5776
5777 auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
5778 auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
5779
5780 auto SqrtRet = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
5781
5782 // Scale down the result.
5783 auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -128);
5784 auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
5785 SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtRet, Src1: ScaleDown, Flags);
5786
5787 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5788 // with finite only or nsz because rsq(+/-0) = +/-inf
5789
5790 // TODO: Check for DAZ and expand to subnormals
5791 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5792
5793 // If x is +INF, +0, or -0, use its original value
5794 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
5795
5796 MI.eraseFromParent();
5797 return true;
5798}
5799
5800bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5801 MachineRegisterInfo &MRI,
5802 MachineIRBuilder &B) const {
5803 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5804 if (Ty == LLT::scalar(SizeInBits: 32))
5805 return legalizeFSQRTF32(MI, MRI, B);
5806 if (Ty == LLT::scalar(SizeInBits: 64))
5807 return legalizeFSQRTF64(MI, MRI, B);
5808 if (Ty == LLT::scalar(SizeInBits: 16))
5809 return legalizeFSQRTF16(MI, MRI, B);
5810 return false;
5811}
5812
5813// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5814// FIXME: Why do we handle this one but not other removed instructions?
5815//
5816// Reciprocal square root. The clamp prevents infinite results, clamping
5817// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5818// +-max_float.
5819bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5820 MachineRegisterInfo &MRI,
5821 MachineIRBuilder &B) const {
5822 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5823 return true;
5824
5825 Register Dst = MI.getOperand(i: 0).getReg();
5826 Register Src = MI.getOperand(i: 2).getReg();
5827 auto Flags = MI.getFlags();
5828
5829 LLT Ty = MRI.getType(Reg: Dst);
5830
5831 const fltSemantics *FltSemantics;
5832 if (Ty == LLT::scalar(SizeInBits: 32))
5833 FltSemantics = &APFloat::IEEEsingle();
5834 else if (Ty == LLT::scalar(SizeInBits: 64))
5835 FltSemantics = &APFloat::IEEEdouble();
5836 else
5837 return false;
5838
5839 auto Rsq = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {Ty})
5840 .addUse(RegNo: Src)
5841 .setMIFlags(Flags);
5842
5843 // We don't need to concern ourselves with the snan handling difference, since
5844 // the rsq quieted (or not) so use the one which will directly select.
5845 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5846 const bool UseIEEE = MFI->getMode().IEEE;
5847
5848 auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
5849 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
5850 B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
5851
5852 auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics, Negative: true));
5853
5854 if (UseIEEE)
5855 B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5856 else
5857 B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5858 MI.eraseFromParent();
5859 return true;
5860}
5861
5862// TODO: Fix pointer type handling
5863bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5864 MachineInstr &MI,
5865 Intrinsic::ID IID) const {
5866
5867 MachineIRBuilder &B = Helper.MIRBuilder;
5868 MachineRegisterInfo &MRI = *B.getMRI();
5869
5870 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5871 IID == Intrinsic::amdgcn_permlanex16;
5872 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5873 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5874
5875 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5876 Register Src2, LLT VT) -> Register {
5877 auto LaneOp = B.buildIntrinsic(ID: IID, Res: {VT}).addUse(RegNo: Src0);
5878 switch (IID) {
5879 case Intrinsic::amdgcn_readfirstlane:
5880 case Intrinsic::amdgcn_permlane64:
5881 return LaneOp.getReg(Idx: 0);
5882 case Intrinsic::amdgcn_readlane:
5883 case Intrinsic::amdgcn_set_inactive:
5884 case Intrinsic::amdgcn_set_inactive_chain_arg:
5885 return LaneOp.addUse(RegNo: Src1).getReg(Idx: 0);
5886 case Intrinsic::amdgcn_writelane:
5887 return LaneOp.addUse(RegNo: Src1).addUse(RegNo: Src2).getReg(Idx: 0);
5888 case Intrinsic::amdgcn_permlane16:
5889 case Intrinsic::amdgcn_permlanex16: {
5890 Register Src3 = MI.getOperand(i: 5).getReg();
5891 int64_t Src4 = MI.getOperand(i: 6).getImm();
5892 int64_t Src5 = MI.getOperand(i: 7).getImm();
5893 return LaneOp.addUse(RegNo: Src1)
5894 .addUse(RegNo: Src2)
5895 .addUse(RegNo: Src3)
5896 .addImm(Val: Src4)
5897 .addImm(Val: Src5)
5898 .getReg(Idx: 0);
5899 }
5900 case Intrinsic::amdgcn_mov_dpp8:
5901 return LaneOp.addImm(Val: MI.getOperand(i: 3).getImm()).getReg(Idx: 0);
5902 case Intrinsic::amdgcn_update_dpp:
5903 return LaneOp.addUse(RegNo: Src1)
5904 .addImm(Val: MI.getOperand(i: 4).getImm())
5905 .addImm(Val: MI.getOperand(i: 5).getImm())
5906 .addImm(Val: MI.getOperand(i: 6).getImm())
5907 .addImm(Val: MI.getOperand(i: 7).getImm())
5908 .getReg(Idx: 0);
5909 default:
5910 llvm_unreachable("unhandled lane op");
5911 }
5912 };
5913
5914 Register DstReg = MI.getOperand(i: 0).getReg();
5915 Register Src0 = MI.getOperand(i: 2).getReg();
5916 Register Src1, Src2;
5917 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5918 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5919 Src1 = MI.getOperand(i: 3).getReg();
5920 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5921 Src2 = MI.getOperand(i: 4).getReg();
5922 }
5923 }
5924
5925 LLT Ty = MRI.getType(Reg: DstReg);
5926 unsigned Size = Ty.getSizeInBits();
5927
5928 unsigned SplitSize = 32;
5929 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5930 ST.hasDPALU_DPP() &&
5931 AMDGPU::isLegalDPALU_DPPControl(ST, DC: MI.getOperand(i: 4).getImm()))
5932 SplitSize = 64;
5933
5934 if (Size == SplitSize) {
5935 // Already legal
5936 return true;
5937 }
5938
5939 if (Size < 32) {
5940 Src0 = B.buildAnyExt(Res: S32, Op: Src0).getReg(Idx: 0);
5941
5942 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5943 Src1 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src1).getReg(Idx: 0);
5944
5945 if (IID == Intrinsic::amdgcn_writelane)
5946 Src2 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src2).getReg(Idx: 0);
5947
5948 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5949 B.buildTrunc(Res: DstReg, Op: LaneOpDst);
5950 MI.eraseFromParent();
5951 return true;
5952 }
5953
5954 if (Size % SplitSize != 0)
5955 return false;
5956
5957 LLT PartialResTy = LLT::scalar(SizeInBits: SplitSize);
5958 bool NeedsBitcast = false;
5959 if (Ty.isVector()) {
5960 LLT EltTy = Ty.getElementType();
5961 unsigned EltSize = EltTy.getSizeInBits();
5962 if (EltSize == SplitSize) {
5963 PartialResTy = EltTy;
5964 } else if (EltSize == 16 || EltSize == 32) {
5965 unsigned NElem = SplitSize / EltSize;
5966 PartialResTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: NElem));
5967 } else {
5968 // Handle all other cases via S32/S64 pieces
5969 NeedsBitcast = true;
5970 }
5971 }
5972
5973 SmallVector<Register, 4> PartialRes;
5974 unsigned NumParts = Size / SplitSize;
5975 MachineInstrBuilder Src0Parts = B.buildUnmerge(Res: PartialResTy, Op: Src0);
5976 MachineInstrBuilder Src1Parts, Src2Parts;
5977
5978 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5979 Src1Parts = B.buildUnmerge(Res: PartialResTy, Op: Src1);
5980
5981 if (IID == Intrinsic::amdgcn_writelane)
5982 Src2Parts = B.buildUnmerge(Res: PartialResTy, Op: Src2);
5983
5984 for (unsigned i = 0; i < NumParts; ++i) {
5985 Src0 = Src0Parts.getReg(Idx: i);
5986
5987 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5988 Src1 = Src1Parts.getReg(Idx: i);
5989
5990 if (IID == Intrinsic::amdgcn_writelane)
5991 Src2 = Src2Parts.getReg(Idx: i);
5992
5993 PartialRes.push_back(Elt: createLaneOp(Src0, Src1, Src2, PartialResTy));
5994 }
5995
5996 if (NeedsBitcast)
5997 B.buildBitcast(Dst: DstReg, Src: B.buildMergeLikeInstr(
5998 Res: LLT::scalar(SizeInBits: Ty.getSizeInBits()), Ops: PartialRes));
5999 else
6000 B.buildMergeLikeInstr(Res: DstReg, Ops: PartialRes);
6001
6002 MI.eraseFromParent();
6003 return true;
6004}
6005
6006bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
6007 MachineRegisterInfo &MRI,
6008 MachineIRBuilder &B) const {
6009 uint64_t Offset =
6010 ST.getTargetLowering()->getImplicitParameterOffset(
6011 MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
6012 LLT DstTy = MRI.getType(Reg: DstReg);
6013 LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
6014
6015 Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
6016 if (!loadInputValue(DstReg: KernargPtrReg, B,
6017 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6018 return false;
6019
6020 B.buildObjectPtrOffset(Res: DstReg, Op0: KernargPtrReg,
6021 Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: 0));
6022 return true;
6023}
6024
6025/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6026/// bits of the pointer and replace them with the stride argument, then
6027/// merge_values everything together. In the common case of a raw buffer (the
6028/// stride component is 0), we can just AND off the upper half.
6029bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
6030 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6031 Register Result = MI.getOperand(i: 0).getReg();
6032 Register Pointer = MI.getOperand(i: 2).getReg();
6033 Register Stride = MI.getOperand(i: 3).getReg();
6034 Register NumRecords = MI.getOperand(i: 4).getReg();
6035 Register Flags = MI.getOperand(i: 5).getReg();
6036
6037 LLT S32 = LLT::scalar(SizeInBits: 32);
6038 LLT S64 = LLT::scalar(SizeInBits: 64);
6039
6040 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6041
6042 auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
6043
6044 if (ST.has45BitNumRecordsBufferResource()) {
6045 Register Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6046 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6047 // num_records.
6048 LLT PtrIntTy = LLT::scalar(SizeInBits: MRI.getType(Reg: Pointer).getSizeInBits());
6049 auto PointerInt = B.buildPtrToInt(Dst: PtrIntTy, Src: Pointer);
6050 auto ExtPointer = B.buildAnyExtOrTrunc(Res: S64, Op: PointerInt);
6051 auto NumRecordsLHS = B.buildShl(Dst: S64, Src0: NumRecords, Src1: B.buildConstant(Res: S32, Val: 57));
6052 Register LowHalf = B.buildOr(Dst: S64, Src0: ExtPointer, Src1: NumRecordsLHS).getReg(Idx: 0);
6053
6054 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6055 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6056 auto NumRecordsRHS = B.buildLShr(Dst: S64, Src0: NumRecords, Src1: B.buildConstant(Res: S32, Val: 7));
6057 auto ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: B.buildConstant(Res: S32, Val: 12));
6058 auto ExtShiftedStride =
6059 B.buildMergeValues(Res: S64, Ops: {Zero, ShiftedStride.getReg(Idx: 0)});
6060 auto ShiftedFlags = B.buildShl(Dst: S32, Src0: Flags, Src1: B.buildConstant(Res: S32, Val: 28));
6061 auto ExtShiftedFlags =
6062 B.buildMergeValues(Res: S64, Ops: {Zero, ShiftedFlags.getReg(Idx: 0)});
6063 auto CombinedFields = B.buildOr(Dst: S64, Src0: NumRecordsRHS, Src1: ExtShiftedStride);
6064 Register HighHalf =
6065 B.buildOr(Dst: S64, Src0: CombinedFields, Src1: ExtShiftedFlags).getReg(Idx: 0);
6066 B.buildMergeValues(Res: Result, Ops: {LowHalf, HighHalf});
6067 } else {
6068 NumRecords = B.buildTrunc(Res: S32, Op: NumRecords).getReg(Idx: 0);
6069 auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
6070 auto LowHalf = Unmerge.getReg(Idx: 0);
6071 auto HighHalf = Unmerge.getReg(Idx: 1);
6072
6073 auto AndMask = B.buildConstant(Res: S32, Val: 0x0000ffff);
6074 auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
6075 auto ShiftConst = B.buildConstant(Res: S32, Val: 16);
6076 auto ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
6077 auto NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
6078 Register NewHighHalfReg = NewHighHalf.getReg(Idx: 0);
6079 B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
6080 }
6081
6082 MI.eraseFromParent();
6083 return true;
6084}
6085
6086bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
6087 MachineRegisterInfo &MRI,
6088 MachineIRBuilder &B) const {
6089 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6090 if (!MFI->isEntryFunction()) {
6091 return legalizePreloadedArgIntrin(MI, MRI, B,
6092 ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
6093 }
6094
6095 Register DstReg = MI.getOperand(i: 0).getReg();
6096 if (!getImplicitArgPtr(DstReg, MRI, B))
6097 return false;
6098
6099 MI.eraseFromParent();
6100 return true;
6101}
6102
6103bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
6104 MachineRegisterInfo &MRI,
6105 MachineIRBuilder &B) const {
6106 Function &F = B.getMF().getFunction();
6107 std::optional<uint32_t> KnownSize =
6108 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
6109 if (KnownSize.has_value())
6110 B.buildConstant(Res: DstReg, Val: *KnownSize);
6111 return false;
6112}
6113
6114bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
6115 MachineRegisterInfo &MRI,
6116 MachineIRBuilder &B) const {
6117
6118 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6119 if (!MFI->isEntryFunction()) {
6120 return legalizePreloadedArgIntrin(MI, MRI, B,
6121 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
6122 }
6123
6124 Register DstReg = MI.getOperand(i: 0).getReg();
6125 if (!getLDSKernelId(DstReg, MRI, B))
6126 return false;
6127
6128 MI.eraseFromParent();
6129 return true;
6130}
6131
6132bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
6133 MachineRegisterInfo &MRI,
6134 MachineIRBuilder &B,
6135 unsigned AddrSpace) const {
6136 const LLT S32 = LLT::scalar(SizeInBits: 32);
6137 auto Unmerge = B.buildUnmerge(Res: S32, Op: MI.getOperand(i: 2).getReg());
6138 Register Hi32 = Unmerge.getReg(Idx: 1);
6139
6140 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6141 ST.hasGloballyAddressableScratch()) {
6142 Register FlatScratchBaseHi =
6143 B.buildInstr(Opc: AMDGPU::S_MOV_B32, DstOps: {S32},
6144 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6145 .getReg(Idx: 0);
6146 MRI.setRegClass(Reg: FlatScratchBaseHi, RC: &AMDGPU::SReg_32RegClass);
6147 // Test bits 63..58 against the aperture address.
6148 Register XOR = B.buildXor(Dst: S32, Src0: Hi32, Src1: FlatScratchBaseHi).getReg(Idx: 0);
6149 B.buildICmp(Pred: ICmpInst::ICMP_ULT, Res: MI.getOperand(i: 0), Op0: XOR,
6150 Op1: B.buildConstant(Res: S32, Val: 1u << 26));
6151 } else {
6152 Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
6153 B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: 0), Op0: Hi32, Op1: ApertureReg);
6154 }
6155 MI.eraseFromParent();
6156 return true;
6157}
6158
6159// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6160// offset (the offset that is included in bounds checking and swizzling, to be
6161// split between the instruction's voffset and immoffset fields) and soffset
6162// (the offset that is excluded from bounds checking and swizzling, to go in
6163// the instruction's soffset field). This function takes the first kind of
6164// offset and figures out how to split it between voffset and immoffset.
6165std::pair<Register, unsigned>
6166AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
6167 Register OrigOffset) const {
6168 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6169 Register BaseReg;
6170 unsigned ImmOffset;
6171 const LLT S32 = LLT::scalar(SizeInBits: 32);
6172 MachineRegisterInfo &MRI = *B.getMRI();
6173
6174 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6175 // being added, so we can only safely match a 32-bit addition with no unsigned
6176 // overflow.
6177 bool CheckNUW = ST.hasGFX1250Insts();
6178 std::tie(args&: BaseReg, args&: ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6179 MRI, Reg: OrigOffset, /*KnownBits=*/ValueTracking: nullptr, CheckNUW);
6180
6181 // If BaseReg is a pointer, convert it to int.
6182 if (MRI.getType(Reg: BaseReg).isPointer())
6183 BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: 0);
6184
6185 // If the immediate value is too big for the immoffset field, put only bits
6186 // that would normally fit in the immoffset field. The remaining value that
6187 // is copied/added for the voffset field is a large power of 2, and it
6188 // stands more chance of being CSEd with the copy/add for another similar
6189 // load/store.
6190 // However, do not do that rounding down if that is a negative
6191 // number, as it appears to be illegal to have a negative offset in the
6192 // vgpr, even if adding the immediate offset makes it positive.
6193 unsigned Overflow = ImmOffset & ~MaxImm;
6194 ImmOffset -= Overflow;
6195 if ((int32_t)Overflow < 0) {
6196 Overflow += ImmOffset;
6197 ImmOffset = 0;
6198 }
6199
6200 if (Overflow != 0) {
6201 if (!BaseReg) {
6202 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
6203 } else {
6204 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
6205 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
6206 }
6207 }
6208
6209 if (!BaseReg)
6210 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6211
6212 return std::pair(BaseReg, ImmOffset);
6213}
6214
6215/// Handle register layout difference for f16 images for some subtargets.
6216Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
6217 MachineRegisterInfo &MRI,
6218 Register Reg,
6219 bool ImageStore) const {
6220 const LLT S16 = LLT::scalar(SizeInBits: 16);
6221 const LLT S32 = LLT::scalar(SizeInBits: 32);
6222 LLT StoreVT = MRI.getType(Reg);
6223 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6224
6225 if (ST.hasUnpackedD16VMem()) {
6226 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
6227
6228 SmallVector<Register, 4> WideRegs;
6229 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6230 WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6231
6232 int NumElts = StoreVT.getNumElements();
6233
6234 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
6235 .getReg(Idx: 0);
6236 }
6237
6238 if (ImageStore && ST.hasImageStoreD16Bug()) {
6239 if (StoreVT.getNumElements() == 2) {
6240 SmallVector<Register, 4> PackedRegs;
6241 Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: 0);
6242 PackedRegs.push_back(Elt: Reg);
6243 PackedRegs.resize(N: 2, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
6244 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Ops: PackedRegs)
6245 .getReg(Idx: 0);
6246 }
6247
6248 if (StoreVT.getNumElements() == 3) {
6249 SmallVector<Register, 4> PackedRegs;
6250 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
6251 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6252 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
6253 PackedRegs.resize(N: 6, NV: B.buildUndef(Res: S16).getReg(Idx: 0));
6254 Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 6, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: 0);
6255 return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 3, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
6256 }
6257
6258 if (StoreVT.getNumElements() == 4) {
6259 SmallVector<Register, 4> PackedRegs;
6260 Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
6261 auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
6262 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6263 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
6264 PackedRegs.resize(N: 4, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
6265 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S32), Ops: PackedRegs)
6266 .getReg(Idx: 0);
6267 }
6268
6269 llvm_unreachable("invalid data type");
6270 }
6271
6272 if (StoreVT == LLT::fixed_vector(NumElements: 3, ScalarTy: S16)) {
6273 Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S16), Op0: Reg)
6274 .getReg(Idx: 0);
6275 }
6276 return Reg;
6277}
6278
6279Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
6280 Register VData, LLT MemTy,
6281 bool IsFormat) const {
6282 MachineRegisterInfo *MRI = B.getMRI();
6283 LLT Ty = MRI->getType(Reg: VData);
6284
6285 const LLT S16 = LLT::scalar(SizeInBits: 16);
6286
6287 // Fixup buffer resources themselves needing to be v4i128.
6288 if (hasBufferRsrcWorkaround(Ty))
6289 return castBufferRsrcToV4I32(Pointer: VData, B);
6290
6291 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6292 Ty = getBitcastRegisterType(Ty);
6293 VData = B.buildBitcast(Dst: Ty, Src: VData).getReg(Idx: 0);
6294 }
6295 // Fixup illegal register types for i8 stores.
6296 if (Ty == LLT::scalar(SizeInBits: 8) || Ty == S16) {
6297 Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: VData).getReg(Idx: 0);
6298 return AnyExt;
6299 }
6300
6301 if (Ty.isVector()) {
6302 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6303 if (IsFormat)
6304 return handleD16VData(B, MRI&: *MRI, Reg: VData);
6305 }
6306 }
6307
6308 return VData;
6309}
6310
6311bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
6312 LegalizerHelper &Helper,
6313 bool IsTyped,
6314 bool IsFormat) const {
6315 MachineIRBuilder &B = Helper.MIRBuilder;
6316 MachineRegisterInfo &MRI = *B.getMRI();
6317
6318 Register VData = MI.getOperand(i: 1).getReg();
6319 LLT Ty = MRI.getType(Reg: VData);
6320 LLT EltTy = Ty.getScalarType();
6321 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6322 const LLT S32 = LLT::scalar(SizeInBits: 32);
6323
6324 MachineMemOperand *MMO = *MI.memoperands_begin();
6325 const int MemSize = MMO->getSize().getValue();
6326 LLT MemTy = MMO->getMemoryType();
6327
6328 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6329
6330 castBufferRsrcArgToV4I32(MI, B, Idx: 2);
6331 Register RSrc = MI.getOperand(i: 2).getReg();
6332
6333 unsigned ImmOffset;
6334
6335 // The typed intrinsics add an immediate after the registers.
6336 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6337
6338 // The struct intrinsic variants add one additional operand over raw.
6339 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6340 Register VIndex;
6341 int OpOffset = 0;
6342 if (HasVIndex) {
6343 VIndex = MI.getOperand(i: 3).getReg();
6344 OpOffset = 1;
6345 } else {
6346 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6347 }
6348
6349 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6350 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6351
6352 unsigned Format = 0;
6353 if (IsTyped) {
6354 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6355 ++OpOffset;
6356 }
6357
6358 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6359
6360 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6361
6362 unsigned Opc;
6363 if (IsTyped) {
6364 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6365 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6366 } else if (IsFormat) {
6367 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6368 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6369 } else {
6370 switch (MemSize) {
6371 case 1:
6372 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6373 break;
6374 case 2:
6375 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6376 break;
6377 default:
6378 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6379 break;
6380 }
6381 }
6382
6383 auto MIB = B.buildInstr(Opcode: Opc)
6384 .addUse(RegNo: VData) // vdata
6385 .addUse(RegNo: RSrc) // rsrc
6386 .addUse(RegNo: VIndex) // vindex
6387 .addUse(RegNo: VOffset) // voffset
6388 .addUse(RegNo: SOffset) // soffset
6389 .addImm(Val: ImmOffset); // offset(imm)
6390
6391 if (IsTyped)
6392 MIB.addImm(Val: Format);
6393
6394 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6395 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6396 .addMemOperand(MMO);
6397
6398 MI.eraseFromParent();
6399 return true;
6400}
6401
6402static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6403 Register VIndex, Register VOffset, Register SOffset,
6404 unsigned ImmOffset, unsigned Format,
6405 unsigned AuxiliaryData, MachineMemOperand *MMO,
6406 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6407 auto MIB = B.buildInstr(Opcode: Opc)
6408 .addDef(RegNo: LoadDstReg) // vdata
6409 .addUse(RegNo: RSrc) // rsrc
6410 .addUse(RegNo: VIndex) // vindex
6411 .addUse(RegNo: VOffset) // voffset
6412 .addUse(RegNo: SOffset) // soffset
6413 .addImm(Val: ImmOffset); // offset(imm)
6414
6415 if (IsTyped)
6416 MIB.addImm(Val: Format);
6417
6418 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6419 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6420 .addMemOperand(MMO);
6421}
6422
6423bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
6424 LegalizerHelper &Helper,
6425 bool IsFormat,
6426 bool IsTyped) const {
6427 MachineIRBuilder &B = Helper.MIRBuilder;
6428 MachineRegisterInfo &MRI = *B.getMRI();
6429 GISelChangeObserver &Observer = Helper.Observer;
6430
6431 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6432 MachineMemOperand *MMO = *MI.memoperands_begin();
6433 const LLT MemTy = MMO->getMemoryType();
6434 const LLT S32 = LLT::scalar(SizeInBits: 32);
6435
6436 Register Dst = MI.getOperand(i: 0).getReg();
6437
6438 Register StatusDst;
6439 int OpOffset = 0;
6440 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6441 bool IsTFE = MI.getNumExplicitDefs() == 2;
6442 if (IsTFE) {
6443 StatusDst = MI.getOperand(i: 1).getReg();
6444 ++OpOffset;
6445 }
6446
6447 castBufferRsrcArgToV4I32(MI, B, Idx: 2 + OpOffset);
6448 Register RSrc = MI.getOperand(i: 2 + OpOffset).getReg();
6449
6450 // The typed intrinsics add an immediate after the registers.
6451 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6452
6453 // The struct intrinsic variants add one additional operand over raw.
6454 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6455 Register VIndex;
6456 if (HasVIndex) {
6457 VIndex = MI.getOperand(i: 3 + OpOffset).getReg();
6458 ++OpOffset;
6459 } else {
6460 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6461 }
6462
6463 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6464 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6465
6466 unsigned Format = 0;
6467 if (IsTyped) {
6468 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6469 ++OpOffset;
6470 }
6471
6472 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6473 unsigned ImmOffset;
6474
6475 LLT Ty = MRI.getType(Reg: Dst);
6476 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6477 // logic doesn't have to handle that case.
6478 if (hasBufferRsrcWorkaround(Ty)) {
6479 Observer.changingInstr(MI);
6480 Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
6481 Observer.changedInstr(MI);
6482 Dst = MI.getOperand(i: 0).getReg();
6483 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6484 }
6485 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6486 Ty = getBitcastRegisterType(Ty);
6487 Observer.changingInstr(MI);
6488 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
6489 Observer.changedInstr(MI);
6490 Dst = MI.getOperand(i: 0).getReg();
6491 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6492 }
6493
6494 LLT EltTy = Ty.getScalarType();
6495 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6496 const bool Unpacked = ST.hasUnpackedD16VMem();
6497
6498 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6499
6500 unsigned Opc;
6501
6502 // TODO: Support TFE for typed and narrow loads.
6503 if (IsTyped) {
6504 if (IsTFE)
6505 return false;
6506 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6507 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6508 } else if (IsFormat) {
6509 if (IsD16) {
6510 if (IsTFE)
6511 return false;
6512 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6513 } else {
6514 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6515 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6516 }
6517 } else {
6518 switch (MemTy.getSizeInBits()) {
6519 case 8:
6520 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6521 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6522 break;
6523 case 16:
6524 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6525 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6526 break;
6527 default:
6528 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6529 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6530 break;
6531 }
6532 }
6533
6534 if (IsTFE) {
6535 unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: 32);
6536 unsigned NumLoadDWords = NumValueDWords + 1;
6537 LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
6538 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
6539 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6540 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6541 if (MemTy.getSizeInBits() < 32) {
6542 Register ExtDst = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6543 B.buildUnmerge(Res: {ExtDst, StatusDst}, Op: LoadDstReg);
6544 B.buildTrunc(Res: Dst, Op: ExtDst);
6545 } else if (NumValueDWords == 1) {
6546 B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
6547 } else {
6548 SmallVector<Register, 5> LoadElts;
6549 for (unsigned I = 0; I != NumValueDWords; ++I)
6550 LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
6551 LoadElts.push_back(Elt: StatusDst);
6552 B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
6553 LoadElts.truncate(N: NumValueDWords);
6554 B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
6555 }
6556 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6557 (IsD16 && !Ty.isVector())) {
6558 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6559 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6560 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6561 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6562 B.buildTrunc(Res: Dst, Op: LoadDstReg);
6563 } else if (Unpacked && IsD16 && Ty.isVector()) {
6564 LLT UnpackedTy = Ty.changeElementSize(NewEltSize: 32);
6565 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
6566 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6567 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6568 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6569 // FIXME: G_TRUNC should work, but legalization currently fails
6570 auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
6571 SmallVector<Register, 4> Repack;
6572 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6573 Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6574 B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
6575 } else {
6576 buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6577 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6578 }
6579
6580 MI.eraseFromParent();
6581 return true;
6582}
6583
6584static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6585 switch (IntrID) {
6586 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6587 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6588 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6589 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6590 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6591 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6592 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6593 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6594 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6595 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6596 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6597 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6598 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6599 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6600 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6601 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6602 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6603 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6604 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6605 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6606 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6607 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6608 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6609 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6610 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6611 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6612 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6613 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6614 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6615 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6616 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6617 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6618 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6620 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6621 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6622 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6623 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6624 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6625 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6626 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6627 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6628 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6629 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6630 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6631 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6632 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6633 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6634 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6635 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6636 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6637 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6638 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6640 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6641 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6642 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6643 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6644 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6645 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6646 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6647 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6648 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6649 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6650 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6651 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6653 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6654 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6655 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6656 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6657 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6658 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6659 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6660 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6661 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6662 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6663 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6664 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6665 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6666 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6667 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6668 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6669 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6670 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6671 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6672 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6673 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6674 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6675 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6676 default:
6677 llvm_unreachable("unhandled atomic opcode");
6678 }
6679}
6680
6681bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6682 MachineIRBuilder &B,
6683 Intrinsic::ID IID) const {
6684 const bool IsCmpSwap =
6685 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6686 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6687 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6688 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6689
6690 Register Dst = MI.getOperand(i: 0).getReg();
6691 // Since we don't have 128-bit atomics, we don't need to handle the case of
6692 // p8 argmunents to the atomic itself
6693 Register VData = MI.getOperand(i: 2).getReg();
6694
6695 Register CmpVal;
6696 int OpOffset = 0;
6697
6698 if (IsCmpSwap) {
6699 CmpVal = MI.getOperand(i: 3).getReg();
6700 ++OpOffset;
6701 }
6702
6703 castBufferRsrcArgToV4I32(MI, B, Idx: 3 + OpOffset);
6704 Register RSrc = MI.getOperand(i: 3 + OpOffset).getReg();
6705 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6706
6707 // The struct intrinsic variants add one additional operand over raw.
6708 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6709 Register VIndex;
6710 if (HasVIndex) {
6711 VIndex = MI.getOperand(i: 4 + OpOffset).getReg();
6712 ++OpOffset;
6713 } else {
6714 VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 0).getReg(Idx: 0);
6715 }
6716
6717 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6718 Register SOffset = MI.getOperand(i: 5 + OpOffset).getReg();
6719 unsigned AuxiliaryData = MI.getOperand(i: 6 + OpOffset).getImm();
6720
6721 MachineMemOperand *MMO = *MI.memoperands_begin();
6722
6723 unsigned ImmOffset;
6724 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6725
6726 auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
6727 .addDef(RegNo: Dst)
6728 .addUse(RegNo: VData); // vdata
6729
6730 if (IsCmpSwap)
6731 MIB.addReg(RegNo: CmpVal);
6732
6733 MIB.addUse(RegNo: RSrc) // rsrc
6734 .addUse(RegNo: VIndex) // vindex
6735 .addUse(RegNo: VOffset) // voffset
6736 .addUse(RegNo: SOffset) // soffset
6737 .addImm(Val: ImmOffset) // offset(imm)
6738 .addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6739 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6740 .addMemOperand(MMO);
6741
6742 MI.eraseFromParent();
6743 return true;
6744}
6745
6746/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6747/// vector with s16 typed elements.
6748static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6749 SmallVectorImpl<Register> &PackedAddrs,
6750 unsigned ArgOffset,
6751 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6752 bool IsA16, bool IsG16) {
6753 const LLT S16 = LLT::scalar(SizeInBits: 16);
6754 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6755 auto EndIdx = Intr->VAddrEnd;
6756
6757 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6758 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6759 if (!SrcOp.isReg())
6760 continue; // _L to _LZ may have eliminated this.
6761
6762 Register AddrReg = SrcOp.getReg();
6763
6764 if ((I < Intr->GradientStart) ||
6765 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6766 (I >= Intr->CoordStart && !IsA16)) {
6767 if ((I < Intr->GradientStart) && IsA16 &&
6768 (B.getMRI()->getType(Reg: AddrReg) == S16)) {
6769 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6770 // Special handling of bias when A16 is on. Bias is of type half but
6771 // occupies full 32-bit.
6772 PackedAddrs.push_back(
6773 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6774 .getReg(Idx: 0));
6775 } else {
6776 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6777 "Bias needs to be converted to 16 bit in A16 mode");
6778 // Handle any gradient or coordinate operands that should not be packed
6779 AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: 0);
6780 PackedAddrs.push_back(Elt: AddrReg);
6781 }
6782 } else {
6783 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6784 // derivatives dx/dh and dx/dv are packed with undef.
6785 if (((I + 1) >= EndIdx) ||
6786 ((Intr->NumGradients / 2) % 2 == 1 &&
6787 (I == static_cast<unsigned>(Intr->GradientStart +
6788 (Intr->NumGradients / 2) - 1) ||
6789 I == static_cast<unsigned>(Intr->GradientStart +
6790 Intr->NumGradients - 1))) ||
6791 // Check for _L to _LZ optimization
6792 !MI.getOperand(i: ArgOffset + I + 1).isReg()) {
6793 PackedAddrs.push_back(
6794 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6795 .getReg(Idx: 0));
6796 } else {
6797 PackedAddrs.push_back(
6798 Elt: B.buildBuildVector(
6799 Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + 1).getReg()})
6800 .getReg(Idx: 0));
6801 ++I;
6802 }
6803 }
6804 }
6805}
6806
6807/// Convert from separate vaddr components to a single vector address register,
6808/// and replace the remaining operands with $noreg.
6809static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6810 int DimIdx, int NumVAddrs) {
6811 const LLT S32 = LLT::scalar(SizeInBits: 32);
6812 (void)S32;
6813 SmallVector<Register, 8> AddrRegs;
6814 for (int I = 0; I != NumVAddrs; ++I) {
6815 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6816 if (SrcOp.isReg()) {
6817 AddrRegs.push_back(Elt: SrcOp.getReg());
6818 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6819 }
6820 }
6821
6822 int NumAddrRegs = AddrRegs.size();
6823 if (NumAddrRegs != 1) {
6824 auto VAddr =
6825 B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: 32), Ops: AddrRegs);
6826 MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: 0));
6827 }
6828
6829 for (int I = 1; I != NumVAddrs; ++I) {
6830 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6831 if (SrcOp.isReg())
6832 MI.getOperand(i: DimIdx + I).setReg(AMDGPU::NoRegister);
6833 }
6834}
6835
6836/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6837///
6838/// Depending on the subtarget, load/store with 16-bit element data need to be
6839/// rewritten to use the low half of 32-bit registers, or directly use a packed
6840/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6841/// registers.
6842///
6843/// We don't want to directly select image instructions just yet, but also want
6844/// to exposes all register repacking to the legalizer/combiners. We also don't
6845/// want a selected instruction entering RegBankSelect. In order to avoid
6846/// defining a multitude of intermediate image instructions, directly hack on
6847/// the intrinsic's arguments. In cases like a16 addresses, this requires
6848/// padding now unnecessary arguments with $noreg.
6849bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6850 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6851 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6852
6853 const MachineFunction &MF = *MI.getMF();
6854 const unsigned NumDefs = MI.getNumExplicitDefs();
6855 const unsigned ArgOffset = NumDefs + 1;
6856 bool IsTFE = NumDefs == 2;
6857 // We are only processing the operands of d16 image operations on subtargets
6858 // that use the unpacked register layout, or need to repack the TFE result.
6859
6860 // TODO: Do we need to guard against already legalized intrinsics?
6861 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6862 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
6863
6864 MachineRegisterInfo *MRI = B.getMRI();
6865 const LLT S32 = LLT::scalar(SizeInBits: 32);
6866 const LLT S16 = LLT::scalar(SizeInBits: 16);
6867 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6868
6869 unsigned DMask = 0;
6870 Register VData;
6871 LLT Ty;
6872
6873 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6874 VData = MI.getOperand(i: NumDefs == 0 ? 1 : 0).getReg();
6875 Ty = MRI->getType(Reg: VData);
6876 }
6877
6878 const bool IsAtomicPacked16Bit =
6879 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6880 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6881
6882 // Check for 16 bit addresses and pack if true.
6883 LLT GradTy =
6884 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
6885 LLT AddrTy =
6886 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
6887 const bool IsG16 =
6888 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6889 const bool IsA16 = AddrTy == S16;
6890 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6891
6892 int DMaskLanes = 0;
6893 if (!BaseOpcode->Atomic) {
6894 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
6895 if (BaseOpcode->Gather4) {
6896 DMaskLanes = 4;
6897 } else if (DMask != 0) {
6898 DMaskLanes = llvm::popcount(Value: DMask);
6899 } else if (!IsTFE && !BaseOpcode->Store) {
6900 // If dmask is 0, this is a no-op load. This can be eliminated.
6901 B.buildUndef(Res: MI.getOperand(i: 0));
6902 MI.eraseFromParent();
6903 return true;
6904 }
6905 }
6906
6907 Observer.changingInstr(MI);
6908 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
6909
6910 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6911 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6912 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6913 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6914 unsigned NewOpcode = LoadOpcode;
6915 if (BaseOpcode->Store)
6916 NewOpcode = StoreOpcode;
6917 else if (BaseOpcode->NoReturn)
6918 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6919
6920 // Track that we legalized this
6921 MI.setDesc(B.getTII().get(Opcode: NewOpcode));
6922
6923 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6924 // dmask to be at least 1 otherwise the instruction will fail
6925 if (IsTFE && DMask == 0) {
6926 DMask = 0x1;
6927 DMaskLanes = 1;
6928 MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
6929 }
6930
6931 if (BaseOpcode->Atomic) {
6932 Register VData0 = MI.getOperand(i: 2).getReg();
6933 LLT Ty = MRI->getType(Reg: VData0);
6934
6935 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6936 if (Ty.isVector() && !IsAtomicPacked16Bit)
6937 return false;
6938
6939 if (BaseOpcode->AtomicX2) {
6940 Register VData1 = MI.getOperand(i: 3).getReg();
6941 // The two values are packed in one register.
6942 LLT PackedTy = LLT::fixed_vector(NumElements: 2, ScalarTy: Ty);
6943 auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
6944 MI.getOperand(i: 2).setReg(Concat.getReg(Idx: 0));
6945 MI.getOperand(i: 3).setReg(AMDGPU::NoRegister);
6946 }
6947 }
6948
6949 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6950
6951 // Rewrite the addressing register layout before doing anything else.
6952 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6953 // 16 bit gradients are supported, but are tied to the A16 control
6954 // so both gradients and addresses must be 16 bit
6955 return false;
6956 }
6957
6958 if (IsA16 && !ST.hasA16()) {
6959 // A16 not supported
6960 return false;
6961 }
6962
6963 const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
6964 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6965
6966 if (IsA16 || IsG16) {
6967 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6968 // instructions expect VGPR_32
6969 SmallVector<Register, 4> PackedRegs;
6970
6971 packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6972
6973 // See also below in the non-a16 branch
6974 const bool UseNSA = ST.hasNSAEncoding() &&
6975 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6976 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6977 const bool UsePartialNSA =
6978 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6979
6980 if (UsePartialNSA) {
6981 // Pack registers that would go over NSAMaxSize into last VAddr register
6982 LLT PackedAddrTy =
6983 LLT::fixed_vector(NumElements: 2 * (PackedRegs.size() - NSAMaxSize + 1), ScalarSizeInBits: 16);
6984 auto Concat = B.buildConcatVectors(
6985 Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - 1));
6986 PackedRegs[NSAMaxSize - 1] = Concat.getReg(Idx: 0);
6987 PackedRegs.resize(N: NSAMaxSize);
6988 } else if (!UseNSA && PackedRegs.size() > 1) {
6989 LLT PackedAddrTy = LLT::fixed_vector(NumElements: 2 * PackedRegs.size(), ScalarSizeInBits: 16);
6990 auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
6991 PackedRegs[0] = Concat.getReg(Idx: 0);
6992 PackedRegs.resize(N: 1);
6993 }
6994
6995 const unsigned NumPacked = PackedRegs.size();
6996 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6997 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6998 if (!SrcOp.isReg()) {
6999 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7000 continue;
7001 }
7002
7003 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7004
7005 if (I - Intr->VAddrStart < NumPacked)
7006 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7007 else
7008 SrcOp.setReg(AMDGPU::NoRegister);
7009 }
7010 } else {
7011 // If the register allocator cannot place the address registers contiguously
7012 // without introducing moves, then using the non-sequential address encoding
7013 // is always preferable, since it saves VALU instructions and is usually a
7014 // wash in terms of code size or even better.
7015 //
7016 // However, we currently have no way of hinting to the register allocator
7017 // that MIMG addresses should be placed contiguously when it is possible to
7018 // do so, so force non-NSA for the common 2-address case as a heuristic.
7019 //
7020 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7021 // allocation when possible.
7022 //
7023 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7024 // set of the remaining addresses.
7025 const bool UseNSA = ST.hasNSAEncoding() &&
7026 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7027 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7028 const bool UsePartialNSA =
7029 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7030
7031 if (UsePartialNSA) {
7032 convertImageAddrToPacked(B, MI,
7033 DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7034 NumVAddrs: Intr->NumVAddrs - NSAMaxSize + 1);
7035 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7036 convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
7037 NumVAddrs: Intr->NumVAddrs);
7038 }
7039 }
7040
7041 int Flags = 0;
7042 if (IsA16)
7043 Flags |= 1;
7044 if (IsG16)
7045 Flags |= 2;
7046 MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
7047
7048 if (BaseOpcode->NoReturn) { // No TFE for stores?
7049 // TODO: Handle dmask trim
7050 if (!Ty.isVector() || !IsD16)
7051 return true;
7052
7053 Register RepackedReg = handleD16VData(B, MRI&: *MRI, Reg: VData, ImageStore: true);
7054 if (RepackedReg != VData) {
7055 MI.getOperand(i: 1).setReg(RepackedReg);
7056 }
7057
7058 return true;
7059 }
7060
7061 Register DstReg = MI.getOperand(i: 0).getReg();
7062 const LLT EltTy = Ty.getScalarType();
7063 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7064
7065 // Confirm that the return type is large enough for the dmask specified
7066 if (NumElts < DMaskLanes)
7067 return false;
7068
7069 if (NumElts > 4 || DMaskLanes > 4)
7070 return false;
7071
7072 // Image atomic instructions are using DMask to specify how many bits
7073 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7074 // DMaskLanes for image atomic has default value '0'.
7075 // We must be sure that atomic variants (especially packed) will not be
7076 // truncated from v2s16 or v4s16 to s16 type.
7077 //
7078 // ChangeElementCount will be needed for image load where Ty is always scalar.
7079 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7080 const LLT AdjustedTy =
7081 DMaskLanes == 0
7082 ? Ty
7083 : Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
7084
7085 // The raw dword aligned data component of the load. The only legal cases
7086 // where this matters should be when using the packed D16 format, for
7087 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7088 LLT RoundedTy;
7089
7090 // S32 vector to cover all data, plus TFE result element.
7091 LLT TFETy;
7092
7093 // Register type to use for each loaded component. Will be S32 or V2S16.
7094 LLT RegTy;
7095
7096 if (IsD16 && ST.hasUnpackedD16VMem()) {
7097 RoundedTy =
7098 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: 32);
7099 TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + 1, ScalarSizeInBits: 32);
7100 RegTy = S32;
7101 } else {
7102 unsigned EltSize = EltTy.getSizeInBits();
7103 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7104 unsigned RoundedSize = 32 * RoundedElts;
7105 RoundedTy = LLT::scalarOrVector(
7106 EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
7107 TFETy = LLT::fixed_vector(NumElements: RoundedSize / 32 + 1, ScalarTy: S32);
7108 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7109 }
7110
7111 // The return type does not need adjustment.
7112 // TODO: Should we change s16 case to s32 or <2 x s16>?
7113 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7114 return true;
7115
7116 Register Dst1Reg;
7117
7118 // Insert after the instruction.
7119 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
7120
7121 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7122 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7123 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7124 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7125
7126 Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
7127
7128 MI.getOperand(i: 0).setReg(NewResultReg);
7129
7130 // In the IR, TFE is supposed to be used with a 2 element struct return
7131 // type. The instruction really returns these two values in one contiguous
7132 // register, with one additional dword beyond the loaded data. Rewrite the
7133 // return type to use a single register result.
7134
7135 if (IsTFE) {
7136 Dst1Reg = MI.getOperand(i: 1).getReg();
7137 if (MRI->getType(Reg: Dst1Reg) != S32)
7138 return false;
7139
7140 // TODO: Make sure the TFE operand bit is set.
7141 MI.removeOperand(OpNo: 1);
7142
7143 // Handle the easy case that requires no repack instructions.
7144 if (Ty == S32) {
7145 B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
7146 return true;
7147 }
7148 }
7149
7150 // Now figure out how to copy the new result register back into the old
7151 // result.
7152 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7153
7154 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7155
7156 if (ResultNumRegs == 1) {
7157 assert(!IsTFE);
7158 ResultRegs[0] = NewResultReg;
7159 } else {
7160 // We have to repack into a new vector of some kind.
7161 for (int I = 0; I != NumDataRegs; ++I)
7162 ResultRegs[I] = MRI->createGenericVirtualRegister(Ty: RegTy);
7163 B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
7164
7165 // Drop the final TFE element to get the data part. The TFE result is
7166 // directly written to the right place already.
7167 if (IsTFE)
7168 ResultRegs.resize(N: NumDataRegs);
7169 }
7170
7171 // For an s16 scalar result, we form an s32 result with a truncate regardless
7172 // of packed vs. unpacked.
7173 if (IsD16 && !Ty.isVector()) {
7174 B.buildTrunc(Res: DstReg, Op: ResultRegs[0]);
7175 return true;
7176 }
7177
7178 // Avoid a build/concat_vector of 1 entry.
7179 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7180 B.buildBitcast(Dst: DstReg, Src: ResultRegs[0]);
7181 return true;
7182 }
7183
7184 assert(Ty.isVector());
7185
7186 if (IsD16) {
7187 // For packed D16 results with TFE enabled, all the data components are
7188 // S32. Cast back to the expected type.
7189 //
7190 // TODO: We don't really need to use load s32 elements. We would only need one
7191 // cast for the TFE result if a multiple of v2s16 was used.
7192 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7193 for (Register &Reg : ResultRegs)
7194 Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: 0);
7195 } else if (ST.hasUnpackedD16VMem()) {
7196 for (Register &Reg : ResultRegs)
7197 Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: 0);
7198 }
7199 }
7200
7201 auto padWithUndef = [&](LLT Ty, int NumElts) {
7202 if (NumElts == 0)
7203 return;
7204 Register Undef = B.buildUndef(Res: Ty).getReg(Idx: 0);
7205 for (int I = 0; I != NumElts; ++I)
7206 ResultRegs.push_back(Elt: Undef);
7207 };
7208
7209 // Pad out any elements eliminated due to the dmask.
7210 LLT ResTy = MRI->getType(Reg: ResultRegs[0]);
7211 if (!ResTy.isVector()) {
7212 padWithUndef(ResTy, NumElts - ResultRegs.size());
7213 B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
7214 return true;
7215 }
7216
7217 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7218 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7219
7220 // Deal with the one annoying legal case.
7221 const LLT V3S16 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 16);
7222 if (Ty == V3S16) {
7223 if (IsTFE) {
7224 if (ResultRegs.size() == 1) {
7225 NewResultReg = ResultRegs[0];
7226 } else if (ResultRegs.size() == 2) {
7227 LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
7228 NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: 0);
7229 } else {
7230 return false;
7231 }
7232 }
7233
7234 if (MRI->getType(Reg: DstReg).getNumElements() <
7235 MRI->getType(Reg: NewResultReg).getNumElements()) {
7236 B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
7237 } else {
7238 B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
7239 }
7240 return true;
7241 }
7242
7243 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7244 B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
7245 return true;
7246}
7247
7248bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
7249 MachineInstr &MI) const {
7250 MachineIRBuilder &B = Helper.MIRBuilder;
7251 GISelChangeObserver &Observer = Helper.Observer;
7252
7253 Register OrigDst = MI.getOperand(i: 0).getReg();
7254 Register Dst;
7255 LLT Ty = B.getMRI()->getType(Reg: OrigDst);
7256 unsigned Size = Ty.getSizeInBits();
7257 MachineFunction &MF = B.getMF();
7258 unsigned Opc = 0;
7259 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7260 assert(Size == 8 || Size == 16);
7261 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7262 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7263 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7264 // destination register.
7265 Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
7266 } else {
7267 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7268 Dst = OrigDst;
7269 }
7270
7271 Observer.changingInstr(MI);
7272
7273 // Handle needing to s.buffer.load() a p8 value.
7274 if (hasBufferRsrcWorkaround(Ty)) {
7275 Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: 0);
7276 B.setInsertPt(MBB&: B.getMBB(), II: MI);
7277 }
7278 if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
7279 Ty = getBitcastRegisterType(Ty);
7280 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
7281 B.setInsertPt(MBB&: B.getMBB(), II: MI);
7282 }
7283
7284 // FIXME: We don't really need this intermediate instruction. The intrinsic
7285 // should be fixed to have a memory operand. Since it's readnone, we're not
7286 // allowed to add one.
7287 MI.setDesc(B.getTII().get(Opcode: Opc));
7288 MI.removeOperand(OpNo: 1); // Remove intrinsic ID
7289
7290 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7291 const unsigned MemSize = (Size + 7) / 8;
7292 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7293 Ty: getTypeForLLT(Ty, C&: MF.getFunction().getContext()));
7294 MachineMemOperand *MMO = MF.getMachineMemOperand(
7295 PtrInfo: MachinePointerInfo(),
7296 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7297 MachineMemOperand::MOInvariant,
7298 Size: MemSize, BaseAlignment: MemAlign);
7299 MI.addMemOperand(MF, MO: MMO);
7300 if (Dst != OrigDst) {
7301 MI.getOperand(i: 0).setReg(Dst);
7302 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
7303 B.buildTrunc(Res: OrigDst, Op: Dst);
7304 }
7305
7306 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7307 // always be legal. We may need to restore this to a 96-bit result if it turns
7308 // out this needs to be converted to a vector load during RegBankSelect.
7309 if (!isPowerOf2_32(Value: Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7310 if (Ty.isVector())
7311 Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: 0);
7312 else
7313 Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: 0);
7314 }
7315
7316 Observer.changedInstr(MI);
7317 return true;
7318}
7319
7320bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
7321 MachineInstr &MI) const {
7322 MachineIRBuilder &B = Helper.MIRBuilder;
7323 GISelChangeObserver &Observer = Helper.Observer;
7324 Observer.changingInstr(MI);
7325 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7326 MI.removeOperand(OpNo: 0); // Remove intrinsic ID
7327 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
7328 Observer.changedInstr(MI);
7329 return true;
7330}
7331
7332// TODO: Move to selection
7333bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
7334 MachineRegisterInfo &MRI,
7335 MachineIRBuilder &B) const {
7336 if (!ST.hasTrapHandler() ||
7337 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7338 return legalizeTrapEndpgm(MI, MRI, B);
7339
7340 return ST.supportsGetDoorbellID() ?
7341 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
7342}
7343
7344bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
7345 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
7346 const DebugLoc &DL = MI.getDebugLoc();
7347 MachineBasicBlock &BB = B.getMBB();
7348 MachineFunction *MF = BB.getParent();
7349
7350 if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
7351 BuildMI(BB, I: BB.end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
7352 .addImm(Val: 0);
7353 MI.eraseFromParent();
7354 return true;
7355 }
7356
7357 // We need a block split to make the real endpgm a terminator. We also don't
7358 // want to break phis in successor blocks, so we can't just delete to the
7359 // end of the block.
7360 BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
7361 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7362 MF->push_back(MBB: TrapBB);
7363 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
7364 .addImm(Val: 0);
7365 BuildMI(BB, I: &MI, MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
7366 .addMBB(MBB: TrapBB);
7367
7368 BB.addSuccessor(Succ: TrapBB);
7369 MI.eraseFromParent();
7370 return true;
7371}
7372
7373bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
7374 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
7375 MachineFunction &MF = B.getMF();
7376 const LLT S64 = LLT::scalar(SizeInBits: 64);
7377
7378 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7379 // For code object version 5, queue_ptr is passed through implicit kernarg.
7380 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
7381 AMDGPU::AMDHSA_COV5) {
7382 AMDGPUTargetLowering::ImplicitParameter Param =
7383 AMDGPUTargetLowering::QUEUE_PTR;
7384 uint64_t Offset =
7385 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
7386
7387 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7388 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7389
7390 if (!loadInputValue(DstReg: KernargPtrReg, B,
7391 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
7392 return false;
7393
7394 // TODO: can we be smarter about machine pointer info?
7395 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF);
7396 MachineMemOperand *MMO = MF.getMachineMemOperand(
7397 PtrInfo: PtrInfo.getWithOffset(O: Offset),
7398 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7399 MachineMemOperand::MOInvariant,
7400 MemTy: LLT::scalar(SizeInBits: 64), base_alignment: commonAlignment(A: Align(64), Offset));
7401
7402 // Pointer address
7403 Register LoadAddr = MRI.createGenericVirtualRegister(
7404 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7405 B.buildObjectPtrOffset(Res: LoadAddr, Op0: KernargPtrReg,
7406 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
7407 // Load address
7408 Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
7409 B.buildCopy(Res: SGPR01, Op: Temp);
7410 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7411 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7412 .addReg(RegNo: SGPR01, Flags: RegState::Implicit);
7413 MI.eraseFromParent();
7414 return true;
7415 }
7416
7417 // Pass queue pointer to trap handler as input, and insert trap instruction
7418 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7419 Register LiveIn =
7420 MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7421 if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
7422 return false;
7423
7424 B.buildCopy(Res: SGPR01, Op: LiveIn);
7425 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7426 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7427 .addReg(RegNo: SGPR01, Flags: RegState::Implicit);
7428
7429 MI.eraseFromParent();
7430 return true;
7431}
7432
7433bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
7434 MachineRegisterInfo &MRI,
7435 MachineIRBuilder &B) const {
7436 // We need to simulate the 's_trap 2' instruction on targets that run in
7437 // PRIV=1 (where it is treated as a nop).
7438 if (ST.hasPrivEnabledTrap2NopBug()) {
7439 ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
7440 DL: MI.getDebugLoc());
7441 MI.eraseFromParent();
7442 return true;
7443 }
7444
7445 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7446 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7447 MI.eraseFromParent();
7448 return true;
7449}
7450
7451bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
7452 MachineRegisterInfo &MRI,
7453 MachineIRBuilder &B) const {
7454 // Is non-HSA path or trap-handler disabled? Then, report a warning
7455 // accordingly
7456 if (!ST.hasTrapHandler() ||
7457 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7458 Function &Fn = B.getMF().getFunction();
7459 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7460 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7461 } else {
7462 // Insert debug-trap instruction
7463 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7464 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7465 }
7466
7467 MI.eraseFromParent();
7468 return true;
7469}
7470
7471bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic(
7472 MachineInstr &MI, MachineIRBuilder &B) const {
7473 MachineRegisterInfo &MRI = *B.getMRI();
7474 const LLT S16 = LLT::scalar(SizeInBits: 16);
7475 const LLT S32 = LLT::scalar(SizeInBits: 32);
7476 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
7477 const LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
7478
7479 Register DstReg = MI.getOperand(i: 0).getReg();
7480 Register NodePtr = MI.getOperand(i: 2).getReg();
7481 Register RayExtent = MI.getOperand(i: 3).getReg();
7482 Register RayOrigin = MI.getOperand(i: 4).getReg();
7483 Register RayDir = MI.getOperand(i: 5).getReg();
7484 Register RayInvDir = MI.getOperand(i: 6).getReg();
7485 Register TDescr = MI.getOperand(i: 7).getReg();
7486
7487 if (!ST.hasGFX10_AEncoding()) {
7488 Function &Fn = B.getMF().getFunction();
7489 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7490 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7491 return false;
7492 }
7493
7494 const bool IsGFX11 = AMDGPU::isGFX11(STI: ST);
7495 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: ST);
7496 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: ST);
7497 const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == 16;
7498 const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == 64;
7499 const unsigned NumVDataDwords = 4;
7500 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7501 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7502 const bool UseNSA =
7503 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7504
7505 const unsigned BaseOpcodes[2][2] = {
7506 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7507 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7508 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7509 int Opcode;
7510 if (UseNSA) {
7511 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7512 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7513 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7514 : AMDGPU::MIMGEncGfx10NSA,
7515 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7516 } else {
7517 assert(!IsGFX12Plus);
7518 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7519 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7520 : AMDGPU::MIMGEncGfx10Default,
7521 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7522 }
7523 assert(Opcode != -1);
7524
7525 SmallVector<Register, 12> Ops;
7526 if (UseNSA && IsGFX11Plus) {
7527 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7528 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7529 auto Merged = B.buildMergeLikeInstr(
7530 Res: V3S32, Ops: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1), Unmerge.getReg(Idx: 2)});
7531 Ops.push_back(Elt: Merged.getReg(Idx: 0));
7532 };
7533
7534 Ops.push_back(Elt: NodePtr);
7535 Ops.push_back(Elt: RayExtent);
7536 packLanes(RayOrigin);
7537
7538 if (IsA16) {
7539 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7540 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7541 auto MergedDir = B.buildMergeLikeInstr(
7542 Res: V3S32,
7543 Ops: {B.buildBitcast(
7544 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 0),
7545 UnmergeRayDir.getReg(Idx: 0)}))
7546 .getReg(Idx: 0),
7547 B.buildBitcast(
7548 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 1),
7549 UnmergeRayDir.getReg(Idx: 1)}))
7550 .getReg(Idx: 0),
7551 B.buildBitcast(
7552 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 2),
7553 UnmergeRayDir.getReg(Idx: 2)}))
7554 .getReg(Idx: 0)});
7555 Ops.push_back(Elt: MergedDir.getReg(Idx: 0));
7556 } else {
7557 packLanes(RayDir);
7558 packLanes(RayInvDir);
7559 }
7560 } else {
7561 if (Is64) {
7562 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
7563 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7564 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7565 } else {
7566 Ops.push_back(Elt: NodePtr);
7567 }
7568 Ops.push_back(Elt: RayExtent);
7569
7570 auto packLanes = [&Ops, &S32, &B](Register Src) {
7571 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7572 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7573 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7574 Ops.push_back(Elt: Unmerge.getReg(Idx: 2));
7575 };
7576
7577 packLanes(RayOrigin);
7578 if (IsA16) {
7579 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7580 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7581 Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
7582 Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
7583 Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
7584 B.buildMergeLikeInstr(Res: R1,
7585 Ops: {UnmergeRayDir.getReg(Idx: 0), UnmergeRayDir.getReg(Idx: 1)});
7586 B.buildMergeLikeInstr(
7587 Res: R2, Ops: {UnmergeRayDir.getReg(Idx: 2), UnmergeRayInvDir.getReg(Idx: 0)});
7588 B.buildMergeLikeInstr(
7589 Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: 1), UnmergeRayInvDir.getReg(Idx: 2)});
7590 Ops.push_back(Elt: R1);
7591 Ops.push_back(Elt: R2);
7592 Ops.push_back(Elt: R3);
7593 } else {
7594 packLanes(RayDir);
7595 packLanes(RayInvDir);
7596 }
7597 }
7598
7599 if (!UseNSA) {
7600 // Build a single vector containing all the operands so far prepared.
7601 LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: 32);
7602 Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: 0);
7603 Ops.clear();
7604 Ops.push_back(Elt: MergedOps);
7605 }
7606
7607 auto MIB = B.buildInstr(Opcode: AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7608 .addDef(RegNo: DstReg)
7609 .addImm(Val: Opcode);
7610
7611 for (Register R : Ops) {
7612 MIB.addUse(RegNo: R);
7613 }
7614
7615 MIB.addUse(RegNo: TDescr)
7616 .addImm(Val: IsA16 ? 1 : 0)
7617 .cloneMemRefs(OtherMI: MI);
7618
7619 MI.eraseFromParent();
7620 return true;
7621}
7622
7623bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic(
7624 MachineInstr &MI, MachineIRBuilder &B) const {
7625 const LLT S32 = LLT::scalar(SizeInBits: 32);
7626 const LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
7627
7628 Register DstReg = MI.getOperand(i: 0).getReg();
7629 Register DstOrigin = MI.getOperand(i: 1).getReg();
7630 Register DstDir = MI.getOperand(i: 2).getReg();
7631 Register NodePtr = MI.getOperand(i: 4).getReg();
7632 Register RayExtent = MI.getOperand(i: 5).getReg();
7633 Register InstanceMask = MI.getOperand(i: 6).getReg();
7634 Register RayOrigin = MI.getOperand(i: 7).getReg();
7635 Register RayDir = MI.getOperand(i: 8).getReg();
7636 Register Offsets = MI.getOperand(i: 9).getReg();
7637 Register TDescr = MI.getOperand(i: 10).getReg();
7638
7639 if (!ST.hasBVHDualAndBVH8Insts()) {
7640 Function &Fn = B.getMF().getFunction();
7641 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7642 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7643 return false;
7644 }
7645
7646 bool IsBVH8 = cast<GIntrinsic>(Val&: MI).getIntrinsicID() ==
7647 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7648 const unsigned NumVDataDwords = 10;
7649 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7650 int Opcode = AMDGPU::getMIMGOpcode(
7651 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7652 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7653 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7654 assert(Opcode != -1);
7655
7656 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7657 Res: V2S32, Ops: {RayExtent, B.buildAnyExt(Res: S32, Op: InstanceMask)});
7658
7659 B.buildInstr(Opcode: IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7660 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7661 .addDef(RegNo: DstReg)
7662 .addDef(RegNo: DstOrigin)
7663 .addDef(RegNo: DstDir)
7664 .addImm(Val: Opcode)
7665 .addUse(RegNo: NodePtr)
7666 .addUse(RegNo: RayExtentInstanceMaskVec.getReg(Idx: 0))
7667 .addUse(RegNo: RayOrigin)
7668 .addUse(RegNo: RayDir)
7669 .addUse(RegNo: Offsets)
7670 .addUse(RegNo: TDescr)
7671 .cloneMemRefs(OtherMI: MI);
7672
7673 MI.eraseFromParent();
7674 return true;
7675}
7676
7677bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7678 MachineIRBuilder &B) const {
7679 const SITargetLowering *TLI = ST.getTargetLowering();
7680 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7681 Register DstReg = MI.getOperand(i: 0).getReg();
7682 B.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {DstReg}, SrcOps: {StackPtr});
7683 MI.eraseFromParent();
7684 return true;
7685}
7686
7687bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7688 MachineIRBuilder &B) const {
7689 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7690 if (!ST.hasArchitectedSGPRs())
7691 return false;
7692 LLT S32 = LLT::scalar(SizeInBits: 32);
7693 Register DstReg = MI.getOperand(i: 0).getReg();
7694 auto TTMP8 = B.buildCopy(Res: S32, Op: Register(AMDGPU::TTMP8));
7695 auto LSB = B.buildConstant(Res: S32, Val: 25);
7696 auto Width = B.buildConstant(Res: S32, Val: 5);
7697 B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
7698 MI.eraseFromParent();
7699 return true;
7700}
7701
7702bool AMDGPULegalizerInfo::legalizeConstHwRegRead(MachineInstr &MI,
7703 MachineIRBuilder &B,
7704 AMDGPU::Hwreg::Id HwReg,
7705 unsigned LowBit,
7706 unsigned Width) const {
7707 MachineRegisterInfo &MRI = *B.getMRI();
7708 Register DstReg = MI.getOperand(i: 0).getReg();
7709 if (!MRI.getRegClassOrNull(Reg: DstReg))
7710 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32RegClass);
7711 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32_const)
7712 .addDef(RegNo: DstReg)
7713 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(Values: HwReg, Values: LowBit, Values: Width));
7714 MI.eraseFromParent();
7715 return true;
7716}
7717
7718static constexpr unsigned FPEnvModeBitField =
7719 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
7720
7721static constexpr unsigned FPEnvTrapBitField =
7722 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
7723
7724bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7725 MachineRegisterInfo &MRI,
7726 MachineIRBuilder &B) const {
7727 Register Src = MI.getOperand(i: 0).getReg();
7728 if (MRI.getType(Reg: Src) != S64)
7729 return false;
7730
7731 auto ModeReg =
7732 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7733 /*HasSideEffects=*/true, /*isConvergent=*/false)
7734 .addImm(Val: FPEnvModeBitField);
7735 auto TrapReg =
7736 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7737 /*HasSideEffects=*/true, /*isConvergent=*/false)
7738 .addImm(Val: FPEnvTrapBitField);
7739 B.buildMergeLikeInstr(Res: Src, Ops: {ModeReg, TrapReg});
7740 MI.eraseFromParent();
7741 return true;
7742}
7743
7744bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7745 MachineRegisterInfo &MRI,
7746 MachineIRBuilder &B) const {
7747 Register Src = MI.getOperand(i: 0).getReg();
7748 if (MRI.getType(Reg: Src) != S64)
7749 return false;
7750
7751 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: 0));
7752 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7753 /*HasSideEffects=*/true, /*isConvergent=*/false)
7754 .addImm(Val: static_cast<int16_t>(FPEnvModeBitField))
7755 .addReg(RegNo: Unmerge.getReg(Idx: 0));
7756 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7757 /*HasSideEffects=*/true, /*isConvergent=*/false)
7758 .addImm(Val: static_cast<int16_t>(FPEnvTrapBitField))
7759 .addReg(RegNo: Unmerge.getReg(Idx: 1));
7760 MI.eraseFromParent();
7761 return true;
7762}
7763
7764bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7765 MachineInstr &MI) const {
7766 MachineIRBuilder &B = Helper.MIRBuilder;
7767 MachineRegisterInfo &MRI = *B.getMRI();
7768
7769 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7770 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
7771 switch (IntrID) {
7772 case Intrinsic::sponentry:
7773 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
7774 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
7775 // that we can remove this cast.
7776 const LLT S32 = LLT::scalar(SizeInBits: 32);
7777 Register TmpReg = MRI.createGenericVirtualRegister(Ty: S32);
7778 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_SPONENTRY).addDef(RegNo: TmpReg);
7779
7780 Register DstReg = MI.getOperand(i: 0).getReg();
7781 B.buildIntToPtr(Dst: DstReg, Src: TmpReg);
7782 MI.eraseFromParent();
7783 } else {
7784 int FI = B.getMF().getFrameInfo().CreateFixedObject(
7785 Size: 1, SPOffset: 0, /*IsImmutable=*/false);
7786 B.buildFrameIndex(Res: MI.getOperand(i: 0), Idx: FI);
7787 MI.eraseFromParent();
7788 }
7789 return true;
7790 case Intrinsic::amdgcn_if:
7791 case Intrinsic::amdgcn_else: {
7792 MachineInstr *Br = nullptr;
7793 MachineBasicBlock *UncondBrTarget = nullptr;
7794 bool Negated = false;
7795 if (MachineInstr *BrCond =
7796 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7797 const SIRegisterInfo *TRI
7798 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7799
7800 Register Def = MI.getOperand(i: 1).getReg();
7801 Register Use = MI.getOperand(i: 3).getReg();
7802
7803 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7804
7805 if (Negated)
7806 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7807
7808 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7809 if (IntrID == Intrinsic::amdgcn_if) {
7810 B.buildInstr(Opcode: AMDGPU::SI_IF)
7811 .addDef(RegNo: Def)
7812 .addUse(RegNo: Use)
7813 .addMBB(MBB: UncondBrTarget);
7814 } else {
7815 B.buildInstr(Opcode: AMDGPU::SI_ELSE)
7816 .addDef(RegNo: Def)
7817 .addUse(RegNo: Use)
7818 .addMBB(MBB: UncondBrTarget);
7819 }
7820
7821 if (Br) {
7822 Br->getOperand(i: 0).setMBB(CondBrTarget);
7823 } else {
7824 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7825 // since we're swapping branch targets it needs to be reinserted.
7826 // FIXME: IRTranslator should probably not do this
7827 B.buildBr(Dest&: *CondBrTarget);
7828 }
7829
7830 MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
7831 MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
7832 MI.eraseFromParent();
7833 BrCond->eraseFromParent();
7834 return true;
7835 }
7836
7837 return false;
7838 }
7839 case Intrinsic::amdgcn_loop: {
7840 MachineInstr *Br = nullptr;
7841 MachineBasicBlock *UncondBrTarget = nullptr;
7842 bool Negated = false;
7843 if (MachineInstr *BrCond =
7844 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7845 const SIRegisterInfo *TRI
7846 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7847
7848 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7849 Register Reg = MI.getOperand(i: 2).getReg();
7850
7851 if (Negated)
7852 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7853
7854 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7855 B.buildInstr(Opcode: AMDGPU::SI_LOOP)
7856 .addUse(RegNo: Reg)
7857 .addMBB(MBB: UncondBrTarget);
7858
7859 if (Br)
7860 Br->getOperand(i: 0).setMBB(CondBrTarget);
7861 else
7862 B.buildBr(Dest&: *CondBrTarget);
7863
7864 MI.eraseFromParent();
7865 BrCond->eraseFromParent();
7866 MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
7867 return true;
7868 }
7869
7870 return false;
7871 }
7872 case Intrinsic::amdgcn_addrspacecast_nonnull:
7873 return legalizeAddrSpaceCast(MI, MRI, B);
7874 case Intrinsic::amdgcn_make_buffer_rsrc:
7875 return legalizePointerAsRsrcIntrin(MI, MRI, B);
7876 case Intrinsic::amdgcn_kernarg_segment_ptr:
7877 if (!AMDGPU::isKernel(F: B.getMF().getFunction())) {
7878 // This only makes sense to call in a kernel, so just lower to null.
7879 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: 0);
7880 MI.eraseFromParent();
7881 return true;
7882 }
7883
7884 return legalizePreloadedArgIntrin(
7885 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7886 case Intrinsic::amdgcn_implicitarg_ptr:
7887 return legalizeImplicitArgPtr(MI, MRI, B);
7888 case Intrinsic::amdgcn_workitem_id_x:
7889 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 0,
7890 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7891 case Intrinsic::amdgcn_workitem_id_y:
7892 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 1,
7893 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7894 case Intrinsic::amdgcn_workitem_id_z:
7895 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 2,
7896 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7897 case Intrinsic::amdgcn_workgroup_id_x:
7898 return legalizeWorkGroupId(
7899 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
7900 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
7901 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
7902 case Intrinsic::amdgcn_workgroup_id_y:
7903 return legalizeWorkGroupId(
7904 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
7905 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
7906 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
7907 case Intrinsic::amdgcn_workgroup_id_z:
7908 return legalizeWorkGroupId(
7909 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
7910 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
7911 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
7912 case Intrinsic::amdgcn_cluster_id_x:
7913 return ST.hasClusters() &&
7914 legalizePreloadedArgIntrin(MI, MRI, B,
7915 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7916 case Intrinsic::amdgcn_cluster_id_y:
7917 return ST.hasClusters() &&
7918 legalizePreloadedArgIntrin(MI, MRI, B,
7919 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7920 case Intrinsic::amdgcn_cluster_id_z:
7921 return ST.hasClusters() &&
7922 legalizePreloadedArgIntrin(MI, MRI, B,
7923 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7924 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7925 return ST.hasClusters() &&
7926 legalizePreloadedArgIntrin(
7927 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
7928 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7929 return ST.hasClusters() &&
7930 legalizePreloadedArgIntrin(
7931 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
7932 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7933 return ST.hasClusters() &&
7934 legalizePreloadedArgIntrin(
7935 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
7936 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7937 return ST.hasClusters() &&
7938 legalizeConstHwRegRead(MI, B, HwReg: AMDGPU::Hwreg::ID_IB_STS2, LowBit: 21, Width: 4);
7939 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7940 return ST.hasClusters() &&
7941 legalizePreloadedArgIntrin(
7942 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
7943 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7944 return ST.hasClusters() &&
7945 legalizePreloadedArgIntrin(
7946 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
7947 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7948 return ST.hasClusters() &&
7949 legalizePreloadedArgIntrin(
7950 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
7951 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7952 return ST.hasClusters() &&
7953 legalizePreloadedArgIntrin(
7954 MI, MRI, B,
7955 ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
7956 case Intrinsic::amdgcn_wave_id:
7957 return legalizeWaveID(MI, B);
7958 case Intrinsic::amdgcn_lds_kernel_id:
7959 return legalizePreloadedArgIntrin(MI, MRI, B,
7960 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7961 case Intrinsic::amdgcn_dispatch_ptr:
7962 return legalizePreloadedArgIntrin(MI, MRI, B,
7963 ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
7964 case Intrinsic::amdgcn_queue_ptr:
7965 return legalizePreloadedArgIntrin(MI, MRI, B,
7966 ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
7967 case Intrinsic::amdgcn_implicit_buffer_ptr:
7968 return legalizePreloadedArgIntrin(
7969 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7970 case Intrinsic::amdgcn_dispatch_id:
7971 return legalizePreloadedArgIntrin(MI, MRI, B,
7972 ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
7973 case Intrinsic::r600_read_ngroups_x:
7974 // TODO: Emit error for hsa
7975 return legalizeKernargMemParameter(MI, B,
7976 Offset: SI::KernelInputOffsets::NGROUPS_X);
7977 case Intrinsic::r600_read_ngroups_y:
7978 return legalizeKernargMemParameter(MI, B,
7979 Offset: SI::KernelInputOffsets::NGROUPS_Y);
7980 case Intrinsic::r600_read_ngroups_z:
7981 return legalizeKernargMemParameter(MI, B,
7982 Offset: SI::KernelInputOffsets::NGROUPS_Z);
7983 case Intrinsic::r600_read_local_size_x:
7984 // TODO: Could insert G_ASSERT_ZEXT from s16
7985 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
7986 case Intrinsic::r600_read_local_size_y:
7987 // TODO: Could insert G_ASSERT_ZEXT from s16
7988 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
7989 // TODO: Could insert G_ASSERT_ZEXT from s16
7990 case Intrinsic::r600_read_local_size_z:
7991 return legalizeKernargMemParameter(MI, B,
7992 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
7993 case Intrinsic::amdgcn_fdiv_fast:
7994 return legalizeFDIVFastIntrin(MI, MRI, B);
7995 case Intrinsic::amdgcn_is_shared:
7996 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
7997 case Intrinsic::amdgcn_is_private:
7998 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
7999 case Intrinsic::amdgcn_wavefrontsize: {
8000 B.buildConstant(Res: MI.getOperand(i: 0), Val: ST.getWavefrontSize());
8001 MI.eraseFromParent();
8002 return true;
8003 }
8004 case Intrinsic::amdgcn_s_buffer_load:
8005 return legalizeSBufferLoad(Helper, MI);
8006 case Intrinsic::amdgcn_raw_buffer_store:
8007 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8008 case Intrinsic::amdgcn_struct_buffer_store:
8009 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8010 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: false);
8011 case Intrinsic::amdgcn_raw_buffer_store_format:
8012 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8013 case Intrinsic::amdgcn_struct_buffer_store_format:
8014 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8015 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: true);
8016 case Intrinsic::amdgcn_raw_tbuffer_store:
8017 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8018 case Intrinsic::amdgcn_struct_tbuffer_store:
8019 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8020 return legalizeBufferStore(MI, Helper, IsTyped: true, IsFormat: true);
8021 case Intrinsic::amdgcn_raw_buffer_load:
8022 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8023 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8024 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8025 case Intrinsic::amdgcn_struct_buffer_load:
8026 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8027 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8028 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8029 return legalizeBufferLoad(MI, Helper, IsFormat: false, IsTyped: false);
8030 case Intrinsic::amdgcn_raw_buffer_load_format:
8031 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8032 case Intrinsic::amdgcn_struct_buffer_load_format:
8033 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8034 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: false);
8035 case Intrinsic::amdgcn_raw_tbuffer_load:
8036 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8037 case Intrinsic::amdgcn_struct_tbuffer_load:
8038 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8039 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: true);
8040 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8041 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8042 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8043 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8044 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8045 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8046 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8047 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8048 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8049 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8050 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8051 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8052 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8053 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8054 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8055 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8056 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8057 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8058 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8059 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8060 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8061 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8062 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8064 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8065 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8066 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8067 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8068 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8069 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8070 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8071 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8072 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8073 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8074 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8076 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8077 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8078 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8079 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8080 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8082 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8084 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8085 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8086 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8087 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8088 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8089 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8090 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8091 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8092 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8093 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8094 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8095 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8096 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8097 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8098 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8099 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8100 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8101 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8102 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8104 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8105 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8106 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8107 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8108 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8109 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8110 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8111 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8112 return legalizeBufferAtomic(MI, B, IID: IntrID);
8113 case Intrinsic::amdgcn_rsq_clamp:
8114 return legalizeRsqClampIntrinsic(MI, MRI, B);
8115 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8116 return legalizeBVHIntersectRayIntrinsic(MI, B);
8117 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8118 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8119 return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
8120 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8121 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8122 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8123 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8124 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8125 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8126 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8127 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8128 Register Index = MI.getOperand(i: 5).getReg();
8129 LLT S64 = LLT::scalar(SizeInBits: 64);
8130 LLT IndexArgTy = MRI.getType(Reg: Index);
8131 if (IndexArgTy != S64) {
8132 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(Dst: S64, Src: Index)
8133 : B.buildAnyExt(Res: S64, Op: Index);
8134 MI.getOperand(i: 5).setReg(NewIndex.getReg(Idx: 0));
8135 }
8136 return true;
8137 }
8138 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8139 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8140 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8141 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8142 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8143 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8144 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8145 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8146 Register Index = MI.getOperand(i: 5).getReg();
8147 LLT S32 = LLT::scalar(SizeInBits: 32);
8148 if (MRI.getType(Reg: Index) != S32)
8149 MI.getOperand(i: 5).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
8150 return true;
8151 }
8152 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8153 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8154 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8155 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8156 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8157 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8158 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8159 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8160 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8161 Register Index = MI.getOperand(i: 7).getReg();
8162 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8163 ? LLT::scalar(SizeInBits: 64)
8164 : LLT::scalar(SizeInBits: 32);
8165 LLT IndexArgTy = MRI.getType(Reg: Index);
8166 if (IndexArgTy != IdxTy) {
8167 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(Dst: IdxTy, Src: Index)
8168 : B.buildAnyExt(Res: IdxTy, Op: Index);
8169 MI.getOperand(i: 7).setReg(NewIndex.getReg(Idx: 0));
8170 }
8171 return true;
8172 }
8173
8174 case Intrinsic::amdgcn_fmed3: {
8175 GISelChangeObserver &Observer = Helper.Observer;
8176
8177 // FIXME: This is to workaround the inability of tablegen match combiners to
8178 // match intrinsics in patterns.
8179 Observer.changingInstr(MI);
8180 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_FMED3));
8181 MI.removeOperand(OpNo: 1);
8182 Observer.changedInstr(MI);
8183 return true;
8184 }
8185 case Intrinsic::amdgcn_readlane:
8186 case Intrinsic::amdgcn_writelane:
8187 case Intrinsic::amdgcn_readfirstlane:
8188 case Intrinsic::amdgcn_permlane16:
8189 case Intrinsic::amdgcn_permlanex16:
8190 case Intrinsic::amdgcn_permlane64:
8191 case Intrinsic::amdgcn_set_inactive:
8192 case Intrinsic::amdgcn_set_inactive_chain_arg:
8193 case Intrinsic::amdgcn_mov_dpp8:
8194 case Intrinsic::amdgcn_update_dpp:
8195 return legalizeLaneOp(Helper, MI, IID: IntrID);
8196 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8197 return legalizeSBufferPrefetch(Helper, MI);
8198 case Intrinsic::amdgcn_dead: {
8199 // TODO: Use poison instead of undef
8200 for (const MachineOperand &Def : MI.defs())
8201 B.buildUndef(Res: Def);
8202 MI.eraseFromParent();
8203 return true;
8204 }
8205 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8206 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8207 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8208 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8209 B.buildLoad(Res: MI.getOperand(i: 0), Addr: MI.getOperand(i: 2), MMO&: **MI.memoperands_begin());
8210 MI.eraseFromParent();
8211 return true;
8212 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8213 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8214 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8215 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8216 B.buildStore(Val: MI.getOperand(i: 2), Addr: MI.getOperand(i: 1), MMO&: **MI.memoperands_begin());
8217 MI.eraseFromParent();
8218 return true;
8219 case Intrinsic::amdgcn_flat_load_monitor_b32:
8220 case Intrinsic::amdgcn_flat_load_monitor_b64:
8221 case Intrinsic::amdgcn_flat_load_monitor_b128:
8222 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8223 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8224 .add(MO: MI.getOperand(i: 0))
8225 .add(MO: MI.getOperand(i: 2))
8226 .addMemOperand(MMO: *MI.memoperands_begin());
8227 MI.eraseFromParent();
8228 return true;
8229 case Intrinsic::amdgcn_global_load_monitor_b32:
8230 case Intrinsic::amdgcn_global_load_monitor_b64:
8231 case Intrinsic::amdgcn_global_load_monitor_b128:
8232 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8233 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8234 .add(MO: MI.getOperand(i: 0))
8235 .add(MO: MI.getOperand(i: 2))
8236 .addMemOperand(MMO: *MI.memoperands_begin());
8237 MI.eraseFromParent();
8238 return true;
8239 default: {
8240 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8241 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
8242 return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
8243 return true;
8244 }
8245 }
8246
8247 return true;
8248}
8249