1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
24#include "SIRegisterInfo.h"
25#include "Utils/AMDGPUBaseInfo.h"
26#include "llvm/ADT/ScopeExit.h"
27#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
30#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
32#include "llvm/CodeGen/GlobalISel/Utils.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/PseudoSourceValueManager.h"
35#include "llvm/CodeGen/TargetOpcodes.h"
36#include "llvm/IR/DiagnosticInfo.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
49static cl::opt<bool> EnableNewLegality(
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(Val: false),
54 cl::ReallyHidden);
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
59static LLT getPow2VectorType(LLT Ty) {
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(Value: NElts);
62 return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
66static LLT getPow2ScalarType(LLT Ty) {
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Value: Bits);
69 return LLT::scalar(SizeInBits: Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(NumElements: Ty.getNumElements() + 1, ScalarTy: EltTy));
110 };
111}
112
113static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
144static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) {
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(x: TypeIdx, y: LLT::scalar(SizeInBits: MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
152static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: Ty.getElementType()));
170 };
171}
172
173static LLT getBufferRsrcScalarType(const LLT Ty) {
174 if (!Ty.isVector())
175 return LLT::scalar(SizeInBits: 128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: 128));
178}
179
180static LLT getBufferRsrcRegisterType(const LLT Ty) {
181 if (!Ty.isVector())
182 return LLT::fixed_vector(NumElements: 4, ScalarTy: LLT::scalar(SizeInBits: 32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElements: NumElems * 4, ScalarTy: LLT::scalar(SizeInBits: 32));
185}
186
187static LLT getBitcastRegisterType(const LLT Ty) {
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(SizeInBits: Size);
194 }
195
196 return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32);
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
206static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
212 TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32));
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
239 Size <= MaxRegisterSize;
240}
241
242static bool isRegisterVectorElementType(LLT EltTy) {
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Size: Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
267static LegalityPredicate isRegisterType(const GCNSubtarget &ST,
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Ty: Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
277static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST,
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(SizeInBits: 16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(SizeInBits: 1);
297constexpr LLT S8 = LLT::scalar(SizeInBits: 8);
298constexpr LLT S16 = LLT::scalar(SizeInBits: 16);
299constexpr LLT S32 = LLT::scalar(SizeInBits: 32);
300constexpr LLT F32 = LLT::float32();
301constexpr LLT S64 = LLT::scalar(SizeInBits: 64);
302constexpr LLT F64 = LLT::float64();
303constexpr LLT S96 = LLT::scalar(SizeInBits: 96);
304constexpr LLT S128 = LLT::scalar(SizeInBits: 128);
305constexpr LLT S160 = LLT::scalar(SizeInBits: 160);
306constexpr LLT S192 = LLT::scalar(SizeInBits: 192);
307constexpr LLT S224 = LLT::scalar(SizeInBits: 224);
308constexpr LLT S256 = LLT::scalar(SizeInBits: 256);
309constexpr LLT S512 = LLT::scalar(SizeInBits: 512);
310constexpr LLT S1024 = LLT::scalar(SizeInBits: 1024);
311constexpr LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
312
313constexpr LLT V2S8 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 8);
314constexpr LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
315constexpr LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
316constexpr LLT V6S16 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16);
317constexpr LLT V8S16 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
318constexpr LLT V10S16 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 16);
319constexpr LLT V12S16 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 16);
320constexpr LLT V16S16 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16);
321
322constexpr LLT V2F16 = LLT::fixed_vector(NumElements: 2, ScalarTy: LLT::float16());
323constexpr LLT V2BF16 = V2F16; // FIXME
324
325constexpr LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
326constexpr LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
327constexpr LLT V4S32 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
328constexpr LLT V5S32 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 32);
329constexpr LLT V6S32 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 32);
330constexpr LLT V7S32 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 32);
331constexpr LLT V8S32 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32);
332constexpr LLT V9S32 = LLT::fixed_vector(NumElements: 9, ScalarSizeInBits: 32);
333constexpr LLT V10S32 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 32);
334constexpr LLT V11S32 = LLT::fixed_vector(NumElements: 11, ScalarSizeInBits: 32);
335constexpr LLT V12S32 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 32);
336constexpr LLT V16S32 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32);
337constexpr LLT V32S32 = LLT::fixed_vector(NumElements: 32, ScalarSizeInBits: 32);
338
339constexpr LLT V2S64 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
340constexpr LLT V3S64 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 64);
341constexpr LLT V4S64 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64);
342constexpr LLT V5S64 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 64);
343constexpr LLT V6S64 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 64);
344constexpr LLT V7S64 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 64);
345constexpr LLT V8S64 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64);
346constexpr LLT V16S64 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 64);
347
348constexpr LLT V2S128 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 128);
349constexpr LLT V4S128 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 128);
350
351constexpr std::initializer_list<LLT> AllScalarTypes = {
352 S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
353
354constexpr std::initializer_list<LLT> AllS16Vectors{
355 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
356
357constexpr std::initializer_list<LLT> AllS32Vectors = {
358 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
359 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
360
361constexpr std::initializer_list<LLT> AllS64Vectors = {
362 V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
363
364constexpr std::initializer_list<LLT> AllVectors{
365 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128,
366 V4S128, V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
367 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32, V2S64, V3S64,
368 V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
369
370// Checks whether a type is in the list of legal register types.
371static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
372 if (Ty.isPointerOrPointerVector())
373 Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
374
375 return is_contained(Set: AllS32Vectors, Element: Ty) || is_contained(Set: AllS64Vectors, Element: Ty) ||
376 is_contained(Set: AllScalarTypes, Element: Ty) ||
377 (ST.useRealTrue16Insts() && Ty == S16) ||
378 is_contained(Set: AllS16Vectors, Element: Ty);
379}
380
381static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST,
382 unsigned TypeIdx) {
383 return [&ST, TypeIdx](const LegalityQuery &Query) {
384 return isRegisterClassType(ST, Ty: Query.Types[TypeIdx]);
385 };
386}
387
388// If we have a truncating store or an extending load with a data size larger
389// than 32-bits, we need to reduce to a 32-bit type.
390static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
391 return [=](const LegalityQuery &Query) {
392 const LLT Ty = Query.Types[TypeIdx];
393 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
394 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
395 };
396}
397
398// If we have a truncating store or an extending load with a data size larger
399// than 32-bits and mem location is a power of 2
400static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) {
401 return [=](const LegalityQuery &Query) {
402 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
403 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
404 isPowerOf2_64(Value: MemSize);
405 };
406}
407
408// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
409// handle some operations by just promoting the register during
410// selection. There are also d16 loads on GFX9+ which preserve the high bits.
411static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
412 bool IsLoad, bool IsAtomic) {
413 switch (AS) {
414 case AMDGPUAS::PRIVATE_ADDRESS:
415 // FIXME: Private element size.
416 return ST.hasFlatScratchEnabled() ? 128 : 32;
417 case AMDGPUAS::LOCAL_ADDRESS:
418 return ST.useDS128() ? 128 : 64;
419 case AMDGPUAS::GLOBAL_ADDRESS:
420 case AMDGPUAS::CONSTANT_ADDRESS:
421 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
422 case AMDGPUAS::BUFFER_RESOURCE:
423 // Treat constant and global as identical. SMRD loads are sometimes usable for
424 // global loads (ideally constant address space should be eliminated)
425 // depending on the context. Legality cannot be context dependent, but
426 // RegBankSelect can split the load as necessary depending on the pointer
427 // register bank/uniformity and if the memory is invariant or not written in a
428 // kernel.
429 return IsLoad ? 512 : 128;
430 default:
431 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
432 // if they may alias scratch depending on the subtarget. This needs to be
433 // moved to custom handling to use addressMayBeAccessedAsPrivate
434 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
435 }
436}
437
438static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
439 const LegalityQuery &Query) {
440 const LLT Ty = Query.Types[0];
441
442 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
443 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
444
445 unsigned RegSize = Ty.getSizeInBits();
446 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
447 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
448 unsigned AS = Query.Types[1].getAddressSpace();
449
450 // All of these need to be custom lowered to cast the pointer operand.
451 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
452 return false;
453
454 // Do not handle extending vector loads.
455 if (Ty.isVector() && MemSize != RegSize)
456 return false;
457
458 // TODO: We should be able to widen loads if the alignment is high enough, but
459 // we also need to modify the memory access size.
460#if 0
461 // Accept widening loads based on alignment.
462 if (IsLoad && MemSize < Size)
463 MemSize = std::max(MemSize, Align);
464#endif
465
466 // Only 1-byte and 2-byte to 32-bit extloads are valid.
467 if (MemSize != RegSize && RegSize != 32)
468 return false;
469
470 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
471 IsAtomic: Query.MMODescrs[0].Ordering !=
472 AtomicOrdering::NotAtomic))
473 return false;
474
475 switch (MemSize) {
476 case 8:
477 case 16:
478 case 32:
479 case 64:
480 case 128:
481 break;
482 case 96:
483 if (!ST.hasDwordx3LoadStores())
484 return false;
485 break;
486 case 256:
487 case 512:
488 // These may contextually need to be broken down.
489 break;
490 default:
491 return false;
492 }
493
494 assert(RegSize >= MemSize);
495
496 if (AlignBits < MemSize) {
497 const SITargetLowering *TLI = ST.getTargetLowering();
498 if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
499 Alignment: Align(AlignBits / 8)))
500 return false;
501 }
502
503 return true;
504}
505
506// The newer buffer intrinsic forms take their resource arguments as
507// pointers in address space 8, aka s128 values. However, in order to not break
508// SelectionDAG, the underlying operations have to continue to take v4i32
509// arguments. Therefore, we convert resource pointers - or vectors of them
510// to integer values here.
511static bool hasBufferRsrcWorkaround(const LLT Ty) {
512 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
513 return true;
514 if (Ty.isVector()) {
515 const LLT ElemTy = Ty.getElementType();
516 return hasBufferRsrcWorkaround(Ty: ElemTy);
517 }
518 return false;
519}
520
521// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
522// workaround this. Eventually it should ignore the type for loads and only care
523// about the size. Return true in cases where we will workaround this for now by
524// bitcasting.
525static bool loadStoreBitcastWorkaround(const LLT Ty) {
526 if (EnableNewLegality)
527 return false;
528
529 const unsigned Size = Ty.getSizeInBits();
530 if (Ty.isPointerVector())
531 return true;
532 if (Size <= 64)
533 return false;
534 // Address space 8 pointers get their own workaround.
535 if (hasBufferRsrcWorkaround(Ty))
536 return false;
537 if (!Ty.isVector())
538 return true;
539
540 unsigned EltSize = Ty.getScalarSizeInBits();
541 return EltSize != 32 && EltSize != 64;
542}
543
544static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
545 const LLT Ty = Query.Types[0];
546 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
547 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
548}
549
550/// Return true if a load or store of the type should be lowered with a bitcast
551/// to a different type.
552static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
553 const LLT MemTy) {
554 const unsigned MemSizeInBits = MemTy.getSizeInBits();
555 const unsigned Size = Ty.getSizeInBits();
556 if (Size != MemSizeInBits)
557 return Size <= 32 && Ty.isVector();
558
559 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
560 return true;
561
562 // Don't try to handle bitcasting vector ext loads for now.
563 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
564 (Size <= 32 || isRegisterSize(ST, Size)) &&
565 !isRegisterVectorElementType(EltTy: Ty.getElementType());
566}
567
568/// Return true if we should legalize a load by widening an odd sized memory
569/// access up to the alignment. Note this case when the memory access itself
570/// changes, not the size of the result register.
571static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
572 uint64_t AlignInBits, unsigned AddrSpace,
573 unsigned Opcode) {
574 unsigned SizeInBits = MemoryTy.getSizeInBits();
575 // We don't want to widen cases that are naturally legal.
576 if (isPowerOf2_32(Value: SizeInBits))
577 return false;
578
579 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
580 // end up widening these for a scalar load during RegBankSelect, if we don't
581 // have 96-bit scalar loads.
582 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
583 return false;
584
585 if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
586 return false;
587
588 // A load is known dereferenceable up to the alignment, so it's legal to widen
589 // to it.
590 //
591 // TODO: Could check dereferenceable for less aligned cases.
592 unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
593 if (AlignInBits < RoundedSize)
594 return false;
595
596 // Do not widen if it would introduce a slow unaligned load.
597 const SITargetLowering *TLI = ST.getTargetLowering();
598 unsigned Fast = 0;
599 return TLI->allowsMisalignedMemoryAccessesImpl(
600 Size: RoundedSize, AddrSpace, Alignment: Align(AlignInBits / 8),
601 Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
602 Fast;
603}
604
605static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
606 unsigned Opcode) {
607 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
608 return false;
609
610 return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs[0].MemoryTy,
611 AlignInBits: Query.MMODescrs[0].AlignInBits,
612 AddrSpace: Query.Types[1].getAddressSpace(), Opcode);
613}
614
615/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
616/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
617/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
618static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
619 MachineRegisterInfo &MRI, unsigned Idx) {
620 MachineOperand &MO = MI.getOperand(i: Idx);
621
622 const LLT PointerTy = MRI.getType(Reg: MO.getReg());
623
624 // Paranoidly prevent us from doing this multiple times.
625 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
626 return PointerTy;
627
628 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
630 if (!PointerTy.isVector()) {
631 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
632 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
633 const LLT S32 = LLT::scalar(SizeInBits: 32);
634
635 Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
636 std::array<Register, 4> VectorElems;
637 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
638 for (unsigned I = 0; I < NumParts; ++I)
639 VectorElems[I] =
640 B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: 0);
641 B.buildMergeValues(Res: MO, Ops: VectorElems);
642 MO.setReg(VectorReg);
643 return VectorTy;
644 }
645 Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
646 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
647 auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
648 B.buildIntToPtr(Dst: MO, Src: Scalar);
649 MO.setReg(BitcastReg);
650
651 return VectorTy;
652}
653
654/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
655/// the form in which the value must be in order to be passed to the low-level
656/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
657/// needed in order to account for the fact that we can't define a register
658/// class for s128 without breaking SelectionDAG.
659static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
660 MachineRegisterInfo &MRI = *B.getMRI();
661 const LLT PointerTy = MRI.getType(Reg: Pointer);
662 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
663 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
664
665 if (!PointerTy.isVector()) {
666 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
667 SmallVector<Register, 4> PointerParts;
668 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
669 auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: Pointer);
670 for (unsigned I = 0; I < NumParts; ++I)
671 PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
672 return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: 0);
673 }
674 Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: 0);
675 return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: 0);
676}
677
678static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
679 unsigned Idx) {
680 MachineOperand &MO = MI.getOperand(i: Idx);
681
682 const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
683 // Paranoidly prevent us from doing this multiple times.
684 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
685 return;
686 MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
687}
688
689AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
690 const GCNTargetMachine &TM)
691 : ST(ST_) {
692 using namespace TargetOpcode;
693
694 auto GetAddrSpacePtr = [&TM](unsigned AS) {
695 return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
696 };
697
698 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
699 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
700 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
701 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
702 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
703 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
704 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
705 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
706 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
707 const LLT BufferStridedPtr =
708 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
709
710 const LLT CodePtr = FlatPtr;
711
712 const std::initializer_list<LLT> AddrSpaces64 = {
713 GlobalPtr, ConstantPtr, FlatPtr
714 };
715
716 const std::initializer_list<LLT> AddrSpaces32 = {
717 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
718 };
719
720 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
721
722 const std::initializer_list<LLT> FPTypesBase = {
723 S32, S64
724 };
725
726 const std::initializer_list<LLT> FPTypes16 = {
727 S32, S64, S16
728 };
729
730 const std::initializer_list<LLT> FPTypesPK16 = {
731 S32, S64, S16, V2S16
732 };
733
734 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
735
736 // s1 for VCC branches, s32 for SCC branches.
737 getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
738
739 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
740 // elements for v3s16
741 getActionDefinitionsBuilder(Opcode: G_PHI)
742 .legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
743 .legalFor(Types: AllS32Vectors)
744 .legalFor(Types: AllS64Vectors)
745 .legalFor(Types: AddrSpaces64)
746 .legalFor(Types: AddrSpaces32)
747 .legalFor(Types: AddrSpaces128)
748 .legalIf(Predicate: isPointer(TypeIdx: 0))
749 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S256)
750 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
751 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16)
752 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
753 .scalarize(TypeIdx: 0);
754
755 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
756 // Full set of gfx9 features.
757 if (ST.hasScalarAddSub64()) {
758 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
759 .legalFor(Types: {S64, S32, S16, V2S16})
760 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
761 .scalarize(TypeIdx: 0)
762 .minScalar(TypeIdx: 0, Ty: S16)
763 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
764 .maxScalar(TypeIdx: 0, Ty: S32);
765 } else {
766 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
767 .legalFor(Types: {S32, S16, V2S16})
768 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
769 .scalarize(TypeIdx: 0)
770 .minScalar(TypeIdx: 0, Ty: S16)
771 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
772 .maxScalar(TypeIdx: 0, Ty: S32);
773 }
774
775 if (ST.hasScalarSMulU64()) {
776 getActionDefinitionsBuilder(Opcode: G_MUL)
777 .legalFor(Types: {S64, S32, S16, V2S16})
778 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
779 .scalarize(TypeIdx: 0)
780 .minScalar(TypeIdx: 0, Ty: S16)
781 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
782 .custom();
783 } else {
784 getActionDefinitionsBuilder(Opcode: G_MUL)
785 .legalFor(Types: {S32, S16, V2S16})
786 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
787 .scalarize(TypeIdx: 0)
788 .minScalar(TypeIdx: 0, Ty: S16)
789 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
790 .custom();
791 }
792 assert(ST.hasMad64_32());
793
794 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
795 .legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
796 .minScalarOrElt(TypeIdx: 0, Ty: S16)
797 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
798 .scalarize(TypeIdx: 0)
799 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
800 .lower();
801 } else if (ST.has16BitInsts()) {
802 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
803 .legalFor(Types: {S32, S16})
804 .minScalar(TypeIdx: 0, Ty: S16)
805 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
806 .maxScalar(TypeIdx: 0, Ty: S32)
807 .scalarize(TypeIdx: 0);
808
809 getActionDefinitionsBuilder(Opcode: G_MUL)
810 .legalFor(Types: {S32, S16})
811 .scalarize(TypeIdx: 0)
812 .minScalar(TypeIdx: 0, Ty: S16)
813 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
814 .custom();
815 assert(ST.hasMad64_32());
816
817 // Technically the saturating operations require clamp bit support, but this
818 // was introduced at the same time as 16-bit operations.
819 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
820 .legalFor(Types: {S32, S16}) // Clamp modifier
821 .minScalar(TypeIdx: 0, Ty: S16)
822 .scalarize(TypeIdx: 0)
823 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 16)
824 .lower();
825
826 // We're just lowering this, but it helps get a better result to try to
827 // coerce to the desired type first.
828 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
829 .minScalar(TypeIdx: 0, Ty: S16)
830 .scalarize(TypeIdx: 0)
831 .lower();
832 } else {
833 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
834 .legalFor(Types: {S32})
835 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
836 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
837 .scalarize(TypeIdx: 0);
838
839 auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
840 .legalFor(Types: {S32})
841 .scalarize(TypeIdx: 0)
842 .minScalar(TypeIdx: 0, Ty: S32)
843 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32);
844
845 if (ST.hasMad64_32())
846 Mul.custom();
847 else
848 Mul.maxScalar(TypeIdx: 0, Ty: S32);
849
850 if (ST.hasIntClamp()) {
851 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
852 .legalFor(Types: {S32}) // Clamp modifier.
853 .scalarize(TypeIdx: 0)
854 .minScalarOrElt(TypeIdx: 0, Ty: S32)
855 .lower();
856 } else {
857 // Clamp bit support was added in VI, along with 16-bit operations.
858 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
859 .minScalar(TypeIdx: 0, Ty: S32)
860 .scalarize(TypeIdx: 0)
861 .lower();
862 }
863
864 // FIXME: DAG expansion gets better results. The widening uses the smaller
865 // range values and goes for the min/max lowering directly.
866 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
867 .minScalar(TypeIdx: 0, Ty: S32)
868 .scalarize(TypeIdx: 0)
869 .lower();
870 }
871
872 getActionDefinitionsBuilder(
873 Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
874 .customFor(Types: {S32, S64})
875 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
876 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
877 .scalarize(TypeIdx: 0);
878
879 auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
880 .legalFor(Types: {S32})
881 .maxScalar(TypeIdx: 0, Ty: S32);
882
883 if (ST.hasVOP3PInsts()) {
884 Mulh
885 .clampMaxNumElements(TypeIdx: 0, EltTy: S8, MaxElements: 2)
886 .lowerFor(Types: {V2S8});
887 }
888
889 Mulh
890 .scalarize(TypeIdx: 0)
891 .lower();
892
893 // Report legal for any types we can handle anywhere. For the cases only legal
894 // on the SALU, RegBankSelect will be able to re-legalize.
895 getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
896 .legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
897 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
898 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
899 .fewerElementsIf(
900 Predicate: all(P0: vectorWiderThan(TypeIdx: 0, Size: 64), P1: scalarOrEltNarrowerThan(TypeIdx: 0, Size: 64)),
901 Mutation: fewerEltsToSize64Vector(TypeIdx: 0))
902 .widenScalarToNextPow2(TypeIdx: 0)
903 .scalarize(TypeIdx: 0);
904
905 getActionDefinitionsBuilder(
906 Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
907 .legalFor(Types: {{S32, S1}, {S32, S32}})
908 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
909 .scalarize(TypeIdx: 0);
910
911 getActionDefinitionsBuilder(Opcode: G_BITCAST)
912 // Don't worry about the size constraint.
913 .legalIf(Predicate: all(P0: isRegisterClassType(ST, TypeIdx: 0), P1: isRegisterClassType(ST, TypeIdx: 1)))
914 .lower();
915
916 getActionDefinitionsBuilder(Opcode: G_CONSTANT)
917 .legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
918 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
919 .legalIf(Predicate: isPointer(TypeIdx: 0))
920 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
921 .widenScalarToNextPow2(TypeIdx: 0);
922
923 getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
924 .legalFor(Types: {S32, S64, S16})
925 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
926
927 getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
928 .legalIf(Predicate: isRegisterClassType(ST, TypeIdx: 0))
929 // s1 and s16 are special cases because they have legal operations on
930 // them, but don't really occupy registers in the normal way.
931 .legalFor(Types: {S1, S16})
932 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
933 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
934 .clampScalarOrElt(TypeIdx: 0, MinTy: S32, MaxTy: MaxScalar)
935 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
936 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16);
937
938 getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
939
940 // If the amount is divergent, we have to do a wave reduction to get the
941 // maximum value, so this is expanded during RegBankSelect.
942 getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
943 .legalFor(Types: {{PrivatePtr, S32}});
944
945 getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
946 .customFor(Types: {PrivatePtr});
947 getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
948 .legalFor(Types: {PrivatePtr});
949
950 getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
951
952 getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
953 .customIf(Predicate: typeIsNot(TypeIdx: 0, Type: PrivatePtr));
954
955 getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
956
957 auto &FPOpActions = getActionDefinitionsBuilder(
958 Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
959 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
960 .legalFor(Types: {S32, S64});
961 auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
962 .customFor(Types: {S32, S64});
963 auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
964 .customFor(Types: {S32, S64});
965
966 if (ST.has16BitInsts()) {
967 if (ST.hasVOP3PInsts())
968 FPOpActions.legalFor(Types: {S16, V2S16});
969 else
970 FPOpActions.legalFor(Types: {S16});
971
972 TrigActions.customFor(Types: {S16});
973 FDIVActions.customFor(Types: {S16});
974 }
975
976 if (ST.hasPackedFP32Ops()) {
977 FPOpActions.legalFor(Types: {V2S32});
978 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S32, NumElts: 2);
979 }
980
981 auto &MinNumMaxNumIeee =
982 getActionDefinitionsBuilder(Opcodes: {G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
983
984 if (ST.hasVOP3PInsts()) {
985 MinNumMaxNumIeee.legalFor(Types: FPTypesPK16)
986 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
987 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
988 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
989 .scalarize(TypeIdx: 0);
990 } else if (ST.has16BitInsts()) {
991 MinNumMaxNumIeee.legalFor(Types: FPTypes16).clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64).scalarize(TypeIdx: 0);
992 } else {
993 MinNumMaxNumIeee.legalFor(Types: FPTypesBase)
994 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
995 .scalarize(TypeIdx: 0);
996 }
997
998 auto &MinNumMaxNum = getActionDefinitionsBuilder(
999 Opcodes: {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1000
1001 if (ST.hasVOP3PInsts()) {
1002 MinNumMaxNum.customFor(Types: FPTypesPK16)
1003 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1004 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1005 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1006 .scalarize(TypeIdx: 0);
1007 } else if (ST.has16BitInsts()) {
1008 MinNumMaxNum.customFor(Types: FPTypes16)
1009 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1010 .scalarize(TypeIdx: 0);
1011 } else {
1012 MinNumMaxNum.customFor(Types: FPTypesBase)
1013 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1014 .scalarize(TypeIdx: 0);
1015 }
1016
1017 if (ST.hasVOP3PInsts())
1018 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
1019
1020 FPOpActions
1021 .scalarize(TypeIdx: 0)
1022 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1023
1024 TrigActions
1025 .scalarize(TypeIdx: 0)
1026 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1027
1028 FDIVActions
1029 .scalarize(TypeIdx: 0)
1030 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1031
1032 getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS})
1033 .legalFor(Types: FPTypesPK16)
1034 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1035 .scalarize(TypeIdx: 0)
1036 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1037
1038 if (ST.has16BitInsts()) {
1039 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1040 .legalFor(Types: {S16})
1041 .customFor(Types: {S32, S64})
1042 .scalarize(TypeIdx: 0)
1043 .unsupported();
1044 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1045 .legalFor(Types: {S32, S64, S16})
1046 .scalarize(TypeIdx: 0)
1047 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1048
1049 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1050 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
1051 .scalarize(TypeIdx: 0)
1052 .maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16)
1053 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1054 .lower();
1055
1056 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1057 .customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1058 .scalarize(TypeIdx: 0)
1059 .lower();
1060
1061 getActionDefinitionsBuilder(Opcode: G_FMODF)
1062 .lowerFor(Types: {S16, S32, S64})
1063 .scalarize(TypeIdx: 0)
1064 .lower();
1065 } else {
1066 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1067 .customFor(Types: {S32, S64, S16})
1068 .scalarize(TypeIdx: 0)
1069 .unsupported();
1070
1071
1072 if (ST.hasFractBug()) {
1073 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1074 .customFor(Types: {S64})
1075 .legalFor(Types: {S32, S64})
1076 .scalarize(TypeIdx: 0)
1077 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1078 } else {
1079 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1080 .legalFor(Types: {S32, S64})
1081 .scalarize(TypeIdx: 0)
1082 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1083 }
1084
1085 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1086 .legalFor(Types: {{S32, S32}, {S64, S32}})
1087 .scalarize(TypeIdx: 0)
1088 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1089 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1090 .lower();
1091
1092 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1093 .customFor(Types: {{S32, S32}, {S64, S32}})
1094 .scalarize(TypeIdx: 0)
1095 .minScalar(TypeIdx: 0, Ty: S32)
1096 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1097 .lower();
1098
1099 getActionDefinitionsBuilder(Opcode: G_FMODF)
1100 .lowerFor(Types: {S32, S64})
1101 .scalarize(TypeIdx: 0)
1102 .lower();
1103 }
1104
1105 auto &FPTruncActions = getActionDefinitionsBuilder(Opcode: G_FPTRUNC);
1106 if (ST.hasCvtPkF16F32Inst()) {
1107 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1108 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1109 } else {
1110 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}});
1111 }
1112 FPTruncActions.scalarize(TypeIdx: 0).lower();
1113
1114 getActionDefinitionsBuilder(Opcode: G_FPEXT)
1115 .legalFor(Types: {{S64, S32}, {S32, S16}})
1116 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32))
1117 .scalarize(TypeIdx: 0);
1118
1119 auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1120 if (ST.has16BitInsts()) {
1121 FSubActions
1122 // Use actual fsub instruction
1123 .legalFor(Types: {S32, S16})
1124 // Must use fadd + fneg
1125 .lowerFor(Types: {S64, V2S16});
1126 } else {
1127 FSubActions
1128 // Use actual fsub instruction
1129 .legalFor(Types: {S32})
1130 // Must use fadd + fneg
1131 .lowerFor(Types: {S64, S16, V2S16});
1132 }
1133
1134 FSubActions
1135 .scalarize(TypeIdx: 0)
1136 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1137
1138 // Whether this is legal depends on the floating point mode for the function.
1139 auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1140 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1141 FMad.customFor(Types: {S32, S16});
1142 else if (ST.hasMadMacF32Insts())
1143 FMad.customFor(Types: {S32});
1144 else if (ST.hasMadF16())
1145 FMad.customFor(Types: {S16});
1146 FMad.scalarize(TypeIdx: 0)
1147 .lower();
1148
1149 auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1150 if (ST.has16BitInsts()) {
1151 FRem.customFor(Types: {S16, S32, S64});
1152 } else {
1153 FRem.minScalar(TypeIdx: 0, Ty: S32)
1154 .customFor(Types: {S32, S64});
1155 }
1156 FRem.scalarize(TypeIdx: 0);
1157
1158 // TODO: Do we need to clamp maximum bitwidth?
1159 getActionDefinitionsBuilder(Opcode: G_TRUNC)
1160 .legalIf(Predicate: isScalar(TypeIdx: 0))
1161 .legalFor(Types: {{V2S16, V2S32}})
1162 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1163 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1164 // situations (like an invalid implicit use), we don't want to infinite loop
1165 // in the legalizer.
1166 .fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: 0), Mutation: LegalizeMutations::scalarize(TypeIdx: 0))
1167 .alwaysLegal();
1168
1169 getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1170 .legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1171 {S32, S1}, {S64, S1}, {S16, S1}})
1172 .scalarize(TypeIdx: 0)
1173 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1174 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1175
1176 // TODO: Split s1->s64 during regbankselect for VALU.
1177 auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1178 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S32}})
1179 .lowerIf(Predicate: typeIs(TypeIdx: 1, TypesInit: S1))
1180 .customFor(Types: {{S32, S64}, {S64, S64}});
1181 if (ST.has16BitInsts())
1182 IToFP.legalFor(Types: {{S16, S16}});
1183 IToFP.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1184 .minScalar(TypeIdx: 0, Ty: S32)
1185 .scalarize(TypeIdx: 0)
1186 .widenScalarToNextPow2(TypeIdx: 1);
1187
1188 auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1189 .legalFor(Types: {{S32, S32}, {S32, S64}, {S32, S16}})
1190 .customFor(Types: {{S64, S32}, {S64, S64}})
1191 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1192 if (ST.has16BitInsts())
1193 FPToI.legalFor(Types: {{S16, S16}});
1194 else
1195 FPToI.minScalar(TypeIdx: 1, Ty: S32);
1196
1197 FPToI.minScalar(TypeIdx: 0, Ty: S32)
1198 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1199 .scalarize(TypeIdx: 0)
1200 .lower();
1201
1202 // clang-format off
1203 auto &FPToISat = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI_SAT, G_FPTOUI_SAT})
1204 .legalFor(Types: {{S32, S32}, {S32, S64}})
1205 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1206 FPToISat.minScalar(TypeIdx: 1, Ty: S32);
1207 FPToISat.minScalar(TypeIdx: 0, Ty: S32)
1208 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1209 .scalarize(TypeIdx: 0)
1210 .lower();
1211 // clang-format on
1212
1213 getActionDefinitionsBuilder(Opcodes: {G_LROUND, G_LLROUND})
1214 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1215 .scalarize(TypeIdx: 0)
1216 .lower();
1217
1218 getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1219 .legalFor(Types: {S16, S32})
1220 .scalarize(TypeIdx: 0)
1221 .lower();
1222
1223 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1224 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1225 .scalarize(TypeIdx: 0)
1226 .lower();
1227
1228 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1229 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1230 .scalarize(TypeIdx: 0)
1231 .lower();
1232
1233 if (ST.has16BitInsts()) {
1234 getActionDefinitionsBuilder(
1235 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1236 .legalFor(Types: {S16, S32, S64})
1237 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1238 .scalarize(TypeIdx: 0);
1239 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1240 getActionDefinitionsBuilder(
1241 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1242 .legalFor(Types: {S32, S64})
1243 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1244 .scalarize(TypeIdx: 0);
1245 } else {
1246 getActionDefinitionsBuilder(
1247 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1248 .legalFor(Types: {S32})
1249 .customFor(Types: {S64})
1250 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1251 .scalarize(TypeIdx: 0);
1252 }
1253
1254 getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1255 .unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1256 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: sameSize(TypeIdx0: 0, TypeIdx1: 1)))
1257 .scalarize(TypeIdx: 0)
1258 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0);
1259
1260 getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1261 .legalIf(Predicate: all(P0: sameSize(TypeIdx0: 0, TypeIdx1: 1), P1: typeInSet(TypeIdx: 1, TypesInit: {S64, S32})))
1262 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0)
1263 .scalarize(TypeIdx: 0);
1264
1265 auto &CmpBuilder =
1266 getActionDefinitionsBuilder(Opcode: G_ICMP)
1267 // The compare output type differs based on the register bank of the output,
1268 // so make both s1 and s32 legal.
1269 //
1270 // Scalar compares producing output in scc will be promoted to s32, as that
1271 // is the allocatable register type that will be needed for the copy from
1272 // scc. This will be promoted during RegBankSelect, and we assume something
1273 // before that won't try to use s32 result types.
1274 //
1275 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1276 // bank.
1277 .legalForCartesianProduct(
1278 Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1279 .legalForCartesianProduct(
1280 Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1281 if (ST.has16BitInsts()) {
1282 CmpBuilder.legalFor(Types: {{S1, S16}});
1283 }
1284
1285 CmpBuilder
1286 .widenScalarToNextPow2(TypeIdx: 1)
1287 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1288 .scalarize(TypeIdx: 0)
1289 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: 1)));
1290
1291 auto &FCmpBuilder =
1292 getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1293 Types0: {S1}, Types1: ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1294
1295 if (ST.hasSALUFloatInsts())
1296 FCmpBuilder.legalForCartesianProduct(Types0: {S32}, Types1: {S16, S32});
1297
1298 FCmpBuilder
1299 .widenScalarToNextPow2(TypeIdx: 1)
1300 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1301 .scalarize(TypeIdx: 0);
1302
1303 // FIXME: fpow has a selection pattern that should move to custom lowering.
1304 auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1305 if (ST.has16BitInsts())
1306 ExpOps.customFor(Types: {{S32}, {S16}});
1307 else
1308 ExpOps.customFor(Types: {S32});
1309 ExpOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1310 .scalarize(TypeIdx: 0);
1311
1312 getActionDefinitionsBuilder(Opcode: G_FPOWI)
1313 .clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1314 .lower();
1315
1316 getActionDefinitionsBuilder(Opcode: G_FLOG2)
1317 .legalFor(Pred: ST.has16BitInsts(), Types: {S16})
1318 .customFor(Types: {S32, S16})
1319 .scalarize(TypeIdx: 0)
1320 .lower();
1321
1322 getActionDefinitionsBuilder(Opcode: G_FEXP2)
1323 .legalFor(Pred: ST.has16BitInsts(), Types: {S16})
1324 .customFor(Types: {S32, S64, S16})
1325 .scalarize(TypeIdx: 0)
1326 .lower();
1327
1328 auto &LogOps =
1329 getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1330 LogOps.customFor(Types: {S32, S16, S64});
1331 LogOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1332 .scalarize(TypeIdx: 0);
1333
1334 // The 64-bit versions produce 32-bit results, but only on the SALU.
1335 getActionDefinitionsBuilder(Opcode: G_CTPOP)
1336 .legalFor(Types: {{S32, S32}, {S32, S64}})
1337 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1338 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1339 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1340 .scalarize(TypeIdx: 0)
1341 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1342
1343 // If no 16 bit instr is available, lower into different instructions.
1344 if (ST.has16BitInsts())
1345 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1346 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1347 .widenScalarToNextPow2(TypeIdx: 1)
1348 .scalarize(TypeIdx: 0)
1349 .lower();
1350 else
1351 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1352 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1353 .lowerFor(Types: {S1, S16})
1354 .widenScalarToNextPow2(TypeIdx: 1)
1355 .scalarize(TypeIdx: 0)
1356 .lower();
1357
1358 // The hardware instructions return a different result on 0 than the generic
1359 // instructions expect. The hardware produces -1, but these produce the
1360 // bitwidth.
1361 getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1362 .scalarize(TypeIdx: 0)
1363 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1364 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1365 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1366 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1367 .custom();
1368
1369 // The 64-bit versions produce 32-bit results, but only on the SALU.
1370 getActionDefinitionsBuilder(Opcode: G_CTLZ_ZERO_UNDEF)
1371 .legalFor(Types: {{S32, S32}, {S32, S64}})
1372 .customIf(Predicate: scalarNarrowerThan(TypeIdx: 1, Size: 32))
1373 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1374 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1375 .scalarize(TypeIdx: 0)
1376 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1377 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1378
1379 getActionDefinitionsBuilder(Opcode: G_CTTZ_ZERO_UNDEF)
1380 .legalFor(Types: {{S32, S32}, {S32, S64}})
1381 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1382 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1383 .scalarize(TypeIdx: 0)
1384 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1385 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1386
1387 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1388 // RegBankSelect.
1389 getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1390 .legalFor(Types: {S32, S64})
1391 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1392 .scalarize(TypeIdx: 0)
1393 .widenScalarToNextPow2(TypeIdx: 0);
1394
1395 if (ST.has16BitInsts()) {
1396 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1397 .legalFor(Types: {S16, S32, V2S16})
1398 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1399 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1400 // narrowScalar limitation.
1401 .widenScalarToNextPow2(TypeIdx: 0)
1402 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S32)
1403 .scalarize(TypeIdx: 0);
1404
1405 if (ST.hasVOP3PInsts()) {
1406 getActionDefinitionsBuilder(Opcode: G_ABS)
1407 .legalFor(Types: {S32, S16, V2S16})
1408 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1409 .minScalar(TypeIdx: 0, Ty: S16)
1410 .widenScalarToNextPow2(TypeIdx: 0)
1411 .scalarize(TypeIdx: 0)
1412 .lower();
1413 if (ST.hasIntMinMax64()) {
1414 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1415 .legalFor(Types: {S32, S16, S64, V2S16})
1416 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1417 .minScalar(TypeIdx: 0, Ty: S16)
1418 .widenScalarToNextPow2(TypeIdx: 0)
1419 .scalarize(TypeIdx: 0)
1420 .lower();
1421 } else {
1422 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1423 .legalFor(Types: {S32, S16, V2S16})
1424 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1425 .minScalar(TypeIdx: 0, Ty: S16)
1426 .widenScalarToNextPow2(TypeIdx: 0)
1427 .scalarize(TypeIdx: 0)
1428 .lower();
1429 }
1430 } else {
1431 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1432 .legalFor(Types: {S32, S16})
1433 .widenScalarToNextPow2(TypeIdx: 0)
1434 .minScalar(TypeIdx: 0, Ty: S16)
1435 .scalarize(TypeIdx: 0)
1436 .lower();
1437 }
1438 } else {
1439 // TODO: Should have same legality without v_perm_b32
1440 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1441 .legalFor(Types: {S32})
1442 .lowerIf(Predicate: scalarNarrowerThan(TypeIdx: 0, Size: 32))
1443 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1444 // narrowScalar limitation.
1445 .widenScalarToNextPow2(TypeIdx: 0)
1446 .maxScalar(TypeIdx: 0, Ty: S32)
1447 .scalarize(TypeIdx: 0)
1448 .lower();
1449
1450 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1451 .legalFor(Types: {S32})
1452 .minScalar(TypeIdx: 0, Ty: S32)
1453 .widenScalarToNextPow2(TypeIdx: 0)
1454 .scalarize(TypeIdx: 0)
1455 .lower();
1456 }
1457
1458 getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1459 // List the common cases
1460 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1461 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1462 .scalarize(TypeIdx: 0)
1463 // Accept any address space as long as the size matches
1464 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1465 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 1, TypeIdx1: 0),
1466 Mutation: [](const LegalityQuery &Query) {
1467 return std::pair(
1468 1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1469 })
1470 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 1, TypeIdx1: 0), Mutation: [](const LegalityQuery &Query) {
1471 return std::pair(1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1472 });
1473
1474 getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1475 // List the common cases
1476 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1477 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1478 .scalarize(TypeIdx: 0)
1479 // Accept any address space as long as the size matches
1480 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1481 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 0, TypeIdx1: 1),
1482 Mutation: [](const LegalityQuery &Query) {
1483 return std::pair(
1484 0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1485 })
1486 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 0, TypeIdx1: 1), Mutation: [](const LegalityQuery &Query) {
1487 return std::pair(0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1488 });
1489
1490 getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1491 .scalarize(TypeIdx: 0)
1492 .custom();
1493
1494 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1495 bool IsLoad) -> bool {
1496 const LLT DstTy = Query.Types[0];
1497
1498 // Split vector extloads.
1499 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1500
1501 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1502 return true;
1503
1504 const LLT PtrTy = Query.Types[1];
1505 unsigned AS = PtrTy.getAddressSpace();
1506 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1507 IsAtomic: Query.MMODescrs[0].Ordering !=
1508 AtomicOrdering::NotAtomic))
1509 return true;
1510
1511 // Catch weird sized loads that don't evenly divide into the access sizes
1512 // TODO: May be able to widen depending on alignment etc.
1513 unsigned NumRegs = (MemSize + 31) / 32;
1514 if (NumRegs == 3) {
1515 if (!ST.hasDwordx3LoadStores())
1516 return true;
1517 } else {
1518 // If the alignment allows, these should have been widened.
1519 if (!isPowerOf2_32(Value: NumRegs))
1520 return true;
1521 }
1522
1523 return false;
1524 };
1525
1526 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1527 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1528 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1529
1530 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1531 // LDS
1532 // TODO: Unsupported flat for SI.
1533
1534 for (unsigned Op : {G_LOAD, G_STORE}) {
1535 const bool IsStore = Op == G_STORE;
1536
1537 auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1538 // Explicitly list some common cases.
1539 // TODO: Does this help compile time at all?
1540 Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1541 {.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1542 {.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1543 {.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1544 {.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1545 {.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1546 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1547 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1548
1549 {.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1550 {.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: 32},
1551 {.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: 32},
1552 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1553 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1554 {.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1555
1556 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1557 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1558 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1559 {.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1560
1561 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1562 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1563 {.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1564 {.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1565 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1566 Actions.legalIf(
1567 Predicate: [=](const LegalityQuery &Query) -> bool {
1568 return isLoadStoreLegal(ST, Query);
1569 });
1570
1571 // The custom pointers (fat pointers, buffer resources) don't work with load
1572 // and store at this level. Fat pointers should have been lowered to
1573 // intrinsics before the translation to MIR.
1574 Actions.unsupportedIf(
1575 Predicate: typeInSet(TypeIdx: 1, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1576
1577 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1578 // ptrtoint. This is needed to account for the fact that we can't have i128
1579 // as a register class for SelectionDAG reasons.
1580 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1581 return hasBufferRsrcWorkaround(Ty: Query.Types[0]);
1582 });
1583
1584 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1585 // 64-bits.
1586 //
1587 // TODO: Should generalize bitcast action into coerce, which will also cover
1588 // inserting addrspacecasts.
1589 Actions.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1590
1591 // Turn any illegal element vectors into something easier to deal
1592 // with. These will ultimately produce 32-bit scalar shifts to extract the
1593 // parts anyway.
1594 //
1595 // For odd 16-bit element vectors, prefer to split those into pieces with
1596 // 16-bit vector parts.
1597 Actions.bitcastIf(
1598 Predicate: [=](const LegalityQuery &Query) -> bool {
1599 return shouldBitcastLoadStoreType(ST, Ty: Query.Types[0],
1600 MemTy: Query.MMODescrs[0].MemoryTy);
1601 }, Mutation: bitcastToRegisterType(TypeIdx: 0));
1602
1603 if (!IsStore) {
1604 // Widen suitably aligned loads by loading extra bytes. The standard
1605 // legalization actions can't properly express widening memory operands.
1606 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1607 return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1608 });
1609 }
1610
1611 // FIXME: load/store narrowing should be moved to lower action
1612 Actions
1613 .narrowScalarIf(
1614 Predicate: [=](const LegalityQuery &Query) -> bool {
1615 return !Query.Types[0].isVector() &&
1616 needToSplitMemOp(Query, Op == G_LOAD);
1617 },
1618 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1619 const LLT DstTy = Query.Types[0];
1620 const LLT PtrTy = Query.Types[1];
1621
1622 const unsigned DstSize = DstTy.getSizeInBits();
1623 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1624
1625 // Split extloads.
1626 if (DstSize > MemSize)
1627 return std::pair(0, LLT::scalar(SizeInBits: MemSize));
1628
1629 unsigned MaxSize = maxSizeForAddrSpace(
1630 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1631 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1632 if (MemSize > MaxSize)
1633 return std::pair(0, LLT::scalar(SizeInBits: MaxSize));
1634
1635 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1636 return std::pair(0, LLT::scalar(SizeInBits: Align));
1637 })
1638 .fewerElementsIf(
1639 Predicate: [=](const LegalityQuery &Query) -> bool {
1640 return Query.Types[0].isVector() &&
1641 needToSplitMemOp(Query, Op == G_LOAD);
1642 },
1643 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1644 const LLT DstTy = Query.Types[0];
1645 const LLT PtrTy = Query.Types[1];
1646
1647 LLT EltTy = DstTy.getElementType();
1648 unsigned MaxSize = maxSizeForAddrSpace(
1649 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1650 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1651
1652 // FIXME: Handle widened to power of 2 results better. This ends
1653 // up scalarizing.
1654 // FIXME: 3 element stores scalarized on SI
1655
1656 // Split if it's too large for the address space.
1657 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1658 if (MemSize > MaxSize) {
1659 unsigned NumElts = DstTy.getNumElements();
1660 unsigned EltSize = EltTy.getSizeInBits();
1661
1662 if (MaxSize % EltSize == 0) {
1663 return std::pair(
1664 0, LLT::scalarOrVector(
1665 EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1666 }
1667
1668 unsigned NumPieces = MemSize / MaxSize;
1669
1670 // FIXME: Refine when odd breakdowns handled
1671 // The scalars will need to be re-legalized.
1672 if (NumPieces == 1 || NumPieces >= NumElts ||
1673 NumElts % NumPieces != 0)
1674 return std::pair(0, EltTy);
1675
1676 return std::pair(0,
1677 LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1678 }
1679
1680 // FIXME: We could probably handle weird extending loads better.
1681 if (DstTy.getSizeInBits() > MemSize)
1682 return std::pair(0, EltTy);
1683
1684 unsigned EltSize = EltTy.getSizeInBits();
1685 unsigned DstSize = DstTy.getSizeInBits();
1686 if (!isPowerOf2_32(Value: DstSize)) {
1687 // We're probably decomposing an odd sized store. Try to split
1688 // to the widest type. TODO: Account for alignment. As-is it
1689 // should be OK, since the new parts will be further legalized.
1690 unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1691 return std::pair(
1692 0, LLT::scalarOrVector(
1693 EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1694 }
1695
1696 // May need relegalization for the scalars.
1697 return std::pair(0, EltTy);
1698 })
1699 .minScalar(TypeIdx: 0, Ty: S32)
1700 .narrowScalarIf(Predicate: isTruncStoreToSizePowerOf2(TypeIdx: 0),
1701 Mutation: getScalarTypeFromMemDesc(TypeIdx: 0))
1702 .widenScalarToNextPow2(TypeIdx: 0)
1703 .moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: 0, Size: 32), Mutation: moreEltsToNext32Bit(TypeIdx: 0))
1704 .lower();
1705 }
1706
1707 // FIXME: Unaligned accesses not lowered.
1708 auto &ExtLoads = getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1709 .legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: 8},
1710 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: 2 * 8},
1711 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1712 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1713 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1714 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1715 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: 8},
1716 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: 2 * 8}})
1717 .legalIf(
1718 Predicate: [=](const LegalityQuery &Query) -> bool {
1719 return isLoadStoreLegal(ST, Query);
1720 });
1721
1722 if (ST.hasFlatAddressSpace()) {
1723 ExtLoads.legalForTypesWithMemDesc(
1724 TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: 8}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: 16}});
1725 }
1726
1727 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1728 // 64-bits.
1729 //
1730 // TODO: Should generalize bitcast action into coerce, which will also cover
1731 // inserting addrspacecasts.
1732 ExtLoads.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1733
1734 ExtLoads.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1735 .widenScalarToNextPow2(TypeIdx: 0)
1736 .lower();
1737
1738 auto &Atomics = getActionDefinitionsBuilder(
1739 Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1740 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1741 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1742 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1743 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1744 {S64, GlobalPtr}, {S64, LocalPtr},
1745 {S32, RegionPtr}, {S64, RegionPtr}});
1746 if (ST.hasFlatAddressSpace()) {
1747 Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1748 }
1749
1750 auto &Atomics32 =
1751 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1752 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1753 if (ST.hasFlatAddressSpace()) {
1754 Atomics32.legalFor(Types: {{S32, FlatPtr}});
1755 }
1756
1757 // TODO: v2bf16 operations, and fat buffer pointer support.
1758 auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1759 if (ST.hasLDSFPAtomicAddF32()) {
1760 Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1761 if (ST.hasLdsAtomicAddF64())
1762 Atomic.legalFor(Types: {{S64, LocalPtr}});
1763 if (ST.hasAtomicDsPkAdd16Insts())
1764 Atomic.legalFor(Types: {{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1765 }
1766 if (ST.hasAtomicFaddInsts())
1767 Atomic.legalFor(Types: {{S32, GlobalPtr}});
1768 if (ST.hasFlatAtomicFaddF32Inst())
1769 Atomic.legalFor(Types: {{S32, FlatPtr}});
1770
1771 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1772 // These are legal with some caveats, and should have undergone expansion in
1773 // the IR in most situations
1774 // TODO: Move atomic expansion into legalizer
1775 Atomic.legalFor(Types: {
1776 {S32, GlobalPtr},
1777 {S64, GlobalPtr},
1778 {S64, FlatPtr}
1779 });
1780 }
1781
1782 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1783 ST.hasAtomicBufferGlobalPkAddF16Insts())
1784 Atomic.legalFor(Types: {{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1785 if (ST.hasAtomicGlobalPkAddBF16Inst())
1786 Atomic.legalFor(Types: {{V2BF16, GlobalPtr}});
1787 if (ST.hasAtomicFlatPkAdd16Insts())
1788 Atomic.legalFor(Types: {{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1789
1790
1791 // Most of the legalization work here is done by AtomicExpand. We could
1792 // probably use a simpler legality rule that just assumes anything is OK.
1793 auto &AtomicFMinFMax =
1794 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1795 .legalFor(Types: {{F32, LocalPtr}, {F64, LocalPtr}});
1796
1797 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1798 AtomicFMinFMax.legalFor(Types: {{F32, GlobalPtr},{F32, BufferFatPtr}});
1799 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1800 AtomicFMinFMax.legalFor(Types: {{F64, GlobalPtr}, {F64, BufferFatPtr}});
1801 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1802 AtomicFMinFMax.legalFor(Types: {F32, FlatPtr});
1803 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1804 AtomicFMinFMax.legalFor(Types: {F64, FlatPtr});
1805
1806 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1807 // demarshalling
1808 getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1809 .customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1810 {S32, FlatPtr}, {S64, FlatPtr}})
1811 .legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1812 {S32, RegionPtr}, {S64, RegionPtr}});
1813 // TODO: Pointer types, any 32-bit or 64-bit vector
1814
1815 // Condition should be s32 for scalar, s1 for vector.
1816 getActionDefinitionsBuilder(Opcode: G_SELECT)
1817 .legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1818 LocalPtr, FlatPtr, PrivatePtr,
1819 LLT::fixed_vector(NumElements: 2, ScalarTy: LocalPtr),
1820 LLT::fixed_vector(NumElements: 2, ScalarTy: PrivatePtr)},
1821 Types1: {S1, S32})
1822 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1823 .scalarize(TypeIdx: 1)
1824 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1825 .fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: 0), Mutation: scalarize(TypeIdx: 0))
1826 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 2)
1827 .clampMaxNumElements(TypeIdx: 0, EltTy: LocalPtr, MaxElements: 2)
1828 .clampMaxNumElements(TypeIdx: 0, EltTy: PrivatePtr, MaxElements: 2)
1829 .scalarize(TypeIdx: 0)
1830 .widenScalarToNextPow2(TypeIdx: 0)
1831 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: typeInSet(TypeIdx: 1, TypesInit: {S1, S32})));
1832
1833 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1834 // be more flexible with the shift amount type.
1835 auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1836 .legalFor(Types: {{S32, S32}, {S64, S32}});
1837 if (ST.has16BitInsts()) {
1838 if (ST.hasVOP3PInsts()) {
1839 Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1840 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1841 } else
1842 Shifts.legalFor(Types: {{S16, S16}});
1843
1844 // TODO: Support 16-bit shift amounts for all types
1845 Shifts.widenScalarIf(
1846 Predicate: [=](const LegalityQuery &Query) {
1847 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1848 // 32-bit amount.
1849 const LLT ValTy = Query.Types[0];
1850 const LLT AmountTy = Query.Types[1];
1851 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1852 AmountTy.getSizeInBits() < 16;
1853 }, Mutation: changeTo(TypeIdx: 1, Ty: S16));
1854 Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16);
1855 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1856 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 16);
1857 Shifts.clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1858
1859 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1860 .minScalar(TypeIdx: 0, Ty: S16)
1861 .scalarize(TypeIdx: 0)
1862 .lower();
1863 } else {
1864 // Make sure we legalize the shift amount type first, as the general
1865 // expansion for the shifted type will produce much worse code if it hasn't
1866 // been truncated already.
1867 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1868 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1869 Shifts.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1870
1871 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1872 .minScalar(TypeIdx: 0, Ty: S32)
1873 .scalarize(TypeIdx: 0)
1874 .lower();
1875 }
1876 Shifts.scalarize(TypeIdx: 0);
1877
1878 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1879 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1880 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1881 unsigned IdxTypeIdx = 2;
1882
1883 getActionDefinitionsBuilder(Opcode: Op)
1884 .customIf(Predicate: [=](const LegalityQuery &Query) {
1885 const LLT EltTy = Query.Types[EltTypeIdx];
1886 const LLT VecTy = Query.Types[VecTypeIdx];
1887 const LLT IdxTy = Query.Types[IdxTypeIdx];
1888 const unsigned EltSize = EltTy.getSizeInBits();
1889 const bool isLegalVecType =
1890 !!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1891 // Address space 8 pointers are 128-bit wide values, but the logic
1892 // below will try to bitcast them to 2N x s64, which will fail.
1893 // Therefore, as an intermediate step, wrap extracts/insertions from a
1894 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1895 // extraction result) in order to produce a vector operation that can
1896 // be handled by the logic below.
1897 if (EltTy.isPointer() && EltSize > 64)
1898 return true;
1899 return (EltSize == 32 || EltSize == 64) &&
1900 VecTy.getSizeInBits() % 32 == 0 &&
1901 VecTy.getSizeInBits() <= MaxRegisterSize &&
1902 IdxTy.getSizeInBits() == 32 &&
1903 isLegalVecType;
1904 })
1905 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1906 P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: 32)),
1907 Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1908 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1909 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1910 P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: 64)),
1911 Mutation: [=](const LegalityQuery &Query) {
1912 // For > 64-bit element types, try to turn this into a
1913 // 64-bit element vector since we may be able to do better
1914 // indexing if this is scalar. If not, fall back to 32.
1915 const LLT EltTy = Query.Types[EltTypeIdx];
1916 const LLT VecTy = Query.Types[VecTypeIdx];
1917 const unsigned DstEltSize = EltTy.getSizeInBits();
1918 const unsigned VecSize = VecTy.getSizeInBits();
1919
1920 const unsigned TargetEltSize =
1921 DstEltSize % 64 == 0 ? 64 : 32;
1922 return std::pair(VecTypeIdx,
1923 LLT::fixed_vector(NumElements: VecSize / TargetEltSize,
1924 ScalarSizeInBits: TargetEltSize));
1925 })
1926 .clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
1927 .clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
1928 .clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
1929 .clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: 32)
1930 // TODO: Clamp elements for 64-bit vectors?
1931 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: VecTypeIdx),
1932 Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
1933 // It should only be necessary with variable indexes.
1934 // As a last resort, lower to the stack
1935 .lower();
1936 }
1937
1938 getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1939 .unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1940 const LLT &EltTy = Query.Types[1].getElementType();
1941 return Query.Types[0] != EltTy;
1942 });
1943
1944 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1945 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1946 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1947 getActionDefinitionsBuilder(Opcode: Op)
1948 .widenScalarIf(
1949 Predicate: [=](const LegalityQuery &Query) {
1950 const LLT BigTy = Query.Types[BigTyIdx];
1951 return (BigTy.getScalarSizeInBits() < 16);
1952 },
1953 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: 16))
1954 .widenScalarIf(
1955 Predicate: [=](const LegalityQuery &Query) {
1956 const LLT LitTy = Query.Types[LitTyIdx];
1957 return (LitTy.getScalarSizeInBits() < 16);
1958 },
1959 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: 16))
1960 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1961 .widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: 32)
1962 .customIf(Predicate: [=](const LegalityQuery &Query) {
1963 // Generic lower operates on the full-width value, producing
1964 // shift+trunc/mask sequences. For simple cases where extract/insert
1965 // values are 32-bit aligned, we can instead unmerge/merge and work on
1966 // the 32-bit components. However, we can't check the offset here so
1967 // custom lower function will have to call generic lowering if offset
1968 // is not 32-bit aligned.
1969 const LLT BigTy = Query.Types[BigTyIdx];
1970 const LLT LitTy = Query.Types[LitTyIdx];
1971 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
1972 LitTy.getSizeInBits() % 32 == 0;
1973 })
1974 .lower();
1975 }
1976
1977 auto &BuildVector =
1978 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1979 .legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
1980 .legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
1981 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
1982 .clampNumElements(TypeIdx: 0, MinTy: V2S64, MaxTy: V16S64)
1983 .fewerElementsIf(Predicate: isWideVec16(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: V2S16))
1984 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: 0),
1985 Mutation: moreElementsToNextExistingRegClass(TypeIdx: 0));
1986
1987 if (ST.hasScalarPackInsts()) {
1988 BuildVector
1989 // FIXME: Should probably widen s1 vectors straight to s32
1990 .minScalarOrElt(TypeIdx: 0, Ty: S16)
1991 .minScalar(TypeIdx: 1, Ty: S16);
1992
1993 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1994 .legalFor(Types: {V2S16, S32})
1995 .lower();
1996 } else {
1997 BuildVector.customFor(Types: {V2S16, S16});
1998 BuildVector.minScalarOrElt(TypeIdx: 0, Ty: S32);
1999
2000 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
2001 .customFor(Types: {V2S16, S32})
2002 .lower();
2003 }
2004
2005 BuildVector.legalIf(Predicate: isRegisterType(ST, TypeIdx: 0));
2006
2007 // FIXME: Clamp maximum size
2008 getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
2009 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
2010 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 32)
2011 .clampMaxNumElements(TypeIdx: 1, EltTy: S16, MaxElements: 2) // TODO: Make 4?
2012 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 64);
2013
2014 getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
2015
2016 // Merge/Unmerge
2017 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2018 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2019 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2020
2021 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2022 const LLT Ty = Query.Types[TypeIdx];
2023 if (Ty.isVector()) {
2024 const LLT &EltTy = Ty.getElementType();
2025 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2026 return true;
2027 if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
2028 return true;
2029 }
2030 return false;
2031 };
2032
2033 auto &Builder =
2034 getActionDefinitionsBuilder(Opcode: Op)
2035 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
2036 .lowerFor(Types: {{S16, V2S16}})
2037 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
2038 const LLT BigTy = Query.Types[BigTyIdx];
2039 return BigTy.getSizeInBits() == 32;
2040 })
2041 // Try to widen to s16 first for small types.
2042 // TODO: Only do this on targets with legal s16 shifts
2043 .minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: 16), TypeIdx: LitTyIdx, Ty: S16)
2044 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 16)
2045 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx),
2046 Mutation: oneMoreElement(TypeIdx: BigTyIdx))
2047 .fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: 0, TypesInit: S16), P1: vectorWiderThan(TypeIdx: 1, Size: 32),
2048 args: elementTypeIs(TypeIdx: 1, EltTy: S16)),
2049 Mutation: changeTo(TypeIdx: 1, Ty: V2S16))
2050 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2051 // not worth considering the multiples of 64 since 2*192 and 2*384
2052 // are not valid.
2053 .clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
2054 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 32)
2055 // Break up vectors with weird elements into scalars
2056 .fewerElementsIf(
2057 Predicate: [=](const LegalityQuery &Query) {
2058 return notValidElt(Query, LitTyIdx);
2059 },
2060 Mutation: scalarize(TypeIdx: 0))
2061 .fewerElementsIf(
2062 Predicate: [=](const LegalityQuery &Query) {
2063 return notValidElt(Query, BigTyIdx);
2064 },
2065 Mutation: scalarize(TypeIdx: 1))
2066 .clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
2067
2068 if (Op == G_MERGE_VALUES) {
2069 Builder.widenScalarIf(
2070 // TODO: Use 16-bit shifts if legal for 8-bit values?
2071 Predicate: [=](const LegalityQuery &Query) {
2072 const LLT Ty = Query.Types[LitTyIdx];
2073 return Ty.getSizeInBits() < 32;
2074 },
2075 Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
2076 }
2077
2078 Builder.widenScalarIf(
2079 Predicate: [=](const LegalityQuery &Query) {
2080 const LLT Ty = Query.Types[BigTyIdx];
2081 return Ty.getSizeInBits() % 16 != 0;
2082 },
2083 Mutation: [=](const LegalityQuery &Query) {
2084 // Pick the next power of 2, or a multiple of 64 over 128.
2085 // Whichever is smaller.
2086 const LLT &Ty = Query.Types[BigTyIdx];
2087 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Value: Ty.getSizeInBits() + 1);
2088 if (NewSizeInBits >= 256) {
2089 unsigned RoundedTo = alignTo<64>(Value: Ty.getSizeInBits() + 1);
2090 if (RoundedTo < NewSizeInBits)
2091 NewSizeInBits = RoundedTo;
2092 }
2093 return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
2094 })
2095 // Any vectors left are the wrong size. Scalarize them.
2096 .scalarize(TypeIdx: 0)
2097 .scalarize(TypeIdx: 1);
2098 }
2099
2100 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2101 // RegBankSelect.
2102 auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
2103 .legalFor(Types: {{S32}, {S64}})
2104 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
2105
2106 if (ST.hasVOP3PInsts()) {
2107 SextInReg.lowerFor(Types: {{V2S16}})
2108 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2109 // get more vector shift opportunities, since we'll get those when
2110 // expanded.
2111 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2112 } else if (ST.has16BitInsts()) {
2113 SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
2114 } else {
2115 // Prefer to promote to s32 before lowering if we don't have 16-bit
2116 // shifts. This avoid a lot of intermediate truncate and extend operations.
2117 SextInReg.lowerFor(Types: {{S32}, {S64}});
2118 }
2119
2120 SextInReg
2121 .scalarize(TypeIdx: 0)
2122 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2123 .lower();
2124
2125 getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
2126 .scalarize(TypeIdx: 0)
2127 .lower();
2128
2129 auto &FSHRActionDefs = getActionDefinitionsBuilder(Opcode: G_FSHR);
2130 FSHRActionDefs.legalFor(Types: {{S32, S32}})
2131 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2132 if (ST.hasVOP3PInsts())
2133 FSHRActionDefs.lowerFor(Types: {{V2S16, V2S16}});
2134 FSHRActionDefs.scalarize(TypeIdx: 0).lower();
2135
2136 if (ST.hasVOP3PInsts()) {
2137 getActionDefinitionsBuilder(Opcode: G_FSHL)
2138 .lowerFor(Types: {{V2S16, V2S16}})
2139 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2140 .scalarize(TypeIdx: 0)
2141 .lower();
2142 } else {
2143 getActionDefinitionsBuilder(Opcode: G_FSHL)
2144 .scalarize(TypeIdx: 0)
2145 .lower();
2146 }
2147
2148 getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
2149 .legalFor(Types: {S64});
2150
2151 getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
2152
2153 getActionDefinitionsBuilder(Opcode: G_FENCE)
2154 .alwaysLegal();
2155
2156 getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
2157 .scalarize(TypeIdx: 0)
2158 .minScalar(TypeIdx: 0, Ty: S32)
2159 .lower();
2160
2161 getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2162 .legalFor(Types: {{S32, S32}, {S64, S32}})
2163 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
2164 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2165 .widenScalarToNextPow2(TypeIdx: 0)
2166 .scalarize(TypeIdx: 0);
2167
2168 getActionDefinitionsBuilder(
2169 Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2170 G_FCOPYSIGN,
2171
2172 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2173 G_READ_REGISTER, G_WRITE_REGISTER,
2174
2175 G_SADDO, G_SSUBO})
2176 .lower();
2177
2178 if (ST.hasIEEEMinimumMaximumInsts()) {
2179 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2180 .legalFor(Types: FPTypesPK16)
2181 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
2182 .scalarize(TypeIdx: 0);
2183 } else if (ST.hasVOP3PInsts()) {
2184 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2185 .lowerFor(Types: {V2S16})
2186 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2187 .scalarize(TypeIdx: 0)
2188 .lower();
2189 } else {
2190 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2191 .scalarize(TypeIdx: 0)
2192 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2193 .lower();
2194 }
2195
2196 getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2197 .lower();
2198
2199 getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2200
2201 getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2202 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2203 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2204 .unsupported();
2205
2206 getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2207
2208 getActionDefinitionsBuilder(
2209 Opcodes: {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2210 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2211 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2212 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2213 .legalFor(Types: AllVectors)
2214 .scalarize(TypeIdx: 1)
2215 .lower();
2216
2217 getLegacyLegalizerInfo().computeTables();
2218 verify(MII: *ST.getInstrInfo());
2219}
2220
2221bool AMDGPULegalizerInfo::legalizeCustom(
2222 LegalizerHelper &Helper, MachineInstr &MI,
2223 LostDebugLocObserver &LocObserver) const {
2224 MachineIRBuilder &B = Helper.MIRBuilder;
2225 MachineRegisterInfo &MRI = *B.getMRI();
2226
2227 switch (MI.getOpcode()) {
2228 case TargetOpcode::G_ADDRSPACE_CAST:
2229 return legalizeAddrSpaceCast(MI, MRI, B);
2230 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2231 return legalizeFroundeven(MI, MRI, B);
2232 case TargetOpcode::G_FCEIL:
2233 return legalizeFceil(MI, MRI, B);
2234 case TargetOpcode::G_FREM:
2235 return legalizeFrem(MI, MRI, B);
2236 case TargetOpcode::G_INTRINSIC_TRUNC:
2237 return legalizeIntrinsicTrunc(MI, MRI, B);
2238 case TargetOpcode::G_SITOFP:
2239 return legalizeITOFP(MI, MRI, B, Signed: true);
2240 case TargetOpcode::G_UITOFP:
2241 return legalizeITOFP(MI, MRI, B, Signed: false);
2242 case TargetOpcode::G_FPTOSI:
2243 return legalizeFPTOI(MI, MRI, B, Signed: true);
2244 case TargetOpcode::G_FPTOUI:
2245 return legalizeFPTOI(MI, MRI, B, Signed: false);
2246 case TargetOpcode::G_FMINNUM:
2247 case TargetOpcode::G_FMAXNUM:
2248 case TargetOpcode::G_FMINIMUMNUM:
2249 case TargetOpcode::G_FMAXIMUMNUM:
2250 return legalizeMinNumMaxNum(Helper, MI);
2251 case TargetOpcode::G_EXTRACT:
2252 return legalizeExtract(Helper, MI);
2253 case TargetOpcode::G_INSERT:
2254 return legalizeInsert(Helper, MI);
2255 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2256 return legalizeExtractVectorElt(MI, MRI, B);
2257 case TargetOpcode::G_INSERT_VECTOR_ELT:
2258 return legalizeInsertVectorElt(MI, MRI, B);
2259 case TargetOpcode::G_FSIN:
2260 case TargetOpcode::G_FCOS:
2261 return legalizeSinCos(MI, MRI, B);
2262 case TargetOpcode::G_GLOBAL_VALUE:
2263 return legalizeGlobalValue(MI, MRI, B);
2264 case TargetOpcode::G_LOAD:
2265 case TargetOpcode::G_SEXTLOAD:
2266 case TargetOpcode::G_ZEXTLOAD:
2267 return legalizeLoad(Helper, MI);
2268 case TargetOpcode::G_STORE:
2269 return legalizeStore(Helper, MI);
2270 case TargetOpcode::G_FMAD:
2271 return legalizeFMad(MI, MRI, B);
2272 case TargetOpcode::G_FDIV:
2273 return legalizeFDIV(MI, MRI, B);
2274 case TargetOpcode::G_FFREXP:
2275 return legalizeFFREXP(MI, MRI, B);
2276 case TargetOpcode::G_FSQRT:
2277 return legalizeFSQRT(MI, MRI, B);
2278 case TargetOpcode::G_UDIV:
2279 case TargetOpcode::G_UREM:
2280 case TargetOpcode::G_UDIVREM:
2281 return legalizeUnsignedDIV_REM(MI, MRI, B);
2282 case TargetOpcode::G_SDIV:
2283 case TargetOpcode::G_SREM:
2284 case TargetOpcode::G_SDIVREM:
2285 return legalizeSignedDIV_REM(MI, MRI, B);
2286 case TargetOpcode::G_ATOMIC_CMPXCHG:
2287 return legalizeAtomicCmpXChg(MI, MRI, B);
2288 case TargetOpcode::G_FLOG2:
2289 return legalizeFlog2(MI, B);
2290 case TargetOpcode::G_FLOG:
2291 case TargetOpcode::G_FLOG10:
2292 return legalizeFlogCommon(MI, B);
2293 case TargetOpcode::G_FEXP2:
2294 return legalizeFExp2(MI, B);
2295 case TargetOpcode::G_FEXP:
2296 case TargetOpcode::G_FEXP10:
2297 return legalizeFExp(MI, B);
2298 case TargetOpcode::G_FPOW:
2299 return legalizeFPow(MI, B);
2300 case TargetOpcode::G_FFLOOR:
2301 return legalizeFFloor(MI, MRI, B);
2302 case TargetOpcode::G_BUILD_VECTOR:
2303 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2304 return legalizeBuildVector(MI, MRI, B);
2305 case TargetOpcode::G_MUL:
2306 return legalizeMul(Helper, MI);
2307 case TargetOpcode::G_CTLZ:
2308 case TargetOpcode::G_CTTZ:
2309 return legalizeCTLZ_CTTZ(MI, MRI, B);
2310 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2311 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2312 case TargetOpcode::G_STACKSAVE:
2313 return legalizeStackSave(MI, B);
2314 case TargetOpcode::G_GET_FPENV:
2315 return legalizeGetFPEnv(MI, MRI, B);
2316 case TargetOpcode::G_SET_FPENV:
2317 return legalizeSetFPEnv(MI, MRI, B);
2318 case TargetOpcode::G_TRAP:
2319 return legalizeTrap(MI, MRI, B);
2320 case TargetOpcode::G_DEBUGTRAP:
2321 return legalizeDebugTrap(MI, MRI, B);
2322 default:
2323 return false;
2324 }
2325
2326 llvm_unreachable("expected switch to return");
2327}
2328
2329Register AMDGPULegalizerInfo::getSegmentAperture(
2330 unsigned AS,
2331 MachineRegisterInfo &MRI,
2332 MachineIRBuilder &B) const {
2333 MachineFunction &MF = B.getMF();
2334 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2335 const LLT S32 = LLT::scalar(SizeInBits: 32);
2336 const LLT S64 = LLT::scalar(SizeInBits: 64);
2337
2338 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2339
2340 if (ST.hasApertureRegs()) {
2341 // Note: this register is somewhat broken. When used as a 32-bit operand,
2342 // it only returns zeroes. The real value is in the upper 32 bits.
2343 // Thus, we must emit extract the high 32 bits.
2344 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2345 ? AMDGPU::SRC_SHARED_BASE
2346 : AMDGPU::SRC_PRIVATE_BASE;
2347 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2348 !ST.hasGloballyAddressableScratch()) &&
2349 "Cannot use src_private_base with globally addressable scratch!");
2350 Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2351 MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2352 B.buildCopy(Res: {Dst}, Op: {Register(ApertureRegNo)});
2353 return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: 1);
2354 }
2355
2356 Register LoadAddr = MRI.createGenericVirtualRegister(
2357 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2358 // For code object version 5, private_base and shared_base are passed through
2359 // implicit kernargs.
2360 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2361 AMDGPU::AMDHSA_COV5) {
2362 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF&: B.getMF());
2363
2364 AMDGPUTargetLowering::ImplicitParameter Param =
2365 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2366 : AMDGPUTargetLowering::PRIVATE_BASE;
2367 uint64_t Offset =
2368 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2369
2370 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2371 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2372
2373 if (!loadInputValue(DstReg: KernargPtrReg, B,
2374 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2375 return Register();
2376
2377 MachineMemOperand *MMO = MF.getMachineMemOperand(
2378 PtrInfo: PtrInfo.getWithOffset(O: Offset),
2379 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2380 MachineMemOperand::MOInvariant,
2381 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset));
2382
2383 // Pointer address
2384 B.buildObjectPtrOffset(Res: LoadAddr, Op0: KernargPtrReg,
2385 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
2386 // Load address
2387 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2388 }
2389
2390 Register QueuePtr = MRI.createGenericVirtualRegister(
2391 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2392
2393 if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2394 return Register();
2395
2396 // TODO: Use custom PseudoSourceValue
2397 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2398
2399 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2400 // private_segment_aperture_base_hi.
2401 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2402
2403 MachineMemOperand *MMO = MF.getMachineMemOperand(
2404 PtrInfo,
2405 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2406 MachineMemOperand::MOInvariant,
2407 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset: StructOffset));
2408
2409 B.buildObjectPtrOffset(
2410 Res: LoadAddr, Op0: QueuePtr,
2411 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: StructOffset).getReg(Idx: 0));
2412 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2413}
2414
2415/// Return true if the value is a known valid address, such that a null check is
2416/// not necessary.
2417static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2418 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2419 MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2420 switch (Def->getOpcode()) {
2421 case AMDGPU::G_FRAME_INDEX:
2422 case AMDGPU::G_GLOBAL_VALUE:
2423 case AMDGPU::G_BLOCK_ADDR:
2424 return true;
2425 case AMDGPU::G_CONSTANT: {
2426 const ConstantInt *CI = Def->getOperand(i: 1).getCImm();
2427 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AS: AddrSpace);
2428 }
2429 default:
2430 return false;
2431 }
2432
2433 return false;
2434}
2435
2436bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2437 MachineInstr &MI, MachineRegisterInfo &MRI,
2438 MachineIRBuilder &B) const {
2439 MachineFunction &MF = B.getMF();
2440
2441 // MI can either be a G_ADDRSPACE_CAST or a
2442 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2443 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2444 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2445 Intrinsic::amdgcn_addrspacecast_nonnull));
2446
2447 const LLT S32 = LLT::scalar(SizeInBits: 32);
2448 Register Dst = MI.getOperand(i: 0).getReg();
2449 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
2450 : MI.getOperand(i: 1).getReg();
2451 LLT DstTy = MRI.getType(Reg: Dst);
2452 LLT SrcTy = MRI.getType(Reg: Src);
2453 unsigned DestAS = DstTy.getAddressSpace();
2454 unsigned SrcAS = SrcTy.getAddressSpace();
2455
2456 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2457 // vector element.
2458 assert(!DstTy.isVector());
2459
2460 const AMDGPUTargetMachine &TM
2461 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2462
2463 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2464 MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2465 return true;
2466 }
2467
2468 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2469 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2470 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2471 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2472 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2473 ST.hasGloballyAddressableScratch()) {
2474 // flat -> private with globally addressable scratch: subtract
2475 // src_flat_scratch_base_lo.
2476 const LLT S32 = LLT::scalar(SizeInBits: 32);
2477 Register SrcLo = B.buildExtract(Res: S32, Src, Index: 0).getReg(Idx: 0);
2478 Register FlatScratchBaseLo =
2479 B.buildInstr(Opc: AMDGPU::S_MOV_B32, DstOps: {S32},
2480 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2481 .getReg(Idx: 0);
2482 MRI.setRegClass(Reg: FlatScratchBaseLo, RC: &AMDGPU::SReg_32RegClass);
2483 Register Sub = B.buildSub(Dst: S32, Src0: SrcLo, Src1: FlatScratchBaseLo).getReg(Idx: 0);
2484 return B.buildIntToPtr(Dst, Src: Sub).getReg(Idx: 0);
2485 }
2486
2487 // Extract low 32-bits of the pointer.
2488 return B.buildExtract(Res: Dst, Src, Index: 0).getReg(Idx: 0);
2489 };
2490
2491 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2492 // G_ADDRSPACE_CAST we need to guess.
2493 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2494 castFlatToLocalOrPrivate(Dst);
2495 MI.eraseFromParent();
2496 return true;
2497 }
2498
2499 unsigned NullVal = AMDGPU::getNullPointerValue(AS: DestAS);
2500
2501 auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2502 auto FlatNull = B.buildConstant(Res: SrcTy, Val: 0);
2503
2504 // Extract low 32-bits of the pointer.
2505 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2506
2507 auto CmpRes =
2508 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: FlatNull.getReg(Idx: 0));
2509 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: 0));
2510
2511 MI.eraseFromParent();
2512 return true;
2513 }
2514
2515 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2516 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2517 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2518 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2519 // Coerce the type of the low half of the result so we can use
2520 // merge_values.
2521 Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: 0);
2522
2523 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2524 ST.hasGloballyAddressableScratch()) {
2525 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2526 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2527 Register AllOnes = B.buildConstant(Res: S32, Val: -1).getReg(Idx: 0);
2528 Register ThreadID = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
2529 ThreadID = B.buildIntrinsic(ID: Intrinsic::amdgcn_mbcnt_lo, Res: {S32})
2530 .addUse(RegNo: AllOnes)
2531 .addUse(RegNo: ThreadID)
2532 .getReg(Idx: 0);
2533 if (ST.isWave64()) {
2534 ThreadID = B.buildIntrinsic(ID: Intrinsic::amdgcn_mbcnt_hi, Res: {S32})
2535 .addUse(RegNo: AllOnes)
2536 .addUse(RegNo: ThreadID)
2537 .getReg(Idx: 0);
2538 }
2539 Register ShAmt =
2540 B.buildConstant(Res: S32, Val: 57 - 32 - ST.getWavefrontSizeLog2()).getReg(Idx: 0);
2541 Register SrcHi = B.buildShl(Dst: S32, Src0: ThreadID, Src1: ShAmt).getReg(Idx: 0);
2542 Register CvtPtr =
2543 B.buildMergeLikeInstr(Res: DstTy, Ops: {SrcAsInt, SrcHi}).getReg(Idx: 0);
2544 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2545 // 64-bit hi:lo value.
2546 Register FlatScratchBase =
2547 B.buildInstr(Opc: AMDGPU::S_MOV_B64, DstOps: {S64},
2548 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2549 .getReg(Idx: 0);
2550 MRI.setRegClass(Reg: FlatScratchBase, RC: &AMDGPU::SReg_64RegClass);
2551 return B.buildPtrAdd(Res: Dst, Op0: CvtPtr, Op1: FlatScratchBase).getReg(Idx: 0);
2552 }
2553
2554 Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2555 if (!ApertureReg.isValid())
2556 return false;
2557
2558 // TODO: Should we allow mismatched types but matching sizes in merges to
2559 // avoid the ptrtoint?
2560 return B.buildMergeLikeInstr(Res: Dst, Ops: {SrcAsInt, ApertureReg}).getReg(Idx: 0);
2561 };
2562
2563 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2564 // G_ADDRSPACE_CAST we need to guess.
2565 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2566 castLocalOrPrivateToFlat(Dst);
2567 MI.eraseFromParent();
2568 return true;
2569 }
2570
2571 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2572
2573 auto SegmentNull =
2574 B.buildConstant(Res: SrcTy, Val: AMDGPU::getNullPointerValue(AS: SrcAS));
2575 auto FlatNull = B.buildConstant(Res: DstTy, Val: AMDGPU::getNullPointerValue(AS: DestAS));
2576
2577 auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
2578 Op1: SegmentNull.getReg(Idx: 0));
2579
2580 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2581
2582 MI.eraseFromParent();
2583 return true;
2584 }
2585
2586 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2587 SrcTy.getSizeInBits() == 64) {
2588 // Truncate.
2589 B.buildExtract(Res: Dst, Src, Index: 0);
2590 MI.eraseFromParent();
2591 return true;
2592 }
2593
2594 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2595 DstTy.getSizeInBits() == 64) {
2596 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2597 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2598 auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2599 if (AddrHiVal == 0) {
2600 auto Zext = B.buildZExt(Res: LLT::scalar(SizeInBits: 64), Op: PtrLo);
2601 B.buildIntToPtr(Dst, Src: Zext);
2602 } else {
2603 auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2604 B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2605 }
2606
2607 MI.eraseFromParent();
2608 return true;
2609 }
2610
2611 // Invalid casts are poison.
2612 // TODO: Should return poison
2613 B.buildUndef(Res: Dst);
2614 MI.eraseFromParent();
2615 return true;
2616}
2617
2618bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2619 MachineRegisterInfo &MRI,
2620 MachineIRBuilder &B) const {
2621 Register Src = MI.getOperand(i: 1).getReg();
2622 LLT Ty = MRI.getType(Reg: Src);
2623 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2624
2625 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2626 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2627
2628 auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2629 auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2630
2631 // TODO: Should this propagate fast-math-flags?
2632 auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2633 auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2634
2635 auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2636 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2637
2638 auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: C2);
2639 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2640 MI.eraseFromParent();
2641 return true;
2642}
2643
2644bool AMDGPULegalizerInfo::legalizeFceil(
2645 MachineInstr &MI, MachineRegisterInfo &MRI,
2646 MachineIRBuilder &B) const {
2647
2648 const LLT S1 = LLT::scalar(SizeInBits: 1);
2649 const LLT S64 = LLT::scalar(SizeInBits: 64);
2650
2651 Register Src = MI.getOperand(i: 1).getReg();
2652 assert(MRI.getType(Src) == S64);
2653
2654 // result = trunc(src)
2655 // if (src > 0.0 && src != result)
2656 // result += 1.0
2657
2658 auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2659
2660 const auto Zero = B.buildFConstant(Res: S64, Val: 0.0);
2661 const auto One = B.buildFConstant(Res: S64, Val: 1.0);
2662 auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2663 auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2664 auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2665 auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2666
2667 // TODO: Should this propagate fast-math-flags?
2668 B.buildFAdd(Dst: MI.getOperand(i: 0).getReg(), Src0: Trunc, Src1: Add);
2669 MI.eraseFromParent();
2670 return true;
2671}
2672
2673bool AMDGPULegalizerInfo::legalizeFrem(
2674 MachineInstr &MI, MachineRegisterInfo &MRI,
2675 MachineIRBuilder &B) const {
2676 Register DstReg = MI.getOperand(i: 0).getReg();
2677 Register Src0Reg = MI.getOperand(i: 1).getReg();
2678 Register Src1Reg = MI.getOperand(i: 2).getReg();
2679 auto Flags = MI.getFlags();
2680 LLT Ty = MRI.getType(Reg: DstReg);
2681
2682 auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2683 auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2684 auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2685 B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2686 MI.eraseFromParent();
2687 return true;
2688}
2689
2690static MachineInstrBuilder extractF64Exponent(Register Hi,
2691 MachineIRBuilder &B) {
2692 const unsigned FractBits = 52;
2693 const unsigned ExpBits = 11;
2694 LLT S32 = LLT::scalar(SizeInBits: 32);
2695
2696 auto Const0 = B.buildConstant(Res: S32, Val: FractBits - 32);
2697 auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2698
2699 auto ExpPart = B.buildIntrinsic(ID: Intrinsic::amdgcn_ubfe, Res: {S32})
2700 .addUse(RegNo: Hi)
2701 .addUse(RegNo: Const0.getReg(Idx: 0))
2702 .addUse(RegNo: Const1.getReg(Idx: 0));
2703
2704 return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: 1023));
2705}
2706
2707bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2708 MachineInstr &MI, MachineRegisterInfo &MRI,
2709 MachineIRBuilder &B) const {
2710 const LLT S1 = LLT::scalar(SizeInBits: 1);
2711 const LLT S32 = LLT::scalar(SizeInBits: 32);
2712 const LLT S64 = LLT::scalar(SizeInBits: 64);
2713
2714 Register Src = MI.getOperand(i: 1).getReg();
2715 assert(MRI.getType(Src) == S64);
2716
2717 // TODO: Should this use extract since the low half is unused?
2718 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2719 Register Hi = Unmerge.getReg(Idx: 1);
2720
2721 // Extract the upper half, since this is where we will find the sign and
2722 // exponent.
2723 auto Exp = extractF64Exponent(Hi, B);
2724
2725 const unsigned FractBits = 52;
2726
2727 // Extract the sign bit.
2728 const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(1) << 31);
2729 auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2730
2731 const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(1) << FractBits) - 1);
2732
2733 const auto Zero32 = B.buildConstant(Res: S32, Val: 0);
2734
2735 // Extend back to 64-bits.
2736 auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2737
2738 auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2739 auto Not = B.buildNot(Dst: S64, Src0: Shr);
2740 auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2741 auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - 1);
2742
2743 auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2744 auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2745
2746 auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2747 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2748 MI.eraseFromParent();
2749 return true;
2750}
2751
2752bool AMDGPULegalizerInfo::legalizeITOFP(
2753 MachineInstr &MI, MachineRegisterInfo &MRI,
2754 MachineIRBuilder &B, bool Signed) const {
2755
2756 Register Dst = MI.getOperand(i: 0).getReg();
2757 Register Src = MI.getOperand(i: 1).getReg();
2758
2759 const LLT S64 = LLT::scalar(SizeInBits: 64);
2760 const LLT S32 = LLT::scalar(SizeInBits: 32);
2761
2762 assert(MRI.getType(Src) == S64);
2763
2764 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2765 auto ThirtyTwo = B.buildConstant(Res: S32, Val: 32);
2766
2767 if (MRI.getType(Reg: Dst) == S64) {
2768 auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1))
2769 : B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1));
2770
2771 auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 0));
2772 auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2773
2774 // TODO: Should this propagate fast-math-flags?
2775 B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2776 MI.eraseFromParent();
2777 return true;
2778 }
2779
2780 assert(MRI.getType(Dst) == S32);
2781
2782 auto One = B.buildConstant(Res: S32, Val: 1);
2783
2784 MachineInstrBuilder ShAmt;
2785 if (Signed) {
2786 auto ThirtyOne = B.buildConstant(Res: S32, Val: 31);
2787 auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: 0), Src1: Unmerge.getReg(Idx: 1));
2788 auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2789 auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2790 auto LS = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32})
2791 .addUse(RegNo: Unmerge.getReg(Idx: 1));
2792 auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2793 ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2794 } else
2795 ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
2796 auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2797 auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2798 auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: 0));
2799 auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: 1), Src1: Adjust);
2800 auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2801 auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2802 B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2803 MI.eraseFromParent();
2804 return true;
2805}
2806
2807// TODO: Copied from DAG implementation. Verify logic and document how this
2808// actually works.
2809bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2810 MachineRegisterInfo &MRI,
2811 MachineIRBuilder &B,
2812 bool Signed) const {
2813
2814 Register Dst = MI.getOperand(i: 0).getReg();
2815 Register Src = MI.getOperand(i: 1).getReg();
2816
2817 const LLT S64 = LLT::scalar(SizeInBits: 64);
2818 const LLT S32 = LLT::scalar(SizeInBits: 32);
2819
2820 const LLT SrcLT = MRI.getType(Reg: Src);
2821 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2822
2823 unsigned Flags = MI.getFlags();
2824
2825 // The basic idea of converting a floating point number into a pair of 32-bit
2826 // integers is illustrated as follows:
2827 //
2828 // tf := trunc(val);
2829 // hif := floor(tf * 2^-32);
2830 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2831 // hi := fptoi(hif);
2832 // lo := fptoi(lof);
2833 //
2834 auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2835 MachineInstrBuilder Sign;
2836 if (Signed && SrcLT == S32) {
2837 // However, a 32-bit floating point number has only 23 bits mantissa and
2838 // it's not enough to hold all the significant bits of `lof` if val is
2839 // negative. To avoid the loss of precision, We need to take the absolute
2840 // value after truncating and flip the result back based on the original
2841 // signedness.
2842 Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: 31));
2843 Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2844 }
2845 MachineInstrBuilder K0, K1;
2846 if (SrcLT == S64) {
2847 K0 = B.buildFConstant(
2848 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2849 K1 = B.buildFConstant(
2850 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2851 } else {
2852 K0 = B.buildFConstant(
2853 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2854 K1 = B.buildFConstant(
2855 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2856 }
2857
2858 auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2859 auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2860 auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2861
2862 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2863 : B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2864 auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2865
2866 if (Signed && SrcLT == S32) {
2867 // Flip the result based on the signedness, which is either all 0s or 1s.
2868 Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2869 // r := xor({lo, hi}, sign) - sign;
2870 B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2871 Src1: Sign);
2872 } else
2873 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2874 MI.eraseFromParent();
2875
2876 return true;
2877}
2878
2879bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2880 MachineInstr &MI) const {
2881 MachineFunction &MF = Helper.MIRBuilder.getMF();
2882 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2883
2884 // With ieee_mode disabled, the instructions have the correct behavior.
2885 if (!MFI->getMode().IEEE)
2886 return true;
2887
2888 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2889}
2890
2891bool AMDGPULegalizerInfo::legalizeExtract(LegalizerHelper &Helper,
2892 MachineInstr &MI) const {
2893 MachineIRBuilder &B = Helper.MIRBuilder;
2894 MachineRegisterInfo &MRI = *B.getMRI();
2895 Register DstReg = MI.getOperand(i: 0).getReg();
2896 Register SrcReg = MI.getOperand(i: 1).getReg();
2897 uint64_t Offset = MI.getOperand(i: 2).getImm();
2898
2899 // Fall back to generic lowering for offset 0 (trivial trunc) and
2900 // non-32-bit-aligned cases which require shift+trunc sequences
2901 // that generic code handles correctly.
2902 if (Offset == 0 || Offset % 32 != 0)
2903 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2904
2905 const LLT DstTy = MRI.getType(Reg: DstReg);
2906 unsigned StartIdx = Offset / 32;
2907 unsigned DstCount = DstTy.getSizeInBits() / 32;
2908 auto Unmerge = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: SrcReg);
2909
2910 if (DstCount == 1) {
2911 if (DstTy.isPointer())
2912 B.buildIntToPtr(Dst: DstReg, Src: Unmerge.getReg(Idx: StartIdx));
2913 else
2914 MRI.replaceRegWith(FromReg: DstReg, ToReg: Unmerge.getReg(Idx: StartIdx));
2915 } else {
2916 SmallVector<Register, 8> MergeVec;
2917 for (unsigned I = 0; I < DstCount; ++I)
2918 MergeVec.push_back(Elt: Unmerge.getReg(Idx: StartIdx + I));
2919 B.buildMergeLikeInstr(Res: DstReg, Ops: MergeVec);
2920 }
2921
2922 MI.eraseFromParent();
2923 return true;
2924}
2925
2926bool AMDGPULegalizerInfo::legalizeInsert(LegalizerHelper &Helper,
2927 MachineInstr &MI) const {
2928 MachineIRBuilder &B = Helper.MIRBuilder;
2929 MachineRegisterInfo &MRI = *B.getMRI();
2930 Register DstReg = MI.getOperand(i: 0).getReg();
2931 Register SrcReg = MI.getOperand(i: 1).getReg();
2932 Register InsertSrc = MI.getOperand(i: 2).getReg();
2933 uint64_t Offset = MI.getOperand(i: 3).getImm();
2934
2935 unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
2936 const LLT InsertTy = MRI.getType(Reg: InsertSrc);
2937 unsigned InsertSize = InsertTy.getSizeInBits();
2938
2939 // Fall back to generic lowering for non-32-bit-aligned cases which
2940 // require shift+mask sequences that generic code handles correctly.
2941 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2942 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
2943
2944 const LLT S32 = LLT::scalar(SizeInBits: 32);
2945 unsigned DstCount = DstSize / 32;
2946 unsigned InsertCount = InsertSize / 32;
2947 unsigned StartIdx = Offset / 32;
2948
2949 auto SrcUnmerge = B.buildUnmerge(Res: S32, Op: SrcReg);
2950
2951 SmallVector<Register, 8> MergeVec;
2952 for (unsigned I = 0; I < StartIdx; ++I)
2953 MergeVec.push_back(Elt: SrcUnmerge.getReg(Idx: I));
2954
2955 if (InsertCount == 1) {
2956 // Merge-like instructions require same source types. Convert pointer
2957 // to scalar when inserting a pointer value into a scalar.
2958 if (InsertTy.isPointer())
2959 InsertSrc = B.buildPtrToInt(Dst: S32, Src: InsertSrc).getReg(Idx: 0);
2960 MergeVec.push_back(Elt: InsertSrc);
2961 } else {
2962 auto InsertUnmerge = B.buildUnmerge(Res: S32, Op: InsertSrc);
2963 for (unsigned I = 0; I < InsertCount; ++I)
2964 MergeVec.push_back(Elt: InsertUnmerge.getReg(Idx: I));
2965 }
2966
2967 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
2968 MergeVec.push_back(Elt: SrcUnmerge.getReg(Idx: I));
2969
2970 B.buildMergeLikeInstr(Res: DstReg, Ops: MergeVec);
2971
2972 MI.eraseFromParent();
2973 return true;
2974}
2975
2976bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2977 MachineInstr &MI, MachineRegisterInfo &MRI,
2978 MachineIRBuilder &B) const {
2979 // TODO: Should move some of this into LegalizerHelper.
2980
2981 // TODO: Promote dynamic indexing of s16 to s32
2982
2983 Register Dst = MI.getOperand(i: 0).getReg();
2984 Register Vec = MI.getOperand(i: 1).getReg();
2985
2986 LLT VecTy = MRI.getType(Reg: Vec);
2987 LLT EltTy = VecTy.getElementType();
2988 assert(EltTy == MRI.getType(Dst));
2989
2990 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2991 // but we can't go directly to that logic becasue you can't bitcast a vector
2992 // of pointers to a vector of integers. Therefore, introduce an intermediate
2993 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2994 // drive the legalization forward.
2995 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2996 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2997 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2998
2999 auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
3000 auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: 2));
3001 B.buildIntToPtr(Dst, Src: IntElt);
3002
3003 MI.eraseFromParent();
3004 return true;
3005 }
3006
3007 // FIXME: Artifact combiner probably should have replaced the truncated
3008 // constant before this, so we shouldn't need
3009 // getIConstantVRegValWithLookThrough.
3010 std::optional<ValueAndVReg> MaybeIdxVal =
3011 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 2).getReg(), MRI);
3012 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3013 return true;
3014 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3015
3016 if (IdxVal < VecTy.getNumElements()) {
3017 auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
3018 B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
3019 } else {
3020 B.buildUndef(Res: Dst);
3021 }
3022
3023 MI.eraseFromParent();
3024 return true;
3025}
3026
3027bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
3028 MachineInstr &MI, MachineRegisterInfo &MRI,
3029 MachineIRBuilder &B) const {
3030 // TODO: Should move some of this into LegalizerHelper.
3031
3032 // TODO: Promote dynamic indexing of s16 to s32
3033
3034 Register Dst = MI.getOperand(i: 0).getReg();
3035 Register Vec = MI.getOperand(i: 1).getReg();
3036 Register Ins = MI.getOperand(i: 2).getReg();
3037
3038 LLT VecTy = MRI.getType(Reg: Vec);
3039 LLT EltTy = VecTy.getElementType();
3040 assert(EltTy == MRI.getType(Ins));
3041
3042 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3043 // but we can't go directly to that logic becasue you can't bitcast a vector
3044 // of pointers to a vector of integers. Therefore, make the pointer vector
3045 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3046 // new value, and then inttoptr the result vector back. This will then allow
3047 // the rest of legalization to take over.
3048 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3049 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
3050 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
3051
3052 auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
3053 auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
3054 auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
3055 Idx: MI.getOperand(i: 3));
3056 B.buildIntToPtr(Dst, Src: IntVecDest);
3057 MI.eraseFromParent();
3058 return true;
3059 }
3060
3061 // FIXME: Artifact combiner probably should have replaced the truncated
3062 // constant before this, so we shouldn't need
3063 // getIConstantVRegValWithLookThrough.
3064 std::optional<ValueAndVReg> MaybeIdxVal =
3065 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 3).getReg(), MRI);
3066 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3067 return true;
3068
3069 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3070
3071 unsigned NumElts = VecTy.getNumElements();
3072 if (IdxVal < NumElts) {
3073 SmallVector<Register, 8> SrcRegs;
3074 for (unsigned i = 0; i < NumElts; ++i)
3075 SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
3076 B.buildUnmerge(Res: SrcRegs, Op: Vec);
3077
3078 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
3079 B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3080 } else {
3081 B.buildUndef(Res: Dst);
3082 }
3083
3084 MI.eraseFromParent();
3085 return true;
3086}
3087
3088bool AMDGPULegalizerInfo::legalizeSinCos(
3089 MachineInstr &MI, MachineRegisterInfo &MRI,
3090 MachineIRBuilder &B) const {
3091
3092 Register DstReg = MI.getOperand(i: 0).getReg();
3093 Register SrcReg = MI.getOperand(i: 1).getReg();
3094 LLT Ty = MRI.getType(Reg: DstReg);
3095 unsigned Flags = MI.getFlags();
3096
3097 Register TrigVal;
3098 auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: 0.5 * numbers::inv_pi);
3099 if (ST.hasTrigReducedRange()) {
3100 auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
3101 TrigVal = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {Ty})
3102 .addUse(RegNo: MulVal.getReg(Idx: 0))
3103 .setMIFlags(Flags)
3104 .getReg(Idx: 0);
3105 } else
3106 TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: 0);
3107
3108 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3109 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3110 B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
3111 .addUse(RegNo: TrigVal)
3112 .setMIFlags(Flags);
3113 MI.eraseFromParent();
3114 return true;
3115}
3116
3117bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
3118 MachineIRBuilder &B,
3119 const GlobalValue *GV,
3120 int64_t Offset,
3121 unsigned GAFlags) const {
3122 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3123 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3124 // to the following code sequence:
3125 //
3126 // For constant address space:
3127 // s_getpc_b64 s[0:1]
3128 // s_add_u32 s0, s0, $symbol
3129 // s_addc_u32 s1, s1, 0
3130 //
3131 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3132 // a fixup or relocation is emitted to replace $symbol with a literal
3133 // constant, which is a pc-relative offset from the encoding of the $symbol
3134 // operand to the global variable.
3135 //
3136 // For global address space:
3137 // s_getpc_b64 s[0:1]
3138 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3139 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3140 //
3141 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3142 // fixups or relocations are emitted to replace $symbol@*@lo and
3143 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3144 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3145 // operand to the global variable.
3146
3147 LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3148
3149 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3150 B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
3151
3152 if (ST.has64BitLiterals()) {
3153 assert(GAFlags != SIInstrInfo::MO_NONE);
3154
3155 MachineInstrBuilder MIB =
3156 B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(RegNo: PCReg);
3157 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 2);
3158 } else {
3159 MachineInstrBuilder MIB =
3160 B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(RegNo: PCReg);
3161
3162 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
3163 if (GAFlags == SIInstrInfo::MO_NONE)
3164 MIB.addImm(Val: 0);
3165 else
3166 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 1);
3167 }
3168
3169 if (!B.getMRI()->getRegClassOrNull(Reg: PCReg))
3170 B.getMRI()->setRegClass(Reg: PCReg, RC: &AMDGPU::SReg_64RegClass);
3171
3172 if (PtrTy.getSizeInBits() == 32)
3173 B.buildExtract(Res: DstReg, Src: PCReg, Index: 0);
3174 return true;
3175}
3176
3177// Emit a ABS32_LO / ABS32_HI relocation stub.
3178void AMDGPULegalizerInfo::buildAbsGlobalAddress(
3179 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3180 MachineRegisterInfo &MRI) const {
3181 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3182
3183 if (RequiresHighHalf && ST.has64BitLiterals()) {
3184 if (!MRI.getRegClassOrNull(Reg: DstReg))
3185 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_64RegClass);
3186 B.buildInstr(Opcode: AMDGPU::S_MOV_B64)
3187 .addDef(RegNo: DstReg)
3188 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS64);
3189 return;
3190 }
3191
3192 LLT S32 = LLT::scalar(SizeInBits: 32);
3193
3194 // Use the destination directly, if and only if we store the lower address
3195 // part only and we don't have a register class being set.
3196 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
3197 ? DstReg
3198 : MRI.createGenericVirtualRegister(Ty: S32);
3199
3200 if (!MRI.getRegClassOrNull(Reg: AddrLo))
3201 MRI.setRegClass(Reg: AddrLo, RC: &AMDGPU::SReg_32RegClass);
3202
3203 // Write the lower half.
3204 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
3205 .addDef(RegNo: AddrLo)
3206 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
3207
3208 // If required, write the upper half as well.
3209 if (RequiresHighHalf) {
3210 assert(PtrTy.getSizeInBits() == 64 &&
3211 "Must provide a 64-bit pointer type!");
3212
3213 Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
3214 MRI.setRegClass(Reg: AddrHi, RC: &AMDGPU::SReg_32RegClass);
3215
3216 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
3217 .addDef(RegNo: AddrHi)
3218 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_HI);
3219
3220 // Use the destination directly, if and only if we don't have a register
3221 // class being set.
3222 Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
3223 ? DstReg
3224 : MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
3225
3226 if (!MRI.getRegClassOrNull(Reg: AddrDst))
3227 MRI.setRegClass(Reg: AddrDst, RC: &AMDGPU::SReg_64RegClass);
3228
3229 B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
3230
3231 // If we created a new register for the destination, cast the result into
3232 // the final output.
3233 if (AddrDst != DstReg)
3234 B.buildCast(Dst: DstReg, Src: AddrDst);
3235 } else if (AddrLo != DstReg) {
3236 // If we created a new register for the destination, cast the result into
3237 // the final output.
3238 B.buildCast(Dst: DstReg, Src: AddrLo);
3239 }
3240}
3241
3242bool AMDGPULegalizerInfo::legalizeGlobalValue(
3243 MachineInstr &MI, MachineRegisterInfo &MRI,
3244 MachineIRBuilder &B) const {
3245 Register DstReg = MI.getOperand(i: 0).getReg();
3246 LLT Ty = MRI.getType(Reg: DstReg);
3247 unsigned AS = Ty.getAddressSpace();
3248
3249 const GlobalValue *GV = MI.getOperand(i: 1).getGlobal();
3250 MachineFunction &MF = B.getMF();
3251 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3252
3253 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
3254 if (!MFI->isModuleEntryFunction() &&
3255 GV->getName() != "llvm.amdgcn.module.lds" &&
3256 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
3257 const Function &Fn = MF.getFunction();
3258 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
3259 Fn, "local memory global used by non-kernel function",
3260 MI.getDebugLoc(), DS_Warning));
3261
3262 // We currently don't have a way to correctly allocate LDS objects that
3263 // aren't directly associated with a kernel. We do force inlining of
3264 // functions that use local objects. However, if these dead functions are
3265 // not eliminated, we don't want a compile time error. Just emit a warning
3266 // and a trap, since there should be no callable path here.
3267 B.buildTrap();
3268 B.buildUndef(Res: DstReg);
3269 MI.eraseFromParent();
3270 return true;
3271 }
3272
3273 // TODO: We could emit code to handle the initialization somewhere.
3274 // We ignore the initializer for now and legalize it to allow selection.
3275 // The initializer will anyway get errored out during assembly emission.
3276 const SITargetLowering *TLI = ST.getTargetLowering();
3277 if (!TLI->shouldUseLDSConstAddress(GV)) {
3278 MI.getOperand(i: 1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3279 return true; // Leave in place;
3280 }
3281
3282 const GlobalVariable &GVar = *cast<GlobalVariable>(Val: GV);
3283 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3284 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3285 // zero-sized type in other languages to declare the dynamic shared
3286 // memory which size is not known at the compile time. They will be
3287 // allocated by the runtime and placed directly after the static
3288 // allocated ones. They all share the same offset.
3289 if (GVar.getGlobalSize(DL: GVar.getDataLayout()) == 0) {
3290 // Adjust alignment for that dynamic shared memory array.
3291 MFI->setDynLDSAlign(F: MF.getFunction(), GV: GVar);
3292 LLT S32 = LLT::scalar(SizeInBits: 32);
3293 auto Sz = B.buildIntrinsic(ID: Intrinsic::amdgcn_groupstaticsize, Res: {S32});
3294 B.buildIntToPtr(Dst: DstReg, Src: Sz);
3295 MI.eraseFromParent();
3296 return true;
3297 }
3298 }
3299
3300 B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(), GV: GVar));
3301 MI.eraseFromParent();
3302 return true;
3303 }
3304
3305 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3306 buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
3307 MI.eraseFromParent();
3308 return true;
3309 }
3310
3311 const SITargetLowering *TLI = ST.getTargetLowering();
3312
3313 if (TLI->shouldEmitFixup(GV)) {
3314 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0);
3315 MI.eraseFromParent();
3316 return true;
3317 }
3318
3319 if (TLI->shouldEmitPCReloc(GV)) {
3320 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_REL32);
3321 MI.eraseFromParent();
3322 return true;
3323 }
3324
3325 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3326 Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
3327
3328 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3329 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3330 PtrInfo: MachinePointerInfo::getGOT(MF),
3331 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3332 MachineMemOperand::MOInvariant,
3333 MemTy: LoadTy, base_alignment: Align(8));
3334
3335 buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3336
3337 if (Ty.getSizeInBits() == 32) {
3338 // Truncate if this is a 32-bit constant address.
3339 auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3340 B.buildExtract(Res: DstReg, Src: Load, Index: 0);
3341 } else
3342 B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3343
3344 MI.eraseFromParent();
3345 return true;
3346}
3347
3348static LLT widenToNextPowerOf2(LLT Ty) {
3349 if (Ty.isVector())
3350 return Ty.changeElementCount(
3351 EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3352 return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3353}
3354
3355bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3356 MachineInstr &MI) const {
3357 MachineIRBuilder &B = Helper.MIRBuilder;
3358 MachineRegisterInfo &MRI = *B.getMRI();
3359 GISelChangeObserver &Observer = Helper.Observer;
3360
3361 Register PtrReg = MI.getOperand(i: 1).getReg();
3362 LLT PtrTy = MRI.getType(Reg: PtrReg);
3363 unsigned AddrSpace = PtrTy.getAddressSpace();
3364
3365 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3366 LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3367 auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3368 Observer.changingInstr(MI);
3369 MI.getOperand(i: 1).setReg(Cast.getReg(Idx: 0));
3370 Observer.changedInstr(MI);
3371 return true;
3372 }
3373
3374 if (MI.getOpcode() != AMDGPU::G_LOAD)
3375 return false;
3376
3377 Register ValReg = MI.getOperand(i: 0).getReg();
3378 LLT ValTy = MRI.getType(Reg: ValReg);
3379
3380 if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3381 Observer.changingInstr(MI);
3382 castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
3383 Observer.changedInstr(MI);
3384 return true;
3385 }
3386
3387 MachineMemOperand *MMO = *MI.memoperands_begin();
3388 const unsigned ValSize = ValTy.getSizeInBits();
3389 const LLT MemTy = MMO->getMemoryType();
3390 const Align MemAlign = MMO->getAlign();
3391 const unsigned MemSize = MemTy.getSizeInBits();
3392 const uint64_t AlignInBits = 8 * MemAlign.value();
3393
3394 // Widen non-power-of-2 loads to the alignment if needed
3395 if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3396 const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3397
3398 // This was already the correct extending load result type, so just adjust
3399 // the memory type.
3400 if (WideMemSize == ValSize) {
3401 MachineFunction &MF = B.getMF();
3402
3403 MachineMemOperand *WideMMO =
3404 MF.getMachineMemOperand(MMO, Offset: 0, Size: WideMemSize / 8);
3405 Observer.changingInstr(MI);
3406 MI.setMemRefs(MF, MemRefs: {WideMMO});
3407 Observer.changedInstr(MI);
3408 return true;
3409 }
3410
3411 // Don't bother handling edge case that should probably never be produced.
3412 if (ValSize > WideMemSize)
3413 return false;
3414
3415 LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3416
3417 Register WideLoad;
3418 if (!WideTy.isVector()) {
3419 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3420 B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: 0);
3421 } else {
3422 // Extract the subvector.
3423
3424 if (isRegisterType(ST, Ty: ValTy)) {
3425 // If this a case where G_EXTRACT is legal, use it.
3426 // (e.g. <3 x s32> -> <4 x s32>)
3427 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3428 B.buildExtract(Res: ValReg, Src: WideLoad, Index: 0);
3429 } else {
3430 // For cases where the widened type isn't a nice register value, unmerge
3431 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3432 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3433 B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3434 }
3435 }
3436
3437 MI.eraseFromParent();
3438 return true;
3439 }
3440
3441 return false;
3442}
3443
3444bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3445 MachineInstr &MI) const {
3446 MachineIRBuilder &B = Helper.MIRBuilder;
3447 MachineRegisterInfo &MRI = *B.getMRI();
3448 GISelChangeObserver &Observer = Helper.Observer;
3449
3450 Register DataReg = MI.getOperand(i: 0).getReg();
3451 LLT DataTy = MRI.getType(Reg: DataReg);
3452
3453 if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3454 Observer.changingInstr(MI);
3455 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
3456 Observer.changedInstr(MI);
3457 return true;
3458 }
3459 return false;
3460}
3461
3462bool AMDGPULegalizerInfo::legalizeFMad(
3463 MachineInstr &MI, MachineRegisterInfo &MRI,
3464 MachineIRBuilder &B) const {
3465 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3466 assert(Ty.isScalar());
3467
3468 MachineFunction &MF = B.getMF();
3469 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3470
3471 // TODO: Always legal with future ftz flag.
3472 // FIXME: Do we need just output?
3473 if (Ty == LLT::float32() &&
3474 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3475 return true;
3476 if (Ty == LLT::float16() &&
3477 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3478 return true;
3479
3480 MachineIRBuilder HelperBuilder(MI);
3481 GISelObserverWrapper DummyObserver;
3482 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3483 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3484}
3485
3486bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3487 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3488 Register DstReg = MI.getOperand(i: 0).getReg();
3489 Register PtrReg = MI.getOperand(i: 1).getReg();
3490 Register CmpVal = MI.getOperand(i: 2).getReg();
3491 Register NewVal = MI.getOperand(i: 3).getReg();
3492
3493 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3494 "this should not have been custom lowered");
3495
3496 LLT ValTy = MRI.getType(Reg: CmpVal);
3497 LLT VecTy = LLT::fixed_vector(NumElements: 2, ScalarTy: ValTy);
3498
3499 Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: 0);
3500
3501 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3502 .addDef(RegNo: DstReg)
3503 .addUse(RegNo: PtrReg)
3504 .addUse(RegNo: PackedVal)
3505 .setMemRefs(MI.memoperands());
3506
3507 MI.eraseFromParent();
3508 return true;
3509}
3510
3511/// Return true if it's known that \p Src can never be an f32 denormal value.
3512static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3513 Register Src) {
3514 const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3515 switch (DefMI->getOpcode()) {
3516 case TargetOpcode::G_INTRINSIC: {
3517 switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3518 case Intrinsic::amdgcn_frexp_mant:
3519 case Intrinsic::amdgcn_log:
3520 case Intrinsic::amdgcn_log_clamp:
3521 case Intrinsic::amdgcn_exp2:
3522 case Intrinsic::amdgcn_sqrt:
3523 return true;
3524 default:
3525 break;
3526 }
3527
3528 break;
3529 }
3530 case TargetOpcode::G_FSQRT:
3531 return true;
3532 case TargetOpcode::G_FFREXP: {
3533 if (DefMI->getOperand(i: 0).getReg() == Src)
3534 return true;
3535 break;
3536 }
3537 case TargetOpcode::G_FPEXT: {
3538 return MRI.getType(Reg: DefMI->getOperand(i: 1).getReg()) == LLT::scalar(SizeInBits: 16);
3539 }
3540 default:
3541 return false;
3542 }
3543
3544 return false;
3545}
3546
3547static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3548 return Flags & MachineInstr::FmAfn;
3549}
3550
3551static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3552 unsigned Flags) {
3553 return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3554 MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3555 DenormalMode::PreserveSign;
3556}
3557
3558std::pair<Register, Register>
3559AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3560 unsigned Flags) const {
3561 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3562 return {};
3563
3564 const LLT F32 = LLT::scalar(SizeInBits: 32);
3565 auto SmallestNormal = B.buildFConstant(
3566 Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3567 auto IsLtSmallestNormal =
3568 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: SmallestNormal);
3569
3570 auto Scale32 = B.buildFConstant(Res: F32, Val: 0x1.0p+32);
3571 auto One = B.buildFConstant(Res: F32, Val: 1.0);
3572 auto ScaleFactor =
3573 B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3574 auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3575
3576 return {ScaledInput.getReg(Idx: 0), IsLtSmallestNormal.getReg(Idx: 0)};
3577}
3578
3579bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3580 MachineIRBuilder &B) const {
3581 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3582 // If we have to handle denormals, scale up the input and adjust the result.
3583
3584 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3585 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3586
3587 Register Dst = MI.getOperand(i: 0).getReg();
3588 Register Src = MI.getOperand(i: 1).getReg();
3589 LLT Ty = B.getMRI()->getType(Reg: Dst);
3590 unsigned Flags = MI.getFlags();
3591
3592 if (Ty == LLT::scalar(SizeInBits: 16)) {
3593 const LLT F32 = LLT::scalar(SizeInBits: 32);
3594 // Nothing in half is a denormal when promoted to f32.
3595 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3596 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {F32})
3597 .addUse(RegNo: Ext.getReg(Idx: 0))
3598 .setMIFlags(Flags);
3599 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3600 MI.eraseFromParent();
3601 return true;
3602 }
3603
3604 assert(Ty == LLT::scalar(32));
3605
3606 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3607 if (!ScaledInput) {
3608 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {MI.getOperand(i: 0)})
3609 .addUse(RegNo: Src)
3610 .setMIFlags(Flags);
3611 MI.eraseFromParent();
3612 return true;
3613 }
3614
3615 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3616 .addUse(RegNo: ScaledInput)
3617 .setMIFlags(Flags);
3618
3619 auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: 32.0);
3620 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3621 auto ResultOffset =
3622 B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3623 B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3624
3625 MI.eraseFromParent();
3626 return true;
3627}
3628
3629static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3630 Register Z, unsigned Flags) {
3631 auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3632 return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: 0);
3633}
3634
3635bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3636 MachineIRBuilder &B) const {
3637 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3638 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3639
3640 MachineRegisterInfo &MRI = *B.getMRI();
3641 Register Dst = MI.getOperand(i: 0).getReg();
3642 Register X = MI.getOperand(i: 1).getReg();
3643 unsigned Flags = MI.getFlags();
3644 const LLT Ty = MRI.getType(Reg: X);
3645
3646 const LLT F32 = LLT::scalar(SizeInBits: 32);
3647 const LLT F16 = LLT::scalar(SizeInBits: 16);
3648
3649 if (Ty == F16 || MI.getFlag(Flag: MachineInstr::FmAfn)) {
3650 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3651 // depending on !fpmath metadata.
3652 bool PromoteToF32 =
3653 Ty == F16 && (!MI.getFlag(Flag: MachineInstr::FmAfn) || !ST.has16BitInsts());
3654 if (PromoteToF32) {
3655 Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3656 auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3657 legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: 0), IsLog10, Flags);
3658 B.buildFPTrunc(Res: Dst, Op: LogVal);
3659 } else {
3660 legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3661 }
3662
3663 MI.eraseFromParent();
3664 return true;
3665 }
3666
3667 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3668 if (ScaledInput)
3669 X = ScaledInput;
3670
3671 auto Y =
3672 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty}).addUse(RegNo: X).setMIFlags(Flags);
3673
3674 Register R;
3675 if (ST.hasFastFMAF32()) {
3676 // c+cc are ln(2)/ln(10) to more than 49 bits
3677 const float c_log10 = 0x1.344134p-2f;
3678 const float cc_log10 = 0x1.09f79ep-26f;
3679
3680 // c + cc is ln(2) to more than 49 bits
3681 const float c_log = 0x1.62e42ep-1f;
3682 const float cc_log = 0x1.efa39ep-25f;
3683
3684 auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3685 auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3686 // This adds correction terms for which contraction may lead to an increase
3687 // in the error of the approximation, so disable it.
3688 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3689 R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags: NewFlags).getReg(Idx: 0);
3690 auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags: NewFlags);
3691 auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags: NewFlags);
3692 auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags: NewFlags);
3693 R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags: NewFlags).getReg(Idx: 0);
3694 } else {
3695 // ch+ct is ln(2)/ln(10) to more than 36 bits
3696 const float ch_log10 = 0x1.344000p-2f;
3697 const float ct_log10 = 0x1.3509f6p-18f;
3698
3699 // ch + ct is ln(2) to more than 36 bits
3700 const float ch_log = 0x1.62e000p-1f;
3701 const float ct_log = 0x1.0bfbe8p-15f;
3702
3703 auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3704 auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3705
3706 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3707 auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3708 auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3709 // This adds correction terms for which contraction may lead to an increase
3710 // in the error of the approximation, so disable it.
3711 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3712 auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags: NewFlags);
3713
3714 Register Mad0 =
3715 getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CT.getReg(Idx: 0), Z: YTCT.getReg(Idx: 0), Flags: NewFlags);
3716 Register Mad1 = getMad(B, Ty, X: YT.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad0, Flags: NewFlags);
3717 R = getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad1, Flags: NewFlags);
3718 }
3719
3720 const bool IsFiniteOnly =
3721 MI.getFlag(Flag: MachineInstr::FmNoNans) && MI.getFlag(Flag: MachineInstr::FmNoInfs);
3722
3723 if (!IsFiniteOnly) {
3724 // Expand isfinite(x) => fabs(x) < inf
3725 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3726 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3727 auto IsFinite =
3728 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
3729 R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(Idx: 0);
3730 }
3731
3732 if (ScaledInput) {
3733 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3734 auto ShiftK =
3735 B.buildFConstant(Res: Ty, Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3736 auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3737 B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3738 } else {
3739 B.buildCopy(Res: Dst, Op: R);
3740 }
3741
3742 MI.eraseFromParent();
3743 return true;
3744}
3745
3746bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3747 Register Src, bool IsLog10,
3748 unsigned Flags) const {
3749 const double Log2BaseInverted =
3750 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3751
3752 LLT Ty = B.getMRI()->getType(Reg: Dst);
3753
3754 if (Ty == LLT::scalar(SizeInBits: 32)) {
3755 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3756 if (ScaledInput) {
3757 auto LogSrc = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3758 .addUse(RegNo: Src)
3759 .setMIFlags(Flags);
3760 auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -32.0 * Log2BaseInverted);
3761 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3762 auto ResultOffset =
3763 B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3764 auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3765
3766 if (ST.hasFastFMAF32())
3767 B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3768 else {
3769 auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3770 B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3771 }
3772
3773 return true;
3774 }
3775 }
3776
3777 auto Log2Operand = Ty == LLT::scalar(SizeInBits: 16)
3778 ? B.buildFLog2(Dst: Ty, Src, Flags)
3779 : B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3780 .addUse(RegNo: Src)
3781 .setMIFlags(Flags);
3782 auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3783 B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3784 return true;
3785}
3786
3787bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3788 MachineIRBuilder &B) const {
3789 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3790 // If we have to handle denormals, scale up the input and adjust the result.
3791
3792 Register Dst = MI.getOperand(i: 0).getReg();
3793 Register Src = MI.getOperand(i: 1).getReg();
3794 unsigned Flags = MI.getFlags();
3795 LLT Ty = B.getMRI()->getType(Reg: Dst);
3796 const LLT F16 = LLT::scalar(SizeInBits: 16);
3797 const LLT F32 = LLT::scalar(SizeInBits: 32);
3798 const LLT F64 = LLT::scalar(SizeInBits: 64);
3799
3800 if (Ty == F64)
3801 return legalizeFEXPF64(MI, B);
3802
3803 if (Ty == F16) {
3804 // Nothing in half is a denormal when promoted to f32.
3805 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3806 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {F32})
3807 .addUse(RegNo: Ext.getReg(Idx: 0))
3808 .setMIFlags(Flags);
3809 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3810 MI.eraseFromParent();
3811 return true;
3812 }
3813
3814 assert(Ty == F32);
3815
3816 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3817 B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3818 .addUse(RegNo: Src)
3819 .setMIFlags(Flags);
3820 MI.eraseFromParent();
3821 return true;
3822 }
3823
3824 // bool needs_scaling = x < -0x1.f80000p+6f;
3825 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3826
3827 // -nextafter(128.0, -1)
3828 auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -0x1.f80000p+6f);
3829 auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
3830 Op1: RangeCheckConst, Flags);
3831
3832 auto SixtyFour = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3833 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3834 auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3835 auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3836
3837 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3838 .addUse(RegNo: AddInput.getReg(Idx: 0))
3839 .setMIFlags(Flags);
3840
3841 auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: 0x1.0p-64f);
3842 auto One = B.buildFConstant(Res: Ty, Val: 1.0);
3843 auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3844 B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3845 MI.eraseFromParent();
3846 return true;
3847}
3848
3849static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst,
3850 const SrcOp &Src, unsigned Flags) {
3851 LLT Ty = Dst.getLLTTy(MRI: *B.getMRI());
3852
3853 if (Ty == LLT::scalar(SizeInBits: 32)) {
3854 return B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Dst})
3855 .addUse(RegNo: Src.getReg())
3856 .setMIFlags(Flags);
3857 }
3858 return B.buildFExp2(Dst, Src, Flags);
3859}
3860
3861bool AMDGPULegalizerInfo::legalizeFExpUnsafeImpl(MachineIRBuilder &B,
3862 Register Dst, Register X,
3863 unsigned Flags,
3864 bool IsExp10) const {
3865 LLT Ty = B.getMRI()->getType(Reg: X);
3866
3867 // exp(x) -> exp2(M_LOG2E_F * x);
3868 // exp10(x) -> exp2(log2(10) * x);
3869 auto Const = B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3870 auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Const, Flags);
3871 buildExp(B, Dst, Src: Mul, Flags);
3872 return true;
3873}
3874
3875bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3876 Register X, unsigned Flags) const {
3877 LLT Ty = B.getMRI()->getType(Reg: Dst);
3878 LLT F32 = LLT::scalar(SizeInBits: 32);
3879
3880 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3881 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3882 }
3883
3884 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.5d58a0p+6f);
3885 auto NeedsScaling =
3886 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold, Flags);
3887 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3888 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3889 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3890
3891 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3892 auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3893
3894 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3895 .addUse(RegNo: ExpInput.getReg(Idx: 0))
3896 .setMIFlags(Flags);
3897
3898 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.969d48p-93f);
3899 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3900 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3901 return true;
3902}
3903
3904bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B,
3905 Register Dst, Register X,
3906 unsigned Flags) const {
3907 LLT Ty = B.getMRI()->getType(Reg: Dst);
3908 LLT F32 = LLT::scalar(SizeInBits: 32);
3909
3910 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3911 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3912 auto K0 = B.buildFConstant(Res: Ty, Val: 0x1.a92000p+1f);
3913 auto K1 = B.buildFConstant(Res: Ty, Val: 0x1.4f0978p-11f);
3914
3915 auto Mul1 = B.buildFMul(Dst: Ty, Src0: X, Src1: K1, Flags);
3916 auto Exp2_1 = buildExp(B, Dst: Ty, Src: Mul1, Flags);
3917 auto Mul0 = B.buildFMul(Dst: Ty, Src0: X, Src1: K0, Flags);
3918 auto Exp2_0 = buildExp(B, Dst: Ty, Src: Mul0, Flags);
3919 B.buildFMul(Dst, Src0: Exp2_0, Src1: Exp2_1, Flags);
3920 return true;
3921 }
3922
3923 // bool s = x < -0x1.2f7030p+5f;
3924 // x += s ? 0x1.0p+5f : 0.0f;
3925 // exp10 = exp2(x * 0x1.a92000p+1f) *
3926 // exp2(x * 0x1.4f0978p-11f) *
3927 // (s ? 0x1.9f623ep-107f : 1.0f);
3928
3929 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.2f7030p+5f);
3930 auto NeedsScaling =
3931 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold);
3932
3933 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+5f);
3934 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3935 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X);
3936
3937 auto K0 = B.buildFConstant(Res: Ty, Val: 0x1.a92000p+1f);
3938 auto K1 = B.buildFConstant(Res: Ty, Val: 0x1.4f0978p-11f);
3939
3940 auto Mul1 = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: K1, Flags);
3941 auto Exp2_1 = buildExp(B, Dst: Ty, Src: Mul1, Flags);
3942 auto Mul0 = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: K0, Flags);
3943 auto Exp2_0 = buildExp(B, Dst: Ty, Src: Mul0, Flags);
3944
3945 auto MulExps = B.buildFMul(Dst: Ty, Src0: Exp2_0, Src1: Exp2_1, Flags);
3946 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.9f623ep-107f);
3947 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: MulExps, Src1: ResultScaleFactor, Flags);
3948
3949 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: MulExps);
3950 return true;
3951}
3952
3953// This expansion gives a result slightly better than 1ulp.
3954bool AMDGPULegalizerInfo::legalizeFEXPF64(MachineInstr &MI,
3955 MachineIRBuilder &B) const {
3956
3957 Register X = MI.getOperand(i: 1).getReg();
3958 LLT S64 = LLT::scalar(SizeInBits: 64);
3959 LLT S32 = LLT::scalar(SizeInBits: 32);
3960 LLT S1 = LLT::scalar(SizeInBits: 1);
3961
3962 // TODO: Check if reassoc is safe. There is an output change in exp2 and
3963 // exp10, which slightly increases ulp.
3964 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
3965
3966 Register Dn, F, T;
3967
3968 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
3969 // Dn = rint(X)
3970 Dn = B.buildFRint(Dst: S64, Src0: X, Flags).getReg(Idx: 0);
3971 // F = X - Dn
3972 F = B.buildFSub(Dst: S64, Src0: X, Src1: Dn, Flags).getReg(Idx: 0);
3973 // T = F*C1 + F*C2
3974 auto C1 = B.buildFConstant(Res: S64, Val: APFloat(0x1.62e42fefa39efp-1));
3975 auto C2 = B.buildFConstant(Res: S64, Val: APFloat(0x1.abc9e3b39803fp-56));
3976 auto Mul2 = B.buildFMul(Dst: S64, Src0: F, Src1: C2, Flags).getReg(Idx: 0);
3977 T = B.buildFMA(Dst: S64, Src0: F, Src1: C1, Src2: Mul2, Flags).getReg(Idx: 0);
3978
3979 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
3980 auto C1 = B.buildFConstant(Res: S64, Val: APFloat(0x1.a934f0979a371p+1));
3981 auto Mul = B.buildFMul(Dst: S64, Src0: X, Src1: C1, Flags).getReg(Idx: 0);
3982 Dn = B.buildFRint(Dst: S64, Src0: Mul, Flags).getReg(Idx: 0);
3983
3984 auto NegDn = B.buildFNeg(Dst: S64, Src0: Dn, Flags).getReg(Idx: 0);
3985 auto C2 = B.buildFConstant(Res: S64, Val: APFloat(-0x1.9dc1da994fd21p-59));
3986 auto C3 = B.buildFConstant(Res: S64, Val: APFloat(0x1.34413509f79ffp-2));
3987 auto Inner = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C3, Src2: X, Flags).getReg(Idx: 0);
3988 F = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C2, Src2: Inner, Flags).getReg(Idx: 0);
3989
3990 auto C4 = B.buildFConstant(Res: S64, Val: APFloat(0x1.26bb1bbb55516p+1));
3991 auto C5 = B.buildFConstant(Res: S64, Val: APFloat(-0x1.f48ad494ea3e9p-53));
3992 auto MulF = B.buildFMul(Dst: S64, Src0: F, Src1: C5, Flags).getReg(Idx: 0);
3993 T = B.buildFMA(Dst: S64, Src0: F, Src1: C4, Src2: MulF, Flags).getReg(Idx: 0);
3994
3995 } else { // G_FEXP
3996 auto C1 = B.buildFConstant(Res: S64, Val: APFloat(0x1.71547652b82fep+0));
3997 auto Mul = B.buildFMul(Dst: S64, Src0: X, Src1: C1, Flags).getReg(Idx: 0);
3998 Dn = B.buildFRint(Dst: S64, Src0: Mul, Flags).getReg(Idx: 0);
3999
4000 auto NegDn = B.buildFNeg(Dst: S64, Src0: Dn, Flags).getReg(Idx: 0);
4001 auto C2 = B.buildFConstant(Res: S64, Val: APFloat(0x1.abc9e3b39803fp-56));
4002 auto C3 = B.buildFConstant(Res: S64, Val: APFloat(0x1.62e42fefa39efp-1));
4003 auto Inner = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C3, Src2: X, Flags).getReg(Idx: 0);
4004 T = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C2, Src2: Inner, Flags).getReg(Idx: 0);
4005 }
4006
4007 // Polynomial chain for P
4008 auto P = B.buildFConstant(Res: S64, Val: 0x1.ade156a5dcb37p-26);
4009 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.28af3fca7ab0cp-22),
4010 Flags);
4011 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.71dee623fde64p-19),
4012 Flags);
4013 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.a01997c89e6b0p-16),
4014 Flags);
4015 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.a01a014761f6ep-13),
4016 Flags);
4017 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.6c16c1852b7b0p-10),
4018 Flags);
4019 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.1111111122322p-7), Flags);
4020 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.55555555502a1p-5), Flags);
4021 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.5555555555511p-3), Flags);
4022 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.000000000000bp-1), Flags);
4023
4024 auto One = B.buildFConstant(Res: S64, Val: 1.0);
4025 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: One, Flags);
4026 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: One, Flags);
4027
4028 // Z = FLDEXP(P, (int)Dn)
4029 auto DnInt = B.buildFPTOSI(Dst: S32, Src0: Dn);
4030 auto Z = B.buildFLdexp(Dst: S64, Src0: P, Src1: DnInt, Flags);
4031
4032 if (!(Flags & MachineInstr::FmNoInfs)) {
4033 // Overflow guard: if X <= 1024.0 then Z else +inf
4034 auto CondHi = B.buildFCmp(Pred: CmpInst::FCMP_ULE, Res: S1, Op0: X,
4035 Op1: B.buildFConstant(Res: S64, Val: APFloat(1024.0)));
4036 auto PInf = B.buildFConstant(Res: S64, Val: APFloat::getInf(Sem: APFloat::IEEEdouble()));
4037 Z = B.buildSelect(Res: S64, Tst: CondHi, Op0: Z, Op1: PInf, Flags);
4038 }
4039
4040 // Underflow guard: if X >= -1075.0 then Z else 0.0
4041 auto CondLo = B.buildFCmp(Pred: CmpInst::FCMP_UGE, Res: S1, Op0: X,
4042 Op1: B.buildFConstant(Res: S64, Val: APFloat(-1075.0)));
4043 auto Zero = B.buildFConstant(Res: S64, Val: APFloat(0.0));
4044 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: CondLo, Op0: Z, Op1: Zero, Flags);
4045
4046 MI.eraseFromParent();
4047 return true;
4048}
4049
4050bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
4051 MachineIRBuilder &B) const {
4052 Register Dst = MI.getOperand(i: 0).getReg();
4053 Register X = MI.getOperand(i: 1).getReg();
4054 const unsigned Flags = MI.getFlags();
4055 MachineFunction &MF = B.getMF();
4056 MachineRegisterInfo &MRI = *B.getMRI();
4057 LLT Ty = MRI.getType(Reg: Dst);
4058
4059 const LLT F64 = LLT::scalar(SizeInBits: 64);
4060
4061 if (Ty == F64)
4062 return legalizeFEXPF64(MI, B);
4063
4064 const LLT F16 = LLT::scalar(SizeInBits: 16);
4065 const LLT F32 = LLT::scalar(SizeInBits: 32);
4066 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4067
4068 if (Ty == F16) {
4069 // v_exp_f16 (fmul x, log2e)
4070 if (allowApproxFunc(MF, Flags)) {
4071 // TODO: Does this really require fast?
4072 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4073 : legalizeFExpUnsafe(B, Dst, X, Flags);
4074 MI.eraseFromParent();
4075 return true;
4076 }
4077
4078 // Nothing in half is a denormal when promoted to f32.
4079 //
4080 // exp(f16 x) ->
4081 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4082 //
4083 // exp10(f16 x) ->
4084 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4085 auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
4086 Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
4087 legalizeFExpUnsafeImpl(B, Dst: Lowered, X: Ext.getReg(Idx: 0), Flags, IsExp10);
4088 B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
4089 MI.eraseFromParent();
4090 return true;
4091 }
4092
4093 assert(Ty == F32);
4094
4095 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4096 // library behavior. Also, is known-not-daz source sufficient?
4097 if (allowApproxFunc(MF, Flags)) {
4098 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4099 : legalizeFExpUnsafe(B, Dst, X, Flags);
4100 MI.eraseFromParent();
4101 return true;
4102 }
4103
4104 // Algorithm:
4105 //
4106 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4107 //
4108 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4109 // n = 64*m + j, 0 <= j < 64
4110 //
4111 // e^x = 2^((64*m + j + f)/64)
4112 // = (2^m) * (2^(j/64)) * 2^(f/64)
4113 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4114 //
4115 // f = x*(64/ln(2)) - n
4116 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4117 //
4118 // e^x = (2^m) * (2^(j/64)) * e^r
4119 //
4120 // (2^(j/64)) is precomputed
4121 //
4122 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4123 // e^r = 1 + q
4124 //
4125 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4126 //
4127 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4128 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4129 Register PH, PL;
4130
4131 if (ST.hasFastFMAF32()) {
4132 const float c_exp = numbers::log2ef;
4133 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4134 const float c_exp10 = 0x1.a934f0p+1f;
4135 const float cc_exp10 = 0x1.2f346ep-24f;
4136
4137 auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
4138 PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: 0);
4139 auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
4140 auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
4141
4142 auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
4143 PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: 0);
4144 } else {
4145 const float ch_exp = 0x1.714000p+0f;
4146 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4147
4148 const float ch_exp10 = 0x1.a92000p+1f;
4149 const float cl_exp10 = 0x1.4f0978p-11f;
4150
4151 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
4152 auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
4153 auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
4154
4155 auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
4156 PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: 0);
4157
4158 auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
4159 auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
4160
4161 Register Mad0 =
4162 getMad(B, Ty, X: XL.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: XLCL.getReg(Idx: 0), Flags);
4163 PL = getMad(B, Ty, X: XH.getReg(Idx: 0), Y: CL.getReg(Idx: 0), Z: Mad0, Flags);
4164 }
4165
4166 auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
4167
4168 // It is unsafe to contract this fsub into the PH multiply.
4169 auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
4170 auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
4171 auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: 32), Src0: E);
4172
4173 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
4174 .addUse(RegNo: A.getReg(Idx: 0))
4175 .setMIFlags(Flags);
4176 auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
4177
4178 auto UnderflowCheckConst =
4179 B.buildFConstant(Res: Ty, Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4180 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
4181 auto Underflow =
4182 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: UnderflowCheckConst);
4183
4184 R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
4185
4186 if (!(Flags & MachineInstr::FmNoInfs)) {
4187 auto OverflowCheckConst =
4188 B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4189
4190 auto Overflow =
4191 B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: OverflowCheckConst);
4192 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
4193 R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
4194 }
4195
4196 B.buildCopy(Res: Dst, Op: R);
4197 MI.eraseFromParent();
4198 return true;
4199}
4200
4201bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
4202 MachineIRBuilder &B) const {
4203 Register Dst = MI.getOperand(i: 0).getReg();
4204 Register Src0 = MI.getOperand(i: 1).getReg();
4205 Register Src1 = MI.getOperand(i: 2).getReg();
4206 unsigned Flags = MI.getFlags();
4207 LLT Ty = B.getMRI()->getType(Reg: Dst);
4208 const LLT F16 = LLT::float16();
4209 const LLT F32 = LLT::float32();
4210
4211 if (Ty == F32) {
4212 auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
4213 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
4214 .addUse(RegNo: Log.getReg(Idx: 0))
4215 .addUse(RegNo: Src1)
4216 .setMIFlags(Flags);
4217 B.buildFExp2(Dst, Src: Mul, Flags);
4218 } else if (Ty == F16) {
4219 // There's no f16 fmul_legacy, so we need to convert for it.
4220 auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
4221 auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
4222 auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
4223 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
4224 .addUse(RegNo: Ext0.getReg(Idx: 0))
4225 .addUse(RegNo: Ext1.getReg(Idx: 0))
4226 .setMIFlags(Flags);
4227 B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
4228 } else
4229 return false;
4230
4231 MI.eraseFromParent();
4232 return true;
4233}
4234
4235// Find a source register, ignoring any possible source modifiers.
4236static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
4237 Register ModSrc = OrigSrc;
4238 if (MachineInstr *SrcFNeg = getOpcodeDef(Opcode: AMDGPU::G_FNEG, Reg: ModSrc, MRI)) {
4239 ModSrc = SrcFNeg->getOperand(i: 1).getReg();
4240 if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
4241 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
4242 } else if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
4243 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
4244 return ModSrc;
4245}
4246
4247bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
4248 MachineRegisterInfo &MRI,
4249 MachineIRBuilder &B) const {
4250
4251 const LLT S1 = LLT::scalar(SizeInBits: 1);
4252 const LLT F64 = LLT::float64();
4253 Register Dst = MI.getOperand(i: 0).getReg();
4254 Register OrigSrc = MI.getOperand(i: 1).getReg();
4255 unsigned Flags = MI.getFlags();
4256 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4257 "this should not have been custom lowered");
4258
4259 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4260 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4261 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4262 // V_FRACT bug is:
4263 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4264 //
4265 // Convert floor(x) to (x - fract(x))
4266
4267 auto Fract = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {F64})
4268 .addUse(RegNo: OrigSrc)
4269 .setMIFlags(Flags);
4270
4271 // Give source modifier matching some assistance before obscuring a foldable
4272 // pattern.
4273
4274 // TODO: We can avoid the neg on the fract? The input sign to fract
4275 // shouldn't matter?
4276 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4277
4278 auto Const =
4279 B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: 0x3fefffffffffffff));
4280
4281 Register Min = MRI.createGenericVirtualRegister(Ty: F64);
4282
4283 // We don't need to concern ourselves with the snan handling difference, so
4284 // use the one which will directly select.
4285 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4286 if (MFI->getMode().IEEE)
4287 B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
4288 else
4289 B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
4290
4291 Register CorrectedFract = Min;
4292 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
4293 auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
4294 CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: 0);
4295 }
4296
4297 auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
4298 B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
4299
4300 MI.eraseFromParent();
4301 return true;
4302}
4303
4304// Turn an illegal packed v2s16 build vector into bit operations.
4305// TODO: This should probably be a bitcast action in LegalizerHelper.
4306bool AMDGPULegalizerInfo::legalizeBuildVector(
4307 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4308 Register Dst = MI.getOperand(i: 0).getReg();
4309 const LLT S32 = LLT::scalar(SizeInBits: 32);
4310 const LLT S16 = LLT::scalar(SizeInBits: 16);
4311 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4312
4313 Register Src0 = MI.getOperand(i: 1).getReg();
4314 Register Src1 = MI.getOperand(i: 2).getReg();
4315
4316 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4317 assert(MRI.getType(Src0) == S32);
4318 Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 1).getReg()).getReg(Idx: 0);
4319 Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 2).getReg()).getReg(Idx: 0);
4320 }
4321
4322 auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
4323 B.buildBitcast(Dst, Src: Merge);
4324
4325 MI.eraseFromParent();
4326 return true;
4327}
4328
4329// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4330//
4331// Source and accumulation registers must all be 32-bits.
4332//
4333// TODO: When the multiply is uniform, we should produce a code sequence
4334// that is better suited to instruction selection on the SALU. Instead of
4335// the outer loop going over parts of the result, the outer loop should go
4336// over parts of one of the factors. This should result in instruction
4337// selection that makes full use of S_ADDC_U32 instructions.
4338void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
4339 MutableArrayRef<Register> Accum,
4340 ArrayRef<Register> Src0,
4341 ArrayRef<Register> Src1,
4342 bool UsePartialMad64_32,
4343 bool SeparateOddAlignedProducts) const {
4344 // Use (possibly empty) vectors of S1 registers to represent the set of
4345 // carries from one pair of positions to the next.
4346 using Carry = SmallVector<Register, 2>;
4347
4348 MachineIRBuilder &B = Helper.MIRBuilder;
4349 GISelValueTracking &VT = *Helper.getValueTracking();
4350
4351 const LLT S1 = LLT::scalar(SizeInBits: 1);
4352 const LLT S32 = LLT::scalar(SizeInBits: 32);
4353 const LLT S64 = LLT::scalar(SizeInBits: 64);
4354
4355 Register Zero32;
4356 Register Zero64;
4357
4358 auto getZero32 = [&]() -> Register {
4359 if (!Zero32)
4360 Zero32 = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
4361 return Zero32;
4362 };
4363 auto getZero64 = [&]() -> Register {
4364 if (!Zero64)
4365 Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
4366 return Zero64;
4367 };
4368
4369 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4370 for (unsigned i = 0; i < Src0.size(); ++i) {
4371 Src0KnownZeros.push_back(Elt: VT.getKnownBits(R: Src0[i]).isZero());
4372 Src1KnownZeros.push_back(Elt: VT.getKnownBits(R: Src1[i]).isZero());
4373 }
4374
4375 // Merge the given carries into the 32-bit LocalAccum, which is modified
4376 // in-place.
4377 //
4378 // Returns the carry-out, which is a single S1 register or null.
4379 auto mergeCarry =
4380 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4381 if (CarryIn.empty())
4382 return Register();
4383
4384 bool HaveCarryOut = true;
4385 Register CarryAccum;
4386 if (CarryIn.size() == 1) {
4387 if (!LocalAccum) {
4388 LocalAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
4389 return Register();
4390 }
4391
4392 CarryAccum = getZero32();
4393 } else {
4394 CarryAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
4395 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4396 CarryAccum =
4397 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32(), CarryIn: CarryIn[i])
4398 .getReg(Idx: 0);
4399 }
4400
4401 if (!LocalAccum) {
4402 LocalAccum = getZero32();
4403 HaveCarryOut = false;
4404 }
4405 }
4406
4407 auto Add =
4408 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
4409 LocalAccum = Add.getReg(Idx: 0);
4410 return HaveCarryOut ? Add.getReg(Idx: 1) : Register();
4411 };
4412
4413 // Build a multiply-add chain to compute
4414 //
4415 // LocalAccum + (partial products at DstIndex)
4416 // + (opportunistic subset of CarryIn)
4417 //
4418 // LocalAccum is an array of one or two 32-bit registers that are updated
4419 // in-place. The incoming registers may be null.
4420 //
4421 // In some edge cases, carry-ins can be consumed "for free". In that case,
4422 // the consumed carry bits are removed from CarryIn in-place.
4423 auto buildMadChain =
4424 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4425 -> Carry {
4426 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4427 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4428
4429 Carry CarryOut;
4430 unsigned j0 = 0;
4431
4432 // Use plain 32-bit multiplication for the most significant part of the
4433 // result by default.
4434 if (LocalAccum.size() == 1 &&
4435 (!UsePartialMad64_32 || !CarryIn.empty())) {
4436 do {
4437 // Skip multiplication if one of the operands is 0
4438 unsigned j1 = DstIndex - j0;
4439 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4440 ++j0;
4441 continue;
4442 }
4443 auto Mul = B.buildMul(Dst: S32, Src0: Src0[j0], Src1: Src1[j1]);
4444 if (!LocalAccum[0] || VT.getKnownBits(R: LocalAccum[0]).isZero()) {
4445 LocalAccum[0] = Mul.getReg(Idx: 0);
4446 } else {
4447 if (CarryIn.empty()) {
4448 LocalAccum[0] = B.buildAdd(Dst: S32, Src0: LocalAccum[0], Src1: Mul).getReg(Idx: 0);
4449 } else {
4450 LocalAccum[0] =
4451 B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum[0], Op1: Mul, CarryIn: CarryIn.back())
4452 .getReg(Idx: 0);
4453 CarryIn.pop_back();
4454 }
4455 }
4456 ++j0;
4457 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4458 }
4459
4460 // Build full 64-bit multiplies.
4461 if (j0 <= DstIndex) {
4462 bool HaveSmallAccum = false;
4463 Register Tmp;
4464
4465 if (LocalAccum[0]) {
4466 if (LocalAccum.size() == 1) {
4467 Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4468 HaveSmallAccum = true;
4469 } else if (LocalAccum[1]) {
4470 Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: 0);
4471 HaveSmallAccum = false;
4472 } else {
4473 Tmp = B.buildZExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4474 HaveSmallAccum = true;
4475 }
4476 } else {
4477 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4478 Tmp = getZero64();
4479 HaveSmallAccum = true;
4480 }
4481
4482 do {
4483 unsigned j1 = DstIndex - j0;
4484 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4485 ++j0;
4486 continue;
4487 }
4488 auto Mad = B.buildInstr(Opc: AMDGPU::G_AMDGPU_MAD_U64_U32, DstOps: {S64, S1},
4489 SrcOps: {Src0[j0], Src1[j1], Tmp});
4490 Tmp = Mad.getReg(Idx: 0);
4491 if (!HaveSmallAccum)
4492 CarryOut.push_back(Elt: Mad.getReg(Idx: 1));
4493 HaveSmallAccum = false;
4494
4495 ++j0;
4496 } while (j0 <= DstIndex);
4497
4498 auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
4499 LocalAccum[0] = Unmerge.getReg(Idx: 0);
4500 if (LocalAccum.size() > 1)
4501 LocalAccum[1] = Unmerge.getReg(Idx: 1);
4502 }
4503
4504 return CarryOut;
4505 };
4506
4507 // Outer multiply loop, iterating over destination parts from least
4508 // significant to most significant parts.
4509 //
4510 // The columns of the following diagram correspond to the destination parts
4511 // affected by one iteration of the outer loop (ignoring boundary
4512 // conditions).
4513 //
4514 // Dest index relative to 2 * i: 1 0 -1
4515 // ------
4516 // Carries from previous iteration: e o
4517 // Even-aligned partial product sum: E E .
4518 // Odd-aligned partial product sum: O O
4519 //
4520 // 'o' is OddCarry, 'e' is EvenCarry.
4521 // EE and OO are computed from partial products via buildMadChain and use
4522 // accumulation where possible and appropriate.
4523 //
4524 Register SeparateOddCarry;
4525 Carry EvenCarry;
4526 Carry OddCarry;
4527
4528 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4529 Carry OddCarryIn = std::move(OddCarry);
4530 Carry EvenCarryIn = std::move(EvenCarry);
4531 OddCarry.clear();
4532 EvenCarry.clear();
4533
4534 // Partial products at offset 2 * i.
4535 if (2 * i < Accum.size()) {
4536 auto LocalAccum = Accum.drop_front(N: 2 * i).take_front(N: 2);
4537 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4538 }
4539
4540 // Partial products at offset 2 * i - 1.
4541 if (i > 0) {
4542 if (!SeparateOddAlignedProducts) {
4543 auto LocalAccum = Accum.drop_front(N: 2 * i - 1).take_front(N: 2);
4544 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4545 } else {
4546 bool IsHighest = 2 * i >= Accum.size();
4547 Register SeparateOddOut[2];
4548 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4549 .take_front(N: IsHighest ? 1 : 2);
4550 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4551
4552 MachineInstr *Lo;
4553
4554 if (i == 1) {
4555 if (!IsHighest)
4556 Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0]);
4557 else
4558 Lo = B.buildAdd(Dst: S32, Src0: Accum[2 * i - 1], Src1: SeparateOddOut[0]);
4559 } else {
4560 Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0],
4561 CarryIn: SeparateOddCarry);
4562 }
4563 Accum[2 * i - 1] = Lo->getOperand(i: 0).getReg();
4564
4565 if (!IsHighest) {
4566 auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i], Op1: SeparateOddOut[1],
4567 CarryIn: Lo->getOperand(i: 1).getReg());
4568 Accum[2 * i] = Hi.getReg(Idx: 0);
4569 SeparateOddCarry = Hi.getReg(Idx: 1);
4570 }
4571 }
4572 }
4573
4574 // Add in the carries from the previous iteration
4575 if (i > 0) {
4576 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4577 EvenCarryIn.push_back(Elt: CarryOut);
4578
4579 if (2 * i < Accum.size()) {
4580 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4581 OddCarry.push_back(Elt: CarryOut);
4582 }
4583 }
4584 }
4585}
4586
4587// Custom narrowing of wide multiplies using wide multiply-add instructions.
4588//
4589// TODO: If the multiply is followed by an addition, we should attempt to
4590// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4591bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4592 MachineInstr &MI) const {
4593 assert(ST.hasMad64_32());
4594 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4595
4596 MachineIRBuilder &B = Helper.MIRBuilder;
4597 MachineRegisterInfo &MRI = *B.getMRI();
4598
4599 Register DstReg = MI.getOperand(i: 0).getReg();
4600 Register Src0 = MI.getOperand(i: 1).getReg();
4601 Register Src1 = MI.getOperand(i: 2).getReg();
4602
4603 LLT Ty = MRI.getType(Reg: DstReg);
4604 assert(Ty.isScalar());
4605
4606 unsigned Size = Ty.getSizeInBits();
4607 if (ST.hasVectorMulU64() && Size == 64)
4608 return true;
4609
4610 unsigned NumParts = Size / 32;
4611 assert((Size % 32) == 0);
4612 assert(NumParts >= 2);
4613
4614 // Whether to use MAD_64_32 for partial products whose high half is
4615 // discarded. This avoids some ADD instructions but risks false dependency
4616 // stalls on some subtargets in some cases.
4617 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4618
4619 // Whether to compute odd-aligned partial products separately. This is
4620 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4621 // in an even-aligned VGPR.
4622 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4623
4624 LLT S32 = LLT::scalar(SizeInBits: 32);
4625 SmallVector<Register, 2> Src0Parts, Src1Parts;
4626 for (unsigned i = 0; i < NumParts; ++i) {
4627 Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4628 Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4629 }
4630 B.buildUnmerge(Res: Src0Parts, Op: Src0);
4631 B.buildUnmerge(Res: Src1Parts, Op: Src1);
4632
4633 SmallVector<Register, 2> AccumRegs(NumParts);
4634 buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4635 SeparateOddAlignedProducts);
4636
4637 B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4638 MI.eraseFromParent();
4639 return true;
4640}
4641
4642// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4643// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4644// case with a single min instruction instead of a compare+select.
4645bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4646 MachineRegisterInfo &MRI,
4647 MachineIRBuilder &B) const {
4648 Register Dst = MI.getOperand(i: 0).getReg();
4649 Register Src = MI.getOperand(i: 1).getReg();
4650 LLT DstTy = MRI.getType(Reg: Dst);
4651 LLT SrcTy = MRI.getType(Reg: Src);
4652
4653 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4654 ? AMDGPU::G_AMDGPU_FFBH_U32
4655 : AMDGPU::G_AMDGPU_FFBL_B32;
4656 auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4657 B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4658
4659 MI.eraseFromParent();
4660 return true;
4661}
4662
4663bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4664 MachineRegisterInfo &MRI,
4665 MachineIRBuilder &B) const {
4666 Register Dst = MI.getOperand(i: 0).getReg();
4667 Register Src = MI.getOperand(i: 1).getReg();
4668 LLT SrcTy = MRI.getType(Reg: Src);
4669 TypeSize NumBits = SrcTy.getSizeInBits();
4670
4671 assert(NumBits < 32u);
4672
4673 auto ShiftAmt = B.buildConstant(Res: S32, Val: 32u - NumBits);
4674 auto Extend = B.buildAnyExt(Res: S32, Op: {Src}).getReg(Idx: 0u);
4675 auto Shift = B.buildShl(Dst: S32, Src0: Extend, Src1: ShiftAmt);
4676 auto Ctlz = B.buildInstr(Opc: AMDGPU::G_AMDGPU_FFBH_U32, DstOps: {S32}, SrcOps: {Shift});
4677 B.buildTrunc(Res: Dst, Op: Ctlz);
4678 MI.eraseFromParent();
4679 return true;
4680}
4681
4682// Check that this is a G_XOR x, -1
4683static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4684 if (MI.getOpcode() != TargetOpcode::G_XOR)
4685 return false;
4686 auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: 2).getReg(), MRI);
4687 return ConstVal == -1;
4688}
4689
4690// Return the use branch instruction, otherwise null if the usage is invalid.
4691static MachineInstr *
4692verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4693 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4694 Register CondDef = MI.getOperand(i: 0).getReg();
4695 if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4696 return nullptr;
4697
4698 MachineBasicBlock *Parent = MI.getParent();
4699 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(RegNo: CondDef);
4700
4701 if (isNot(MRI, MI: *UseMI)) {
4702 Register NegatedCond = UseMI->getOperand(i: 0).getReg();
4703 if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4704 return nullptr;
4705
4706 // We're deleting the def of this value, so we need to remove it.
4707 eraseInstr(MI&: *UseMI, MRI);
4708
4709 UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4710 Negated = true;
4711 }
4712
4713 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4714 return nullptr;
4715
4716 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4717 MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4718 if (Next == Parent->end()) {
4719 MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4720 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4721 return nullptr;
4722 UncondBrTarget = &*NextMBB;
4723 } else {
4724 if (Next->getOpcode() != AMDGPU::G_BR)
4725 return nullptr;
4726 Br = &*Next;
4727 UncondBrTarget = Br->getOperand(i: 0).getMBB();
4728 }
4729
4730 return UseMI;
4731}
4732
4733void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
4734 MachineIRBuilder &B,
4735 const ArgDescriptor *Arg,
4736 const TargetRegisterClass *ArgRC,
4737 LLT ArgTy) const {
4738 MCRegister SrcReg = Arg->getRegister();
4739 assert(SrcReg.isPhysical() && "Physical register expected");
4740 assert(DstReg.isVirtual() && "Virtual register expected");
4741
4742 Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4743 RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4744 if (Arg->isMasked()) {
4745 // TODO: Should we try to emit this once in the entry block?
4746 const LLT S32 = LLT::scalar(SizeInBits: 32);
4747 const unsigned Mask = Arg->getMask();
4748 const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4749
4750 Register AndMaskSrc = LiveIn;
4751
4752 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4753 // 0.
4754 if (Shift != 0) {
4755 auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4756 AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: 0);
4757 }
4758
4759 B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4760 } else {
4761 B.buildCopy(Res: DstReg, Op: LiveIn);
4762 }
4763}
4764
4765bool AMDGPULegalizerInfo::legalizeWorkGroupId(
4766 MachineInstr &MI, MachineIRBuilder &B,
4767 AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
4768 AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
4769 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4770 Register DstReg = MI.getOperand(i: 0).getReg();
4771 if (!ST.hasClusters()) {
4772 if (!loadInputValue(DstReg, B, ArgType: WorkGroupIdPV))
4773 return false;
4774 MI.eraseFromParent();
4775 return true;
4776 }
4777
4778 // Clusters are supported. Return the global position in the grid. If clusters
4779 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4780
4781 // WorkGroupIdXYZ = ClusterId == 0 ?
4782 // ClusterIdXYZ :
4783 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4784 MachineRegisterInfo &MRI = *B.getMRI();
4785 const LLT S32 = LLT::scalar(SizeInBits: 32);
4786 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4787 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4788 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4789 if (!loadInputValue(DstReg: ClusterIdXYZ, B, ArgType: WorkGroupIdPV) ||
4790 !loadInputValue(DstReg: ClusterWorkGroupIdXYZ, B, ArgType: ClusterWorkGroupIdPV) ||
4791 !loadInputValue(DstReg: ClusterMaxIdXYZ, B, ArgType: ClusterMaxIdPV))
4792 return false;
4793
4794 auto One = B.buildConstant(Res: S32, Val: 1);
4795 auto ClusterSizeXYZ = B.buildAdd(Dst: S32, Src0: ClusterMaxIdXYZ, Src1: One);
4796 auto GlobalIdXYZ = B.buildAdd(Dst: S32, Src0: ClusterWorkGroupIdXYZ,
4797 Src1: B.buildMul(Dst: S32, Src0: ClusterIdXYZ, Src1: ClusterSizeXYZ));
4798
4799 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4800
4801 switch (MFI->getClusterDims().getKind()) {
4802 case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
4803 case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
4804 B.buildCopy(Res: DstReg, Op: GlobalIdXYZ);
4805 MI.eraseFromParent();
4806 return true;
4807 }
4808 case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
4809 B.buildCopy(Res: DstReg, Op: ClusterIdXYZ);
4810 MI.eraseFromParent();
4811 return true;
4812 }
4813 case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
4814 using namespace AMDGPU::Hwreg;
4815 unsigned ClusterIdField = HwregEncoding::encode(Values: ID_IB_STS2, Values: 6, Values: 4);
4816 Register ClusterId = MRI.createGenericVirtualRegister(Ty: S32);
4817 MRI.setRegClass(Reg: ClusterId, RC: &AMDGPU::SReg_32RegClass);
4818 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32_const)
4819 .addDef(RegNo: ClusterId)
4820 .addImm(Val: ClusterIdField);
4821 auto Zero = B.buildConstant(Res: S32, Val: 0);
4822 auto NoClusters =
4823 B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1), Op0: ClusterId, Op1: Zero);
4824 B.buildSelect(Res: DstReg, Tst: NoClusters, Op0: ClusterIdXYZ, Op1: GlobalIdXYZ);
4825 MI.eraseFromParent();
4826 return true;
4827 }
4828 }
4829
4830 llvm_unreachable("nothing should reach here");
4831}
4832
4833bool AMDGPULegalizerInfo::loadInputValue(
4834 Register DstReg, MachineIRBuilder &B,
4835 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4836 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4837 const ArgDescriptor *Arg = nullptr;
4838 const TargetRegisterClass *ArgRC;
4839 LLT ArgTy;
4840
4841 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4842 const ArgDescriptor WorkGroupIDX =
4843 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
4844 // If GridZ is not programmed in an entry function then the hardware will set
4845 // it to all zeros, so there is no need to mask the GridY value in the low
4846 // order bits.
4847 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4848 Reg: AMDGPU::TTMP7,
4849 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4850 const ArgDescriptor WorkGroupIDZ =
4851 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
4852 const ArgDescriptor ClusterWorkGroupIDX =
4853 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000000Fu);
4854 const ArgDescriptor ClusterWorkGroupIDY =
4855 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000000F0u);
4856 const ArgDescriptor ClusterWorkGroupIDZ =
4857 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00000F00u);
4858 const ArgDescriptor ClusterWorkGroupMaxIDX =
4859 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000F000u);
4860 const ArgDescriptor ClusterWorkGroupMaxIDY =
4861 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000F0000u);
4862 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4863 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00F00000u);
4864 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4865 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0F000000u);
4866
4867 auto LoadConstant = [&](unsigned N) {
4868 B.buildConstant(Res: DstReg, Val: N);
4869 return true;
4870 };
4871
4872 if (ST.hasArchitectedSGPRs() &&
4873 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4874 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4875 bool HasFixedDims = ClusterDims.isFixedDims();
4876
4877 switch (ArgType) {
4878 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4879 Arg = &WorkGroupIDX;
4880 ArgRC = &AMDGPU::SReg_32RegClass;
4881 ArgTy = LLT::scalar(SizeInBits: 32);
4882 break;
4883 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4884 Arg = &WorkGroupIDY;
4885 ArgRC = &AMDGPU::SReg_32RegClass;
4886 ArgTy = LLT::scalar(SizeInBits: 32);
4887 break;
4888 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4889 Arg = &WorkGroupIDZ;
4890 ArgRC = &AMDGPU::SReg_32RegClass;
4891 ArgTy = LLT::scalar(SizeInBits: 32);
4892 break;
4893 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
4894 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4895 return LoadConstant(0);
4896 Arg = &ClusterWorkGroupIDX;
4897 ArgRC = &AMDGPU::SReg_32RegClass;
4898 ArgTy = LLT::scalar(SizeInBits: 32);
4899 break;
4900 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
4901 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4902 return LoadConstant(0);
4903 Arg = &ClusterWorkGroupIDY;
4904 ArgRC = &AMDGPU::SReg_32RegClass;
4905 ArgTy = LLT::scalar(SizeInBits: 32);
4906 break;
4907 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
4908 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4909 return LoadConstant(0);
4910 Arg = &ClusterWorkGroupIDZ;
4911 ArgRC = &AMDGPU::SReg_32RegClass;
4912 ArgTy = LLT::scalar(SizeInBits: 32);
4913 break;
4914 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
4915 if (HasFixedDims)
4916 return LoadConstant(ClusterDims.getDims()[0] - 1);
4917 Arg = &ClusterWorkGroupMaxIDX;
4918 ArgRC = &AMDGPU::SReg_32RegClass;
4919 ArgTy = LLT::scalar(SizeInBits: 32);
4920 break;
4921 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
4922 if (HasFixedDims)
4923 return LoadConstant(ClusterDims.getDims()[1] - 1);
4924 Arg = &ClusterWorkGroupMaxIDY;
4925 ArgRC = &AMDGPU::SReg_32RegClass;
4926 ArgTy = LLT::scalar(SizeInBits: 32);
4927 break;
4928 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
4929 if (HasFixedDims)
4930 return LoadConstant(ClusterDims.getDims()[2] - 1);
4931 Arg = &ClusterWorkGroupMaxIDZ;
4932 ArgRC = &AMDGPU::SReg_32RegClass;
4933 ArgTy = LLT::scalar(SizeInBits: 32);
4934 break;
4935 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
4936 Arg = &ClusterWorkGroupMaxFlatID;
4937 ArgRC = &AMDGPU::SReg_32RegClass;
4938 ArgTy = LLT::scalar(SizeInBits: 32);
4939 break;
4940 default:
4941 break;
4942 }
4943 }
4944
4945 if (!Arg)
4946 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4947
4948 if (!Arg) {
4949 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4950 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4951 // which case the pointer argument may be missing and we use null.
4952 return LoadConstant(0);
4953 }
4954
4955 // It's undefined behavior if a function marked with the amdgpu-no-*
4956 // attributes uses the corresponding intrinsic.
4957 B.buildUndef(Res: DstReg);
4958 return true;
4959 }
4960
4961 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4962 return false; // TODO: Handle these
4963 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4964 return true;
4965}
4966
4967bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4968 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4969 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4970 if (!loadInputValue(DstReg: MI.getOperand(i: 0).getReg(), B, ArgType))
4971 return false;
4972
4973 MI.eraseFromParent();
4974 return true;
4975}
4976
4977static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4978 int64_t C) {
4979 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: C);
4980 MI.eraseFromParent();
4981 return true;
4982}
4983
4984bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4985 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4986 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4987 unsigned MaxID = ST.getMaxWorkitemID(Kernel: B.getMF().getFunction(), Dimension: Dim);
4988 if (MaxID == 0)
4989 return replaceWithConstant(B, MI, C: 0);
4990
4991 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4992 const ArgDescriptor *Arg;
4993 const TargetRegisterClass *ArgRC;
4994 LLT ArgTy;
4995 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4996
4997 Register DstReg = MI.getOperand(i: 0).getReg();
4998 if (!Arg) {
4999 // It's undefined behavior if a function marked with the amdgpu-no-*
5000 // attributes uses the corresponding intrinsic.
5001 B.buildUndef(Res: DstReg);
5002 MI.eraseFromParent();
5003 return true;
5004 }
5005
5006 if (Arg->isMasked()) {
5007 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5008 // masking operations anyway.
5009 //
5010 // TODO: We could assert the top bit is 0 for the source copy.
5011 if (!loadInputValue(DstReg, B, ArgType))
5012 return false;
5013 } else {
5014 Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
5015 if (!loadInputValue(DstReg: TmpReg, B, ArgType))
5016 return false;
5017 B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
5018 }
5019
5020 MI.eraseFromParent();
5021 return true;
5022}
5023
5024MachinePointerInfo
5025AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const {
5026 // This isn't really a constant pool but close enough.
5027 MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
5028 PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
5029 return PtrInfo;
5030}
5031
5032Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
5033 int64_t Offset) const {
5034 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
5035 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
5036
5037 // TODO: If we passed in the base kernel offset we could have a better
5038 // alignment than 4, but we don't really need it.
5039 if (!loadInputValue(DstReg: KernArgReg, B,
5040 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5041 llvm_unreachable("failed to find kernarg segment ptr");
5042
5043 auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset);
5044 return B.buildObjectPtrOffset(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: 0);
5045}
5046
5047/// Legalize a value that's loaded from kernel arguments. This is only used by
5048/// legacy intrinsics.
5049bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
5050 MachineIRBuilder &B,
5051 uint64_t Offset,
5052 Align Alignment) const {
5053 Register DstReg = MI.getOperand(i: 0).getReg();
5054
5055 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5056 "unexpected kernarg parameter type");
5057
5058 Register Ptr = getKernargParameterPtr(B, Offset);
5059 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF&: B.getMF());
5060 B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment: Align(4),
5061 MMOFlags: MachineMemOperand::MODereferenceable |
5062 MachineMemOperand::MOInvariant);
5063 MI.eraseFromParent();
5064 return true;
5065}
5066
5067bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
5068 MachineRegisterInfo &MRI,
5069 MachineIRBuilder &B) const {
5070 Register Dst = MI.getOperand(i: 0).getReg();
5071 LLT DstTy = MRI.getType(Reg: Dst);
5072 LLT S16 = LLT::scalar(SizeInBits: 16);
5073 LLT S32 = LLT::scalar(SizeInBits: 32);
5074 LLT S64 = LLT::scalar(SizeInBits: 64);
5075
5076 if (DstTy == S16)
5077 return legalizeFDIV16(MI, MRI, B);
5078 if (DstTy == S32)
5079 return legalizeFDIV32(MI, MRI, B);
5080 if (DstTy == S64)
5081 return legalizeFDIV64(MI, MRI, B);
5082
5083 return false;
5084}
5085
5086void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
5087 Register DstDivReg,
5088 Register DstRemReg,
5089 Register X,
5090 Register Y) const {
5091 const LLT S1 = LLT::scalar(SizeInBits: 1);
5092 const LLT S32 = LLT::scalar(SizeInBits: 32);
5093
5094 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5095 // algorithm used here.
5096
5097 // Initial estimate of inv(y).
5098 auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
5099 auto RcpIFlag = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {FloatY});
5100 auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f7ffffe));
5101 auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
5102 auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
5103
5104 // One round of UNR.
5105 auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 0), Src1: Y);
5106 auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
5107 Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
5108
5109 // Quotient/remainder estimate.
5110 auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
5111 auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
5112
5113 // First quotient/remainder refinement.
5114 auto One = B.buildConstant(Res: S32, Val: 1);
5115 auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
5116 if (DstDivReg)
5117 Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
5118 R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
5119
5120 // Second quotient/remainder refinement.
5121 Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
5122 if (DstDivReg)
5123 B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
5124
5125 if (DstRemReg)
5126 B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
5127}
5128
5129// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5130//
5131// Return lo, hi of result
5132//
5133// %cvt.lo = G_UITOFP Val.lo
5134// %cvt.hi = G_UITOFP Val.hi
5135// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5136// %rcp = G_AMDGPU_RCP_IFLAG %mad
5137// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5138// %mul2 = G_FMUL %mul1, 2**(-32)
5139// %trunc = G_INTRINSIC_TRUNC %mul2
5140// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5141// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5142static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5143 Register Val) {
5144 const LLT S32 = LLT::scalar(SizeInBits: 32);
5145 auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
5146
5147 auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 0));
5148 auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
5149
5150 auto Mad = B.buildFMAD(
5151 Dst: S32, Src0: CvtHi, // 2**32
5152 Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f800000)), Src2: CvtLo);
5153
5154 auto Rcp = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {Mad});
5155 auto Mul1 = B.buildFMul(
5156 Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x5f7ffffc)));
5157
5158 // 2**(-32)
5159 auto Mul2 = B.buildFMul(
5160 Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x2f800000)));
5161 auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
5162
5163 // -(2**32)
5164 auto Mad2 = B.buildFMAD(
5165 Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0xcf800000)),
5166 Src2: Mul1);
5167
5168 auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
5169 auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
5170
5171 return {ResultLo.getReg(Idx: 0), ResultHi.getReg(Idx: 0)};
5172}
5173
5174void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
5175 Register DstDivReg,
5176 Register DstRemReg,
5177 Register Numer,
5178 Register Denom) const {
5179 const LLT S32 = LLT::scalar(SizeInBits: 32);
5180 const LLT S64 = LLT::scalar(SizeInBits: 64);
5181 const LLT S1 = LLT::scalar(SizeInBits: 1);
5182 Register RcpLo, RcpHi;
5183
5184 std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
5185
5186 auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
5187
5188 auto Zero64 = B.buildConstant(Res: S64, Val: 0);
5189 auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
5190
5191 auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
5192 auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
5193
5194 auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
5195 Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: 0);
5196 Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: 1);
5197
5198 auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
5199 auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: 1));
5200 auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
5201
5202 auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
5203 auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
5204 auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
5205 Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: 0);
5206 Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: 1);
5207
5208 auto Zero32 = B.buildConstant(Res: S32, Val: 0);
5209 auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
5210 auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: 1));
5211 auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
5212
5213 auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
5214 Register NumerLo = UnmergeNumer.getReg(Idx: 0);
5215 Register NumerHi = UnmergeNumer.getReg(Idx: 1);
5216
5217 auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
5218 auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
5219 auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
5220 Register Mul3_Lo = UnmergeMul3.getReg(Idx: 0);
5221 Register Mul3_Hi = UnmergeMul3.getReg(Idx: 1);
5222 auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
5223 auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: 1));
5224 auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
5225 auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
5226
5227 auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
5228 Register DenomLo = UnmergeDenom.getReg(Idx: 0);
5229 Register DenomHi = UnmergeDenom.getReg(Idx: 1);
5230
5231 auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
5232 auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
5233
5234 auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
5235 auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
5236
5237 auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
5238 auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
5239
5240 // TODO: Here and below portions of the code can be enclosed into if/endif.
5241 // Currently control flow is unconditional and we have 4 selects after
5242 // potential endif to substitute PHIs.
5243
5244 // if C3 != 0 ...
5245 auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
5246 auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: 1));
5247 auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: 1));
5248 auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
5249
5250 auto One64 = B.buildConstant(Res: S64, Val: 1);
5251 auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
5252
5253 auto C4 =
5254 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
5255 auto C5 =
5256 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
5257 auto C6 = B.buildSelect(
5258 Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
5259
5260 // if (C6 != 0)
5261 auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
5262 auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
5263
5264 auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: 1));
5265 auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: 1));
5266 auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
5267
5268 // endif C6
5269 // endif C3
5270
5271 if (DstDivReg) {
5272 auto Sel1 = B.buildSelect(
5273 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
5274 B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
5275 Op0: Sel1, Op1: MulHi3);
5276 }
5277
5278 if (DstRemReg) {
5279 auto Sel2 = B.buildSelect(
5280 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
5281 B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
5282 Op0: Sel2, Op1: Sub1);
5283 }
5284}
5285
5286bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
5287 MachineRegisterInfo &MRI,
5288 MachineIRBuilder &B) const {
5289 Register DstDivReg, DstRemReg;
5290 switch (MI.getOpcode()) {
5291 default:
5292 llvm_unreachable("Unexpected opcode!");
5293 case AMDGPU::G_UDIV: {
5294 DstDivReg = MI.getOperand(i: 0).getReg();
5295 break;
5296 }
5297 case AMDGPU::G_UREM: {
5298 DstRemReg = MI.getOperand(i: 0).getReg();
5299 break;
5300 }
5301 case AMDGPU::G_UDIVREM: {
5302 DstDivReg = MI.getOperand(i: 0).getReg();
5303 DstRemReg = MI.getOperand(i: 1).getReg();
5304 break;
5305 }
5306 }
5307
5308 const LLT S64 = LLT::scalar(SizeInBits: 64);
5309 const LLT S32 = LLT::scalar(SizeInBits: 32);
5310 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5311 Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
5312 Register Den = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
5313 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5314
5315 if (Ty == S32)
5316 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
5317 else if (Ty == S64)
5318 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
5319 else
5320 return false;
5321
5322 MI.eraseFromParent();
5323 return true;
5324}
5325
5326bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
5327 MachineRegisterInfo &MRI,
5328 MachineIRBuilder &B) const {
5329 const LLT S64 = LLT::scalar(SizeInBits: 64);
5330 const LLT S32 = LLT::scalar(SizeInBits: 32);
5331
5332 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5333 if (Ty != S32 && Ty != S64)
5334 return false;
5335
5336 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5337 Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
5338 Register RHS = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
5339
5340 auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - 1);
5341 auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
5342 auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
5343
5344 LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
5345 RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
5346
5347 LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
5348 RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
5349
5350 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5351 switch (MI.getOpcode()) {
5352 default:
5353 llvm_unreachable("Unexpected opcode!");
5354 case AMDGPU::G_SDIV: {
5355 DstDivReg = MI.getOperand(i: 0).getReg();
5356 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5357 break;
5358 }
5359 case AMDGPU::G_SREM: {
5360 DstRemReg = MI.getOperand(i: 0).getReg();
5361 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5362 break;
5363 }
5364 case AMDGPU::G_SDIVREM: {
5365 DstDivReg = MI.getOperand(i: 0).getReg();
5366 DstRemReg = MI.getOperand(i: 1).getReg();
5367 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5368 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5369 break;
5370 }
5371 }
5372
5373 if (Ty == S32)
5374 legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
5375 else
5376 legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
5377
5378 if (DstDivReg) {
5379 auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: 0);
5380 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: 0);
5381 B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
5382 }
5383
5384 if (DstRemReg) {
5385 auto Sign = LHSign.getReg(Idx: 0); // Remainder sign is the same as LHS
5386 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: 0);
5387 B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
5388 }
5389
5390 MI.eraseFromParent();
5391 return true;
5392}
5393
5394bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
5395 MachineRegisterInfo &MRI,
5396 MachineIRBuilder &B) const {
5397 Register Res = MI.getOperand(i: 0).getReg();
5398 Register LHS = MI.getOperand(i: 1).getReg();
5399 Register RHS = MI.getOperand(i: 2).getReg();
5400 uint16_t Flags = MI.getFlags();
5401 LLT ResTy = MRI.getType(Reg: Res);
5402
5403 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn);
5404
5405 if (const auto *CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
5406 if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: 16))
5407 return false;
5408
5409 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5410 // the CI documentation has a worst case error of 1 ulp.
5411 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5412 // use it as long as we aren't trying to use denormals.
5413 //
5414 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5415
5416 // 1 / x -> RCP(x)
5417 if (CLHS->isExactlyValue(V: 1.0)) {
5418 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
5419 .addUse(RegNo: RHS)
5420 .setMIFlags(Flags);
5421
5422 MI.eraseFromParent();
5423 return true;
5424 }
5425
5426 // -1 / x -> RCP( FNEG(x) )
5427 if (CLHS->isExactlyValue(V: -1.0)) {
5428 auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
5429 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
5430 .addUse(RegNo: FNeg.getReg(Idx: 0))
5431 .setMIFlags(Flags);
5432
5433 MI.eraseFromParent();
5434 return true;
5435 }
5436 }
5437
5438 // For f16 require afn or arcp.
5439 // For f32 require afn.
5440 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: 16) ||
5441 !MI.getFlag(Flag: MachineInstr::FmArcp)))
5442 return false;
5443
5444 // x / y -> x * (1.0 / y)
5445 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
5446 .addUse(RegNo: RHS)
5447 .setMIFlags(Flags);
5448 B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
5449
5450 MI.eraseFromParent();
5451 return true;
5452}
5453
5454bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
5455 MachineRegisterInfo &MRI,
5456 MachineIRBuilder &B) const {
5457 Register Res = MI.getOperand(i: 0).getReg();
5458 Register X = MI.getOperand(i: 1).getReg();
5459 Register Y = MI.getOperand(i: 2).getReg();
5460 uint16_t Flags = MI.getFlags();
5461 LLT ResTy = MRI.getType(Reg: Res);
5462
5463 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn);
5464
5465 if (!AllowInaccurateRcp)
5466 return false;
5467
5468 auto NegY = B.buildFNeg(Dst: ResTy, Src0: Y);
5469 auto One = B.buildFConstant(Res: ResTy, Val: 1.0);
5470
5471 auto R = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
5472 .addUse(RegNo: Y)
5473 .setMIFlags(Flags);
5474
5475 auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
5476 R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
5477
5478 auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
5479 R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
5480
5481 auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
5482 auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
5483
5484 B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
5485 MI.eraseFromParent();
5486 return true;
5487}
5488
5489bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
5490 MachineRegisterInfo &MRI,
5491 MachineIRBuilder &B) const {
5492 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5493 return true;
5494
5495 Register Res = MI.getOperand(i: 0).getReg();
5496 Register LHS = MI.getOperand(i: 1).getReg();
5497 Register RHS = MI.getOperand(i: 2).getReg();
5498
5499 uint16_t Flags = MI.getFlags();
5500
5501 LLT S16 = LLT::scalar(SizeInBits: 16);
5502 LLT S32 = LLT::scalar(SizeInBits: 32);
5503
5504 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5505 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5506 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5507 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5508 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5509 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5510 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5511 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5512 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5513 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5514 // q16.u = opx(V_CVT_F16_F32, q32.u);
5515 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5516
5517 auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
5518 auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
5519 auto NegRHSExt = B.buildFNeg(Dst: S32, Src0: RHSExt);
5520 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5521 .addUse(RegNo: RHSExt.getReg(Idx: 0))
5522 .setMIFlags(Flags);
5523 auto Quot = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: Rcp, Flags);
5524 MachineInstrBuilder Err;
5525 if (ST.hasMadMacF32Insts()) {
5526 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5527 Quot = B.buildFMAD(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
5528 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5529 } else {
5530 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5531 Quot = B.buildFMA(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
5532 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5533 }
5534 auto Tmp = B.buildFMul(Dst: S32, Src0: Err, Src1: Rcp, Flags);
5535 Tmp = B.buildAnd(Dst: S32, Src0: Tmp, Src1: B.buildConstant(Res: S32, Val: 0xff800000));
5536 Quot = B.buildFAdd(Dst: S32, Src0: Tmp, Src1: Quot, Flags);
5537 auto RDst = B.buildFPTrunc(Res: S16, Op: Quot, Flags);
5538 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5539 .addUse(RegNo: RDst.getReg(Idx: 0))
5540 .addUse(RegNo: RHS)
5541 .addUse(RegNo: LHS)
5542 .setMIFlags(Flags);
5543
5544 MI.eraseFromParent();
5545 return true;
5546}
5547
5548static constexpr unsigned SPDenormModeBitField =
5549 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 4, Values: 2);
5550
5551// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5552// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5553static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
5554 const GCNSubtarget &ST,
5555 SIModeRegisterDefaults Mode) {
5556 // Set SP denorm mode to this value.
5557 unsigned SPDenormMode =
5558 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5559
5560 if (ST.hasDenormModeInst()) {
5561 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5562 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5563
5564 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5565 B.buildInstr(Opcode: AMDGPU::S_DENORM_MODE)
5566 .addImm(Val: NewDenormModeValue);
5567
5568 } else {
5569 B.buildInstr(Opcode: AMDGPU::S_SETREG_IMM32_B32)
5570 .addImm(Val: SPDenormMode)
5571 .addImm(Val: SPDenormModeBitField);
5572 }
5573}
5574
5575bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
5576 MachineRegisterInfo &MRI,
5577 MachineIRBuilder &B) const {
5578 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5579 return true;
5580
5581 Register Res = MI.getOperand(i: 0).getReg();
5582 Register LHS = MI.getOperand(i: 1).getReg();
5583 Register RHS = MI.getOperand(i: 2).getReg();
5584 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5585 SIModeRegisterDefaults Mode = MFI->getMode();
5586
5587 uint16_t Flags = MI.getFlags();
5588
5589 LLT S32 = LLT::scalar(SizeInBits: 32);
5590 LLT S1 = LLT::scalar(SizeInBits: 1);
5591
5592 auto One = B.buildFConstant(Res: S32, Val: 1.0f);
5593
5594 auto DenominatorScaled =
5595 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5596 .addUse(RegNo: LHS)
5597 .addUse(RegNo: RHS)
5598 .addImm(Val: 0)
5599 .setMIFlags(Flags);
5600 auto NumeratorScaled =
5601 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5602 .addUse(RegNo: LHS)
5603 .addUse(RegNo: RHS)
5604 .addImm(Val: 1)
5605 .setMIFlags(Flags);
5606
5607 auto ApproxRcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5608 .addUse(RegNo: DenominatorScaled.getReg(Idx: 0))
5609 .setMIFlags(Flags);
5610 auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
5611
5612 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5613 const bool HasDynamicDenormals =
5614 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5615 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5616
5617 Register SavedSPDenormMode;
5618 if (!PreservesDenormals) {
5619 if (HasDynamicDenormals) {
5620 SavedSPDenormMode = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5621 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32)
5622 .addDef(RegNo: SavedSPDenormMode)
5623 .addImm(Val: SPDenormModeBitField);
5624 }
5625 toggleSPDenormMode(Enable: true, B, ST, Mode);
5626 }
5627
5628 auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
5629 auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
5630 auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
5631 auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
5632 auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
5633 auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
5634
5635 if (!PreservesDenormals) {
5636 if (HasDynamicDenormals) {
5637 assert(SavedSPDenormMode);
5638 B.buildInstr(Opcode: AMDGPU::S_SETREG_B32)
5639 .addReg(RegNo: SavedSPDenormMode)
5640 .addImm(Val: SPDenormModeBitField);
5641 } else
5642 toggleSPDenormMode(Enable: false, B, ST, Mode);
5643 }
5644
5645 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S32})
5646 .addUse(RegNo: Fma4.getReg(Idx: 0))
5647 .addUse(RegNo: Fma1.getReg(Idx: 0))
5648 .addUse(RegNo: Fma3.getReg(Idx: 0))
5649 .addUse(RegNo: NumeratorScaled.getReg(Idx: 1))
5650 .setMIFlags(Flags);
5651
5652 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5653 .addUse(RegNo: Fmas.getReg(Idx: 0))
5654 .addUse(RegNo: RHS)
5655 .addUse(RegNo: LHS)
5656 .setMIFlags(Flags);
5657
5658 MI.eraseFromParent();
5659 return true;
5660}
5661
5662bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5663 MachineRegisterInfo &MRI,
5664 MachineIRBuilder &B) const {
5665 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5666 return true;
5667
5668 Register Res = MI.getOperand(i: 0).getReg();
5669 Register LHS = MI.getOperand(i: 1).getReg();
5670 Register RHS = MI.getOperand(i: 2).getReg();
5671
5672 uint16_t Flags = MI.getFlags();
5673
5674 LLT S64 = LLT::scalar(SizeInBits: 64);
5675 LLT S1 = LLT::scalar(SizeInBits: 1);
5676
5677 auto One = B.buildFConstant(Res: S64, Val: 1.0);
5678
5679 auto DivScale0 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5680 .addUse(RegNo: LHS)
5681 .addUse(RegNo: RHS)
5682 .addImm(Val: 0)
5683 .setMIFlags(Flags);
5684
5685 auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(Idx: 0), Flags);
5686
5687 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S64})
5688 .addUse(RegNo: DivScale0.getReg(Idx: 0))
5689 .setMIFlags(Flags);
5690
5691 auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
5692 auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
5693 auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
5694
5695 auto DivScale1 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5696 .addUse(RegNo: LHS)
5697 .addUse(RegNo: RHS)
5698 .addImm(Val: 1)
5699 .setMIFlags(Flags);
5700
5701 auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5702 auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(Idx: 0), Src1: Fma3, Flags);
5703 auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(Idx: 0), Flags);
5704
5705 Register Scale;
5706 if (!ST.hasUsableDivScaleConditionOutput()) {
5707 // Workaround a hardware bug on SI where the condition output from div_scale
5708 // is not usable.
5709
5710 LLT S32 = LLT::scalar(SizeInBits: 32);
5711
5712 auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5713 auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5714 auto Scale0Unmerge = B.buildUnmerge(Res: S32, Op: DivScale0);
5715 auto Scale1Unmerge = B.buildUnmerge(Res: S32, Op: DivScale1);
5716
5717 auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: 1),
5718 Op1: Scale1Unmerge.getReg(Idx: 1));
5719 auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: 1),
5720 Op1: Scale0Unmerge.getReg(Idx: 1));
5721 Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(Idx: 0);
5722 } else {
5723 Scale = DivScale1.getReg(Idx: 1);
5724 }
5725
5726 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S64})
5727 .addUse(RegNo: Fma4.getReg(Idx: 0))
5728 .addUse(RegNo: Fma3.getReg(Idx: 0))
5729 .addUse(RegNo: Mul.getReg(Idx: 0))
5730 .addUse(RegNo: Scale)
5731 .setMIFlags(Flags);
5732
5733 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res: ArrayRef(Res))
5734 .addUse(RegNo: Fmas.getReg(Idx: 0))
5735 .addUse(RegNo: RHS)
5736 .addUse(RegNo: LHS)
5737 .setMIFlags(Flags);
5738
5739 MI.eraseFromParent();
5740 return true;
5741}
5742
5743bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5744 MachineRegisterInfo &MRI,
5745 MachineIRBuilder &B) const {
5746 Register Res0 = MI.getOperand(i: 0).getReg();
5747 Register Res1 = MI.getOperand(i: 1).getReg();
5748 Register Val = MI.getOperand(i: 2).getReg();
5749 uint16_t Flags = MI.getFlags();
5750
5751 LLT Ty = MRI.getType(Reg: Res0);
5752 LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: 16) ? LLT::scalar(SizeInBits: 16) : LLT::scalar(SizeInBits: 32);
5753
5754 auto Mant = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_mant, Res: {Ty})
5755 .addUse(RegNo: Val)
5756 .setMIFlags(Flags);
5757 auto Exp = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_exp, Res: {InstrExpTy})
5758 .addUse(RegNo: Val)
5759 .setMIFlags(Flags);
5760
5761 if (ST.hasFractBug()) {
5762 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5763 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5764 auto IsFinite =
5765 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
5766 auto Zero = B.buildConstant(Res: InstrExpTy, Val: 0);
5767 Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5768 Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5769 }
5770
5771 B.buildCopy(Res: Res0, Op: Mant);
5772 B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5773
5774 MI.eraseFromParent();
5775 return true;
5776}
5777
5778bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5779 MachineRegisterInfo &MRI,
5780 MachineIRBuilder &B) const {
5781 Register Res = MI.getOperand(i: 0).getReg();
5782 Register LHS = MI.getOperand(i: 2).getReg();
5783 Register RHS = MI.getOperand(i: 3).getReg();
5784 uint16_t Flags = MI.getFlags();
5785
5786 LLT S32 = LLT::scalar(SizeInBits: 32);
5787 LLT S1 = LLT::scalar(SizeInBits: 1);
5788
5789 auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5790 const APFloat C0Val(1.0f);
5791
5792 auto C0 = B.buildFConstant(Res: S32, Val: 0x1p+96f);
5793 auto C1 = B.buildFConstant(Res: S32, Val: 0x1p-32f);
5794 auto C2 = B.buildFConstant(Res: S32, Val: 1.0f);
5795
5796 auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5797 auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5798
5799 auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5800
5801 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5802 .addUse(RegNo: Mul0.getReg(Idx: 0))
5803 .setMIFlags(Flags);
5804
5805 auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5806
5807 B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5808
5809 MI.eraseFromParent();
5810 return true;
5811}
5812
5813bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5814 MachineRegisterInfo &MRI,
5815 MachineIRBuilder &B) const {
5816 // Bypass the correct expansion a standard promotion through G_FSQRT would
5817 // get. The f32 op is accurate enough for the f16 cas.
5818 unsigned Flags = MI.getFlags();
5819 assert(!ST.has16BitInsts());
5820 const LLT F32 = LLT::scalar(SizeInBits: 32);
5821 auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: 1), Flags);
5822 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: {F32})
5823 .addUse(RegNo: Ext.getReg(Idx: 0))
5824 .setMIFlags(Flags);
5825 B.buildFPTrunc(Res: MI.getOperand(i: 0), Op: Log2, Flags);
5826 MI.eraseFromParent();
5827 return true;
5828}
5829
5830bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5831 MachineRegisterInfo &MRI,
5832 MachineIRBuilder &B) const {
5833 MachineFunction &MF = B.getMF();
5834 Register Dst = MI.getOperand(i: 0).getReg();
5835 Register X = MI.getOperand(i: 1).getReg();
5836 const unsigned Flags = MI.getFlags();
5837 const LLT S1 = LLT::scalar(SizeInBits: 1);
5838 const LLT F32 = LLT::scalar(SizeInBits: 32);
5839 const LLT I32 = LLT::scalar(SizeInBits: 32);
5840
5841 if (allowApproxFunc(MF, Flags)) {
5842 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({Dst}))
5843 .addUse(RegNo: X)
5844 .setMIFlags(Flags);
5845 MI.eraseFromParent();
5846 return true;
5847 }
5848
5849 auto ScaleThreshold = B.buildFConstant(Res: F32, Val: 0x1.0p-96f);
5850 auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5851 auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: 0x1.0p+32f);
5852 auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5853 auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5854
5855 Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5856 if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5857 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({SqrtS}))
5858 .addUse(RegNo: SqrtX.getReg(Idx: 0))
5859 .setMIFlags(Flags);
5860
5861 auto NegOne = B.buildConstant(Res: I32, Val: -1);
5862 auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5863
5864 auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5865 auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5866
5867 auto PosOne = B.buildConstant(Res: I32, Val: 1);
5868 auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5869
5870 auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5871 auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5872
5873 auto Zero = B.buildFConstant(Res: F32, Val: 0.0f);
5874 auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5875
5876 SqrtS =
5877 B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5878
5879 auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
5880 SqrtS =
5881 B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: 0);
5882 } else {
5883 auto SqrtR =
5884 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F32}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5885 B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
5886
5887 auto Half = B.buildFConstant(Res: F32, Val: 0.5f);
5888 auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
5889 auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
5890 auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
5891 SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
5892 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(Idx: 0);
5893 auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
5894 auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
5895 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(Idx: 0);
5896 }
5897
5898 auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: 0x1.0p-16f);
5899
5900 auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
5901
5902 SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5903
5904 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5905 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
5906
5907 MI.eraseFromParent();
5908 return true;
5909}
5910
5911bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5912 MachineRegisterInfo &MRI,
5913 MachineIRBuilder &B) const {
5914 // For double type, the SQRT and RSQ instructions don't have required
5915 // precision, we apply Goldschmidt's algorithm to improve the result:
5916 //
5917 // y0 = rsq(x)
5918 // g0 = x * y0
5919 // h0 = 0.5 * y0
5920 //
5921 // r0 = 0.5 - h0 * g0
5922 // g1 = g0 * r0 + g0
5923 // h1 = h0 * r0 + h0
5924 //
5925 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5926 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5927 // h2 = h1 * r1 + h1
5928 //
5929 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5930 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5931 //
5932 // sqrt(x) = g3
5933
5934 const LLT S1 = LLT::scalar(SizeInBits: 1);
5935 const LLT S32 = LLT::scalar(SizeInBits: 32);
5936 const LLT F64 = LLT::scalar(SizeInBits: 64);
5937
5938 Register Dst = MI.getOperand(i: 0).getReg();
5939 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5940
5941 Register X = MI.getOperand(i: 1).getReg();
5942 unsigned Flags = MI.getFlags();
5943
5944 auto ScaleConstant = B.buildFConstant(Res: F64, Val: 0x1.0p-767);
5945
5946 auto ZeroInt = B.buildConstant(Res: S32, Val: 0);
5947 auto Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant);
5948
5949 // Scale up input if it is too small.
5950 auto ScaleUpFactor = B.buildConstant(Res: S32, Val: 256);
5951 auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
5952 auto SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags);
5953
5954 auto SqrtY =
5955 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F64}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5956
5957 auto Half = B.buildFConstant(Res: F64, Val: 0.5);
5958 auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
5959 auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
5960
5961 auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
5962 auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
5963
5964 auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
5965 auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
5966
5967 auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
5968 auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
5969
5970 auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
5971
5972 auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
5973 auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
5974
5975 auto SqrtRet = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
5976
5977 // Scale down the result.
5978 auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -128);
5979 auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
5980 SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtRet, Src1: ScaleDown, Flags);
5981
5982 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5983 // with finite only or nsz because rsq(+/-0) = +/-inf
5984
5985 // TODO: Check for DAZ and expand to subnormals
5986 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5987
5988 // If x is +INF, +0, or -0, use its original value
5989 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
5990
5991 MI.eraseFromParent();
5992 return true;
5993}
5994
5995bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5996 MachineRegisterInfo &MRI,
5997 MachineIRBuilder &B) const {
5998 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5999 if (Ty == LLT::scalar(SizeInBits: 32))
6000 return legalizeFSQRTF32(MI, MRI, B);
6001 if (Ty == LLT::scalar(SizeInBits: 64))
6002 return legalizeFSQRTF64(MI, MRI, B);
6003 if (Ty == LLT::scalar(SizeInBits: 16))
6004 return legalizeFSQRTF16(MI, MRI, B);
6005 return false;
6006}
6007
6008// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6009// FIXME: Why do we handle this one but not other removed instructions?
6010//
6011// Reciprocal square root. The clamp prevents infinite results, clamping
6012// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6013// +-max_float.
6014bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
6015 MachineRegisterInfo &MRI,
6016 MachineIRBuilder &B) const {
6017 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6018 return true;
6019
6020 Register Dst = MI.getOperand(i: 0).getReg();
6021 Register Src = MI.getOperand(i: 2).getReg();
6022 auto Flags = MI.getFlags();
6023
6024 LLT Ty = MRI.getType(Reg: Dst);
6025
6026 const fltSemantics *FltSemantics;
6027 if (Ty == LLT::scalar(SizeInBits: 32))
6028 FltSemantics = &APFloat::IEEEsingle();
6029 else if (Ty == LLT::scalar(SizeInBits: 64))
6030 FltSemantics = &APFloat::IEEEdouble();
6031 else
6032 return false;
6033
6034 auto Rsq = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {Ty})
6035 .addUse(RegNo: Src)
6036 .setMIFlags(Flags);
6037
6038 // We don't need to concern ourselves with the snan handling difference, since
6039 // the rsq quieted (or not) so use the one which will directly select.
6040 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6041 const bool UseIEEE = MFI->getMode().IEEE;
6042
6043 auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
6044 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
6045 B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
6046
6047 auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics, Negative: true));
6048
6049 if (UseIEEE)
6050 B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
6051 else
6052 B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
6053 MI.eraseFromParent();
6054 return true;
6055}
6056
6057// TODO: Fix pointer type handling
6058bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
6059 MachineInstr &MI,
6060 Intrinsic::ID IID) const {
6061
6062 MachineIRBuilder &B = Helper.MIRBuilder;
6063 MachineRegisterInfo &MRI = *B.getMRI();
6064
6065 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6066 IID == Intrinsic::amdgcn_permlanex16;
6067 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6068 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6069
6070 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6071 Register Src2, LLT VT) -> Register {
6072 auto LaneOp = B.buildIntrinsic(ID: IID, Res: {VT}).addUse(RegNo: Src0);
6073 switch (IID) {
6074 case Intrinsic::amdgcn_readfirstlane:
6075 case Intrinsic::amdgcn_permlane64:
6076 return LaneOp.getReg(Idx: 0);
6077 case Intrinsic::amdgcn_readlane:
6078 case Intrinsic::amdgcn_set_inactive:
6079 case Intrinsic::amdgcn_set_inactive_chain_arg:
6080 return LaneOp.addUse(RegNo: Src1).getReg(Idx: 0);
6081 case Intrinsic::amdgcn_writelane:
6082 return LaneOp.addUse(RegNo: Src1).addUse(RegNo: Src2).getReg(Idx: 0);
6083 case Intrinsic::amdgcn_permlane16:
6084 case Intrinsic::amdgcn_permlanex16: {
6085 Register Src3 = MI.getOperand(i: 5).getReg();
6086 int64_t Src4 = MI.getOperand(i: 6).getImm();
6087 int64_t Src5 = MI.getOperand(i: 7).getImm();
6088 return LaneOp.addUse(RegNo: Src1)
6089 .addUse(RegNo: Src2)
6090 .addUse(RegNo: Src3)
6091 .addImm(Val: Src4)
6092 .addImm(Val: Src5)
6093 .getReg(Idx: 0);
6094 }
6095 case Intrinsic::amdgcn_mov_dpp8:
6096 return LaneOp.addImm(Val: MI.getOperand(i: 3).getImm()).getReg(Idx: 0);
6097 case Intrinsic::amdgcn_update_dpp:
6098 return LaneOp.addUse(RegNo: Src1)
6099 .addImm(Val: MI.getOperand(i: 4).getImm())
6100 .addImm(Val: MI.getOperand(i: 5).getImm())
6101 .addImm(Val: MI.getOperand(i: 6).getImm())
6102 .addImm(Val: MI.getOperand(i: 7).getImm())
6103 .getReg(Idx: 0);
6104 default:
6105 llvm_unreachable("unhandled lane op");
6106 }
6107 };
6108
6109 Register DstReg = MI.getOperand(i: 0).getReg();
6110 Register Src0 = MI.getOperand(i: 2).getReg();
6111 Register Src1, Src2;
6112 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6113 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6114 Src1 = MI.getOperand(i: 3).getReg();
6115 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6116 Src2 = MI.getOperand(i: 4).getReg();
6117 }
6118 }
6119
6120 LLT Ty = MRI.getType(Reg: DstReg);
6121 unsigned Size = Ty.getSizeInBits();
6122
6123 unsigned SplitSize = 32;
6124 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6125 ST.hasDPALU_DPP() &&
6126 AMDGPU::isLegalDPALU_DPPControl(ST, DC: MI.getOperand(i: 4).getImm()))
6127 SplitSize = 64;
6128
6129 if (Size == SplitSize) {
6130 // Already legal
6131 return true;
6132 }
6133
6134 if (Size < 32) {
6135 Src0 = B.buildAnyExt(Res: S32, Op: Src0).getReg(Idx: 0);
6136
6137 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6138 Src1 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src1).getReg(Idx: 0);
6139
6140 if (IID == Intrinsic::amdgcn_writelane)
6141 Src2 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src2).getReg(Idx: 0);
6142
6143 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6144 B.buildTrunc(Res: DstReg, Op: LaneOpDst);
6145 MI.eraseFromParent();
6146 return true;
6147 }
6148
6149 if (Size % SplitSize != 0)
6150 return false;
6151
6152 LLT PartialResTy = LLT::scalar(SizeInBits: SplitSize);
6153 bool NeedsBitcast = false;
6154 if (Ty.isVector()) {
6155 LLT EltTy = Ty.getElementType();
6156 unsigned EltSize = EltTy.getSizeInBits();
6157 if (EltSize == SplitSize) {
6158 PartialResTy = EltTy;
6159 } else if (EltSize == 16 || EltSize == 32) {
6160 unsigned NElem = SplitSize / EltSize;
6161 PartialResTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: NElem));
6162 } else {
6163 // Handle all other cases via S32/S64 pieces
6164 NeedsBitcast = true;
6165 }
6166 }
6167
6168 SmallVector<Register, 4> PartialRes;
6169 unsigned NumParts = Size / SplitSize;
6170 MachineInstrBuilder Src0Parts = B.buildUnmerge(Res: PartialResTy, Op: Src0);
6171 MachineInstrBuilder Src1Parts, Src2Parts;
6172
6173 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6174 Src1Parts = B.buildUnmerge(Res: PartialResTy, Op: Src1);
6175
6176 if (IID == Intrinsic::amdgcn_writelane)
6177 Src2Parts = B.buildUnmerge(Res: PartialResTy, Op: Src2);
6178
6179 for (unsigned i = 0; i < NumParts; ++i) {
6180 Src0 = Src0Parts.getReg(Idx: i);
6181
6182 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6183 Src1 = Src1Parts.getReg(Idx: i);
6184
6185 if (IID == Intrinsic::amdgcn_writelane)
6186 Src2 = Src2Parts.getReg(Idx: i);
6187
6188 PartialRes.push_back(Elt: createLaneOp(Src0, Src1, Src2, PartialResTy));
6189 }
6190
6191 if (NeedsBitcast)
6192 B.buildBitcast(Dst: DstReg, Src: B.buildMergeLikeInstr(
6193 Res: LLT::scalar(SizeInBits: Ty.getSizeInBits()), Ops: PartialRes));
6194 else
6195 B.buildMergeLikeInstr(Res: DstReg, Ops: PartialRes);
6196
6197 MI.eraseFromParent();
6198 return true;
6199}
6200
6201bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
6202 MachineRegisterInfo &MRI,
6203 MachineIRBuilder &B) const {
6204 uint64_t Offset =
6205 ST.getTargetLowering()->getImplicitParameterOffset(
6206 MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
6207 LLT DstTy = MRI.getType(Reg: DstReg);
6208 LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
6209
6210 Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
6211 if (!loadInputValue(DstReg: KernargPtrReg, B,
6212 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6213 return false;
6214
6215 B.buildObjectPtrOffset(Res: DstReg, Op0: KernargPtrReg,
6216 Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: 0));
6217 return true;
6218}
6219
6220/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6221/// bits of the pointer and replace them with the stride argument, then
6222/// merge_values everything together. In the common case of a raw buffer (the
6223/// stride component is 0), we can just AND off the upper half.
6224bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
6225 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6226 Register Result = MI.getOperand(i: 0).getReg();
6227 Register Pointer = MI.getOperand(i: 2).getReg();
6228 Register Stride = MI.getOperand(i: 3).getReg();
6229 Register NumRecords = MI.getOperand(i: 4).getReg();
6230 Register Flags = MI.getOperand(i: 5).getReg();
6231
6232 LLT S32 = LLT::scalar(SizeInBits: 32);
6233 LLT S64 = LLT::scalar(SizeInBits: 64);
6234
6235 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6236
6237 auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
6238
6239 if (ST.has45BitNumRecordsBufferResource()) {
6240 Register Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6241 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6242 // num_records.
6243 LLT PtrIntTy = LLT::scalar(SizeInBits: MRI.getType(Reg: Pointer).getSizeInBits());
6244 auto PointerInt = B.buildPtrToInt(Dst: PtrIntTy, Src: Pointer);
6245 auto ExtPointer = B.buildAnyExtOrTrunc(Res: S64, Op: PointerInt);
6246 auto NumRecordsLHS = B.buildShl(Dst: S64, Src0: NumRecords, Src1: B.buildConstant(Res: S32, Val: 57));
6247 Register LowHalf = B.buildOr(Dst: S64, Src0: ExtPointer, Src1: NumRecordsLHS).getReg(Idx: 0);
6248
6249 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6250 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6251 auto NumRecordsRHS = B.buildLShr(Dst: S64, Src0: NumRecords, Src1: B.buildConstant(Res: S32, Val: 7));
6252 auto ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: B.buildConstant(Res: S32, Val: 12));
6253 auto ExtShiftedStride =
6254 B.buildMergeValues(Res: S64, Ops: {Zero, ShiftedStride.getReg(Idx: 0)});
6255 auto ShiftedFlags = B.buildShl(Dst: S32, Src0: Flags, Src1: B.buildConstant(Res: S32, Val: 28));
6256 auto ExtShiftedFlags =
6257 B.buildMergeValues(Res: S64, Ops: {Zero, ShiftedFlags.getReg(Idx: 0)});
6258 auto CombinedFields = B.buildOr(Dst: S64, Src0: NumRecordsRHS, Src1: ExtShiftedStride);
6259 Register HighHalf =
6260 B.buildOr(Dst: S64, Src0: CombinedFields, Src1: ExtShiftedFlags).getReg(Idx: 0);
6261 B.buildMergeValues(Res: Result, Ops: {LowHalf, HighHalf});
6262 } else {
6263 NumRecords = B.buildTrunc(Res: S32, Op: NumRecords).getReg(Idx: 0);
6264 auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
6265 auto LowHalf = Unmerge.getReg(Idx: 0);
6266 auto HighHalf = Unmerge.getReg(Idx: 1);
6267
6268 auto AndMask = B.buildConstant(Res: S32, Val: 0x0000ffff);
6269 auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
6270 auto ShiftConst = B.buildConstant(Res: S32, Val: 16);
6271 auto ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
6272 auto NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
6273 Register NewHighHalfReg = NewHighHalf.getReg(Idx: 0);
6274 B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
6275 }
6276
6277 MI.eraseFromParent();
6278 return true;
6279}
6280
6281bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
6282 MachineRegisterInfo &MRI,
6283 MachineIRBuilder &B) const {
6284 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6285 if (!MFI->isEntryFunction()) {
6286 return legalizePreloadedArgIntrin(MI, MRI, B,
6287 ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
6288 }
6289
6290 Register DstReg = MI.getOperand(i: 0).getReg();
6291 if (!getImplicitArgPtr(DstReg, MRI, B))
6292 return false;
6293
6294 MI.eraseFromParent();
6295 return true;
6296}
6297
6298bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
6299 MachineRegisterInfo &MRI,
6300 MachineIRBuilder &B) const {
6301 Function &F = B.getMF().getFunction();
6302 std::optional<uint32_t> KnownSize =
6303 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
6304 if (KnownSize.has_value())
6305 B.buildConstant(Res: DstReg, Val: *KnownSize);
6306 return false;
6307}
6308
6309bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
6310 MachineRegisterInfo &MRI,
6311 MachineIRBuilder &B) const {
6312
6313 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6314 if (!MFI->isEntryFunction()) {
6315 return legalizePreloadedArgIntrin(MI, MRI, B,
6316 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
6317 }
6318
6319 Register DstReg = MI.getOperand(i: 0).getReg();
6320 if (!getLDSKernelId(DstReg, MRI, B))
6321 return false;
6322
6323 MI.eraseFromParent();
6324 return true;
6325}
6326
6327bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
6328 MachineRegisterInfo &MRI,
6329 MachineIRBuilder &B,
6330 unsigned AddrSpace) const {
6331 const LLT S32 = LLT::scalar(SizeInBits: 32);
6332 auto Unmerge = B.buildUnmerge(Res: S32, Op: MI.getOperand(i: 2).getReg());
6333 Register Hi32 = Unmerge.getReg(Idx: 1);
6334
6335 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6336 ST.hasGloballyAddressableScratch()) {
6337 Register FlatScratchBaseHi =
6338 B.buildInstr(Opc: AMDGPU::S_MOV_B32, DstOps: {S32},
6339 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6340 .getReg(Idx: 0);
6341 MRI.setRegClass(Reg: FlatScratchBaseHi, RC: &AMDGPU::SReg_32RegClass);
6342 // Test bits 63..58 against the aperture address.
6343 Register XOR = B.buildXor(Dst: S32, Src0: Hi32, Src1: FlatScratchBaseHi).getReg(Idx: 0);
6344 B.buildICmp(Pred: ICmpInst::ICMP_ULT, Res: MI.getOperand(i: 0), Op0: XOR,
6345 Op1: B.buildConstant(Res: S32, Val: 1u << 26));
6346 } else {
6347 Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
6348 B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: 0), Op0: Hi32, Op1: ApertureReg);
6349 }
6350 MI.eraseFromParent();
6351 return true;
6352}
6353
6354// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6355// offset (the offset that is included in bounds checking and swizzling, to be
6356// split between the instruction's voffset and immoffset fields) and soffset
6357// (the offset that is excluded from bounds checking and swizzling, to go in
6358// the instruction's soffset field). This function takes the first kind of
6359// offset and figures out how to split it between voffset and immoffset.
6360std::pair<Register, unsigned>
6361AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
6362 Register OrigOffset) const {
6363 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6364 Register BaseReg;
6365 unsigned ImmOffset;
6366 const LLT S32 = LLT::scalar(SizeInBits: 32);
6367 MachineRegisterInfo &MRI = *B.getMRI();
6368
6369 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6370 // being added, so we can only safely match a 32-bit addition with no unsigned
6371 // overflow.
6372 bool CheckNUW = ST.hasGFX1250Insts();
6373 std::tie(args&: BaseReg, args&: ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6374 MRI, Reg: OrigOffset, /*KnownBits=*/ValueTracking: nullptr, CheckNUW);
6375
6376 // If BaseReg is a pointer, convert it to int.
6377 if (MRI.getType(Reg: BaseReg).isPointer())
6378 BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: 0);
6379
6380 // If the immediate value is too big for the immoffset field, put only bits
6381 // that would normally fit in the immoffset field. The remaining value that
6382 // is copied/added for the voffset field is a large power of 2, and it
6383 // stands more chance of being CSEd with the copy/add for another similar
6384 // load/store.
6385 // However, do not do that rounding down if that is a negative
6386 // number, as it appears to be illegal to have a negative offset in the
6387 // vgpr, even if adding the immediate offset makes it positive.
6388 unsigned Overflow = ImmOffset & ~MaxImm;
6389 ImmOffset -= Overflow;
6390 if ((int32_t)Overflow < 0) {
6391 Overflow += ImmOffset;
6392 ImmOffset = 0;
6393 }
6394
6395 if (Overflow != 0) {
6396 if (!BaseReg) {
6397 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
6398 } else {
6399 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
6400 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
6401 }
6402 }
6403
6404 if (!BaseReg)
6405 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6406
6407 return std::pair(BaseReg, ImmOffset);
6408}
6409
6410/// Handle register layout difference for f16 images for some subtargets.
6411Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
6412 MachineRegisterInfo &MRI,
6413 Register Reg,
6414 bool ImageStore) const {
6415 const LLT S16 = LLT::scalar(SizeInBits: 16);
6416 const LLT S32 = LLT::scalar(SizeInBits: 32);
6417 LLT StoreVT = MRI.getType(Reg);
6418 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6419
6420 if (ST.hasUnpackedD16VMem()) {
6421 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
6422
6423 SmallVector<Register, 4> WideRegs;
6424 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6425 WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6426
6427 int NumElts = StoreVT.getNumElements();
6428
6429 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
6430 .getReg(Idx: 0);
6431 }
6432
6433 if (ImageStore && ST.hasImageStoreD16Bug()) {
6434 if (StoreVT.getNumElements() == 2) {
6435 SmallVector<Register, 4> PackedRegs;
6436 Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: 0);
6437 PackedRegs.push_back(Elt: Reg);
6438 PackedRegs.resize(N: 2, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
6439 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Ops: PackedRegs)
6440 .getReg(Idx: 0);
6441 }
6442
6443 if (StoreVT.getNumElements() == 3) {
6444 SmallVector<Register, 4> PackedRegs;
6445 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
6446 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6447 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
6448 PackedRegs.resize(N: 6, NV: B.buildUndef(Res: S16).getReg(Idx: 0));
6449 Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 6, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: 0);
6450 return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 3, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
6451 }
6452
6453 if (StoreVT.getNumElements() == 4) {
6454 SmallVector<Register, 4> PackedRegs;
6455 Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
6456 auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
6457 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6458 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
6459 PackedRegs.resize(N: 4, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
6460 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S32), Ops: PackedRegs)
6461 .getReg(Idx: 0);
6462 }
6463
6464 llvm_unreachable("invalid data type");
6465 }
6466
6467 if (StoreVT == LLT::fixed_vector(NumElements: 3, ScalarTy: S16)) {
6468 Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S16), Op0: Reg)
6469 .getReg(Idx: 0);
6470 }
6471 return Reg;
6472}
6473
6474Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
6475 Register VData, LLT MemTy,
6476 bool IsFormat) const {
6477 MachineRegisterInfo *MRI = B.getMRI();
6478 LLT Ty = MRI->getType(Reg: VData);
6479
6480 const LLT S16 = LLT::scalar(SizeInBits: 16);
6481
6482 // Fixup buffer resources themselves needing to be v4i128.
6483 if (hasBufferRsrcWorkaround(Ty))
6484 return castBufferRsrcToV4I32(Pointer: VData, B);
6485
6486 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6487 Ty = getBitcastRegisterType(Ty);
6488 VData = B.buildBitcast(Dst: Ty, Src: VData).getReg(Idx: 0);
6489 }
6490 // Fixup illegal register types for i8 stores.
6491 if (Ty == LLT::scalar(SizeInBits: 8) || Ty == S16) {
6492 Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: VData).getReg(Idx: 0);
6493 return AnyExt;
6494 }
6495
6496 if (Ty.isVector()) {
6497 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6498 if (IsFormat)
6499 return handleD16VData(B, MRI&: *MRI, Reg: VData);
6500 }
6501 }
6502
6503 return VData;
6504}
6505
6506bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
6507 LegalizerHelper &Helper,
6508 bool IsTyped,
6509 bool IsFormat) const {
6510 MachineIRBuilder &B = Helper.MIRBuilder;
6511 MachineRegisterInfo &MRI = *B.getMRI();
6512
6513 Register VData = MI.getOperand(i: 1).getReg();
6514 LLT Ty = MRI.getType(Reg: VData);
6515 LLT EltTy = Ty.getScalarType();
6516 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6517 const LLT S32 = LLT::scalar(SizeInBits: 32);
6518
6519 MachineMemOperand *MMO = *MI.memoperands_begin();
6520 const int MemSize = MMO->getSize().getValue();
6521 LLT MemTy = MMO->getMemoryType();
6522
6523 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6524
6525 castBufferRsrcArgToV4I32(MI, B, Idx: 2);
6526 Register RSrc = MI.getOperand(i: 2).getReg();
6527
6528 unsigned ImmOffset;
6529
6530 // The typed intrinsics add an immediate after the registers.
6531 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6532
6533 // The struct intrinsic variants add one additional operand over raw.
6534 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6535 Register VIndex;
6536 int OpOffset = 0;
6537 if (HasVIndex) {
6538 VIndex = MI.getOperand(i: 3).getReg();
6539 OpOffset = 1;
6540 } else {
6541 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6542 }
6543
6544 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6545 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6546
6547 unsigned Format = 0;
6548 if (IsTyped) {
6549 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6550 ++OpOffset;
6551 }
6552
6553 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6554
6555 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6556
6557 unsigned Opc;
6558 if (IsTyped) {
6559 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6560 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6561 } else if (IsFormat) {
6562 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6563 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6564 } else {
6565 switch (MemSize) {
6566 case 1:
6567 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6568 break;
6569 case 2:
6570 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6571 break;
6572 default:
6573 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6574 break;
6575 }
6576 }
6577
6578 auto MIB = B.buildInstr(Opcode: Opc)
6579 .addUse(RegNo: VData) // vdata
6580 .addUse(RegNo: RSrc) // rsrc
6581 .addUse(RegNo: VIndex) // vindex
6582 .addUse(RegNo: VOffset) // voffset
6583 .addUse(RegNo: SOffset) // soffset
6584 .addImm(Val: ImmOffset); // offset(imm)
6585
6586 if (IsTyped)
6587 MIB.addImm(Val: Format);
6588
6589 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6590 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6591 .addMemOperand(MMO);
6592
6593 MI.eraseFromParent();
6594 return true;
6595}
6596
6597static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6598 Register VIndex, Register VOffset, Register SOffset,
6599 unsigned ImmOffset, unsigned Format,
6600 unsigned AuxiliaryData, MachineMemOperand *MMO,
6601 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6602 auto MIB = B.buildInstr(Opcode: Opc)
6603 .addDef(RegNo: LoadDstReg) // vdata
6604 .addUse(RegNo: RSrc) // rsrc
6605 .addUse(RegNo: VIndex) // vindex
6606 .addUse(RegNo: VOffset) // voffset
6607 .addUse(RegNo: SOffset) // soffset
6608 .addImm(Val: ImmOffset); // offset(imm)
6609
6610 if (IsTyped)
6611 MIB.addImm(Val: Format);
6612
6613 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6614 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6615 .addMemOperand(MMO);
6616}
6617
6618bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
6619 LegalizerHelper &Helper,
6620 bool IsFormat,
6621 bool IsTyped) const {
6622 MachineIRBuilder &B = Helper.MIRBuilder;
6623 MachineRegisterInfo &MRI = *B.getMRI();
6624 GISelChangeObserver &Observer = Helper.Observer;
6625
6626 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6627 MachineMemOperand *MMO = *MI.memoperands_begin();
6628 const LLT MemTy = MMO->getMemoryType();
6629 const LLT S32 = LLT::scalar(SizeInBits: 32);
6630
6631 Register Dst = MI.getOperand(i: 0).getReg();
6632
6633 Register StatusDst;
6634 int OpOffset = 0;
6635 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6636 bool IsTFE = MI.getNumExplicitDefs() == 2;
6637 if (IsTFE) {
6638 StatusDst = MI.getOperand(i: 1).getReg();
6639 ++OpOffset;
6640 }
6641
6642 castBufferRsrcArgToV4I32(MI, B, Idx: 2 + OpOffset);
6643 Register RSrc = MI.getOperand(i: 2 + OpOffset).getReg();
6644
6645 // The typed intrinsics add an immediate after the registers.
6646 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6647
6648 // The struct intrinsic variants add one additional operand over raw.
6649 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6650 Register VIndex;
6651 if (HasVIndex) {
6652 VIndex = MI.getOperand(i: 3 + OpOffset).getReg();
6653 ++OpOffset;
6654 } else {
6655 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6656 }
6657
6658 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6659 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6660
6661 unsigned Format = 0;
6662 if (IsTyped) {
6663 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6664 ++OpOffset;
6665 }
6666
6667 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6668 unsigned ImmOffset;
6669
6670 LLT Ty = MRI.getType(Reg: Dst);
6671 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6672 // logic doesn't have to handle that case.
6673 if (hasBufferRsrcWorkaround(Ty)) {
6674 Observer.changingInstr(MI);
6675 Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
6676 Observer.changedInstr(MI);
6677 Dst = MI.getOperand(i: 0).getReg();
6678 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6679 }
6680 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6681 Ty = getBitcastRegisterType(Ty);
6682 Observer.changingInstr(MI);
6683 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
6684 Observer.changedInstr(MI);
6685 Dst = MI.getOperand(i: 0).getReg();
6686 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6687 }
6688
6689 LLT EltTy = Ty.getScalarType();
6690 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6691 const bool Unpacked = ST.hasUnpackedD16VMem();
6692
6693 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6694
6695 unsigned Opc;
6696
6697 // TODO: Support TFE for typed and narrow loads.
6698 if (IsTyped) {
6699 if (IsTFE)
6700 return false;
6701 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6702 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6703 } else if (IsFormat) {
6704 if (IsD16) {
6705 if (IsTFE)
6706 return false;
6707 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6708 } else {
6709 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6710 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6711 }
6712 } else {
6713 switch (MemTy.getSizeInBits()) {
6714 case 8:
6715 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6716 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6717 break;
6718 case 16:
6719 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6720 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6721 break;
6722 default:
6723 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6724 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6725 break;
6726 }
6727 }
6728
6729 if (IsTFE) {
6730 unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: 32);
6731 unsigned NumLoadDWords = NumValueDWords + 1;
6732 LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
6733 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
6734 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6735 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6736 if (MemTy.getSizeInBits() < 32) {
6737 Register ExtDst = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6738 B.buildUnmerge(Res: {ExtDst, StatusDst}, Op: LoadDstReg);
6739 B.buildTrunc(Res: Dst, Op: ExtDst);
6740 } else if (NumValueDWords == 1) {
6741 B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
6742 } else {
6743 SmallVector<Register, 5> LoadElts;
6744 for (unsigned I = 0; I != NumValueDWords; ++I)
6745 LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
6746 LoadElts.push_back(Elt: StatusDst);
6747 B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
6748 LoadElts.truncate(N: NumValueDWords);
6749 B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
6750 }
6751 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6752 (IsD16 && !Ty.isVector())) {
6753 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6754 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6755 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6756 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6757 B.buildTrunc(Res: Dst, Op: LoadDstReg);
6758 } else if (Unpacked && IsD16 && Ty.isVector()) {
6759 LLT UnpackedTy = Ty.changeElementSize(NewEltSize: 32);
6760 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
6761 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6762 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6763 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6764 // FIXME: G_TRUNC should work, but legalization currently fails
6765 auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
6766 SmallVector<Register, 4> Repack;
6767 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6768 Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6769 B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
6770 } else {
6771 buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6772 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6773 }
6774
6775 MI.eraseFromParent();
6776 return true;
6777}
6778
6779static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6780 switch (IntrID) {
6781 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6782 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6783 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6784 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6785 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6786 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6787 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6788 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6789 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6790 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6791 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6792 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6793 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6794 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6795 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6796 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6797 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6798 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6799 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6800 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6801 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6802 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6803 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6804 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6805 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6806 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6807 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6808 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6809 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6810 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6811 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6812 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6813 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6814 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6815 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6816 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6817 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6818 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6819 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6820 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6821 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6822 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6823 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6824 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6825 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6826 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6827 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6828 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6829 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6830 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6831 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6832 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6833 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6834 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6835 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6836 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6837 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6838 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6839 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6840 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6841 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6842 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6843 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6844 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6845 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6846 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6847 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6848 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6849 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6850 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6851 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6852 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6853 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6854 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6855 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6856 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6857 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6858 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6859 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6860 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6861 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6862 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6863 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6864 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6865 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6866 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6867 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6868 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6869 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6870 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6871 default:
6872 llvm_unreachable("unhandled atomic opcode");
6873 }
6874}
6875
6876bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6877 MachineIRBuilder &B,
6878 Intrinsic::ID IID) const {
6879 const bool IsCmpSwap =
6880 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6881 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6882 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6883 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6884
6885 Register Dst = MI.getOperand(i: 0).getReg();
6886 // Since we don't have 128-bit atomics, we don't need to handle the case of
6887 // p8 argmunents to the atomic itself
6888 Register VData = MI.getOperand(i: 2).getReg();
6889
6890 Register CmpVal;
6891 int OpOffset = 0;
6892
6893 if (IsCmpSwap) {
6894 CmpVal = MI.getOperand(i: 3).getReg();
6895 ++OpOffset;
6896 }
6897
6898 castBufferRsrcArgToV4I32(MI, B, Idx: 3 + OpOffset);
6899 Register RSrc = MI.getOperand(i: 3 + OpOffset).getReg();
6900 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6901
6902 // The struct intrinsic variants add one additional operand over raw.
6903 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6904 Register VIndex;
6905 if (HasVIndex) {
6906 VIndex = MI.getOperand(i: 4 + OpOffset).getReg();
6907 ++OpOffset;
6908 } else {
6909 VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 0).getReg(Idx: 0);
6910 }
6911
6912 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6913 Register SOffset = MI.getOperand(i: 5 + OpOffset).getReg();
6914 unsigned AuxiliaryData = MI.getOperand(i: 6 + OpOffset).getImm();
6915
6916 MachineMemOperand *MMO = *MI.memoperands_begin();
6917
6918 unsigned ImmOffset;
6919 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6920
6921 auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
6922 .addDef(RegNo: Dst)
6923 .addUse(RegNo: VData); // vdata
6924
6925 if (IsCmpSwap)
6926 MIB.addReg(RegNo: CmpVal);
6927
6928 MIB.addUse(RegNo: RSrc) // rsrc
6929 .addUse(RegNo: VIndex) // vindex
6930 .addUse(RegNo: VOffset) // voffset
6931 .addUse(RegNo: SOffset) // soffset
6932 .addImm(Val: ImmOffset) // offset(imm)
6933 .addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6934 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6935 .addMemOperand(MMO);
6936
6937 MI.eraseFromParent();
6938 return true;
6939}
6940
6941/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6942/// vector with s16 typed elements.
6943static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6944 SmallVectorImpl<Register> &PackedAddrs,
6945 unsigned ArgOffset,
6946 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6947 bool IsA16, bool IsG16) {
6948 const LLT S16 = LLT::scalar(SizeInBits: 16);
6949 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6950 auto EndIdx = Intr->VAddrEnd;
6951
6952 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6953 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6954 if (!SrcOp.isReg())
6955 continue; // _L to _LZ may have eliminated this.
6956
6957 Register AddrReg = SrcOp.getReg();
6958
6959 if ((I < Intr->GradientStart) ||
6960 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6961 (I >= Intr->CoordStart && !IsA16)) {
6962 if ((I < Intr->GradientStart) && IsA16 &&
6963 (B.getMRI()->getType(Reg: AddrReg) == S16)) {
6964 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6965 // Special handling of bias when A16 is on. Bias is of type half but
6966 // occupies full 32-bit.
6967 PackedAddrs.push_back(
6968 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6969 .getReg(Idx: 0));
6970 } else {
6971 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6972 "Bias needs to be converted to 16 bit in A16 mode");
6973 // Handle any gradient or coordinate operands that should not be packed
6974 AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: 0);
6975 PackedAddrs.push_back(Elt: AddrReg);
6976 }
6977 } else {
6978 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6979 // derivatives dx/dh and dx/dv are packed with undef.
6980 if (((I + 1) >= EndIdx) ||
6981 ((Intr->NumGradients / 2) % 2 == 1 &&
6982 (I == static_cast<unsigned>(Intr->GradientStart +
6983 (Intr->NumGradients / 2) - 1) ||
6984 I == static_cast<unsigned>(Intr->GradientStart +
6985 Intr->NumGradients - 1))) ||
6986 // Check for _L to _LZ optimization
6987 !MI.getOperand(i: ArgOffset + I + 1).isReg()) {
6988 PackedAddrs.push_back(
6989 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6990 .getReg(Idx: 0));
6991 } else {
6992 PackedAddrs.push_back(
6993 Elt: B.buildBuildVector(
6994 Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + 1).getReg()})
6995 .getReg(Idx: 0));
6996 ++I;
6997 }
6998 }
6999 }
7000}
7001
7002/// Convert from separate vaddr components to a single vector address register,
7003/// and replace the remaining operands with $noreg.
7004static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
7005 int DimIdx, int NumVAddrs) {
7006 const LLT S32 = LLT::scalar(SizeInBits: 32);
7007 (void)S32;
7008 SmallVector<Register, 8> AddrRegs;
7009 for (int I = 0; I != NumVAddrs; ++I) {
7010 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
7011 if (SrcOp.isReg()) {
7012 AddrRegs.push_back(Elt: SrcOp.getReg());
7013 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7014 }
7015 }
7016
7017 int NumAddrRegs = AddrRegs.size();
7018 if (NumAddrRegs != 1) {
7019 auto VAddr =
7020 B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: 32), Ops: AddrRegs);
7021 MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: 0));
7022 }
7023
7024 for (int I = 1; I != NumVAddrs; ++I) {
7025 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
7026 if (SrcOp.isReg())
7027 MI.getOperand(i: DimIdx + I).setReg(AMDGPU::NoRegister);
7028 }
7029}
7030
7031/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7032///
7033/// Depending on the subtarget, load/store with 16-bit element data need to be
7034/// rewritten to use the low half of 32-bit registers, or directly use a packed
7035/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7036/// registers.
7037///
7038/// We don't want to directly select image instructions just yet, but also want
7039/// to exposes all register repacking to the legalizer/combiners. We also don't
7040/// want a selected instruction entering RegBankSelect. In order to avoid
7041/// defining a multitude of intermediate image instructions, directly hack on
7042/// the intrinsic's arguments. In cases like a16 addresses, this requires
7043/// padding now unnecessary arguments with $noreg.
7044bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
7045 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
7046 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7047
7048 const MachineFunction &MF = *MI.getMF();
7049 const unsigned NumDefs = MI.getNumExplicitDefs();
7050 const unsigned ArgOffset = NumDefs + 1;
7051 bool IsTFE = NumDefs == 2;
7052 // We are only processing the operands of d16 image operations on subtargets
7053 // that use the unpacked register layout, or need to repack the TFE result.
7054
7055 // TODO: Do we need to guard against already legalized intrinsics?
7056 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7057 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
7058
7059 MachineRegisterInfo *MRI = B.getMRI();
7060 const LLT S32 = LLT::scalar(SizeInBits: 32);
7061 const LLT S16 = LLT::scalar(SizeInBits: 16);
7062 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
7063
7064 unsigned DMask = 0;
7065 Register VData;
7066 LLT Ty;
7067
7068 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7069 VData = MI.getOperand(i: NumDefs == 0 ? 1 : 0).getReg();
7070 Ty = MRI->getType(Reg: VData);
7071 }
7072
7073 const bool IsAtomicPacked16Bit =
7074 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7075 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7076
7077 // Check for 16 bit addresses and pack if true.
7078 LLT GradTy =
7079 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
7080 LLT AddrTy =
7081 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
7082 const bool IsG16 =
7083 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7084 const bool IsA16 = AddrTy == S16;
7085 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7086
7087 int DMaskLanes = 0;
7088 if (!BaseOpcode->Atomic) {
7089 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
7090 if (BaseOpcode->Gather4) {
7091 DMaskLanes = 4;
7092 } else if (DMask != 0) {
7093 DMaskLanes = llvm::popcount(Value: DMask);
7094 } else if (!IsTFE && !BaseOpcode->Store) {
7095 // If dmask is 0, this is a no-op load. This can be eliminated.
7096 B.buildUndef(Res: MI.getOperand(i: 0));
7097 MI.eraseFromParent();
7098 return true;
7099 }
7100 }
7101
7102 Observer.changingInstr(MI);
7103 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7104
7105 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7106 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7107 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7108 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7109 unsigned NewOpcode = LoadOpcode;
7110 if (BaseOpcode->Store)
7111 NewOpcode = StoreOpcode;
7112 else if (BaseOpcode->NoReturn)
7113 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7114
7115 // Track that we legalized this
7116 MI.setDesc(B.getTII().get(Opcode: NewOpcode));
7117
7118 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7119 // dmask to be at least 1 otherwise the instruction will fail
7120 if (IsTFE && DMask == 0) {
7121 DMask = 0x1;
7122 DMaskLanes = 1;
7123 MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
7124 }
7125
7126 if (BaseOpcode->Atomic) {
7127 Register VData0 = MI.getOperand(i: 2).getReg();
7128 LLT Ty = MRI->getType(Reg: VData0);
7129
7130 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7131 if (Ty.isVector() && !IsAtomicPacked16Bit)
7132 return false;
7133
7134 if (BaseOpcode->AtomicX2) {
7135 Register VData1 = MI.getOperand(i: 3).getReg();
7136 // The two values are packed in one register.
7137 LLT PackedTy = LLT::fixed_vector(NumElements: 2, ScalarTy: Ty);
7138 auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
7139 MI.getOperand(i: 2).setReg(Concat.getReg(Idx: 0));
7140 MI.getOperand(i: 3).setReg(AMDGPU::NoRegister);
7141 }
7142 }
7143
7144 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7145
7146 // Rewrite the addressing register layout before doing anything else.
7147 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7148 // 16 bit gradients are supported, but are tied to the A16 control
7149 // so both gradients and addresses must be 16 bit
7150 return false;
7151 }
7152
7153 if (IsA16 && !ST.hasA16()) {
7154 // A16 not supported
7155 return false;
7156 }
7157
7158 const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
7159 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7160
7161 if (IsA16 || IsG16) {
7162 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7163 // instructions expect VGPR_32
7164 SmallVector<Register, 4> PackedRegs;
7165
7166 packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7167
7168 // See also below in the non-a16 branch
7169 const bool UseNSA = ST.hasNSAEncoding() &&
7170 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7171 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7172 const bool UsePartialNSA =
7173 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7174
7175 if (UsePartialNSA) {
7176 // Pack registers that would go over NSAMaxSize into last VAddr register
7177 LLT PackedAddrTy =
7178 LLT::fixed_vector(NumElements: 2 * (PackedRegs.size() - NSAMaxSize + 1), ScalarSizeInBits: 16);
7179 auto Concat = B.buildConcatVectors(
7180 Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - 1));
7181 PackedRegs[NSAMaxSize - 1] = Concat.getReg(Idx: 0);
7182 PackedRegs.resize(N: NSAMaxSize);
7183 } else if (!UseNSA && PackedRegs.size() > 1) {
7184 LLT PackedAddrTy = LLT::fixed_vector(NumElements: 2 * PackedRegs.size(), ScalarSizeInBits: 16);
7185 auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
7186 PackedRegs[0] = Concat.getReg(Idx: 0);
7187 PackedRegs.resize(N: 1);
7188 }
7189
7190 const unsigned NumPacked = PackedRegs.size();
7191 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7192 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
7193 if (!SrcOp.isReg()) {
7194 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7195 continue;
7196 }
7197
7198 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7199
7200 if (I - Intr->VAddrStart < NumPacked)
7201 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7202 else
7203 SrcOp.setReg(AMDGPU::NoRegister);
7204 }
7205 } else {
7206 // If the register allocator cannot place the address registers contiguously
7207 // without introducing moves, then using the non-sequential address encoding
7208 // is always preferable, since it saves VALU instructions and is usually a
7209 // wash in terms of code size or even better.
7210 //
7211 // However, we currently have no way of hinting to the register allocator
7212 // that MIMG addresses should be placed contiguously when it is possible to
7213 // do so, so force non-NSA for the common 2-address case as a heuristic.
7214 //
7215 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7216 // allocation when possible.
7217 //
7218 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7219 // set of the remaining addresses.
7220 const bool UseNSA = ST.hasNSAEncoding() &&
7221 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7222 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7223 const bool UsePartialNSA =
7224 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7225
7226 if (UsePartialNSA) {
7227 convertImageAddrToPacked(B, MI,
7228 DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7229 NumVAddrs: Intr->NumVAddrs - NSAMaxSize + 1);
7230 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7231 convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
7232 NumVAddrs: Intr->NumVAddrs);
7233 }
7234 }
7235
7236 int Flags = 0;
7237 if (IsA16)
7238 Flags |= 1;
7239 if (IsG16)
7240 Flags |= 2;
7241 MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
7242
7243 if (BaseOpcode->NoReturn) { // No TFE for stores?
7244 // TODO: Handle dmask trim
7245 if (!Ty.isVector() || !IsD16)
7246 return true;
7247
7248 Register RepackedReg = handleD16VData(B, MRI&: *MRI, Reg: VData, ImageStore: true);
7249 if (RepackedReg != VData) {
7250 MI.getOperand(i: 1).setReg(RepackedReg);
7251 }
7252
7253 return true;
7254 }
7255
7256 Register DstReg = MI.getOperand(i: 0).getReg();
7257 const LLT EltTy = Ty.getScalarType();
7258 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7259
7260 // Confirm that the return type is large enough for the dmask specified
7261 if (NumElts < DMaskLanes)
7262 return false;
7263
7264 if (NumElts > 4 || DMaskLanes > 4)
7265 return false;
7266
7267 // Image atomic instructions are using DMask to specify how many bits
7268 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7269 // DMaskLanes for image atomic has default value '0'.
7270 // We must be sure that atomic variants (especially packed) will not be
7271 // truncated from v2s16 or v4s16 to s16 type.
7272 //
7273 // ChangeElementCount will be needed for image load where Ty is always scalar.
7274 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7275 const LLT AdjustedTy =
7276 DMaskLanes == 0
7277 ? Ty
7278 : Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
7279
7280 // The raw dword aligned data component of the load. The only legal cases
7281 // where this matters should be when using the packed D16 format, for
7282 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7283 LLT RoundedTy;
7284
7285 // S32 vector to cover all data, plus TFE result element.
7286 LLT TFETy;
7287
7288 // Register type to use for each loaded component. Will be S32 or V2S16.
7289 LLT RegTy;
7290
7291 if (IsD16 && ST.hasUnpackedD16VMem()) {
7292 RoundedTy =
7293 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: 32);
7294 TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + 1, ScalarSizeInBits: 32);
7295 RegTy = S32;
7296 } else {
7297 unsigned EltSize = EltTy.getSizeInBits();
7298 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7299 unsigned RoundedSize = 32 * RoundedElts;
7300 RoundedTy = LLT::scalarOrVector(
7301 EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
7302 TFETy = LLT::fixed_vector(NumElements: RoundedSize / 32 + 1, ScalarTy: S32);
7303 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7304 }
7305
7306 // The return type does not need adjustment.
7307 // TODO: Should we change s16 case to s32 or <2 x s16>?
7308 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7309 return true;
7310
7311 Register Dst1Reg;
7312
7313 // Insert after the instruction.
7314 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
7315
7316 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7317 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7318 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7319 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7320
7321 Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
7322
7323 MI.getOperand(i: 0).setReg(NewResultReg);
7324
7325 // In the IR, TFE is supposed to be used with a 2 element struct return
7326 // type. The instruction really returns these two values in one contiguous
7327 // register, with one additional dword beyond the loaded data. Rewrite the
7328 // return type to use a single register result.
7329
7330 if (IsTFE) {
7331 Dst1Reg = MI.getOperand(i: 1).getReg();
7332 if (MRI->getType(Reg: Dst1Reg) != S32)
7333 return false;
7334
7335 // TODO: Make sure the TFE operand bit is set.
7336 MI.removeOperand(OpNo: 1);
7337
7338 // Handle the easy case that requires no repack instructions.
7339 if (Ty == S32) {
7340 B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
7341 return true;
7342 }
7343 }
7344
7345 // Now figure out how to copy the new result register back into the old
7346 // result.
7347 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7348
7349 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7350
7351 if (ResultNumRegs == 1) {
7352 assert(!IsTFE);
7353 ResultRegs[0] = NewResultReg;
7354 } else {
7355 // We have to repack into a new vector of some kind.
7356 for (int I = 0; I != NumDataRegs; ++I)
7357 ResultRegs[I] = MRI->createGenericVirtualRegister(Ty: RegTy);
7358 B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
7359
7360 // Drop the final TFE element to get the data part. The TFE result is
7361 // directly written to the right place already.
7362 if (IsTFE)
7363 ResultRegs.resize(N: NumDataRegs);
7364 }
7365
7366 // For an s16 scalar result, we form an s32 result with a truncate regardless
7367 // of packed vs. unpacked.
7368 if (IsD16 && !Ty.isVector()) {
7369 B.buildTrunc(Res: DstReg, Op: ResultRegs[0]);
7370 return true;
7371 }
7372
7373 // Avoid a build/concat_vector of 1 entry.
7374 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7375 B.buildBitcast(Dst: DstReg, Src: ResultRegs[0]);
7376 return true;
7377 }
7378
7379 assert(Ty.isVector());
7380
7381 if (IsD16) {
7382 // For packed D16 results with TFE enabled, all the data components are
7383 // S32. Cast back to the expected type.
7384 //
7385 // TODO: We don't really need to use load s32 elements. We would only need one
7386 // cast for the TFE result if a multiple of v2s16 was used.
7387 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7388 for (Register &Reg : ResultRegs)
7389 Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: 0);
7390 } else if (ST.hasUnpackedD16VMem()) {
7391 for (Register &Reg : ResultRegs)
7392 Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: 0);
7393 }
7394 }
7395
7396 auto padWithUndef = [&](LLT Ty, int NumElts) {
7397 if (NumElts == 0)
7398 return;
7399 Register Undef = B.buildUndef(Res: Ty).getReg(Idx: 0);
7400 for (int I = 0; I != NumElts; ++I)
7401 ResultRegs.push_back(Elt: Undef);
7402 };
7403
7404 // Pad out any elements eliminated due to the dmask.
7405 LLT ResTy = MRI->getType(Reg: ResultRegs[0]);
7406 if (!ResTy.isVector()) {
7407 padWithUndef(ResTy, NumElts - ResultRegs.size());
7408 B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
7409 return true;
7410 }
7411
7412 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7413 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7414
7415 // Deal with the one annoying legal case.
7416 const LLT V3S16 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 16);
7417 if (Ty == V3S16) {
7418 if (IsTFE) {
7419 if (ResultRegs.size() == 1) {
7420 NewResultReg = ResultRegs[0];
7421 } else if (ResultRegs.size() == 2) {
7422 LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
7423 NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: 0);
7424 } else {
7425 return false;
7426 }
7427 }
7428
7429 if (MRI->getType(Reg: DstReg).getNumElements() <
7430 MRI->getType(Reg: NewResultReg).getNumElements()) {
7431 B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
7432 } else {
7433 B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
7434 }
7435 return true;
7436 }
7437
7438 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7439 B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
7440 return true;
7441}
7442
7443bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
7444 MachineInstr &MI) const {
7445 MachineIRBuilder &B = Helper.MIRBuilder;
7446 GISelChangeObserver &Observer = Helper.Observer;
7447
7448 Register OrigDst = MI.getOperand(i: 0).getReg();
7449 Register Dst;
7450 LLT Ty = B.getMRI()->getType(Reg: OrigDst);
7451 unsigned Size = Ty.getSizeInBits();
7452 MachineFunction &MF = B.getMF();
7453 unsigned Opc = 0;
7454 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7455 assert(Size == 8 || Size == 16);
7456 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7457 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7458 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7459 // destination register.
7460 Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
7461 } else {
7462 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7463 Dst = OrigDst;
7464 }
7465
7466 Observer.changingInstr(MI);
7467
7468 // Handle needing to s.buffer.load() a p8 value.
7469 if (hasBufferRsrcWorkaround(Ty)) {
7470 Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: 0);
7471 B.setInsertPt(MBB&: B.getMBB(), II: MI);
7472 }
7473 if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
7474 Ty = getBitcastRegisterType(Ty);
7475 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
7476 B.setInsertPt(MBB&: B.getMBB(), II: MI);
7477 }
7478
7479 // FIXME: We don't really need this intermediate instruction. The intrinsic
7480 // should be fixed to have a memory operand. Since it's readnone, we're not
7481 // allowed to add one.
7482 MI.setDesc(B.getTII().get(Opcode: Opc));
7483 MI.removeOperand(OpNo: 1); // Remove intrinsic ID
7484
7485 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7486 const unsigned MemSize = (Size + 7) / 8;
7487 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7488 Ty: getTypeForLLT(Ty, C&: MF.getFunction().getContext()));
7489 MachineMemOperand *MMO = MF.getMachineMemOperand(
7490 PtrInfo: MachinePointerInfo(),
7491 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7492 MachineMemOperand::MOInvariant,
7493 Size: MemSize, BaseAlignment: MemAlign);
7494 MI.addMemOperand(MF, MO: MMO);
7495 if (Dst != OrigDst) {
7496 MI.getOperand(i: 0).setReg(Dst);
7497 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
7498 B.buildTrunc(Res: OrigDst, Op: Dst);
7499 }
7500
7501 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7502 // always be legal. We may need to restore this to a 96-bit result if it turns
7503 // out this needs to be converted to a vector load during RegBankSelect.
7504 if (!isPowerOf2_32(Value: Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7505 if (Ty.isVector())
7506 Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: 0);
7507 else
7508 Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: 0);
7509 }
7510
7511 Observer.changedInstr(MI);
7512 return true;
7513}
7514
7515bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
7516 MachineInstr &MI) const {
7517 MachineIRBuilder &B = Helper.MIRBuilder;
7518 GISelChangeObserver &Observer = Helper.Observer;
7519 Observer.changingInstr(MI);
7520 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7521 MI.removeOperand(OpNo: 0); // Remove intrinsic ID
7522 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
7523 Observer.changedInstr(MI);
7524 return true;
7525}
7526
7527// TODO: Move to selection
7528bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
7529 MachineRegisterInfo &MRI,
7530 MachineIRBuilder &B) const {
7531 if (!ST.hasTrapHandler() ||
7532 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7533 return legalizeTrapEndpgm(MI, MRI, B);
7534
7535 return ST.supportsGetDoorbellID() ?
7536 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
7537}
7538
7539bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
7540 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
7541 const DebugLoc &DL = MI.getDebugLoc();
7542 MachineBasicBlock &BB = B.getMBB();
7543 MachineFunction *MF = BB.getParent();
7544
7545 if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
7546 BuildMI(BB, I: BB.end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
7547 .addImm(Val: 0);
7548 MI.eraseFromParent();
7549 return true;
7550 }
7551
7552 // We need a block split to make the real endpgm a terminator. We also don't
7553 // want to break phis in successor blocks, so we can't just delete to the
7554 // end of the block.
7555 BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
7556 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7557 MF->push_back(MBB: TrapBB);
7558 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
7559 .addImm(Val: 0);
7560 BuildMI(BB, I: &MI, MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
7561 .addMBB(MBB: TrapBB);
7562
7563 BB.addSuccessor(Succ: TrapBB);
7564 MI.eraseFromParent();
7565 return true;
7566}
7567
7568bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
7569 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
7570 MachineFunction &MF = B.getMF();
7571 const LLT S64 = LLT::scalar(SizeInBits: 64);
7572
7573 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7574 // For code object version 5, queue_ptr is passed through implicit kernarg.
7575 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
7576 AMDGPU::AMDHSA_COV5) {
7577 AMDGPUTargetLowering::ImplicitParameter Param =
7578 AMDGPUTargetLowering::QUEUE_PTR;
7579 uint64_t Offset =
7580 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
7581
7582 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7583 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7584
7585 if (!loadInputValue(DstReg: KernargPtrReg, B,
7586 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
7587 return false;
7588
7589 // TODO: can we be smarter about machine pointer info?
7590 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF);
7591 MachineMemOperand *MMO = MF.getMachineMemOperand(
7592 PtrInfo: PtrInfo.getWithOffset(O: Offset),
7593 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7594 MachineMemOperand::MOInvariant,
7595 MemTy: LLT::scalar(SizeInBits: 64), base_alignment: commonAlignment(A: Align(64), Offset));
7596
7597 // Pointer address
7598 Register LoadAddr = MRI.createGenericVirtualRegister(
7599 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7600 B.buildObjectPtrOffset(Res: LoadAddr, Op0: KernargPtrReg,
7601 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
7602 // Load address
7603 Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
7604 B.buildCopy(Res: SGPR01, Op: Temp);
7605 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7606 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7607 .addReg(RegNo: SGPR01, Flags: RegState::Implicit);
7608 MI.eraseFromParent();
7609 return true;
7610 }
7611
7612 // Pass queue pointer to trap handler as input, and insert trap instruction
7613 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7614 Register LiveIn =
7615 MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7616 if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
7617 return false;
7618
7619 B.buildCopy(Res: SGPR01, Op: LiveIn);
7620 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7621 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7622 .addReg(RegNo: SGPR01, Flags: RegState::Implicit);
7623
7624 MI.eraseFromParent();
7625 return true;
7626}
7627
7628bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
7629 MachineRegisterInfo &MRI,
7630 MachineIRBuilder &B) const {
7631 // We need to simulate the 's_trap 2' instruction on targets that run in
7632 // PRIV=1 (where it is treated as a nop).
7633 if (ST.hasPrivEnabledTrap2NopBug()) {
7634 ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
7635 DL: MI.getDebugLoc());
7636 MI.eraseFromParent();
7637 return true;
7638 }
7639
7640 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7641 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7642 MI.eraseFromParent();
7643 return true;
7644}
7645
7646bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
7647 MachineRegisterInfo &MRI,
7648 MachineIRBuilder &B) const {
7649 // Is non-HSA path or trap-handler disabled? Then, report a warning
7650 // accordingly
7651 if (!ST.hasTrapHandler() ||
7652 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7653 Function &Fn = B.getMF().getFunction();
7654 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7655 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7656 } else {
7657 // Insert debug-trap instruction
7658 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7659 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7660 }
7661
7662 MI.eraseFromParent();
7663 return true;
7664}
7665
7666bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic(
7667 MachineInstr &MI, MachineIRBuilder &B) const {
7668 MachineRegisterInfo &MRI = *B.getMRI();
7669 const LLT S16 = LLT::scalar(SizeInBits: 16);
7670 const LLT S32 = LLT::scalar(SizeInBits: 32);
7671 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
7672 const LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
7673
7674 Register DstReg = MI.getOperand(i: 0).getReg();
7675 Register NodePtr = MI.getOperand(i: 2).getReg();
7676 Register RayExtent = MI.getOperand(i: 3).getReg();
7677 Register RayOrigin = MI.getOperand(i: 4).getReg();
7678 Register RayDir = MI.getOperand(i: 5).getReg();
7679 Register RayInvDir = MI.getOperand(i: 6).getReg();
7680 Register TDescr = MI.getOperand(i: 7).getReg();
7681
7682 if (!ST.hasGFX10_AEncoding()) {
7683 Function &Fn = B.getMF().getFunction();
7684 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7685 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7686 return false;
7687 }
7688
7689 const bool IsGFX11 = AMDGPU::isGFX11(STI: ST);
7690 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: ST);
7691 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: ST);
7692 const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == 16;
7693 const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == 64;
7694 const unsigned NumVDataDwords = 4;
7695 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7696 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7697 const bool UseNSA =
7698 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7699
7700 const unsigned BaseOpcodes[2][2] = {
7701 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7702 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7703 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7704 int Opcode;
7705 if (UseNSA) {
7706 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7707 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7708 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7709 : AMDGPU::MIMGEncGfx10NSA,
7710 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7711 } else {
7712 assert(!IsGFX12Plus);
7713 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7714 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7715 : AMDGPU::MIMGEncGfx10Default,
7716 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7717 }
7718 assert(Opcode != -1);
7719
7720 SmallVector<Register, 12> Ops;
7721 if (UseNSA && IsGFX11Plus) {
7722 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7723 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7724 auto Merged = B.buildMergeLikeInstr(
7725 Res: V3S32, Ops: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1), Unmerge.getReg(Idx: 2)});
7726 Ops.push_back(Elt: Merged.getReg(Idx: 0));
7727 };
7728
7729 Ops.push_back(Elt: NodePtr);
7730 Ops.push_back(Elt: RayExtent);
7731 packLanes(RayOrigin);
7732
7733 if (IsA16) {
7734 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7735 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7736 auto MergedDir = B.buildMergeLikeInstr(
7737 Res: V3S32,
7738 Ops: {B.buildBitcast(
7739 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 0),
7740 UnmergeRayDir.getReg(Idx: 0)}))
7741 .getReg(Idx: 0),
7742 B.buildBitcast(
7743 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 1),
7744 UnmergeRayDir.getReg(Idx: 1)}))
7745 .getReg(Idx: 0),
7746 B.buildBitcast(
7747 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 2),
7748 UnmergeRayDir.getReg(Idx: 2)}))
7749 .getReg(Idx: 0)});
7750 Ops.push_back(Elt: MergedDir.getReg(Idx: 0));
7751 } else {
7752 packLanes(RayDir);
7753 packLanes(RayInvDir);
7754 }
7755 } else {
7756 if (Is64) {
7757 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
7758 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7759 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7760 } else {
7761 Ops.push_back(Elt: NodePtr);
7762 }
7763 Ops.push_back(Elt: RayExtent);
7764
7765 auto packLanes = [&Ops, &S32, &B](Register Src) {
7766 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7767 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7768 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7769 Ops.push_back(Elt: Unmerge.getReg(Idx: 2));
7770 };
7771
7772 packLanes(RayOrigin);
7773 if (IsA16) {
7774 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7775 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7776 Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
7777 Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
7778 Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
7779 B.buildMergeLikeInstr(Res: R1,
7780 Ops: {UnmergeRayDir.getReg(Idx: 0), UnmergeRayDir.getReg(Idx: 1)});
7781 B.buildMergeLikeInstr(
7782 Res: R2, Ops: {UnmergeRayDir.getReg(Idx: 2), UnmergeRayInvDir.getReg(Idx: 0)});
7783 B.buildMergeLikeInstr(
7784 Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: 1), UnmergeRayInvDir.getReg(Idx: 2)});
7785 Ops.push_back(Elt: R1);
7786 Ops.push_back(Elt: R2);
7787 Ops.push_back(Elt: R3);
7788 } else {
7789 packLanes(RayDir);
7790 packLanes(RayInvDir);
7791 }
7792 }
7793
7794 if (!UseNSA) {
7795 // Build a single vector containing all the operands so far prepared.
7796 LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: 32);
7797 Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: 0);
7798 Ops.clear();
7799 Ops.push_back(Elt: MergedOps);
7800 }
7801
7802 auto MIB = B.buildInstr(Opcode: AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7803 .addDef(RegNo: DstReg)
7804 .addImm(Val: Opcode);
7805
7806 for (Register R : Ops) {
7807 MIB.addUse(RegNo: R);
7808 }
7809
7810 MIB.addUse(RegNo: TDescr)
7811 .addImm(Val: IsA16 ? 1 : 0)
7812 .cloneMemRefs(OtherMI: MI);
7813
7814 MI.eraseFromParent();
7815 return true;
7816}
7817
7818bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic(
7819 MachineInstr &MI, MachineIRBuilder &B) const {
7820 const LLT S32 = LLT::scalar(SizeInBits: 32);
7821 const LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
7822
7823 Register DstReg = MI.getOperand(i: 0).getReg();
7824 Register DstOrigin = MI.getOperand(i: 1).getReg();
7825 Register DstDir = MI.getOperand(i: 2).getReg();
7826 Register NodePtr = MI.getOperand(i: 4).getReg();
7827 Register RayExtent = MI.getOperand(i: 5).getReg();
7828 Register InstanceMask = MI.getOperand(i: 6).getReg();
7829 Register RayOrigin = MI.getOperand(i: 7).getReg();
7830 Register RayDir = MI.getOperand(i: 8).getReg();
7831 Register Offsets = MI.getOperand(i: 9).getReg();
7832 Register TDescr = MI.getOperand(i: 10).getReg();
7833
7834 if (!ST.hasBVHDualAndBVH8Insts()) {
7835 Function &Fn = B.getMF().getFunction();
7836 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7837 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7838 return false;
7839 }
7840
7841 bool IsBVH8 = cast<GIntrinsic>(Val&: MI).getIntrinsicID() ==
7842 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7843 const unsigned NumVDataDwords = 10;
7844 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7845 int Opcode = AMDGPU::getMIMGOpcode(
7846 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7847 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7848 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7849 assert(Opcode != -1);
7850
7851 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7852 Res: V2S32, Ops: {RayExtent, B.buildAnyExt(Res: S32, Op: InstanceMask)});
7853
7854 B.buildInstr(Opcode: IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7855 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7856 .addDef(RegNo: DstReg)
7857 .addDef(RegNo: DstOrigin)
7858 .addDef(RegNo: DstDir)
7859 .addImm(Val: Opcode)
7860 .addUse(RegNo: NodePtr)
7861 .addUse(RegNo: RayExtentInstanceMaskVec.getReg(Idx: 0))
7862 .addUse(RegNo: RayOrigin)
7863 .addUse(RegNo: RayDir)
7864 .addUse(RegNo: Offsets)
7865 .addUse(RegNo: TDescr)
7866 .cloneMemRefs(OtherMI: MI);
7867
7868 MI.eraseFromParent();
7869 return true;
7870}
7871
7872bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7873 MachineIRBuilder &B) const {
7874 const SITargetLowering *TLI = ST.getTargetLowering();
7875 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7876 Register DstReg = MI.getOperand(i: 0).getReg();
7877 B.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {DstReg}, SrcOps: {StackPtr});
7878 MI.eraseFromParent();
7879 return true;
7880}
7881
7882bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7883 MachineIRBuilder &B) const {
7884 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7885 if (!ST.hasArchitectedSGPRs())
7886 return false;
7887 LLT S32 = LLT::scalar(SizeInBits: 32);
7888 Register DstReg = MI.getOperand(i: 0).getReg();
7889 auto TTMP8 = B.buildCopy(Res: S32, Op: Register(AMDGPU::TTMP8));
7890 auto LSB = B.buildConstant(Res: S32, Val: 25);
7891 auto Width = B.buildConstant(Res: S32, Val: 5);
7892 B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
7893 MI.eraseFromParent();
7894 return true;
7895}
7896
7897bool AMDGPULegalizerInfo::legalizeConstHwRegRead(MachineInstr &MI,
7898 MachineIRBuilder &B,
7899 AMDGPU::Hwreg::Id HwReg,
7900 unsigned LowBit,
7901 unsigned Width) const {
7902 MachineRegisterInfo &MRI = *B.getMRI();
7903 Register DstReg = MI.getOperand(i: 0).getReg();
7904 if (!MRI.getRegClassOrNull(Reg: DstReg))
7905 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32RegClass);
7906 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32_const)
7907 .addDef(RegNo: DstReg)
7908 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(Values: HwReg, Values: LowBit, Values: Width));
7909 MI.eraseFromParent();
7910 return true;
7911}
7912
7913static constexpr unsigned FPEnvModeBitField =
7914 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
7915
7916static constexpr unsigned FPEnvTrapBitField =
7917 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
7918
7919bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7920 MachineRegisterInfo &MRI,
7921 MachineIRBuilder &B) const {
7922 Register Src = MI.getOperand(i: 0).getReg();
7923 if (MRI.getType(Reg: Src) != S64)
7924 return false;
7925
7926 auto ModeReg =
7927 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7928 /*HasSideEffects=*/true, /*isConvergent=*/false)
7929 .addImm(Val: FPEnvModeBitField);
7930 auto TrapReg =
7931 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7932 /*HasSideEffects=*/true, /*isConvergent=*/false)
7933 .addImm(Val: FPEnvTrapBitField);
7934 B.buildMergeLikeInstr(Res: Src, Ops: {ModeReg, TrapReg});
7935 MI.eraseFromParent();
7936 return true;
7937}
7938
7939bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7940 MachineRegisterInfo &MRI,
7941 MachineIRBuilder &B) const {
7942 Register Src = MI.getOperand(i: 0).getReg();
7943 if (MRI.getType(Reg: Src) != S64)
7944 return false;
7945
7946 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: 0));
7947 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7948 /*HasSideEffects=*/true, /*isConvergent=*/false)
7949 .addImm(Val: static_cast<int16_t>(FPEnvModeBitField))
7950 .addReg(RegNo: Unmerge.getReg(Idx: 0));
7951 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7952 /*HasSideEffects=*/true, /*isConvergent=*/false)
7953 .addImm(Val: static_cast<int16_t>(FPEnvTrapBitField))
7954 .addReg(RegNo: Unmerge.getReg(Idx: 1));
7955 MI.eraseFromParent();
7956 return true;
7957}
7958
7959bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7960 MachineInstr &MI) const {
7961 MachineIRBuilder &B = Helper.MIRBuilder;
7962 MachineRegisterInfo &MRI = *B.getMRI();
7963
7964 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7965 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
7966 switch (IntrID) {
7967 case Intrinsic::sponentry:
7968 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
7969 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
7970 // that we can remove this cast.
7971 const LLT S32 = LLT::scalar(SizeInBits: 32);
7972 Register TmpReg = MRI.createGenericVirtualRegister(Ty: S32);
7973 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_SPONENTRY).addDef(RegNo: TmpReg);
7974
7975 Register DstReg = MI.getOperand(i: 0).getReg();
7976 B.buildIntToPtr(Dst: DstReg, Src: TmpReg);
7977 MI.eraseFromParent();
7978 } else {
7979 int FI = B.getMF().getFrameInfo().CreateFixedObject(
7980 Size: 1, SPOffset: 0, /*IsImmutable=*/false);
7981 B.buildFrameIndex(Res: MI.getOperand(i: 0), Idx: FI);
7982 MI.eraseFromParent();
7983 }
7984 return true;
7985 case Intrinsic::amdgcn_if:
7986 case Intrinsic::amdgcn_else: {
7987 MachineInstr *Br = nullptr;
7988 MachineBasicBlock *UncondBrTarget = nullptr;
7989 bool Negated = false;
7990 if (MachineInstr *BrCond =
7991 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7992 const SIRegisterInfo *TRI
7993 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7994
7995 Register Def = MI.getOperand(i: 1).getReg();
7996 Register Use = MI.getOperand(i: 3).getReg();
7997
7998 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7999
8000 if (Negated)
8001 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
8002
8003 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
8004 if (IntrID == Intrinsic::amdgcn_if) {
8005 B.buildInstr(Opcode: AMDGPU::SI_IF)
8006 .addDef(RegNo: Def)
8007 .addUse(RegNo: Use)
8008 .addMBB(MBB: UncondBrTarget);
8009 } else {
8010 B.buildInstr(Opcode: AMDGPU::SI_ELSE)
8011 .addDef(RegNo: Def)
8012 .addUse(RegNo: Use)
8013 .addMBB(MBB: UncondBrTarget);
8014 }
8015
8016 if (Br) {
8017 Br->getOperand(i: 0).setMBB(CondBrTarget);
8018 } else {
8019 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8020 // since we're swapping branch targets it needs to be reinserted.
8021 // FIXME: IRTranslator should probably not do this
8022 B.buildBr(Dest&: *CondBrTarget);
8023 }
8024
8025 MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
8026 MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
8027 MI.eraseFromParent();
8028 BrCond->eraseFromParent();
8029 return true;
8030 }
8031
8032 return false;
8033 }
8034 case Intrinsic::amdgcn_loop: {
8035 MachineInstr *Br = nullptr;
8036 MachineBasicBlock *UncondBrTarget = nullptr;
8037 bool Negated = false;
8038 if (MachineInstr *BrCond =
8039 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8040 const SIRegisterInfo *TRI
8041 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8042
8043 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
8044 Register Reg = MI.getOperand(i: 2).getReg();
8045
8046 if (Negated)
8047 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
8048
8049 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
8050 B.buildInstr(Opcode: AMDGPU::SI_LOOP)
8051 .addUse(RegNo: Reg)
8052 .addMBB(MBB: UncondBrTarget);
8053
8054 if (Br)
8055 Br->getOperand(i: 0).setMBB(CondBrTarget);
8056 else
8057 B.buildBr(Dest&: *CondBrTarget);
8058
8059 MI.eraseFromParent();
8060 BrCond->eraseFromParent();
8061 MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
8062 return true;
8063 }
8064
8065 return false;
8066 }
8067 case Intrinsic::amdgcn_addrspacecast_nonnull:
8068 return legalizeAddrSpaceCast(MI, MRI, B);
8069 case Intrinsic::amdgcn_make_buffer_rsrc:
8070 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8071 case Intrinsic::amdgcn_kernarg_segment_ptr:
8072 if (!AMDGPU::isKernel(F: B.getMF().getFunction())) {
8073 // This only makes sense to call in a kernel, so just lower to null.
8074 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: 0);
8075 MI.eraseFromParent();
8076 return true;
8077 }
8078
8079 return legalizePreloadedArgIntrin(
8080 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
8081 case Intrinsic::amdgcn_implicitarg_ptr:
8082 return legalizeImplicitArgPtr(MI, MRI, B);
8083 case Intrinsic::amdgcn_workitem_id_x:
8084 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 0,
8085 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
8086 case Intrinsic::amdgcn_workitem_id_y:
8087 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 1,
8088 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
8089 case Intrinsic::amdgcn_workitem_id_z:
8090 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 2,
8091 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
8092 case Intrinsic::amdgcn_workgroup_id_x:
8093 return legalizeWorkGroupId(
8094 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
8095 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
8096 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
8097 case Intrinsic::amdgcn_workgroup_id_y:
8098 return legalizeWorkGroupId(
8099 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
8100 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
8101 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
8102 case Intrinsic::amdgcn_workgroup_id_z:
8103 return legalizeWorkGroupId(
8104 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
8105 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
8106 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
8107 case Intrinsic::amdgcn_cluster_id_x:
8108 return ST.hasClusters() &&
8109 legalizePreloadedArgIntrin(MI, MRI, B,
8110 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
8111 case Intrinsic::amdgcn_cluster_id_y:
8112 return ST.hasClusters() &&
8113 legalizePreloadedArgIntrin(MI, MRI, B,
8114 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
8115 case Intrinsic::amdgcn_cluster_id_z:
8116 return ST.hasClusters() &&
8117 legalizePreloadedArgIntrin(MI, MRI, B,
8118 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
8119 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8120 return ST.hasClusters() &&
8121 legalizePreloadedArgIntrin(
8122 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
8123 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8124 return ST.hasClusters() &&
8125 legalizePreloadedArgIntrin(
8126 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
8127 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8128 return ST.hasClusters() &&
8129 legalizePreloadedArgIntrin(
8130 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
8131 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8132 return ST.hasClusters() &&
8133 legalizeConstHwRegRead(MI, B, HwReg: AMDGPU::Hwreg::ID_IB_STS2, LowBit: 21, Width: 4);
8134 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8135 return ST.hasClusters() &&
8136 legalizePreloadedArgIntrin(
8137 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
8138 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8139 return ST.hasClusters() &&
8140 legalizePreloadedArgIntrin(
8141 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
8142 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8143 return ST.hasClusters() &&
8144 legalizePreloadedArgIntrin(
8145 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
8146 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8147 return ST.hasClusters() &&
8148 legalizePreloadedArgIntrin(
8149 MI, MRI, B,
8150 ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
8151 case Intrinsic::amdgcn_wave_id:
8152 return legalizeWaveID(MI, B);
8153 case Intrinsic::amdgcn_lds_kernel_id:
8154 return legalizePreloadedArgIntrin(MI, MRI, B,
8155 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
8156 case Intrinsic::amdgcn_dispatch_ptr:
8157 return legalizePreloadedArgIntrin(MI, MRI, B,
8158 ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
8159 case Intrinsic::amdgcn_queue_ptr:
8160 return legalizePreloadedArgIntrin(MI, MRI, B,
8161 ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
8162 case Intrinsic::amdgcn_implicit_buffer_ptr:
8163 return legalizePreloadedArgIntrin(
8164 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
8165 case Intrinsic::amdgcn_dispatch_id:
8166 return legalizePreloadedArgIntrin(MI, MRI, B,
8167 ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
8168 case Intrinsic::r600_read_ngroups_x:
8169 // TODO: Emit error for hsa
8170 return legalizeKernargMemParameter(MI, B,
8171 Offset: SI::KernelInputOffsets::NGROUPS_X);
8172 case Intrinsic::r600_read_ngroups_y:
8173 return legalizeKernargMemParameter(MI, B,
8174 Offset: SI::KernelInputOffsets::NGROUPS_Y);
8175 case Intrinsic::r600_read_ngroups_z:
8176 return legalizeKernargMemParameter(MI, B,
8177 Offset: SI::KernelInputOffsets::NGROUPS_Z);
8178 case Intrinsic::r600_read_local_size_x:
8179 // TODO: Could insert G_ASSERT_ZEXT from s16
8180 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
8181 case Intrinsic::r600_read_local_size_y:
8182 // TODO: Could insert G_ASSERT_ZEXT from s16
8183 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
8184 // TODO: Could insert G_ASSERT_ZEXT from s16
8185 case Intrinsic::r600_read_local_size_z:
8186 return legalizeKernargMemParameter(MI, B,
8187 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
8188 case Intrinsic::amdgcn_fdiv_fast:
8189 return legalizeFDIVFastIntrin(MI, MRI, B);
8190 case Intrinsic::amdgcn_is_shared:
8191 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
8192 case Intrinsic::amdgcn_is_private:
8193 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
8194 case Intrinsic::amdgcn_wavefrontsize: {
8195 B.buildConstant(Res: MI.getOperand(i: 0), Val: ST.getWavefrontSize());
8196 MI.eraseFromParent();
8197 return true;
8198 }
8199 case Intrinsic::amdgcn_s_buffer_load:
8200 return legalizeSBufferLoad(Helper, MI);
8201 case Intrinsic::amdgcn_raw_buffer_store:
8202 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8203 case Intrinsic::amdgcn_struct_buffer_store:
8204 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8205 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: false);
8206 case Intrinsic::amdgcn_raw_buffer_store_format:
8207 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8208 case Intrinsic::amdgcn_struct_buffer_store_format:
8209 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8210 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: true);
8211 case Intrinsic::amdgcn_raw_tbuffer_store:
8212 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8213 case Intrinsic::amdgcn_struct_tbuffer_store:
8214 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8215 return legalizeBufferStore(MI, Helper, IsTyped: true, IsFormat: true);
8216 case Intrinsic::amdgcn_raw_buffer_load:
8217 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8218 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8219 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8220 case Intrinsic::amdgcn_struct_buffer_load:
8221 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8222 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8223 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8224 return legalizeBufferLoad(MI, Helper, IsFormat: false, IsTyped: false);
8225 case Intrinsic::amdgcn_raw_buffer_load_format:
8226 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8227 case Intrinsic::amdgcn_struct_buffer_load_format:
8228 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8229 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: false);
8230 case Intrinsic::amdgcn_raw_tbuffer_load:
8231 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8232 case Intrinsic::amdgcn_struct_tbuffer_load:
8233 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8234 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: true);
8235 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8236 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8237 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8239 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8240 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8241 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8242 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8243 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8244 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8245 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8246 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8247 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8248 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8249 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8250 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8251 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8252 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8253 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8254 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8255 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8256 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8257 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8258 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8259 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8260 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8261 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8262 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8263 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8264 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8265 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8266 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8267 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8268 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8269 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8271 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8272 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8273 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8274 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8275 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8276 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8277 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8278 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8279 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8280 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8281 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8282 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8283 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8284 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8285 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8286 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8287 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8288 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8289 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8290 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8291 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8292 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8293 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8294 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8295 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8296 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8297 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8298 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8299 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8300 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8301 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8302 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8303 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8305 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8306 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8307 return legalizeBufferAtomic(MI, B, IID: IntrID);
8308 case Intrinsic::amdgcn_rsq_clamp:
8309 return legalizeRsqClampIntrinsic(MI, MRI, B);
8310 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8311 return legalizeBVHIntersectRayIntrinsic(MI, B);
8312 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8313 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8314 return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
8315 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8316 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8317 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8318 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8319 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8320 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8321 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8322 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8323 Register Index = MI.getOperand(i: 5).getReg();
8324 LLT S64 = LLT::scalar(SizeInBits: 64);
8325 LLT IndexArgTy = MRI.getType(Reg: Index);
8326 if (IndexArgTy != S64) {
8327 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(Dst: S64, Src: Index)
8328 : B.buildAnyExt(Res: S64, Op: Index);
8329 MI.getOperand(i: 5).setReg(NewIndex.getReg(Idx: 0));
8330 }
8331 return true;
8332 }
8333 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8334 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8335 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8336 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8337 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8338 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8339 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8340 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8341 Register Index = MI.getOperand(i: 5).getReg();
8342 LLT S32 = LLT::scalar(SizeInBits: 32);
8343 if (MRI.getType(Reg: Index) != S32)
8344 MI.getOperand(i: 5).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
8345 return true;
8346 }
8347 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8348 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8349 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8350 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8351 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8352 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8353 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8354 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8355 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8356 Register Index = MI.getOperand(i: 7).getReg();
8357 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8358 ? LLT::scalar(SizeInBits: 64)
8359 : LLT::scalar(SizeInBits: 32);
8360 LLT IndexArgTy = MRI.getType(Reg: Index);
8361 if (IndexArgTy != IdxTy) {
8362 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(Dst: IdxTy, Src: Index)
8363 : B.buildAnyExt(Res: IdxTy, Op: Index);
8364 MI.getOperand(i: 7).setReg(NewIndex.getReg(Idx: 0));
8365 }
8366 return true;
8367 }
8368
8369 case Intrinsic::amdgcn_fmed3: {
8370 GISelChangeObserver &Observer = Helper.Observer;
8371
8372 // FIXME: This is to workaround the inability of tablegen match combiners to
8373 // match intrinsics in patterns.
8374 Observer.changingInstr(MI);
8375 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_FMED3));
8376 MI.removeOperand(OpNo: 1);
8377 Observer.changedInstr(MI);
8378 return true;
8379 }
8380 case Intrinsic::amdgcn_readlane:
8381 case Intrinsic::amdgcn_writelane:
8382 case Intrinsic::amdgcn_readfirstlane:
8383 case Intrinsic::amdgcn_permlane16:
8384 case Intrinsic::amdgcn_permlanex16:
8385 case Intrinsic::amdgcn_permlane64:
8386 case Intrinsic::amdgcn_set_inactive:
8387 case Intrinsic::amdgcn_set_inactive_chain_arg:
8388 case Intrinsic::amdgcn_mov_dpp8:
8389 case Intrinsic::amdgcn_update_dpp:
8390 return legalizeLaneOp(Helper, MI, IID: IntrID);
8391 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8392 return legalizeSBufferPrefetch(Helper, MI);
8393 case Intrinsic::amdgcn_dead: {
8394 // TODO: Use poison instead of undef
8395 for (const MachineOperand &Def : MI.defs())
8396 B.buildUndef(Res: Def);
8397 MI.eraseFromParent();
8398 return true;
8399 }
8400 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8401 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8402 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8403 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8404 B.buildLoad(Res: MI.getOperand(i: 0), Addr: MI.getOperand(i: 2), MMO&: **MI.memoperands_begin());
8405 MI.eraseFromParent();
8406 return true;
8407 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8408 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8409 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8410 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8411 B.buildStore(Val: MI.getOperand(i: 2), Addr: MI.getOperand(i: 1), MMO&: **MI.memoperands_begin());
8412 MI.eraseFromParent();
8413 return true;
8414 case Intrinsic::amdgcn_flat_load_monitor_b32:
8415 case Intrinsic::amdgcn_flat_load_monitor_b64:
8416 case Intrinsic::amdgcn_flat_load_monitor_b128:
8417 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8418 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8419 .add(MO: MI.getOperand(i: 0))
8420 .add(MO: MI.getOperand(i: 2))
8421 .addMemOperand(MMO: *MI.memoperands_begin());
8422 MI.eraseFromParent();
8423 return true;
8424 case Intrinsic::amdgcn_global_load_monitor_b32:
8425 case Intrinsic::amdgcn_global_load_monitor_b64:
8426 case Intrinsic::amdgcn_global_load_monitor_b128:
8427 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8428 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8429 .add(MO: MI.getOperand(i: 0))
8430 .add(MO: MI.getOperand(i: 2))
8431 .addMemOperand(MMO: *MI.memoperands_begin());
8432 MI.eraseFromParent();
8433 return true;
8434 default: {
8435 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8436 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
8437 return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
8438 return true;
8439 }
8440 }
8441
8442 return true;
8443}
8444