1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
24#include "SIRegisterInfo.h"
25#include "Utils/AMDGPUBaseInfo.h"
26#include "llvm/ADT/ScopeExit.h"
27#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
30#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
32#include "llvm/CodeGen/GlobalISel/Utils.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/PseudoSourceValueManager.h"
35#include "llvm/CodeGen/TargetOpcodes.h"
36#include "llvm/IR/DiagnosticInfo.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
49static cl::opt<bool> EnableNewLegality(
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(Val: false),
54 cl::ReallyHidden);
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
59static LLT getPow2VectorType(LLT Ty) {
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(Value: NElts);
62 return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
66static LLT getPow2ScalarType(LLT Ty) {
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Value: Bits);
69 return LLT::scalar(SizeInBits: Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(NumElements: Ty.getNumElements() + 1, ScalarTy: EltTy));
110 };
111}
112
113static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
144static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) {
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(x: TypeIdx, y: LLT::scalar(SizeInBits: MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
152static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: Ty.getElementType()));
170 };
171}
172
173static LLT getBufferRsrcScalarType(const LLT Ty) {
174 if (!Ty.isVector())
175 return LLT::scalar(SizeInBits: 128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: 128));
178}
179
180static LLT getBufferRsrcRegisterType(const LLT Ty) {
181 if (!Ty.isVector())
182 return LLT::fixed_vector(NumElements: 4, ScalarTy: LLT::scalar(SizeInBits: 32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElements: NumElems * 4, ScalarTy: LLT::scalar(SizeInBits: 32));
185}
186
187static LLT getBitcastRegisterType(const LLT Ty) {
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(SizeInBits: Size);
194 }
195
196 return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32);
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
206static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
212 TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32));
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
239 Size <= MaxRegisterSize;
240}
241
242static bool isRegisterVectorElementType(LLT EltTy) {
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Size: Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
267static LegalityPredicate isRegisterType(const GCNSubtarget &ST,
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Ty: Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
277static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST,
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(SizeInBits: 16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(SizeInBits: 1);
297constexpr LLT S8 = LLT::scalar(SizeInBits: 8);
298constexpr LLT S16 = LLT::scalar(SizeInBits: 16);
299constexpr LLT S32 = LLT::scalar(SizeInBits: 32);
300constexpr LLT F32 = LLT::float32();
301constexpr LLT S64 = LLT::scalar(SizeInBits: 64);
302constexpr LLT F64 = LLT::float64();
303constexpr LLT S96 = LLT::scalar(SizeInBits: 96);
304constexpr LLT S128 = LLT::scalar(SizeInBits: 128);
305constexpr LLT S160 = LLT::scalar(SizeInBits: 160);
306constexpr LLT S192 = LLT::scalar(SizeInBits: 192);
307constexpr LLT S224 = LLT::scalar(SizeInBits: 224);
308constexpr LLT S256 = LLT::scalar(SizeInBits: 256);
309constexpr LLT S512 = LLT::scalar(SizeInBits: 512);
310constexpr LLT S1024 = LLT::scalar(SizeInBits: 1024);
311constexpr LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
312
313constexpr LLT V2S8 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 8);
314constexpr LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
315constexpr LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
316constexpr LLT V6S16 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16);
317constexpr LLT V8S16 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
318constexpr LLT V10S16 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 16);
319constexpr LLT V12S16 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 16);
320constexpr LLT V16S16 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16);
321
322constexpr LLT V2F16 = LLT::fixed_vector(NumElements: 2, ScalarTy: LLT::float16());
323constexpr LLT V2BF16 = V2F16; // FIXME
324
325constexpr LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
326constexpr LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
327constexpr LLT V4S32 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
328constexpr LLT V5S32 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 32);
329constexpr LLT V6S32 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 32);
330constexpr LLT V7S32 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 32);
331constexpr LLT V8S32 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32);
332constexpr LLT V9S32 = LLT::fixed_vector(NumElements: 9, ScalarSizeInBits: 32);
333constexpr LLT V10S32 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 32);
334constexpr LLT V11S32 = LLT::fixed_vector(NumElements: 11, ScalarSizeInBits: 32);
335constexpr LLT V12S32 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 32);
336constexpr LLT V16S32 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32);
337constexpr LLT V32S32 = LLT::fixed_vector(NumElements: 32, ScalarSizeInBits: 32);
338
339constexpr LLT V2S64 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
340constexpr LLT V3S64 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 64);
341constexpr LLT V4S64 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64);
342constexpr LLT V5S64 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 64);
343constexpr LLT V6S64 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 64);
344constexpr LLT V7S64 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 64);
345constexpr LLT V8S64 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64);
346constexpr LLT V16S64 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 64);
347
348constexpr LLT V2S128 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 128);
349constexpr LLT V4S128 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 128);
350
351constexpr std::initializer_list<LLT> AllScalarTypes = {
352 S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
353
354constexpr std::initializer_list<LLT> AllS16Vectors{
355 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
356
357constexpr std::initializer_list<LLT> AllS32Vectors = {
358 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
359 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
360
361constexpr std::initializer_list<LLT> AllS64Vectors = {
362 V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
363
364constexpr std::initializer_list<LLT> AllVectors{
365 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128,
366 V4S128, V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
367 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32, V2S64, V3S64,
368 V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
369
370// Checks whether a type is in the list of legal register types.
371static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
372 if (Ty.isPointerOrPointerVector())
373 Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
374
375 return is_contained(Set: AllS32Vectors, Element: Ty) || is_contained(Set: AllS64Vectors, Element: Ty) ||
376 is_contained(Set: AllScalarTypes, Element: Ty) ||
377 (ST.useRealTrue16Insts() && Ty == S16) ||
378 is_contained(Set: AllS16Vectors, Element: Ty);
379}
380
381static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST,
382 unsigned TypeIdx) {
383 return [&ST, TypeIdx](const LegalityQuery &Query) {
384 return isRegisterClassType(ST, Ty: Query.Types[TypeIdx]);
385 };
386}
387
388// If we have a truncating store or an extending load with a data size larger
389// than 32-bits, we need to reduce to a 32-bit type.
390static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
391 return [=](const LegalityQuery &Query) {
392 const LLT Ty = Query.Types[TypeIdx];
393 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
394 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
395 };
396}
397
398// If we have a truncating store or an extending load with a data size larger
399// than 32-bits and mem location is a power of 2
400static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) {
401 return [=](const LegalityQuery &Query) {
402 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
403 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
404 isPowerOf2_64(Value: MemSize);
405 };
406}
407
408// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
409// handle some operations by just promoting the register during
410// selection. There are also d16 loads on GFX9+ which preserve the high bits.
411static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
412 bool IsLoad, bool IsAtomic) {
413 switch (AS) {
414 case AMDGPUAS::PRIVATE_ADDRESS:
415 // FIXME: Private element size.
416 return ST.hasFlatScratchEnabled() ? 128 : 32;
417 case AMDGPUAS::LOCAL_ADDRESS:
418 return ST.useDS128() ? 128 : 64;
419 case AMDGPUAS::GLOBAL_ADDRESS:
420 case AMDGPUAS::CONSTANT_ADDRESS:
421 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
422 case AMDGPUAS::BUFFER_RESOURCE:
423 // Treat constant and global as identical. SMRD loads are sometimes usable for
424 // global loads (ideally constant address space should be eliminated)
425 // depending on the context. Legality cannot be context dependent, but
426 // RegBankSelect can split the load as necessary depending on the pointer
427 // register bank/uniformity and if the memory is invariant or not written in a
428 // kernel.
429 return IsLoad ? 512 : 128;
430 default:
431 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
432 // if they may alias scratch depending on the subtarget. This needs to be
433 // moved to custom handling to use addressMayBeAccessedAsPrivate
434 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
435 }
436}
437
438static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
439 const LegalityQuery &Query) {
440 const LLT Ty = Query.Types[0];
441
442 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
443 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
444
445 unsigned RegSize = Ty.getSizeInBits();
446 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
447 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
448 unsigned AS = Query.Types[1].getAddressSpace();
449
450 // All of these need to be custom lowered to cast the pointer operand.
451 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
452 return false;
453
454 // Do not handle extending vector loads.
455 if (Ty.isVector() && MemSize != RegSize)
456 return false;
457
458 // TODO: We should be able to widen loads if the alignment is high enough, but
459 // we also need to modify the memory access size.
460#if 0
461 // Accept widening loads based on alignment.
462 if (IsLoad && MemSize < Size)
463 MemSize = std::max(MemSize, Align);
464#endif
465
466 // Only 1-byte and 2-byte to 32-bit extloads are valid.
467 if (MemSize != RegSize && RegSize != 32)
468 return false;
469
470 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
471 IsAtomic: Query.MMODescrs[0].Ordering !=
472 AtomicOrdering::NotAtomic))
473 return false;
474
475 switch (MemSize) {
476 case 8:
477 case 16:
478 case 32:
479 case 64:
480 case 128:
481 break;
482 case 96:
483 if (!ST.hasDwordx3LoadStores())
484 return false;
485 break;
486 case 256:
487 case 512:
488 // These may contextually need to be broken down.
489 break;
490 default:
491 return false;
492 }
493
494 assert(RegSize >= MemSize);
495
496 if (AlignBits < MemSize) {
497 const SITargetLowering *TLI = ST.getTargetLowering();
498 if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
499 Alignment: Align(AlignBits / 8)))
500 return false;
501 }
502
503 return true;
504}
505
506// The newer buffer intrinsic forms take their resource arguments as
507// pointers in address space 8, aka s128 values. However, in order to not break
508// SelectionDAG, the underlying operations have to continue to take v4i32
509// arguments. Therefore, we convert resource pointers - or vectors of them
510// to integer values here.
511static bool hasBufferRsrcWorkaround(const LLT Ty) {
512 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
513 return true;
514 if (Ty.isVector()) {
515 const LLT ElemTy = Ty.getElementType();
516 return hasBufferRsrcWorkaround(Ty: ElemTy);
517 }
518 return false;
519}
520
521// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
522// workaround this. Eventually it should ignore the type for loads and only care
523// about the size. Return true in cases where we will workaround this for now by
524// bitcasting.
525static bool loadStoreBitcastWorkaround(const LLT Ty) {
526 if (EnableNewLegality)
527 return false;
528
529 const unsigned Size = Ty.getSizeInBits();
530 if (Ty.isPointerVector())
531 return true;
532 if (Size <= 64)
533 return false;
534 // Address space 8 pointers get their own workaround.
535 if (hasBufferRsrcWorkaround(Ty))
536 return false;
537 if (!Ty.isVector())
538 return true;
539
540 unsigned EltSize = Ty.getScalarSizeInBits();
541 return EltSize != 32 && EltSize != 64;
542}
543
544static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
545 const LLT Ty = Query.Types[0];
546 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
547 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
548}
549
550/// Return true if a load or store of the type should be lowered with a bitcast
551/// to a different type.
552static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
553 const LLT MemTy) {
554 const unsigned MemSizeInBits = MemTy.getSizeInBits();
555 const unsigned Size = Ty.getSizeInBits();
556 if (Size != MemSizeInBits)
557 return Size <= 32 && Ty.isVector();
558
559 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
560 return true;
561
562 // Don't try to handle bitcasting vector ext loads for now.
563 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
564 (Size <= 32 || isRegisterSize(ST, Size)) &&
565 !isRegisterVectorElementType(EltTy: Ty.getElementType());
566}
567
568/// Return true if we should legalize a load by widening an odd sized memory
569/// access up to the alignment. Note this case when the memory access itself
570/// changes, not the size of the result register.
571static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
572 uint64_t AlignInBits, unsigned AddrSpace,
573 unsigned Opcode) {
574 unsigned SizeInBits = MemoryTy.getSizeInBits();
575 // We don't want to widen cases that are naturally legal.
576 if (isPowerOf2_32(Value: SizeInBits))
577 return false;
578
579 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
580 // end up widening these for a scalar load during RegBankSelect, if we don't
581 // have 96-bit scalar loads.
582 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
583 return false;
584
585 if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
586 return false;
587
588 // A load is known dereferenceable up to the alignment, so it's legal to widen
589 // to it.
590 //
591 // TODO: Could check dereferenceable for less aligned cases.
592 unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
593 if (AlignInBits < RoundedSize)
594 return false;
595
596 // Do not widen if it would introduce a slow unaligned load.
597 const SITargetLowering *TLI = ST.getTargetLowering();
598 unsigned Fast = 0;
599 return TLI->allowsMisalignedMemoryAccessesImpl(
600 Size: RoundedSize, AddrSpace, Alignment: Align(AlignInBits / 8),
601 Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
602 Fast;
603}
604
605static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
606 unsigned Opcode) {
607 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
608 return false;
609
610 return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs[0].MemoryTy,
611 AlignInBits: Query.MMODescrs[0].AlignInBits,
612 AddrSpace: Query.Types[1].getAddressSpace(), Opcode);
613}
614
615/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
616/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
617/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
618static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
619 MachineRegisterInfo &MRI, unsigned Idx) {
620 MachineOperand &MO = MI.getOperand(i: Idx);
621
622 const LLT PointerTy = MRI.getType(Reg: MO.getReg());
623
624 // Paranoidly prevent us from doing this multiple times.
625 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
626 return PointerTy;
627
628 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
630 if (!PointerTy.isVector()) {
631 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
632 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
633 const LLT S32 = LLT::scalar(SizeInBits: 32);
634
635 Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
636 std::array<Register, 4> VectorElems;
637 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
638 for (unsigned I = 0; I < NumParts; ++I)
639 VectorElems[I] =
640 B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: 0);
641 B.buildMergeValues(Res: MO, Ops: VectorElems);
642 MO.setReg(VectorReg);
643 return VectorTy;
644 }
645 Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
646 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
647 auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
648 B.buildIntToPtr(Dst: MO, Src: Scalar);
649 MO.setReg(BitcastReg);
650
651 return VectorTy;
652}
653
654/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
655/// the form in which the value must be in order to be passed to the low-level
656/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
657/// needed in order to account for the fact that we can't define a register
658/// class for s128 without breaking SelectionDAG.
659static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
660 MachineRegisterInfo &MRI = *B.getMRI();
661 const LLT PointerTy = MRI.getType(Reg: Pointer);
662 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
663 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
664
665 if (!PointerTy.isVector()) {
666 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
667 SmallVector<Register, 4> PointerParts;
668 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
669 auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: Pointer);
670 for (unsigned I = 0; I < NumParts; ++I)
671 PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
672 return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: 0);
673 }
674 Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: 0);
675 return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: 0);
676}
677
678static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
679 unsigned Idx) {
680 MachineOperand &MO = MI.getOperand(i: Idx);
681
682 const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
683 // Paranoidly prevent us from doing this multiple times.
684 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
685 return;
686 MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
687}
688
689AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
690 const GCNTargetMachine &TM)
691 : ST(ST_) {
692 using namespace TargetOpcode;
693
694 auto GetAddrSpacePtr = [&TM](unsigned AS) {
695 return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
696 };
697
698 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
699 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
700 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
701 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
702 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
703 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
704 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
705 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
706 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
707 const LLT BufferStridedPtr =
708 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
709
710 const LLT CodePtr = FlatPtr;
711
712 const std::initializer_list<LLT> AddrSpaces64 = {
713 GlobalPtr, ConstantPtr, FlatPtr
714 };
715
716 const std::initializer_list<LLT> AddrSpaces32 = {
717 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
718 };
719
720 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
721
722 const std::initializer_list<LLT> FPTypesBase = {
723 S32, S64
724 };
725
726 const std::initializer_list<LLT> FPTypes16 = {
727 S32, S64, S16
728 };
729
730 const std::initializer_list<LLT> FPTypesPK16 = {
731 S32, S64, S16, V2S16
732 };
733
734 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
735
736 // s1 for VCC branches, s32 for SCC branches.
737 getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
738
739 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
740 // elements for v3s16
741 getActionDefinitionsBuilder(Opcode: G_PHI)
742 .legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
743 .legalFor(Types: AllS32Vectors)
744 .legalFor(Types: AllS64Vectors)
745 .legalFor(Types: AddrSpaces64)
746 .legalFor(Types: AddrSpaces32)
747 .legalFor(Types: AddrSpaces128)
748 .legalIf(Predicate: isPointer(TypeIdx: 0))
749 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S256)
750 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
751 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16)
752 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
753 .scalarize(TypeIdx: 0);
754
755 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
756 // Full set of gfx9 features.
757 if (ST.hasScalarAddSub64()) {
758 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
759 .legalFor(Types: {S64, S32, S16, V2S16})
760 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
761 .scalarize(TypeIdx: 0)
762 .minScalar(TypeIdx: 0, Ty: S16)
763 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
764 .maxScalar(TypeIdx: 0, Ty: S32);
765 } else {
766 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
767 .legalFor(Types: {S32, S16, V2S16})
768 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
769 .scalarize(TypeIdx: 0)
770 .minScalar(TypeIdx: 0, Ty: S16)
771 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
772 .maxScalar(TypeIdx: 0, Ty: S32);
773 }
774
775 if (ST.hasScalarSMulU64()) {
776 getActionDefinitionsBuilder(Opcode: G_MUL)
777 .legalFor(Types: {S64, S32, S16, V2S16})
778 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
779 .scalarize(TypeIdx: 0)
780 .minScalar(TypeIdx: 0, Ty: S16)
781 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
782 .custom();
783 } else {
784 getActionDefinitionsBuilder(Opcode: G_MUL)
785 .legalFor(Types: {S32, S16, V2S16})
786 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
787 .scalarize(TypeIdx: 0)
788 .minScalar(TypeIdx: 0, Ty: S16)
789 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
790 .custom();
791 }
792 assert(ST.hasMad64_32());
793
794 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
795 .legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
796 .minScalarOrElt(TypeIdx: 0, Ty: S16)
797 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
798 .scalarize(TypeIdx: 0)
799 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
800 .lower();
801 } else if (ST.has16BitInsts()) {
802 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
803 .legalFor(Types: {S32, S16})
804 .minScalar(TypeIdx: 0, Ty: S16)
805 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
806 .maxScalar(TypeIdx: 0, Ty: S32)
807 .scalarize(TypeIdx: 0);
808
809 getActionDefinitionsBuilder(Opcode: G_MUL)
810 .legalFor(Types: {S32, S16})
811 .scalarize(TypeIdx: 0)
812 .minScalar(TypeIdx: 0, Ty: S16)
813 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
814 .custom();
815 assert(ST.hasMad64_32());
816
817 // Technically the saturating operations require clamp bit support, but this
818 // was introduced at the same time as 16-bit operations.
819 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
820 .legalFor(Types: {S32, S16}) // Clamp modifier
821 .minScalar(TypeIdx: 0, Ty: S16)
822 .scalarize(TypeIdx: 0)
823 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 16)
824 .lower();
825
826 // We're just lowering this, but it helps get a better result to try to
827 // coerce to the desired type first.
828 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
829 .minScalar(TypeIdx: 0, Ty: S16)
830 .scalarize(TypeIdx: 0)
831 .lower();
832 } else {
833 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
834 .legalFor(Types: {S32})
835 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
836 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
837 .scalarize(TypeIdx: 0);
838
839 auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
840 .legalFor(Types: {S32})
841 .scalarize(TypeIdx: 0)
842 .minScalar(TypeIdx: 0, Ty: S32)
843 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32);
844
845 if (ST.hasMad64_32())
846 Mul.custom();
847 else
848 Mul.maxScalar(TypeIdx: 0, Ty: S32);
849
850 if (ST.hasIntClamp()) {
851 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
852 .legalFor(Types: {S32}) // Clamp modifier.
853 .scalarize(TypeIdx: 0)
854 .minScalarOrElt(TypeIdx: 0, Ty: S32)
855 .lower();
856 } else {
857 // Clamp bit support was added in VI, along with 16-bit operations.
858 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
859 .minScalar(TypeIdx: 0, Ty: S32)
860 .scalarize(TypeIdx: 0)
861 .lower();
862 }
863
864 // FIXME: DAG expansion gets better results. The widening uses the smaller
865 // range values and goes for the min/max lowering directly.
866 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
867 .minScalar(TypeIdx: 0, Ty: S32)
868 .scalarize(TypeIdx: 0)
869 .lower();
870 }
871
872 getActionDefinitionsBuilder(
873 Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
874 .customFor(Types: {S32, S64})
875 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
876 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
877 .scalarize(TypeIdx: 0);
878
879 auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
880 .legalFor(Types: {S32})
881 .maxScalar(TypeIdx: 0, Ty: S32);
882
883 if (ST.hasVOP3PInsts()) {
884 Mulh
885 .clampMaxNumElements(TypeIdx: 0, EltTy: S8, MaxElements: 2)
886 .lowerFor(Types: {V2S8});
887 }
888
889 Mulh
890 .scalarize(TypeIdx: 0)
891 .lower();
892
893 // Report legal for any types we can handle anywhere. For the cases only legal
894 // on the SALU, RegBankSelect will be able to re-legalize.
895 getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
896 .legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
897 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
898 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
899 .fewerElementsIf(
900 Predicate: all(P0: vectorWiderThan(TypeIdx: 0, Size: 64), P1: scalarOrEltNarrowerThan(TypeIdx: 0, Size: 64)),
901 Mutation: fewerEltsToSize64Vector(TypeIdx: 0))
902 .widenScalarToNextPow2(TypeIdx: 0)
903 .scalarize(TypeIdx: 0);
904
905 getActionDefinitionsBuilder(
906 Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
907 .legalFor(Types: {{S32, S1}, {S32, S32}})
908 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
909 .scalarize(TypeIdx: 0);
910
911 getActionDefinitionsBuilder(Opcode: G_BITCAST)
912 // Don't worry about the size constraint.
913 .legalIf(Predicate: all(P0: isRegisterClassType(ST, TypeIdx: 0), P1: isRegisterClassType(ST, TypeIdx: 1)))
914 .lower();
915
916 getActionDefinitionsBuilder(Opcode: G_CONSTANT)
917 .legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
918 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
919 .legalIf(Predicate: isPointer(TypeIdx: 0))
920 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
921 .widenScalarToNextPow2(TypeIdx: 0);
922
923 getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
924 .legalFor(Types: {S32, S64, S16})
925 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
926
927 getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
928 .legalIf(Predicate: isRegisterClassType(ST, TypeIdx: 0))
929 // s1 and s16 are special cases because they have legal operations on
930 // them, but don't really occupy registers in the normal way.
931 .legalFor(Types: {S1, S16})
932 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
933 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
934 .clampScalarOrElt(TypeIdx: 0, MinTy: S32, MaxTy: MaxScalar)
935 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
936 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16);
937
938 getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
939
940 // If the amount is divergent, we have to do a wave reduction to get the
941 // maximum value, so this is expanded during RegBankSelect.
942 getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
943 .legalFor(Types: {{PrivatePtr, S32}});
944
945 getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
946 .customFor(Types: {PrivatePtr});
947 getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
948 .legalFor(Types: {PrivatePtr});
949
950 getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
951
952 getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
953 .customIf(Predicate: typeIsNot(TypeIdx: 0, Type: PrivatePtr));
954
955 getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
956
957 auto &FPOpActions = getActionDefinitionsBuilder(
958 Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
959 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
960 .legalFor(Types: {S32, S64});
961 auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
962 .customFor(Types: {S32, S64});
963 auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
964 .customFor(Types: {S32, S64});
965
966 if (ST.has16BitInsts()) {
967 if (ST.hasVOP3PInsts())
968 FPOpActions.legalFor(Types: {S16, V2S16});
969 else
970 FPOpActions.legalFor(Types: {S16});
971
972 TrigActions.customFor(Types: {S16});
973 FDIVActions.customFor(Types: {S16});
974 }
975
976 if (ST.hasPackedFP32Ops()) {
977 FPOpActions.legalFor(Types: {V2S32});
978 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S32, NumElts: 2);
979 }
980
981 auto &MinNumMaxNumIeee =
982 getActionDefinitionsBuilder(Opcodes: {G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
983
984 if (ST.hasVOP3PInsts()) {
985 MinNumMaxNumIeee.legalFor(Types: FPTypesPK16)
986 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
987 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
988 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
989 .scalarize(TypeIdx: 0);
990 } else if (ST.has16BitInsts()) {
991 MinNumMaxNumIeee.legalFor(Types: FPTypes16).clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64).scalarize(TypeIdx: 0);
992 } else {
993 MinNumMaxNumIeee.legalFor(Types: FPTypesBase)
994 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
995 .scalarize(TypeIdx: 0);
996 }
997
998 auto &MinNumMaxNum = getActionDefinitionsBuilder(
999 Opcodes: {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1000
1001 if (ST.hasVOP3PInsts()) {
1002 MinNumMaxNum.customFor(Types: FPTypesPK16)
1003 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1004 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1005 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1006 .scalarize(TypeIdx: 0);
1007 } else if (ST.has16BitInsts()) {
1008 MinNumMaxNum.customFor(Types: FPTypes16)
1009 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1010 .scalarize(TypeIdx: 0);
1011 } else {
1012 MinNumMaxNum.customFor(Types: FPTypesBase)
1013 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1014 .scalarize(TypeIdx: 0);
1015 }
1016
1017 if (ST.hasVOP3PInsts())
1018 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
1019
1020 FPOpActions
1021 .scalarize(TypeIdx: 0)
1022 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1023
1024 TrigActions
1025 .scalarize(TypeIdx: 0)
1026 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1027
1028 FDIVActions
1029 .scalarize(TypeIdx: 0)
1030 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1031
1032 getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS})
1033 .legalFor(Types: FPTypesPK16)
1034 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1035 .scalarize(TypeIdx: 0)
1036 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1037
1038 if (ST.has16BitInsts()) {
1039 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1040 .legalFor(Types: {S16})
1041 .customFor(Types: {S32, S64})
1042 .scalarize(TypeIdx: 0)
1043 .unsupported();
1044 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1045 .legalFor(Types: {S32, S64, S16})
1046 .scalarize(TypeIdx: 0)
1047 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1048
1049 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1050 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
1051 .scalarize(TypeIdx: 0)
1052 .maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16)
1053 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1054 .lower();
1055
1056 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1057 .customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1058 .scalarize(TypeIdx: 0)
1059 .lower();
1060 } else {
1061 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1062 .customFor(Types: {S32, S64, S16})
1063 .scalarize(TypeIdx: 0)
1064 .unsupported();
1065
1066
1067 if (ST.hasFractBug()) {
1068 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1069 .customFor(Types: {S64})
1070 .legalFor(Types: {S32, S64})
1071 .scalarize(TypeIdx: 0)
1072 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1073 } else {
1074 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1075 .legalFor(Types: {S32, S64})
1076 .scalarize(TypeIdx: 0)
1077 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1078 }
1079
1080 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1081 .legalFor(Types: {{S32, S32}, {S64, S32}})
1082 .scalarize(TypeIdx: 0)
1083 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1084 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1085 .lower();
1086
1087 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1088 .customFor(Types: {{S32, S32}, {S64, S32}})
1089 .scalarize(TypeIdx: 0)
1090 .minScalar(TypeIdx: 0, Ty: S32)
1091 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1092 .lower();
1093 }
1094
1095 auto &FPTruncActions = getActionDefinitionsBuilder(Opcode: G_FPTRUNC);
1096 if (ST.hasCvtPkF16F32Inst()) {
1097 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1098 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1099 } else {
1100 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}});
1101 }
1102 FPTruncActions.scalarize(TypeIdx: 0).lower();
1103
1104 getActionDefinitionsBuilder(Opcode: G_FPEXT)
1105 .legalFor(Types: {{S64, S32}, {S32, S16}})
1106 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32))
1107 .scalarize(TypeIdx: 0);
1108
1109 auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1110 if (ST.has16BitInsts()) {
1111 FSubActions
1112 // Use actual fsub instruction
1113 .legalFor(Types: {S32, S16})
1114 // Must use fadd + fneg
1115 .lowerFor(Types: {S64, V2S16});
1116 } else {
1117 FSubActions
1118 // Use actual fsub instruction
1119 .legalFor(Types: {S32})
1120 // Must use fadd + fneg
1121 .lowerFor(Types: {S64, S16, V2S16});
1122 }
1123
1124 FSubActions
1125 .scalarize(TypeIdx: 0)
1126 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1127
1128 // Whether this is legal depends on the floating point mode for the function.
1129 auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1130 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1131 FMad.customFor(Types: {S32, S16});
1132 else if (ST.hasMadMacF32Insts())
1133 FMad.customFor(Types: {S32});
1134 else if (ST.hasMadF16())
1135 FMad.customFor(Types: {S16});
1136 FMad.scalarize(TypeIdx: 0)
1137 .lower();
1138
1139 auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1140 if (ST.has16BitInsts()) {
1141 FRem.customFor(Types: {S16, S32, S64});
1142 } else {
1143 FRem.minScalar(TypeIdx: 0, Ty: S32)
1144 .customFor(Types: {S32, S64});
1145 }
1146 FRem.scalarize(TypeIdx: 0);
1147
1148 // TODO: Do we need to clamp maximum bitwidth?
1149 getActionDefinitionsBuilder(Opcode: G_TRUNC)
1150 .legalIf(Predicate: isScalar(TypeIdx: 0))
1151 .legalFor(Types: {{V2S16, V2S32}})
1152 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1153 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1154 // situations (like an invalid implicit use), we don't want to infinite loop
1155 // in the legalizer.
1156 .fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: 0), Mutation: LegalizeMutations::scalarize(TypeIdx: 0))
1157 .alwaysLegal();
1158
1159 getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1160 .legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1161 {S32, S1}, {S64, S1}, {S16, S1}})
1162 .scalarize(TypeIdx: 0)
1163 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1164 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1165
1166 // TODO: Split s1->s64 during regbankselect for VALU.
1167 auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1168 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S32}})
1169 .lowerIf(Predicate: typeIs(TypeIdx: 1, TypesInit: S1))
1170 .customFor(Types: {{S32, S64}, {S64, S64}});
1171 if (ST.has16BitInsts())
1172 IToFP.legalFor(Types: {{S16, S16}});
1173 IToFP.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1174 .minScalar(TypeIdx: 0, Ty: S32)
1175 .scalarize(TypeIdx: 0)
1176 .widenScalarToNextPow2(TypeIdx: 1);
1177
1178 auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1179 .legalFor(Types: {{S32, S32}, {S32, S64}, {S32, S16}})
1180 .customFor(Types: {{S64, S32}, {S64, S64}})
1181 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1182 if (ST.has16BitInsts())
1183 FPToI.legalFor(Types: {{S16, S16}});
1184 else
1185 FPToI.minScalar(TypeIdx: 1, Ty: S32);
1186
1187 FPToI.minScalar(TypeIdx: 0, Ty: S32)
1188 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1189 .scalarize(TypeIdx: 0)
1190 .lower();
1191
1192 // clang-format off
1193 auto &FPToISat = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI_SAT, G_FPTOUI_SAT})
1194 .legalFor(Types: {{S32, S32}, {S32, S64}})
1195 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1196 FPToISat.minScalar(TypeIdx: 1, Ty: S32);
1197 FPToISat.minScalar(TypeIdx: 0, Ty: S32)
1198 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1199 .scalarize(TypeIdx: 0)
1200 .lower();
1201 // clang-format on
1202
1203 getActionDefinitionsBuilder(Opcodes: {G_LROUND, G_LLROUND})
1204 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1205 .scalarize(TypeIdx: 0)
1206 .lower();
1207
1208 getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1209 .legalFor(Types: {S16, S32})
1210 .scalarize(TypeIdx: 0)
1211 .lower();
1212
1213 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1214 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1215 .scalarize(TypeIdx: 0)
1216 .lower();
1217
1218 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1219 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1220 .scalarize(TypeIdx: 0)
1221 .lower();
1222
1223 if (ST.has16BitInsts()) {
1224 getActionDefinitionsBuilder(
1225 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1226 .legalFor(Types: {S16, S32, S64})
1227 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1228 .scalarize(TypeIdx: 0);
1229 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1230 getActionDefinitionsBuilder(
1231 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1232 .legalFor(Types: {S32, S64})
1233 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1234 .scalarize(TypeIdx: 0);
1235 } else {
1236 getActionDefinitionsBuilder(
1237 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1238 .legalFor(Types: {S32})
1239 .customFor(Types: {S64})
1240 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1241 .scalarize(TypeIdx: 0);
1242 }
1243
1244 getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1245 .unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1246 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: sameSize(TypeIdx0: 0, TypeIdx1: 1)))
1247 .scalarize(TypeIdx: 0)
1248 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0);
1249
1250 getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1251 .legalIf(Predicate: all(P0: sameSize(TypeIdx0: 0, TypeIdx1: 1), P1: typeInSet(TypeIdx: 1, TypesInit: {S64, S32})))
1252 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0)
1253 .scalarize(TypeIdx: 0);
1254
1255 auto &CmpBuilder =
1256 getActionDefinitionsBuilder(Opcode: G_ICMP)
1257 // The compare output type differs based on the register bank of the output,
1258 // so make both s1 and s32 legal.
1259 //
1260 // Scalar compares producing output in scc will be promoted to s32, as that
1261 // is the allocatable register type that will be needed for the copy from
1262 // scc. This will be promoted during RegBankSelect, and we assume something
1263 // before that won't try to use s32 result types.
1264 //
1265 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1266 // bank.
1267 .legalForCartesianProduct(
1268 Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1269 .legalForCartesianProduct(
1270 Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1271 if (ST.has16BitInsts()) {
1272 CmpBuilder.legalFor(Types: {{S1, S16}});
1273 }
1274
1275 CmpBuilder
1276 .widenScalarToNextPow2(TypeIdx: 1)
1277 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1278 .scalarize(TypeIdx: 0)
1279 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: 1)));
1280
1281 auto &FCmpBuilder =
1282 getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1283 Types0: {S1}, Types1: ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1284
1285 if (ST.hasSALUFloatInsts())
1286 FCmpBuilder.legalForCartesianProduct(Types0: {S32}, Types1: {S16, S32});
1287
1288 FCmpBuilder
1289 .widenScalarToNextPow2(TypeIdx: 1)
1290 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1291 .scalarize(TypeIdx: 0);
1292
1293 // FIXME: fpow has a selection pattern that should move to custom lowering.
1294 auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1295 if (ST.has16BitInsts())
1296 ExpOps.customFor(Types: {{S32}, {S16}});
1297 else
1298 ExpOps.customFor(Types: {S32});
1299 ExpOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1300 .scalarize(TypeIdx: 0);
1301
1302 getActionDefinitionsBuilder(Opcode: G_FPOWI)
1303 .clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1304 .lower();
1305
1306 auto &Log2Ops = getActionDefinitionsBuilder(Opcodes: {G_FLOG2, G_FEXP2});
1307 Log2Ops.customFor(Types: {S32});
1308 if (ST.has16BitInsts())
1309 Log2Ops.legalFor(Types: {S16});
1310 else
1311 Log2Ops.customFor(Types: {S16});
1312 Log2Ops.scalarize(TypeIdx: 0)
1313 .lower();
1314
1315 auto &LogOps =
1316 getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1317 LogOps.customFor(Types: {S32, S16});
1318 LogOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1319 .scalarize(TypeIdx: 0);
1320
1321 // The 64-bit versions produce 32-bit results, but only on the SALU.
1322 getActionDefinitionsBuilder(Opcode: G_CTPOP)
1323 .legalFor(Types: {{S32, S32}, {S32, S64}})
1324 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1325 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1326 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1327 .scalarize(TypeIdx: 0)
1328 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1329
1330 // If no 16 bit instr is available, lower into different instructions.
1331 if (ST.has16BitInsts())
1332 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1333 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1334 .widenScalarToNextPow2(TypeIdx: 1)
1335 .scalarize(TypeIdx: 0)
1336 .lower();
1337 else
1338 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1339 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1340 .lowerFor(Types: {S1, S16})
1341 .widenScalarToNextPow2(TypeIdx: 1)
1342 .scalarize(TypeIdx: 0)
1343 .lower();
1344
1345 // The hardware instructions return a different result on 0 than the generic
1346 // instructions expect. The hardware produces -1, but these produce the
1347 // bitwidth.
1348 getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1349 .scalarize(TypeIdx: 0)
1350 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1351 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1352 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1353 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1354 .custom();
1355
1356 // The 64-bit versions produce 32-bit results, but only on the SALU.
1357 getActionDefinitionsBuilder(Opcode: G_CTLZ_ZERO_UNDEF)
1358 .legalFor(Types: {{S32, S32}, {S32, S64}})
1359 .customIf(Predicate: scalarNarrowerThan(TypeIdx: 1, Size: 32))
1360 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1361 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1362 .scalarize(TypeIdx: 0)
1363 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1364 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1365
1366 getActionDefinitionsBuilder(Opcode: G_CTTZ_ZERO_UNDEF)
1367 .legalFor(Types: {{S32, S32}, {S32, S64}})
1368 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1369 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1370 .scalarize(TypeIdx: 0)
1371 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1372 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1373
1374 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1375 // RegBankSelect.
1376 getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1377 .legalFor(Types: {S32, S64})
1378 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1379 .scalarize(TypeIdx: 0)
1380 .widenScalarToNextPow2(TypeIdx: 0);
1381
1382 if (ST.has16BitInsts()) {
1383 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1384 .legalFor(Types: {S16, S32, V2S16})
1385 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1386 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1387 // narrowScalar limitation.
1388 .widenScalarToNextPow2(TypeIdx: 0)
1389 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S32)
1390 .scalarize(TypeIdx: 0);
1391
1392 if (ST.hasVOP3PInsts()) {
1393 getActionDefinitionsBuilder(Opcode: G_ABS)
1394 .legalFor(Types: {S32, S16, V2S16})
1395 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1396 .minScalar(TypeIdx: 0, Ty: S16)
1397 .widenScalarToNextPow2(TypeIdx: 0)
1398 .scalarize(TypeIdx: 0)
1399 .lower();
1400 if (ST.hasIntMinMax64()) {
1401 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1402 .legalFor(Types: {S32, S16, S64, V2S16})
1403 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1404 .minScalar(TypeIdx: 0, Ty: S16)
1405 .widenScalarToNextPow2(TypeIdx: 0)
1406 .scalarize(TypeIdx: 0)
1407 .lower();
1408 } else {
1409 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1410 .legalFor(Types: {S32, S16, V2S16})
1411 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1412 .minScalar(TypeIdx: 0, Ty: S16)
1413 .widenScalarToNextPow2(TypeIdx: 0)
1414 .scalarize(TypeIdx: 0)
1415 .lower();
1416 }
1417 } else {
1418 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1419 .legalFor(Types: {S32, S16})
1420 .widenScalarToNextPow2(TypeIdx: 0)
1421 .minScalar(TypeIdx: 0, Ty: S16)
1422 .scalarize(TypeIdx: 0)
1423 .lower();
1424 }
1425 } else {
1426 // TODO: Should have same legality without v_perm_b32
1427 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1428 .legalFor(Types: {S32})
1429 .lowerIf(Predicate: scalarNarrowerThan(TypeIdx: 0, Size: 32))
1430 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1431 // narrowScalar limitation.
1432 .widenScalarToNextPow2(TypeIdx: 0)
1433 .maxScalar(TypeIdx: 0, Ty: S32)
1434 .scalarize(TypeIdx: 0)
1435 .lower();
1436
1437 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1438 .legalFor(Types: {S32})
1439 .minScalar(TypeIdx: 0, Ty: S32)
1440 .widenScalarToNextPow2(TypeIdx: 0)
1441 .scalarize(TypeIdx: 0)
1442 .lower();
1443 }
1444
1445 getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1446 // List the common cases
1447 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1448 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1449 .scalarize(TypeIdx: 0)
1450 // Accept any address space as long as the size matches
1451 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1452 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 1, TypeIdx1: 0),
1453 Mutation: [](const LegalityQuery &Query) {
1454 return std::pair(
1455 1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1456 })
1457 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 1, TypeIdx1: 0), Mutation: [](const LegalityQuery &Query) {
1458 return std::pair(1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1459 });
1460
1461 getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1462 // List the common cases
1463 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1464 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1465 .scalarize(TypeIdx: 0)
1466 // Accept any address space as long as the size matches
1467 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1468 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 0, TypeIdx1: 1),
1469 Mutation: [](const LegalityQuery &Query) {
1470 return std::pair(
1471 0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1472 })
1473 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 0, TypeIdx1: 1), Mutation: [](const LegalityQuery &Query) {
1474 return std::pair(0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1475 });
1476
1477 getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1478 .scalarize(TypeIdx: 0)
1479 .custom();
1480
1481 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1482 bool IsLoad) -> bool {
1483 const LLT DstTy = Query.Types[0];
1484
1485 // Split vector extloads.
1486 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1487
1488 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1489 return true;
1490
1491 const LLT PtrTy = Query.Types[1];
1492 unsigned AS = PtrTy.getAddressSpace();
1493 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1494 IsAtomic: Query.MMODescrs[0].Ordering !=
1495 AtomicOrdering::NotAtomic))
1496 return true;
1497
1498 // Catch weird sized loads that don't evenly divide into the access sizes
1499 // TODO: May be able to widen depending on alignment etc.
1500 unsigned NumRegs = (MemSize + 31) / 32;
1501 if (NumRegs == 3) {
1502 if (!ST.hasDwordx3LoadStores())
1503 return true;
1504 } else {
1505 // If the alignment allows, these should have been widened.
1506 if (!isPowerOf2_32(Value: NumRegs))
1507 return true;
1508 }
1509
1510 return false;
1511 };
1512
1513 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1514 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1515 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1516
1517 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1518 // LDS
1519 // TODO: Unsupported flat for SI.
1520
1521 for (unsigned Op : {G_LOAD, G_STORE}) {
1522 const bool IsStore = Op == G_STORE;
1523
1524 auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1525 // Explicitly list some common cases.
1526 // TODO: Does this help compile time at all?
1527 Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1528 {.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1529 {.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1530 {.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1531 {.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1532 {.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1533 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1534 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1535
1536 {.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1537 {.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: 32},
1538 {.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: 32},
1539 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1540 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1541 {.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1542
1543 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1544 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1545 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1546 {.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1547
1548 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1549 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1550 {.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1551 {.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1552 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1553 Actions.legalIf(
1554 Predicate: [=](const LegalityQuery &Query) -> bool {
1555 return isLoadStoreLegal(ST, Query);
1556 });
1557
1558 // The custom pointers (fat pointers, buffer resources) don't work with load
1559 // and store at this level. Fat pointers should have been lowered to
1560 // intrinsics before the translation to MIR.
1561 Actions.unsupportedIf(
1562 Predicate: typeInSet(TypeIdx: 1, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1563
1564 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1565 // ptrtoint. This is needed to account for the fact that we can't have i128
1566 // as a register class for SelectionDAG reasons.
1567 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1568 return hasBufferRsrcWorkaround(Ty: Query.Types[0]);
1569 });
1570
1571 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1572 // 64-bits.
1573 //
1574 // TODO: Should generalize bitcast action into coerce, which will also cover
1575 // inserting addrspacecasts.
1576 Actions.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1577
1578 // Turn any illegal element vectors into something easier to deal
1579 // with. These will ultimately produce 32-bit scalar shifts to extract the
1580 // parts anyway.
1581 //
1582 // For odd 16-bit element vectors, prefer to split those into pieces with
1583 // 16-bit vector parts.
1584 Actions.bitcastIf(
1585 Predicate: [=](const LegalityQuery &Query) -> bool {
1586 return shouldBitcastLoadStoreType(ST, Ty: Query.Types[0],
1587 MemTy: Query.MMODescrs[0].MemoryTy);
1588 }, Mutation: bitcastToRegisterType(TypeIdx: 0));
1589
1590 if (!IsStore) {
1591 // Widen suitably aligned loads by loading extra bytes. The standard
1592 // legalization actions can't properly express widening memory operands.
1593 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1594 return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1595 });
1596 }
1597
1598 // FIXME: load/store narrowing should be moved to lower action
1599 Actions
1600 .narrowScalarIf(
1601 Predicate: [=](const LegalityQuery &Query) -> bool {
1602 return !Query.Types[0].isVector() &&
1603 needToSplitMemOp(Query, Op == G_LOAD);
1604 },
1605 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1606 const LLT DstTy = Query.Types[0];
1607 const LLT PtrTy = Query.Types[1];
1608
1609 const unsigned DstSize = DstTy.getSizeInBits();
1610 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1611
1612 // Split extloads.
1613 if (DstSize > MemSize)
1614 return std::pair(0, LLT::scalar(SizeInBits: MemSize));
1615
1616 unsigned MaxSize = maxSizeForAddrSpace(
1617 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1618 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1619 if (MemSize > MaxSize)
1620 return std::pair(0, LLT::scalar(SizeInBits: MaxSize));
1621
1622 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1623 return std::pair(0, LLT::scalar(SizeInBits: Align));
1624 })
1625 .fewerElementsIf(
1626 Predicate: [=](const LegalityQuery &Query) -> bool {
1627 return Query.Types[0].isVector() &&
1628 needToSplitMemOp(Query, Op == G_LOAD);
1629 },
1630 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1631 const LLT DstTy = Query.Types[0];
1632 const LLT PtrTy = Query.Types[1];
1633
1634 LLT EltTy = DstTy.getElementType();
1635 unsigned MaxSize = maxSizeForAddrSpace(
1636 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1637 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1638
1639 // FIXME: Handle widened to power of 2 results better. This ends
1640 // up scalarizing.
1641 // FIXME: 3 element stores scalarized on SI
1642
1643 // Split if it's too large for the address space.
1644 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1645 if (MemSize > MaxSize) {
1646 unsigned NumElts = DstTy.getNumElements();
1647 unsigned EltSize = EltTy.getSizeInBits();
1648
1649 if (MaxSize % EltSize == 0) {
1650 return std::pair(
1651 0, LLT::scalarOrVector(
1652 EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1653 }
1654
1655 unsigned NumPieces = MemSize / MaxSize;
1656
1657 // FIXME: Refine when odd breakdowns handled
1658 // The scalars will need to be re-legalized.
1659 if (NumPieces == 1 || NumPieces >= NumElts ||
1660 NumElts % NumPieces != 0)
1661 return std::pair(0, EltTy);
1662
1663 return std::pair(0,
1664 LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1665 }
1666
1667 // FIXME: We could probably handle weird extending loads better.
1668 if (DstTy.getSizeInBits() > MemSize)
1669 return std::pair(0, EltTy);
1670
1671 unsigned EltSize = EltTy.getSizeInBits();
1672 unsigned DstSize = DstTy.getSizeInBits();
1673 if (!isPowerOf2_32(Value: DstSize)) {
1674 // We're probably decomposing an odd sized store. Try to split
1675 // to the widest type. TODO: Account for alignment. As-is it
1676 // should be OK, since the new parts will be further legalized.
1677 unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1678 return std::pair(
1679 0, LLT::scalarOrVector(
1680 EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1681 }
1682
1683 // May need relegalization for the scalars.
1684 return std::pair(0, EltTy);
1685 })
1686 .minScalar(TypeIdx: 0, Ty: S32)
1687 .narrowScalarIf(Predicate: isTruncStoreToSizePowerOf2(TypeIdx: 0),
1688 Mutation: getScalarTypeFromMemDesc(TypeIdx: 0))
1689 .widenScalarToNextPow2(TypeIdx: 0)
1690 .moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: 0, Size: 32), Mutation: moreEltsToNext32Bit(TypeIdx: 0))
1691 .lower();
1692 }
1693
1694 // FIXME: Unaligned accesses not lowered.
1695 auto &ExtLoads = getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1696 .legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: 8},
1697 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: 2 * 8},
1698 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1699 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1700 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1701 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1702 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: 8},
1703 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: 2 * 8}})
1704 .legalIf(
1705 Predicate: [=](const LegalityQuery &Query) -> bool {
1706 return isLoadStoreLegal(ST, Query);
1707 });
1708
1709 if (ST.hasFlatAddressSpace()) {
1710 ExtLoads.legalForTypesWithMemDesc(
1711 TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: 8}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: 16}});
1712 }
1713
1714 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1715 // 64-bits.
1716 //
1717 // TODO: Should generalize bitcast action into coerce, which will also cover
1718 // inserting addrspacecasts.
1719 ExtLoads.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1720
1721 ExtLoads.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1722 .widenScalarToNextPow2(TypeIdx: 0)
1723 .lower();
1724
1725 auto &Atomics = getActionDefinitionsBuilder(
1726 Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1727 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1728 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1729 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1730 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1731 {S64, GlobalPtr}, {S64, LocalPtr},
1732 {S32, RegionPtr}, {S64, RegionPtr}});
1733 if (ST.hasFlatAddressSpace()) {
1734 Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1735 }
1736
1737 auto &Atomics32 =
1738 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1739 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1740 if (ST.hasFlatAddressSpace()) {
1741 Atomics32.legalFor(Types: {{S32, FlatPtr}});
1742 }
1743
1744 // TODO: v2bf16 operations, and fat buffer pointer support.
1745 auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1746 if (ST.hasLDSFPAtomicAddF32()) {
1747 Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1748 if (ST.hasLdsAtomicAddF64())
1749 Atomic.legalFor(Types: {{S64, LocalPtr}});
1750 if (ST.hasAtomicDsPkAdd16Insts())
1751 Atomic.legalFor(Types: {{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1752 }
1753 if (ST.hasAtomicFaddInsts())
1754 Atomic.legalFor(Types: {{S32, GlobalPtr}});
1755 if (ST.hasFlatAtomicFaddF32Inst())
1756 Atomic.legalFor(Types: {{S32, FlatPtr}});
1757
1758 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1759 // These are legal with some caveats, and should have undergone expansion in
1760 // the IR in most situations
1761 // TODO: Move atomic expansion into legalizer
1762 Atomic.legalFor(Types: {
1763 {S32, GlobalPtr},
1764 {S64, GlobalPtr},
1765 {S64, FlatPtr}
1766 });
1767 }
1768
1769 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1770 ST.hasAtomicBufferGlobalPkAddF16Insts())
1771 Atomic.legalFor(Types: {{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1772 if (ST.hasAtomicGlobalPkAddBF16Inst())
1773 Atomic.legalFor(Types: {{V2BF16, GlobalPtr}});
1774 if (ST.hasAtomicFlatPkAdd16Insts())
1775 Atomic.legalFor(Types: {{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1776
1777
1778 // Most of the legalization work here is done by AtomicExpand. We could
1779 // probably use a simpler legality rule that just assumes anything is OK.
1780 auto &AtomicFMinFMax =
1781 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1782 .legalFor(Types: {{F32, LocalPtr}, {F64, LocalPtr}});
1783
1784 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1785 AtomicFMinFMax.legalFor(Types: {{F32, GlobalPtr},{F32, BufferFatPtr}});
1786 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1787 AtomicFMinFMax.legalFor(Types: {{F64, GlobalPtr}, {F64, BufferFatPtr}});
1788 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1789 AtomicFMinFMax.legalFor(Types: {F32, FlatPtr});
1790 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1791 AtomicFMinFMax.legalFor(Types: {F64, FlatPtr});
1792
1793 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1794 // demarshalling
1795 getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1796 .customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1797 {S32, FlatPtr}, {S64, FlatPtr}})
1798 .legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1799 {S32, RegionPtr}, {S64, RegionPtr}});
1800 // TODO: Pointer types, any 32-bit or 64-bit vector
1801
1802 // Condition should be s32 for scalar, s1 for vector.
1803 getActionDefinitionsBuilder(Opcode: G_SELECT)
1804 .legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1805 LocalPtr, FlatPtr, PrivatePtr,
1806 LLT::fixed_vector(NumElements: 2, ScalarTy: LocalPtr),
1807 LLT::fixed_vector(NumElements: 2, ScalarTy: PrivatePtr)},
1808 Types1: {S1, S32})
1809 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1810 .scalarize(TypeIdx: 1)
1811 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1812 .fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: 0), Mutation: scalarize(TypeIdx: 0))
1813 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 2)
1814 .clampMaxNumElements(TypeIdx: 0, EltTy: LocalPtr, MaxElements: 2)
1815 .clampMaxNumElements(TypeIdx: 0, EltTy: PrivatePtr, MaxElements: 2)
1816 .scalarize(TypeIdx: 0)
1817 .widenScalarToNextPow2(TypeIdx: 0)
1818 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: typeInSet(TypeIdx: 1, TypesInit: {S1, S32})));
1819
1820 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1821 // be more flexible with the shift amount type.
1822 auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1823 .legalFor(Types: {{S32, S32}, {S64, S32}});
1824 if (ST.has16BitInsts()) {
1825 if (ST.hasVOP3PInsts()) {
1826 Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1827 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1828 } else
1829 Shifts.legalFor(Types: {{S16, S16}});
1830
1831 // TODO: Support 16-bit shift amounts for all types
1832 Shifts.widenScalarIf(
1833 Predicate: [=](const LegalityQuery &Query) {
1834 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1835 // 32-bit amount.
1836 const LLT ValTy = Query.Types[0];
1837 const LLT AmountTy = Query.Types[1];
1838 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1839 AmountTy.getSizeInBits() < 16;
1840 }, Mutation: changeTo(TypeIdx: 1, Ty: S16));
1841 Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16);
1842 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1843 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 16);
1844 Shifts.clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1845
1846 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1847 .minScalar(TypeIdx: 0, Ty: S16)
1848 .scalarize(TypeIdx: 0)
1849 .lower();
1850 } else {
1851 // Make sure we legalize the shift amount type first, as the general
1852 // expansion for the shifted type will produce much worse code if it hasn't
1853 // been truncated already.
1854 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1855 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1856 Shifts.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1857
1858 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1859 .minScalar(TypeIdx: 0, Ty: S32)
1860 .scalarize(TypeIdx: 0)
1861 .lower();
1862 }
1863 Shifts.scalarize(TypeIdx: 0);
1864
1865 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1866 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1867 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1868 unsigned IdxTypeIdx = 2;
1869
1870 getActionDefinitionsBuilder(Opcode: Op)
1871 .customIf(Predicate: [=](const LegalityQuery &Query) {
1872 const LLT EltTy = Query.Types[EltTypeIdx];
1873 const LLT VecTy = Query.Types[VecTypeIdx];
1874 const LLT IdxTy = Query.Types[IdxTypeIdx];
1875 const unsigned EltSize = EltTy.getSizeInBits();
1876 const bool isLegalVecType =
1877 !!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1878 // Address space 8 pointers are 128-bit wide values, but the logic
1879 // below will try to bitcast them to 2N x s64, which will fail.
1880 // Therefore, as an intermediate step, wrap extracts/insertions from a
1881 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1882 // extraction result) in order to produce a vector operation that can
1883 // be handled by the logic below.
1884 if (EltTy.isPointer() && EltSize > 64)
1885 return true;
1886 return (EltSize == 32 || EltSize == 64) &&
1887 VecTy.getSizeInBits() % 32 == 0 &&
1888 VecTy.getSizeInBits() <= MaxRegisterSize &&
1889 IdxTy.getSizeInBits() == 32 &&
1890 isLegalVecType;
1891 })
1892 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1893 P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: 32)),
1894 Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1895 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1896 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1897 P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: 64)),
1898 Mutation: [=](const LegalityQuery &Query) {
1899 // For > 64-bit element types, try to turn this into a
1900 // 64-bit element vector since we may be able to do better
1901 // indexing if this is scalar. If not, fall back to 32.
1902 const LLT EltTy = Query.Types[EltTypeIdx];
1903 const LLT VecTy = Query.Types[VecTypeIdx];
1904 const unsigned DstEltSize = EltTy.getSizeInBits();
1905 const unsigned VecSize = VecTy.getSizeInBits();
1906
1907 const unsigned TargetEltSize =
1908 DstEltSize % 64 == 0 ? 64 : 32;
1909 return std::pair(VecTypeIdx,
1910 LLT::fixed_vector(NumElements: VecSize / TargetEltSize,
1911 ScalarSizeInBits: TargetEltSize));
1912 })
1913 .clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
1914 .clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
1915 .clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
1916 .clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: 32)
1917 // TODO: Clamp elements for 64-bit vectors?
1918 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: VecTypeIdx),
1919 Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
1920 // It should only be necessary with variable indexes.
1921 // As a last resort, lower to the stack
1922 .lower();
1923 }
1924
1925 getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1926 .unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1927 const LLT &EltTy = Query.Types[1].getElementType();
1928 return Query.Types[0] != EltTy;
1929 });
1930
1931 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1932 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1933 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1934
1935 // FIXME: Doesn't handle extract of illegal sizes.
1936 getActionDefinitionsBuilder(Opcode: Op)
1937 .lowerIf(Predicate: all(P0: typeIs(TypeIdx: LitTyIdx, TypesInit: S16), P1: sizeIs(TypeIdx: BigTyIdx, Size: 32)))
1938 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
1939 // Sub-vector(or single element) insert and extract.
1940 // TODO: verify immediate offset here since lower only works with
1941 // whole elements.
1942 const LLT BigTy = Query.Types[BigTyIdx];
1943 return BigTy.isVector();
1944 })
1945 // FIXME: Multiples of 16 should not be legal.
1946 .legalIf(Predicate: [=](const LegalityQuery &Query) {
1947 const LLT BigTy = Query.Types[BigTyIdx];
1948 const LLT LitTy = Query.Types[LitTyIdx];
1949 return (BigTy.getSizeInBits() % 32 == 0) &&
1950 (LitTy.getSizeInBits() % 16 == 0);
1951 })
1952 .widenScalarIf(
1953 Predicate: [=](const LegalityQuery &Query) {
1954 const LLT BigTy = Query.Types[BigTyIdx];
1955 return (BigTy.getScalarSizeInBits() < 16);
1956 },
1957 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: 16))
1958 .widenScalarIf(
1959 Predicate: [=](const LegalityQuery &Query) {
1960 const LLT LitTy = Query.Types[LitTyIdx];
1961 return (LitTy.getScalarSizeInBits() < 16);
1962 },
1963 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: 16))
1964 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1965 .widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: 32);
1966
1967 }
1968
1969 auto &BuildVector =
1970 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1971 .legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
1972 .legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
1973 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
1974 .clampNumElements(TypeIdx: 0, MinTy: V2S64, MaxTy: V16S64)
1975 .fewerElementsIf(Predicate: isWideVec16(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: V2S16))
1976 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: 0),
1977 Mutation: moreElementsToNextExistingRegClass(TypeIdx: 0));
1978
1979 if (ST.hasScalarPackInsts()) {
1980 BuildVector
1981 // FIXME: Should probably widen s1 vectors straight to s32
1982 .minScalarOrElt(TypeIdx: 0, Ty: S16)
1983 .minScalar(TypeIdx: 1, Ty: S16);
1984
1985 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1986 .legalFor(Types: {V2S16, S32})
1987 .lower();
1988 } else {
1989 BuildVector.customFor(Types: {V2S16, S16});
1990 BuildVector.minScalarOrElt(TypeIdx: 0, Ty: S32);
1991
1992 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1993 .customFor(Types: {V2S16, S32})
1994 .lower();
1995 }
1996
1997 BuildVector.legalIf(Predicate: isRegisterType(ST, TypeIdx: 0));
1998
1999 // FIXME: Clamp maximum size
2000 getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
2001 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
2002 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 32)
2003 .clampMaxNumElements(TypeIdx: 1, EltTy: S16, MaxElements: 2) // TODO: Make 4?
2004 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 64);
2005
2006 getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
2007
2008 // Merge/Unmerge
2009 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2010 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2011 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2012
2013 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2014 const LLT Ty = Query.Types[TypeIdx];
2015 if (Ty.isVector()) {
2016 const LLT &EltTy = Ty.getElementType();
2017 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2018 return true;
2019 if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
2020 return true;
2021 }
2022 return false;
2023 };
2024
2025 auto &Builder =
2026 getActionDefinitionsBuilder(Opcode: Op)
2027 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
2028 .lowerFor(Types: {{S16, V2S16}})
2029 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
2030 const LLT BigTy = Query.Types[BigTyIdx];
2031 return BigTy.getSizeInBits() == 32;
2032 })
2033 // Try to widen to s16 first for small types.
2034 // TODO: Only do this on targets with legal s16 shifts
2035 .minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: 16), TypeIdx: LitTyIdx, Ty: S16)
2036 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 16)
2037 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx),
2038 Mutation: oneMoreElement(TypeIdx: BigTyIdx))
2039 .fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: 0, TypesInit: S16), P1: vectorWiderThan(TypeIdx: 1, Size: 32),
2040 args: elementTypeIs(TypeIdx: 1, EltTy: S16)),
2041 Mutation: changeTo(TypeIdx: 1, Ty: V2S16))
2042 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2043 // not worth considering the multiples of 64 since 2*192 and 2*384
2044 // are not valid.
2045 .clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
2046 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 32)
2047 // Break up vectors with weird elements into scalars
2048 .fewerElementsIf(
2049 Predicate: [=](const LegalityQuery &Query) {
2050 return notValidElt(Query, LitTyIdx);
2051 },
2052 Mutation: scalarize(TypeIdx: 0))
2053 .fewerElementsIf(
2054 Predicate: [=](const LegalityQuery &Query) {
2055 return notValidElt(Query, BigTyIdx);
2056 },
2057 Mutation: scalarize(TypeIdx: 1))
2058 .clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
2059
2060 if (Op == G_MERGE_VALUES) {
2061 Builder.widenScalarIf(
2062 // TODO: Use 16-bit shifts if legal for 8-bit values?
2063 Predicate: [=](const LegalityQuery &Query) {
2064 const LLT Ty = Query.Types[LitTyIdx];
2065 return Ty.getSizeInBits() < 32;
2066 },
2067 Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
2068 }
2069
2070 Builder.widenScalarIf(
2071 Predicate: [=](const LegalityQuery &Query) {
2072 const LLT Ty = Query.Types[BigTyIdx];
2073 return Ty.getSizeInBits() % 16 != 0;
2074 },
2075 Mutation: [=](const LegalityQuery &Query) {
2076 // Pick the next power of 2, or a multiple of 64 over 128.
2077 // Whichever is smaller.
2078 const LLT &Ty = Query.Types[BigTyIdx];
2079 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Value: Ty.getSizeInBits() + 1);
2080 if (NewSizeInBits >= 256) {
2081 unsigned RoundedTo = alignTo<64>(Value: Ty.getSizeInBits() + 1);
2082 if (RoundedTo < NewSizeInBits)
2083 NewSizeInBits = RoundedTo;
2084 }
2085 return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
2086 })
2087 // Any vectors left are the wrong size. Scalarize them.
2088 .scalarize(TypeIdx: 0)
2089 .scalarize(TypeIdx: 1);
2090 }
2091
2092 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2093 // RegBankSelect.
2094 auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
2095 .legalFor(Types: {{S32}, {S64}})
2096 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
2097
2098 if (ST.hasVOP3PInsts()) {
2099 SextInReg.lowerFor(Types: {{V2S16}})
2100 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2101 // get more vector shift opportunities, since we'll get those when
2102 // expanded.
2103 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2104 } else if (ST.has16BitInsts()) {
2105 SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
2106 } else {
2107 // Prefer to promote to s32 before lowering if we don't have 16-bit
2108 // shifts. This avoid a lot of intermediate truncate and extend operations.
2109 SextInReg.lowerFor(Types: {{S32}, {S64}});
2110 }
2111
2112 SextInReg
2113 .scalarize(TypeIdx: 0)
2114 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2115 .lower();
2116
2117 getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
2118 .scalarize(TypeIdx: 0)
2119 .lower();
2120
2121 auto &FSHRActionDefs = getActionDefinitionsBuilder(Opcode: G_FSHR);
2122 FSHRActionDefs.legalFor(Types: {{S32, S32}})
2123 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2124 if (ST.hasVOP3PInsts())
2125 FSHRActionDefs.lowerFor(Types: {{V2S16, V2S16}});
2126 FSHRActionDefs.scalarize(TypeIdx: 0).lower();
2127
2128 if (ST.hasVOP3PInsts()) {
2129 getActionDefinitionsBuilder(Opcode: G_FSHL)
2130 .lowerFor(Types: {{V2S16, V2S16}})
2131 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2132 .scalarize(TypeIdx: 0)
2133 .lower();
2134 } else {
2135 getActionDefinitionsBuilder(Opcode: G_FSHL)
2136 .scalarize(TypeIdx: 0)
2137 .lower();
2138 }
2139
2140 getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
2141 .legalFor(Types: {S64});
2142
2143 getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
2144
2145 getActionDefinitionsBuilder(Opcode: G_FENCE)
2146 .alwaysLegal();
2147
2148 getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
2149 .scalarize(TypeIdx: 0)
2150 .minScalar(TypeIdx: 0, Ty: S32)
2151 .lower();
2152
2153 getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2154 .legalFor(Types: {{S32, S32}, {S64, S32}})
2155 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
2156 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2157 .widenScalarToNextPow2(TypeIdx: 0)
2158 .scalarize(TypeIdx: 0);
2159
2160 getActionDefinitionsBuilder(
2161 Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2162 G_FCOPYSIGN,
2163
2164 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2165 G_READ_REGISTER, G_WRITE_REGISTER,
2166
2167 G_SADDO, G_SSUBO})
2168 .lower();
2169
2170 if (ST.hasIEEEMinimumMaximumInsts()) {
2171 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2172 .legalFor(Types: FPTypesPK16)
2173 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
2174 .scalarize(TypeIdx: 0);
2175 } else if (ST.hasVOP3PInsts()) {
2176 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2177 .lowerFor(Types: {V2S16})
2178 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2179 .scalarize(TypeIdx: 0)
2180 .lower();
2181 } else {
2182 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2183 .scalarize(TypeIdx: 0)
2184 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2185 .lower();
2186 }
2187
2188 getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2189 .lower();
2190
2191 getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2192
2193 getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2194 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2195 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2196 .unsupported();
2197
2198 getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2199
2200 getActionDefinitionsBuilder(
2201 Opcodes: {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2202 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2203 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2204 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2205 .legalFor(Types: AllVectors)
2206 .scalarize(TypeIdx: 1)
2207 .lower();
2208
2209 getLegacyLegalizerInfo().computeTables();
2210 verify(MII: *ST.getInstrInfo());
2211}
2212
2213bool AMDGPULegalizerInfo::legalizeCustom(
2214 LegalizerHelper &Helper, MachineInstr &MI,
2215 LostDebugLocObserver &LocObserver) const {
2216 MachineIRBuilder &B = Helper.MIRBuilder;
2217 MachineRegisterInfo &MRI = *B.getMRI();
2218
2219 switch (MI.getOpcode()) {
2220 case TargetOpcode::G_ADDRSPACE_CAST:
2221 return legalizeAddrSpaceCast(MI, MRI, B);
2222 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2223 return legalizeFroundeven(MI, MRI, B);
2224 case TargetOpcode::G_FCEIL:
2225 return legalizeFceil(MI, MRI, B);
2226 case TargetOpcode::G_FREM:
2227 return legalizeFrem(MI, MRI, B);
2228 case TargetOpcode::G_INTRINSIC_TRUNC:
2229 return legalizeIntrinsicTrunc(MI, MRI, B);
2230 case TargetOpcode::G_SITOFP:
2231 return legalizeITOFP(MI, MRI, B, Signed: true);
2232 case TargetOpcode::G_UITOFP:
2233 return legalizeITOFP(MI, MRI, B, Signed: false);
2234 case TargetOpcode::G_FPTOSI:
2235 return legalizeFPTOI(MI, MRI, B, Signed: true);
2236 case TargetOpcode::G_FPTOUI:
2237 return legalizeFPTOI(MI, MRI, B, Signed: false);
2238 case TargetOpcode::G_FMINNUM:
2239 case TargetOpcode::G_FMAXNUM:
2240 case TargetOpcode::G_FMINIMUMNUM:
2241 case TargetOpcode::G_FMAXIMUMNUM:
2242 return legalizeMinNumMaxNum(Helper, MI);
2243 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2244 return legalizeExtractVectorElt(MI, MRI, B);
2245 case TargetOpcode::G_INSERT_VECTOR_ELT:
2246 return legalizeInsertVectorElt(MI, MRI, B);
2247 case TargetOpcode::G_FSIN:
2248 case TargetOpcode::G_FCOS:
2249 return legalizeSinCos(MI, MRI, B);
2250 case TargetOpcode::G_GLOBAL_VALUE:
2251 return legalizeGlobalValue(MI, MRI, B);
2252 case TargetOpcode::G_LOAD:
2253 case TargetOpcode::G_SEXTLOAD:
2254 case TargetOpcode::G_ZEXTLOAD:
2255 return legalizeLoad(Helper, MI);
2256 case TargetOpcode::G_STORE:
2257 return legalizeStore(Helper, MI);
2258 case TargetOpcode::G_FMAD:
2259 return legalizeFMad(MI, MRI, B);
2260 case TargetOpcode::G_FDIV:
2261 return legalizeFDIV(MI, MRI, B);
2262 case TargetOpcode::G_FFREXP:
2263 return legalizeFFREXP(MI, MRI, B);
2264 case TargetOpcode::G_FSQRT:
2265 return legalizeFSQRT(MI, MRI, B);
2266 case TargetOpcode::G_UDIV:
2267 case TargetOpcode::G_UREM:
2268 case TargetOpcode::G_UDIVREM:
2269 return legalizeUnsignedDIV_REM(MI, MRI, B);
2270 case TargetOpcode::G_SDIV:
2271 case TargetOpcode::G_SREM:
2272 case TargetOpcode::G_SDIVREM:
2273 return legalizeSignedDIV_REM(MI, MRI, B);
2274 case TargetOpcode::G_ATOMIC_CMPXCHG:
2275 return legalizeAtomicCmpXChg(MI, MRI, B);
2276 case TargetOpcode::G_FLOG2:
2277 return legalizeFlog2(MI, B);
2278 case TargetOpcode::G_FLOG:
2279 case TargetOpcode::G_FLOG10:
2280 return legalizeFlogCommon(MI, B);
2281 case TargetOpcode::G_FEXP2:
2282 return legalizeFExp2(MI, B);
2283 case TargetOpcode::G_FEXP:
2284 case TargetOpcode::G_FEXP10:
2285 return legalizeFExp(MI, B);
2286 case TargetOpcode::G_FPOW:
2287 return legalizeFPow(MI, B);
2288 case TargetOpcode::G_FFLOOR:
2289 return legalizeFFloor(MI, MRI, B);
2290 case TargetOpcode::G_BUILD_VECTOR:
2291 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2292 return legalizeBuildVector(MI, MRI, B);
2293 case TargetOpcode::G_MUL:
2294 return legalizeMul(Helper, MI);
2295 case TargetOpcode::G_CTLZ:
2296 case TargetOpcode::G_CTTZ:
2297 return legalizeCTLZ_CTTZ(MI, MRI, B);
2298 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2299 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2300 case TargetOpcode::G_STACKSAVE:
2301 return legalizeStackSave(MI, B);
2302 case TargetOpcode::G_GET_FPENV:
2303 return legalizeGetFPEnv(MI, MRI, B);
2304 case TargetOpcode::G_SET_FPENV:
2305 return legalizeSetFPEnv(MI, MRI, B);
2306 case TargetOpcode::G_TRAP:
2307 return legalizeTrap(MI, MRI, B);
2308 case TargetOpcode::G_DEBUGTRAP:
2309 return legalizeDebugTrap(MI, MRI, B);
2310 default:
2311 return false;
2312 }
2313
2314 llvm_unreachable("expected switch to return");
2315}
2316
2317Register AMDGPULegalizerInfo::getSegmentAperture(
2318 unsigned AS,
2319 MachineRegisterInfo &MRI,
2320 MachineIRBuilder &B) const {
2321 MachineFunction &MF = B.getMF();
2322 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2323 const LLT S32 = LLT::scalar(SizeInBits: 32);
2324 const LLT S64 = LLT::scalar(SizeInBits: 64);
2325
2326 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2327
2328 if (ST.hasApertureRegs()) {
2329 // Note: this register is somewhat broken. When used as a 32-bit operand,
2330 // it only returns zeroes. The real value is in the upper 32 bits.
2331 // Thus, we must emit extract the high 32 bits.
2332 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2333 ? AMDGPU::SRC_SHARED_BASE
2334 : AMDGPU::SRC_PRIVATE_BASE;
2335 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2336 !ST.hasGloballyAddressableScratch()) &&
2337 "Cannot use src_private_base with globally addressable scratch!");
2338 Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2339 MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2340 B.buildCopy(Res: {Dst}, Op: {Register(ApertureRegNo)});
2341 return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: 1);
2342 }
2343
2344 Register LoadAddr = MRI.createGenericVirtualRegister(
2345 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2346 // For code object version 5, private_base and shared_base are passed through
2347 // implicit kernargs.
2348 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2349 AMDGPU::AMDHSA_COV5) {
2350 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF&: B.getMF());
2351
2352 AMDGPUTargetLowering::ImplicitParameter Param =
2353 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2354 : AMDGPUTargetLowering::PRIVATE_BASE;
2355 uint64_t Offset =
2356 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2357
2358 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2359 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2360
2361 if (!loadInputValue(DstReg: KernargPtrReg, B,
2362 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2363 return Register();
2364
2365 MachineMemOperand *MMO = MF.getMachineMemOperand(
2366 PtrInfo: PtrInfo.getWithOffset(O: Offset),
2367 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2368 MachineMemOperand::MOInvariant,
2369 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset));
2370
2371 // Pointer address
2372 B.buildObjectPtrOffset(Res: LoadAddr, Op0: KernargPtrReg,
2373 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
2374 // Load address
2375 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2376 }
2377
2378 Register QueuePtr = MRI.createGenericVirtualRegister(
2379 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2380
2381 if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2382 return Register();
2383
2384 // TODO: Use custom PseudoSourceValue
2385 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2386
2387 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2388 // private_segment_aperture_base_hi.
2389 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2390
2391 MachineMemOperand *MMO = MF.getMachineMemOperand(
2392 PtrInfo,
2393 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2394 MachineMemOperand::MOInvariant,
2395 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset: StructOffset));
2396
2397 B.buildObjectPtrOffset(
2398 Res: LoadAddr, Op0: QueuePtr,
2399 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: StructOffset).getReg(Idx: 0));
2400 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2401}
2402
2403/// Return true if the value is a known valid address, such that a null check is
2404/// not necessary.
2405static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2406 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2407 MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2408 switch (Def->getOpcode()) {
2409 case AMDGPU::G_FRAME_INDEX:
2410 case AMDGPU::G_GLOBAL_VALUE:
2411 case AMDGPU::G_BLOCK_ADDR:
2412 return true;
2413 case AMDGPU::G_CONSTANT: {
2414 const ConstantInt *CI = Def->getOperand(i: 1).getCImm();
2415 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2416 }
2417 default:
2418 return false;
2419 }
2420
2421 return false;
2422}
2423
2424bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2425 MachineInstr &MI, MachineRegisterInfo &MRI,
2426 MachineIRBuilder &B) const {
2427 MachineFunction &MF = B.getMF();
2428
2429 // MI can either be a G_ADDRSPACE_CAST or a
2430 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2431 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2432 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2433 Intrinsic::amdgcn_addrspacecast_nonnull));
2434
2435 const LLT S32 = LLT::scalar(SizeInBits: 32);
2436 Register Dst = MI.getOperand(i: 0).getReg();
2437 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
2438 : MI.getOperand(i: 1).getReg();
2439 LLT DstTy = MRI.getType(Reg: Dst);
2440 LLT SrcTy = MRI.getType(Reg: Src);
2441 unsigned DestAS = DstTy.getAddressSpace();
2442 unsigned SrcAS = SrcTy.getAddressSpace();
2443
2444 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2445 // vector element.
2446 assert(!DstTy.isVector());
2447
2448 const AMDGPUTargetMachine &TM
2449 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2450
2451 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2452 MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2453 return true;
2454 }
2455
2456 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2457 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2458 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2459 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2460 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2461 ST.hasGloballyAddressableScratch()) {
2462 // flat -> private with globally addressable scratch: subtract
2463 // src_flat_scratch_base_lo.
2464 const LLT S32 = LLT::scalar(SizeInBits: 32);
2465 Register SrcLo = B.buildExtract(Res: S32, Src, Index: 0).getReg(Idx: 0);
2466 Register FlatScratchBaseLo =
2467 B.buildInstr(Opc: AMDGPU::S_MOV_B32, DstOps: {S32},
2468 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2469 .getReg(Idx: 0);
2470 MRI.setRegClass(Reg: FlatScratchBaseLo, RC: &AMDGPU::SReg_32RegClass);
2471 Register Sub = B.buildSub(Dst: S32, Src0: SrcLo, Src1: FlatScratchBaseLo).getReg(Idx: 0);
2472 return B.buildIntToPtr(Dst, Src: Sub).getReg(Idx: 0);
2473 }
2474
2475 // Extract low 32-bits of the pointer.
2476 return B.buildExtract(Res: Dst, Src, Index: 0).getReg(Idx: 0);
2477 };
2478
2479 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2480 // G_ADDRSPACE_CAST we need to guess.
2481 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2482 castFlatToLocalOrPrivate(Dst);
2483 MI.eraseFromParent();
2484 return true;
2485 }
2486
2487 unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
2488
2489 auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2490 auto FlatNull = B.buildConstant(Res: SrcTy, Val: 0);
2491
2492 // Extract low 32-bits of the pointer.
2493 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2494
2495 auto CmpRes =
2496 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: FlatNull.getReg(Idx: 0));
2497 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: 0));
2498
2499 MI.eraseFromParent();
2500 return true;
2501 }
2502
2503 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2504 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2505 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2506 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2507 // Coerce the type of the low half of the result so we can use
2508 // merge_values.
2509 Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: 0);
2510
2511 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2512 ST.hasGloballyAddressableScratch()) {
2513 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2514 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2515 Register AllOnes = B.buildConstant(Res: S32, Val: -1).getReg(Idx: 0);
2516 Register ThreadID = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
2517 ThreadID = B.buildIntrinsic(ID: Intrinsic::amdgcn_mbcnt_lo, Res: {S32})
2518 .addUse(RegNo: AllOnes)
2519 .addUse(RegNo: ThreadID)
2520 .getReg(Idx: 0);
2521 if (ST.isWave64()) {
2522 ThreadID = B.buildIntrinsic(ID: Intrinsic::amdgcn_mbcnt_hi, Res: {S32})
2523 .addUse(RegNo: AllOnes)
2524 .addUse(RegNo: ThreadID)
2525 .getReg(Idx: 0);
2526 }
2527 Register ShAmt =
2528 B.buildConstant(Res: S32, Val: 57 - 32 - ST.getWavefrontSizeLog2()).getReg(Idx: 0);
2529 Register SrcHi = B.buildShl(Dst: S32, Src0: ThreadID, Src1: ShAmt).getReg(Idx: 0);
2530 Register CvtPtr =
2531 B.buildMergeLikeInstr(Res: DstTy, Ops: {SrcAsInt, SrcHi}).getReg(Idx: 0);
2532 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2533 // 64-bit hi:lo value.
2534 Register FlatScratchBase =
2535 B.buildInstr(Opc: AMDGPU::S_MOV_B64, DstOps: {S64},
2536 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2537 .getReg(Idx: 0);
2538 MRI.setRegClass(Reg: FlatScratchBase, RC: &AMDGPU::SReg_64RegClass);
2539 return B.buildPtrAdd(Res: Dst, Op0: CvtPtr, Op1: FlatScratchBase).getReg(Idx: 0);
2540 }
2541
2542 Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2543 if (!ApertureReg.isValid())
2544 return false;
2545
2546 // TODO: Should we allow mismatched types but matching sizes in merges to
2547 // avoid the ptrtoint?
2548 return B.buildMergeLikeInstr(Res: Dst, Ops: {SrcAsInt, ApertureReg}).getReg(Idx: 0);
2549 };
2550
2551 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2552 // G_ADDRSPACE_CAST we need to guess.
2553 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2554 castLocalOrPrivateToFlat(Dst);
2555 MI.eraseFromParent();
2556 return true;
2557 }
2558
2559 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2560
2561 auto SegmentNull = B.buildConstant(Res: SrcTy, Val: TM.getNullPointerValue(AddrSpace: SrcAS));
2562 auto FlatNull = B.buildConstant(Res: DstTy, Val: TM.getNullPointerValue(AddrSpace: DestAS));
2563
2564 auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
2565 Op1: SegmentNull.getReg(Idx: 0));
2566
2567 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2568
2569 MI.eraseFromParent();
2570 return true;
2571 }
2572
2573 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2574 SrcTy.getSizeInBits() == 64) {
2575 // Truncate.
2576 B.buildExtract(Res: Dst, Src, Index: 0);
2577 MI.eraseFromParent();
2578 return true;
2579 }
2580
2581 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2582 DstTy.getSizeInBits() == 64) {
2583 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2584 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2585 auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2586 if (AddrHiVal == 0) {
2587 auto Zext = B.buildZExt(Res: LLT::scalar(SizeInBits: 64), Op: PtrLo);
2588 B.buildIntToPtr(Dst, Src: Zext);
2589 } else {
2590 auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2591 B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2592 }
2593
2594 MI.eraseFromParent();
2595 return true;
2596 }
2597
2598 // Invalid casts are poison.
2599 // TODO: Should return poison
2600 B.buildUndef(Res: Dst);
2601 MI.eraseFromParent();
2602 return true;
2603}
2604
2605bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2606 MachineRegisterInfo &MRI,
2607 MachineIRBuilder &B) const {
2608 Register Src = MI.getOperand(i: 1).getReg();
2609 LLT Ty = MRI.getType(Reg: Src);
2610 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2611
2612 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2613 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2614
2615 auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2616 auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2617
2618 // TODO: Should this propagate fast-math-flags?
2619 auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2620 auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2621
2622 auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2623 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2624
2625 auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: C2);
2626 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2627 MI.eraseFromParent();
2628 return true;
2629}
2630
2631bool AMDGPULegalizerInfo::legalizeFceil(
2632 MachineInstr &MI, MachineRegisterInfo &MRI,
2633 MachineIRBuilder &B) const {
2634
2635 const LLT S1 = LLT::scalar(SizeInBits: 1);
2636 const LLT S64 = LLT::scalar(SizeInBits: 64);
2637
2638 Register Src = MI.getOperand(i: 1).getReg();
2639 assert(MRI.getType(Src) == S64);
2640
2641 // result = trunc(src)
2642 // if (src > 0.0 && src != result)
2643 // result += 1.0
2644
2645 auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2646
2647 const auto Zero = B.buildFConstant(Res: S64, Val: 0.0);
2648 const auto One = B.buildFConstant(Res: S64, Val: 1.0);
2649 auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2650 auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2651 auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2652 auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2653
2654 // TODO: Should this propagate fast-math-flags?
2655 B.buildFAdd(Dst: MI.getOperand(i: 0).getReg(), Src0: Trunc, Src1: Add);
2656 MI.eraseFromParent();
2657 return true;
2658}
2659
2660bool AMDGPULegalizerInfo::legalizeFrem(
2661 MachineInstr &MI, MachineRegisterInfo &MRI,
2662 MachineIRBuilder &B) const {
2663 Register DstReg = MI.getOperand(i: 0).getReg();
2664 Register Src0Reg = MI.getOperand(i: 1).getReg();
2665 Register Src1Reg = MI.getOperand(i: 2).getReg();
2666 auto Flags = MI.getFlags();
2667 LLT Ty = MRI.getType(Reg: DstReg);
2668
2669 auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2670 auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2671 auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2672 B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2673 MI.eraseFromParent();
2674 return true;
2675}
2676
2677static MachineInstrBuilder extractF64Exponent(Register Hi,
2678 MachineIRBuilder &B) {
2679 const unsigned FractBits = 52;
2680 const unsigned ExpBits = 11;
2681 LLT S32 = LLT::scalar(SizeInBits: 32);
2682
2683 auto Const0 = B.buildConstant(Res: S32, Val: FractBits - 32);
2684 auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2685
2686 auto ExpPart = B.buildIntrinsic(ID: Intrinsic::amdgcn_ubfe, Res: {S32})
2687 .addUse(RegNo: Hi)
2688 .addUse(RegNo: Const0.getReg(Idx: 0))
2689 .addUse(RegNo: Const1.getReg(Idx: 0));
2690
2691 return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: 1023));
2692}
2693
2694bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2695 MachineInstr &MI, MachineRegisterInfo &MRI,
2696 MachineIRBuilder &B) const {
2697 const LLT S1 = LLT::scalar(SizeInBits: 1);
2698 const LLT S32 = LLT::scalar(SizeInBits: 32);
2699 const LLT S64 = LLT::scalar(SizeInBits: 64);
2700
2701 Register Src = MI.getOperand(i: 1).getReg();
2702 assert(MRI.getType(Src) == S64);
2703
2704 // TODO: Should this use extract since the low half is unused?
2705 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2706 Register Hi = Unmerge.getReg(Idx: 1);
2707
2708 // Extract the upper half, since this is where we will find the sign and
2709 // exponent.
2710 auto Exp = extractF64Exponent(Hi, B);
2711
2712 const unsigned FractBits = 52;
2713
2714 // Extract the sign bit.
2715 const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(1) << 31);
2716 auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2717
2718 const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(1) << FractBits) - 1);
2719
2720 const auto Zero32 = B.buildConstant(Res: S32, Val: 0);
2721
2722 // Extend back to 64-bits.
2723 auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2724
2725 auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2726 auto Not = B.buildNot(Dst: S64, Src0: Shr);
2727 auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2728 auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - 1);
2729
2730 auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2731 auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2732
2733 auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2734 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2735 MI.eraseFromParent();
2736 return true;
2737}
2738
2739bool AMDGPULegalizerInfo::legalizeITOFP(
2740 MachineInstr &MI, MachineRegisterInfo &MRI,
2741 MachineIRBuilder &B, bool Signed) const {
2742
2743 Register Dst = MI.getOperand(i: 0).getReg();
2744 Register Src = MI.getOperand(i: 1).getReg();
2745
2746 const LLT S64 = LLT::scalar(SizeInBits: 64);
2747 const LLT S32 = LLT::scalar(SizeInBits: 32);
2748
2749 assert(MRI.getType(Src) == S64);
2750
2751 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2752 auto ThirtyTwo = B.buildConstant(Res: S32, Val: 32);
2753
2754 if (MRI.getType(Reg: Dst) == S64) {
2755 auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1))
2756 : B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1));
2757
2758 auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 0));
2759 auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2760
2761 // TODO: Should this propagate fast-math-flags?
2762 B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2763 MI.eraseFromParent();
2764 return true;
2765 }
2766
2767 assert(MRI.getType(Dst) == S32);
2768
2769 auto One = B.buildConstant(Res: S32, Val: 1);
2770
2771 MachineInstrBuilder ShAmt;
2772 if (Signed) {
2773 auto ThirtyOne = B.buildConstant(Res: S32, Val: 31);
2774 auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: 0), Src1: Unmerge.getReg(Idx: 1));
2775 auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2776 auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2777 auto LS = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32})
2778 .addUse(RegNo: Unmerge.getReg(Idx: 1));
2779 auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2780 ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2781 } else
2782 ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
2783 auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2784 auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2785 auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: 0));
2786 auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: 1), Src1: Adjust);
2787 auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2788 auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2789 B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2790 MI.eraseFromParent();
2791 return true;
2792}
2793
2794// TODO: Copied from DAG implementation. Verify logic and document how this
2795// actually works.
2796bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2797 MachineRegisterInfo &MRI,
2798 MachineIRBuilder &B,
2799 bool Signed) const {
2800
2801 Register Dst = MI.getOperand(i: 0).getReg();
2802 Register Src = MI.getOperand(i: 1).getReg();
2803
2804 const LLT S64 = LLT::scalar(SizeInBits: 64);
2805 const LLT S32 = LLT::scalar(SizeInBits: 32);
2806
2807 const LLT SrcLT = MRI.getType(Reg: Src);
2808 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2809
2810 unsigned Flags = MI.getFlags();
2811
2812 // The basic idea of converting a floating point number into a pair of 32-bit
2813 // integers is illustrated as follows:
2814 //
2815 // tf := trunc(val);
2816 // hif := floor(tf * 2^-32);
2817 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2818 // hi := fptoi(hif);
2819 // lo := fptoi(lof);
2820 //
2821 auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2822 MachineInstrBuilder Sign;
2823 if (Signed && SrcLT == S32) {
2824 // However, a 32-bit floating point number has only 23 bits mantissa and
2825 // it's not enough to hold all the significant bits of `lof` if val is
2826 // negative. To avoid the loss of precision, We need to take the absolute
2827 // value after truncating and flip the result back based on the original
2828 // signedness.
2829 Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: 31));
2830 Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2831 }
2832 MachineInstrBuilder K0, K1;
2833 if (SrcLT == S64) {
2834 K0 = B.buildFConstant(
2835 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2836 K1 = B.buildFConstant(
2837 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2838 } else {
2839 K0 = B.buildFConstant(
2840 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2841 K1 = B.buildFConstant(
2842 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2843 }
2844
2845 auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2846 auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2847 auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2848
2849 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2850 : B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2851 auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2852
2853 if (Signed && SrcLT == S32) {
2854 // Flip the result based on the signedness, which is either all 0s or 1s.
2855 Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2856 // r := xor({lo, hi}, sign) - sign;
2857 B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2858 Src1: Sign);
2859 } else
2860 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2861 MI.eraseFromParent();
2862
2863 return true;
2864}
2865
2866bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2867 MachineInstr &MI) const {
2868 MachineFunction &MF = Helper.MIRBuilder.getMF();
2869 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2870
2871 // With ieee_mode disabled, the instructions have the correct behavior.
2872 if (!MFI->getMode().IEEE)
2873 return true;
2874
2875 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2876}
2877
2878bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2879 MachineInstr &MI, MachineRegisterInfo &MRI,
2880 MachineIRBuilder &B) const {
2881 // TODO: Should move some of this into LegalizerHelper.
2882
2883 // TODO: Promote dynamic indexing of s16 to s32
2884
2885 Register Dst = MI.getOperand(i: 0).getReg();
2886 Register Vec = MI.getOperand(i: 1).getReg();
2887
2888 LLT VecTy = MRI.getType(Reg: Vec);
2889 LLT EltTy = VecTy.getElementType();
2890 assert(EltTy == MRI.getType(Dst));
2891
2892 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2893 // but we can't go directly to that logic becasue you can't bitcast a vector
2894 // of pointers to a vector of integers. Therefore, introduce an intermediate
2895 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2896 // drive the legalization forward.
2897 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2898 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2899 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2900
2901 auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2902 auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: 2));
2903 B.buildIntToPtr(Dst, Src: IntElt);
2904
2905 MI.eraseFromParent();
2906 return true;
2907 }
2908
2909 // FIXME: Artifact combiner probably should have replaced the truncated
2910 // constant before this, so we shouldn't need
2911 // getIConstantVRegValWithLookThrough.
2912 std::optional<ValueAndVReg> MaybeIdxVal =
2913 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 2).getReg(), MRI);
2914 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2915 return true;
2916 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2917
2918 if (IdxVal < VecTy.getNumElements()) {
2919 auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
2920 B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
2921 } else {
2922 B.buildUndef(Res: Dst);
2923 }
2924
2925 MI.eraseFromParent();
2926 return true;
2927}
2928
2929bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2930 MachineInstr &MI, MachineRegisterInfo &MRI,
2931 MachineIRBuilder &B) const {
2932 // TODO: Should move some of this into LegalizerHelper.
2933
2934 // TODO: Promote dynamic indexing of s16 to s32
2935
2936 Register Dst = MI.getOperand(i: 0).getReg();
2937 Register Vec = MI.getOperand(i: 1).getReg();
2938 Register Ins = MI.getOperand(i: 2).getReg();
2939
2940 LLT VecTy = MRI.getType(Reg: Vec);
2941 LLT EltTy = VecTy.getElementType();
2942 assert(EltTy == MRI.getType(Ins));
2943
2944 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2945 // but we can't go directly to that logic becasue you can't bitcast a vector
2946 // of pointers to a vector of integers. Therefore, make the pointer vector
2947 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2948 // new value, and then inttoptr the result vector back. This will then allow
2949 // the rest of legalization to take over.
2950 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2951 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2952 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2953
2954 auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2955 auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
2956 auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
2957 Idx: MI.getOperand(i: 3));
2958 B.buildIntToPtr(Dst, Src: IntVecDest);
2959 MI.eraseFromParent();
2960 return true;
2961 }
2962
2963 // FIXME: Artifact combiner probably should have replaced the truncated
2964 // constant before this, so we shouldn't need
2965 // getIConstantVRegValWithLookThrough.
2966 std::optional<ValueAndVReg> MaybeIdxVal =
2967 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 3).getReg(), MRI);
2968 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2969 return true;
2970
2971 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2972
2973 unsigned NumElts = VecTy.getNumElements();
2974 if (IdxVal < NumElts) {
2975 SmallVector<Register, 8> SrcRegs;
2976 for (unsigned i = 0; i < NumElts; ++i)
2977 SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
2978 B.buildUnmerge(Res: SrcRegs, Op: Vec);
2979
2980 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
2981 B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
2982 } else {
2983 B.buildUndef(Res: Dst);
2984 }
2985
2986 MI.eraseFromParent();
2987 return true;
2988}
2989
2990bool AMDGPULegalizerInfo::legalizeSinCos(
2991 MachineInstr &MI, MachineRegisterInfo &MRI,
2992 MachineIRBuilder &B) const {
2993
2994 Register DstReg = MI.getOperand(i: 0).getReg();
2995 Register SrcReg = MI.getOperand(i: 1).getReg();
2996 LLT Ty = MRI.getType(Reg: DstReg);
2997 unsigned Flags = MI.getFlags();
2998
2999 Register TrigVal;
3000 auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: 0.5 * numbers::inv_pi);
3001 if (ST.hasTrigReducedRange()) {
3002 auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
3003 TrigVal = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {Ty})
3004 .addUse(RegNo: MulVal.getReg(Idx: 0))
3005 .setMIFlags(Flags)
3006 .getReg(Idx: 0);
3007 } else
3008 TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: 0);
3009
3010 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3011 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3012 B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
3013 .addUse(RegNo: TrigVal)
3014 .setMIFlags(Flags);
3015 MI.eraseFromParent();
3016 return true;
3017}
3018
3019bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
3020 MachineIRBuilder &B,
3021 const GlobalValue *GV,
3022 int64_t Offset,
3023 unsigned GAFlags) const {
3024 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3025 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3026 // to the following code sequence:
3027 //
3028 // For constant address space:
3029 // s_getpc_b64 s[0:1]
3030 // s_add_u32 s0, s0, $symbol
3031 // s_addc_u32 s1, s1, 0
3032 //
3033 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3034 // a fixup or relocation is emitted to replace $symbol with a literal
3035 // constant, which is a pc-relative offset from the encoding of the $symbol
3036 // operand to the global variable.
3037 //
3038 // For global address space:
3039 // s_getpc_b64 s[0:1]
3040 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3041 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3042 //
3043 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3044 // fixups or relocations are emitted to replace $symbol@*@lo and
3045 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3046 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3047 // operand to the global variable.
3048
3049 LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3050
3051 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3052 B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
3053
3054 if (ST.has64BitLiterals()) {
3055 assert(GAFlags != SIInstrInfo::MO_NONE);
3056
3057 MachineInstrBuilder MIB =
3058 B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(RegNo: PCReg);
3059 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 2);
3060 } else {
3061 MachineInstrBuilder MIB =
3062 B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(RegNo: PCReg);
3063
3064 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
3065 if (GAFlags == SIInstrInfo::MO_NONE)
3066 MIB.addImm(Val: 0);
3067 else
3068 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 1);
3069 }
3070
3071 if (!B.getMRI()->getRegClassOrNull(Reg: PCReg))
3072 B.getMRI()->setRegClass(Reg: PCReg, RC: &AMDGPU::SReg_64RegClass);
3073
3074 if (PtrTy.getSizeInBits() == 32)
3075 B.buildExtract(Res: DstReg, Src: PCReg, Index: 0);
3076 return true;
3077}
3078
3079// Emit a ABS32_LO / ABS32_HI relocation stub.
3080void AMDGPULegalizerInfo::buildAbsGlobalAddress(
3081 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3082 MachineRegisterInfo &MRI) const {
3083 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3084
3085 if (RequiresHighHalf && ST.has64BitLiterals()) {
3086 if (!MRI.getRegClassOrNull(Reg: DstReg))
3087 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_64RegClass);
3088 B.buildInstr(Opcode: AMDGPU::S_MOV_B64)
3089 .addDef(RegNo: DstReg)
3090 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS64);
3091 return;
3092 }
3093
3094 LLT S32 = LLT::scalar(SizeInBits: 32);
3095
3096 // Use the destination directly, if and only if we store the lower address
3097 // part only and we don't have a register class being set.
3098 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
3099 ? DstReg
3100 : MRI.createGenericVirtualRegister(Ty: S32);
3101
3102 if (!MRI.getRegClassOrNull(Reg: AddrLo))
3103 MRI.setRegClass(Reg: AddrLo, RC: &AMDGPU::SReg_32RegClass);
3104
3105 // Write the lower half.
3106 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
3107 .addDef(RegNo: AddrLo)
3108 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
3109
3110 // If required, write the upper half as well.
3111 if (RequiresHighHalf) {
3112 assert(PtrTy.getSizeInBits() == 64 &&
3113 "Must provide a 64-bit pointer type!");
3114
3115 Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
3116 MRI.setRegClass(Reg: AddrHi, RC: &AMDGPU::SReg_32RegClass);
3117
3118 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
3119 .addDef(RegNo: AddrHi)
3120 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_HI);
3121
3122 // Use the destination directly, if and only if we don't have a register
3123 // class being set.
3124 Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
3125 ? DstReg
3126 : MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
3127
3128 if (!MRI.getRegClassOrNull(Reg: AddrDst))
3129 MRI.setRegClass(Reg: AddrDst, RC: &AMDGPU::SReg_64RegClass);
3130
3131 B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
3132
3133 // If we created a new register for the destination, cast the result into
3134 // the final output.
3135 if (AddrDst != DstReg)
3136 B.buildCast(Dst: DstReg, Src: AddrDst);
3137 } else if (AddrLo != DstReg) {
3138 // If we created a new register for the destination, cast the result into
3139 // the final output.
3140 B.buildCast(Dst: DstReg, Src: AddrLo);
3141 }
3142}
3143
3144bool AMDGPULegalizerInfo::legalizeGlobalValue(
3145 MachineInstr &MI, MachineRegisterInfo &MRI,
3146 MachineIRBuilder &B) const {
3147 Register DstReg = MI.getOperand(i: 0).getReg();
3148 LLT Ty = MRI.getType(Reg: DstReg);
3149 unsigned AS = Ty.getAddressSpace();
3150
3151 const GlobalValue *GV = MI.getOperand(i: 1).getGlobal();
3152 MachineFunction &MF = B.getMF();
3153 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3154
3155 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
3156 if (!MFI->isModuleEntryFunction() &&
3157 GV->getName() != "llvm.amdgcn.module.lds" &&
3158 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
3159 const Function &Fn = MF.getFunction();
3160 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
3161 Fn, "local memory global used by non-kernel function",
3162 MI.getDebugLoc(), DS_Warning));
3163
3164 // We currently don't have a way to correctly allocate LDS objects that
3165 // aren't directly associated with a kernel. We do force inlining of
3166 // functions that use local objects. However, if these dead functions are
3167 // not eliminated, we don't want a compile time error. Just emit a warning
3168 // and a trap, since there should be no callable path here.
3169 B.buildTrap();
3170 B.buildUndef(Res: DstReg);
3171 MI.eraseFromParent();
3172 return true;
3173 }
3174
3175 // TODO: We could emit code to handle the initialization somewhere.
3176 // We ignore the initializer for now and legalize it to allow selection.
3177 // The initializer will anyway get errored out during assembly emission.
3178 const SITargetLowering *TLI = ST.getTargetLowering();
3179 if (!TLI->shouldUseLDSConstAddress(GV)) {
3180 MI.getOperand(i: 1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3181 return true; // Leave in place;
3182 }
3183
3184 const GlobalVariable &GVar = *cast<GlobalVariable>(Val: GV);
3185 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3186 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3187 // zero-sized type in other languages to declare the dynamic shared
3188 // memory which size is not known at the compile time. They will be
3189 // allocated by the runtime and placed directly after the static
3190 // allocated ones. They all share the same offset.
3191 if (GVar.getGlobalSize(DL: GVar.getDataLayout()) == 0) {
3192 // Adjust alignment for that dynamic shared memory array.
3193 MFI->setDynLDSAlign(F: MF.getFunction(), GV: GVar);
3194 LLT S32 = LLT::scalar(SizeInBits: 32);
3195 auto Sz = B.buildIntrinsic(ID: Intrinsic::amdgcn_groupstaticsize, Res: {S32});
3196 B.buildIntToPtr(Dst: DstReg, Src: Sz);
3197 MI.eraseFromParent();
3198 return true;
3199 }
3200 }
3201
3202 B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(), GV: GVar));
3203 MI.eraseFromParent();
3204 return true;
3205 }
3206
3207 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3208 buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
3209 MI.eraseFromParent();
3210 return true;
3211 }
3212
3213 const SITargetLowering *TLI = ST.getTargetLowering();
3214
3215 if (TLI->shouldEmitFixup(GV)) {
3216 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0);
3217 MI.eraseFromParent();
3218 return true;
3219 }
3220
3221 if (TLI->shouldEmitPCReloc(GV)) {
3222 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_REL32);
3223 MI.eraseFromParent();
3224 return true;
3225 }
3226
3227 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3228 Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
3229
3230 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3231 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3232 PtrInfo: MachinePointerInfo::getGOT(MF),
3233 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3234 MachineMemOperand::MOInvariant,
3235 MemTy: LoadTy, base_alignment: Align(8));
3236
3237 buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3238
3239 if (Ty.getSizeInBits() == 32) {
3240 // Truncate if this is a 32-bit constant address.
3241 auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3242 B.buildExtract(Res: DstReg, Src: Load, Index: 0);
3243 } else
3244 B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3245
3246 MI.eraseFromParent();
3247 return true;
3248}
3249
3250static LLT widenToNextPowerOf2(LLT Ty) {
3251 if (Ty.isVector())
3252 return Ty.changeElementCount(
3253 EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3254 return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3255}
3256
3257bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3258 MachineInstr &MI) const {
3259 MachineIRBuilder &B = Helper.MIRBuilder;
3260 MachineRegisterInfo &MRI = *B.getMRI();
3261 GISelChangeObserver &Observer = Helper.Observer;
3262
3263 Register PtrReg = MI.getOperand(i: 1).getReg();
3264 LLT PtrTy = MRI.getType(Reg: PtrReg);
3265 unsigned AddrSpace = PtrTy.getAddressSpace();
3266
3267 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3268 LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3269 auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3270 Observer.changingInstr(MI);
3271 MI.getOperand(i: 1).setReg(Cast.getReg(Idx: 0));
3272 Observer.changedInstr(MI);
3273 return true;
3274 }
3275
3276 if (MI.getOpcode() != AMDGPU::G_LOAD)
3277 return false;
3278
3279 Register ValReg = MI.getOperand(i: 0).getReg();
3280 LLT ValTy = MRI.getType(Reg: ValReg);
3281
3282 if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3283 Observer.changingInstr(MI);
3284 castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
3285 Observer.changedInstr(MI);
3286 return true;
3287 }
3288
3289 MachineMemOperand *MMO = *MI.memoperands_begin();
3290 const unsigned ValSize = ValTy.getSizeInBits();
3291 const LLT MemTy = MMO->getMemoryType();
3292 const Align MemAlign = MMO->getAlign();
3293 const unsigned MemSize = MemTy.getSizeInBits();
3294 const uint64_t AlignInBits = 8 * MemAlign.value();
3295
3296 // Widen non-power-of-2 loads to the alignment if needed
3297 if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3298 const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3299
3300 // This was already the correct extending load result type, so just adjust
3301 // the memory type.
3302 if (WideMemSize == ValSize) {
3303 MachineFunction &MF = B.getMF();
3304
3305 MachineMemOperand *WideMMO =
3306 MF.getMachineMemOperand(MMO, Offset: 0, Size: WideMemSize / 8);
3307 Observer.changingInstr(MI);
3308 MI.setMemRefs(MF, MemRefs: {WideMMO});
3309 Observer.changedInstr(MI);
3310 return true;
3311 }
3312
3313 // Don't bother handling edge case that should probably never be produced.
3314 if (ValSize > WideMemSize)
3315 return false;
3316
3317 LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3318
3319 Register WideLoad;
3320 if (!WideTy.isVector()) {
3321 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3322 B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: 0);
3323 } else {
3324 // Extract the subvector.
3325
3326 if (isRegisterType(ST, Ty: ValTy)) {
3327 // If this a case where G_EXTRACT is legal, use it.
3328 // (e.g. <3 x s32> -> <4 x s32>)
3329 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3330 B.buildExtract(Res: ValReg, Src: WideLoad, Index: 0);
3331 } else {
3332 // For cases where the widened type isn't a nice register value, unmerge
3333 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3334 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3335 B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3336 }
3337 }
3338
3339 MI.eraseFromParent();
3340 return true;
3341 }
3342
3343 return false;
3344}
3345
3346bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3347 MachineInstr &MI) const {
3348 MachineIRBuilder &B = Helper.MIRBuilder;
3349 MachineRegisterInfo &MRI = *B.getMRI();
3350 GISelChangeObserver &Observer = Helper.Observer;
3351
3352 Register DataReg = MI.getOperand(i: 0).getReg();
3353 LLT DataTy = MRI.getType(Reg: DataReg);
3354
3355 if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3356 Observer.changingInstr(MI);
3357 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
3358 Observer.changedInstr(MI);
3359 return true;
3360 }
3361 return false;
3362}
3363
3364bool AMDGPULegalizerInfo::legalizeFMad(
3365 MachineInstr &MI, MachineRegisterInfo &MRI,
3366 MachineIRBuilder &B) const {
3367 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3368 assert(Ty.isScalar());
3369
3370 MachineFunction &MF = B.getMF();
3371 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3372
3373 // TODO: Always legal with future ftz flag.
3374 // FIXME: Do we need just output?
3375 if (Ty == LLT::float32() &&
3376 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3377 return true;
3378 if (Ty == LLT::float16() &&
3379 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3380 return true;
3381
3382 MachineIRBuilder HelperBuilder(MI);
3383 GISelObserverWrapper DummyObserver;
3384 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3385 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3386}
3387
3388bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3389 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3390 Register DstReg = MI.getOperand(i: 0).getReg();
3391 Register PtrReg = MI.getOperand(i: 1).getReg();
3392 Register CmpVal = MI.getOperand(i: 2).getReg();
3393 Register NewVal = MI.getOperand(i: 3).getReg();
3394
3395 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3396 "this should not have been custom lowered");
3397
3398 LLT ValTy = MRI.getType(Reg: CmpVal);
3399 LLT VecTy = LLT::fixed_vector(NumElements: 2, ScalarTy: ValTy);
3400
3401 Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: 0);
3402
3403 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3404 .addDef(RegNo: DstReg)
3405 .addUse(RegNo: PtrReg)
3406 .addUse(RegNo: PackedVal)
3407 .setMemRefs(MI.memoperands());
3408
3409 MI.eraseFromParent();
3410 return true;
3411}
3412
3413/// Return true if it's known that \p Src can never be an f32 denormal value.
3414static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3415 Register Src) {
3416 const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3417 switch (DefMI->getOpcode()) {
3418 case TargetOpcode::G_INTRINSIC: {
3419 switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3420 case Intrinsic::amdgcn_frexp_mant:
3421 case Intrinsic::amdgcn_log:
3422 case Intrinsic::amdgcn_log_clamp:
3423 case Intrinsic::amdgcn_exp2:
3424 case Intrinsic::amdgcn_sqrt:
3425 return true;
3426 default:
3427 break;
3428 }
3429
3430 break;
3431 }
3432 case TargetOpcode::G_FSQRT:
3433 return true;
3434 case TargetOpcode::G_FFREXP: {
3435 if (DefMI->getOperand(i: 0).getReg() == Src)
3436 return true;
3437 break;
3438 }
3439 case TargetOpcode::G_FPEXT: {
3440 return MRI.getType(Reg: DefMI->getOperand(i: 1).getReg()) == LLT::scalar(SizeInBits: 16);
3441 }
3442 default:
3443 return false;
3444 }
3445
3446 return false;
3447}
3448
3449static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3450 return Flags & MachineInstr::FmAfn;
3451}
3452
3453static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3454 unsigned Flags) {
3455 return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3456 MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3457 DenormalMode::PreserveSign;
3458}
3459
3460std::pair<Register, Register>
3461AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3462 unsigned Flags) const {
3463 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3464 return {};
3465
3466 const LLT F32 = LLT::scalar(SizeInBits: 32);
3467 auto SmallestNormal = B.buildFConstant(
3468 Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3469 auto IsLtSmallestNormal =
3470 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: SmallestNormal);
3471
3472 auto Scale32 = B.buildFConstant(Res: F32, Val: 0x1.0p+32);
3473 auto One = B.buildFConstant(Res: F32, Val: 1.0);
3474 auto ScaleFactor =
3475 B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3476 auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3477
3478 return {ScaledInput.getReg(Idx: 0), IsLtSmallestNormal.getReg(Idx: 0)};
3479}
3480
3481bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3482 MachineIRBuilder &B) const {
3483 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3484 // If we have to handle denormals, scale up the input and adjust the result.
3485
3486 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3487 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3488
3489 Register Dst = MI.getOperand(i: 0).getReg();
3490 Register Src = MI.getOperand(i: 1).getReg();
3491 LLT Ty = B.getMRI()->getType(Reg: Dst);
3492 unsigned Flags = MI.getFlags();
3493
3494 if (Ty == LLT::scalar(SizeInBits: 16)) {
3495 const LLT F32 = LLT::scalar(SizeInBits: 32);
3496 // Nothing in half is a denormal when promoted to f32.
3497 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3498 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {F32})
3499 .addUse(RegNo: Ext.getReg(Idx: 0))
3500 .setMIFlags(Flags);
3501 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3502 MI.eraseFromParent();
3503 return true;
3504 }
3505
3506 assert(Ty == LLT::scalar(32));
3507
3508 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3509 if (!ScaledInput) {
3510 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {MI.getOperand(i: 0)})
3511 .addUse(RegNo: Src)
3512 .setMIFlags(Flags);
3513 MI.eraseFromParent();
3514 return true;
3515 }
3516
3517 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3518 .addUse(RegNo: ScaledInput)
3519 .setMIFlags(Flags);
3520
3521 auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: 32.0);
3522 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3523 auto ResultOffset =
3524 B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3525 B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3526
3527 MI.eraseFromParent();
3528 return true;
3529}
3530
3531static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3532 Register Z, unsigned Flags) {
3533 auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3534 return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: 0);
3535}
3536
3537bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3538 MachineIRBuilder &B) const {
3539 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3540 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3541
3542 MachineRegisterInfo &MRI = *B.getMRI();
3543 Register Dst = MI.getOperand(i: 0).getReg();
3544 Register X = MI.getOperand(i: 1).getReg();
3545 unsigned Flags = MI.getFlags();
3546 const LLT Ty = MRI.getType(Reg: X);
3547 MachineFunction &MF = B.getMF();
3548
3549 const LLT F32 = LLT::scalar(SizeInBits: 32);
3550 const LLT F16 = LLT::scalar(SizeInBits: 16);
3551
3552 const AMDGPUTargetMachine &TM =
3553 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3554
3555 if (Ty == F16 || MI.getFlag(Flag: MachineInstr::FmAfn)) {
3556 if (Ty == F16 && !ST.has16BitInsts()) {
3557 Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3558 auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3559 legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: 0), IsLog10, Flags);
3560 B.buildFPTrunc(Res: Dst, Op: LogVal);
3561 } else {
3562 legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3563 }
3564
3565 MI.eraseFromParent();
3566 return true;
3567 }
3568
3569 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3570 if (ScaledInput)
3571 X = ScaledInput;
3572
3573 auto Y =
3574 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty}).addUse(RegNo: X).setMIFlags(Flags);
3575
3576 Register R;
3577 if (ST.hasFastFMAF32()) {
3578 // c+cc are ln(2)/ln(10) to more than 49 bits
3579 const float c_log10 = 0x1.344134p-2f;
3580 const float cc_log10 = 0x1.09f79ep-26f;
3581
3582 // c + cc is ln(2) to more than 49 bits
3583 const float c_log = 0x1.62e42ep-1f;
3584 const float cc_log = 0x1.efa39ep-25f;
3585
3586 auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3587 auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3588 // This adds correction terms for which contraction may lead to an increase
3589 // in the error of the approximation, so disable it.
3590 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3591 R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags: NewFlags).getReg(Idx: 0);
3592 auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags: NewFlags);
3593 auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags: NewFlags);
3594 auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags: NewFlags);
3595 R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags: NewFlags).getReg(Idx: 0);
3596 } else {
3597 // ch+ct is ln(2)/ln(10) to more than 36 bits
3598 const float ch_log10 = 0x1.344000p-2f;
3599 const float ct_log10 = 0x1.3509f6p-18f;
3600
3601 // ch + ct is ln(2) to more than 36 bits
3602 const float ch_log = 0x1.62e000p-1f;
3603 const float ct_log = 0x1.0bfbe8p-15f;
3604
3605 auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3606 auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3607
3608 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3609 auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3610 auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3611 // This adds correction terms for which contraction may lead to an increase
3612 // in the error of the approximation, so disable it.
3613 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3614 auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags: NewFlags);
3615
3616 Register Mad0 =
3617 getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CT.getReg(Idx: 0), Z: YTCT.getReg(Idx: 0), Flags: NewFlags);
3618 Register Mad1 = getMad(B, Ty, X: YT.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad0, Flags: NewFlags);
3619 R = getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad1, Flags: NewFlags);
3620 }
3621
3622 const bool IsFiniteOnly =
3623 (MI.getFlag(Flag: MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3624 MI.getFlag(Flag: MachineInstr::FmNoInfs);
3625
3626 if (!IsFiniteOnly) {
3627 // Expand isfinite(x) => fabs(x) < inf
3628 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3629 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3630 auto IsFinite =
3631 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
3632 R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(Idx: 0);
3633 }
3634
3635 if (ScaledInput) {
3636 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3637 auto ShiftK =
3638 B.buildFConstant(Res: Ty, Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3639 auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3640 B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3641 } else {
3642 B.buildCopy(Res: Dst, Op: R);
3643 }
3644
3645 MI.eraseFromParent();
3646 return true;
3647}
3648
3649bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3650 Register Src, bool IsLog10,
3651 unsigned Flags) const {
3652 const double Log2BaseInverted =
3653 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3654
3655 LLT Ty = B.getMRI()->getType(Reg: Dst);
3656
3657 if (Ty == LLT::scalar(SizeInBits: 32)) {
3658 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3659 if (ScaledInput) {
3660 auto LogSrc = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3661 .addUse(RegNo: Src)
3662 .setMIFlags(Flags);
3663 auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -32.0 * Log2BaseInverted);
3664 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3665 auto ResultOffset =
3666 B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3667 auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3668
3669 if (ST.hasFastFMAF32())
3670 B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3671 else {
3672 auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3673 B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3674 }
3675
3676 return true;
3677 }
3678 }
3679
3680 auto Log2Operand = Ty == LLT::scalar(SizeInBits: 16)
3681 ? B.buildFLog2(Dst: Ty, Src, Flags)
3682 : B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3683 .addUse(RegNo: Src)
3684 .setMIFlags(Flags);
3685 auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3686 B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3687 return true;
3688}
3689
3690bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3691 MachineIRBuilder &B) const {
3692 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3693 // If we have to handle denormals, scale up the input and adjust the result.
3694
3695 Register Dst = MI.getOperand(i: 0).getReg();
3696 Register Src = MI.getOperand(i: 1).getReg();
3697 unsigned Flags = MI.getFlags();
3698 LLT Ty = B.getMRI()->getType(Reg: Dst);
3699 const LLT F16 = LLT::scalar(SizeInBits: 16);
3700 const LLT F32 = LLT::scalar(SizeInBits: 32);
3701
3702 if (Ty == F16) {
3703 // Nothing in half is a denormal when promoted to f32.
3704 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3705 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {F32})
3706 .addUse(RegNo: Ext.getReg(Idx: 0))
3707 .setMIFlags(Flags);
3708 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3709 MI.eraseFromParent();
3710 return true;
3711 }
3712
3713 assert(Ty == F32);
3714
3715 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3716 B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3717 .addUse(RegNo: Src)
3718 .setMIFlags(Flags);
3719 MI.eraseFromParent();
3720 return true;
3721 }
3722
3723 // bool needs_scaling = x < -0x1.f80000p+6f;
3724 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3725
3726 // -nextafter(128.0, -1)
3727 auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -0x1.f80000p+6f);
3728 auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
3729 Op1: RangeCheckConst, Flags);
3730
3731 auto SixtyFour = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3732 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3733 auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3734 auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3735
3736 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3737 .addUse(RegNo: AddInput.getReg(Idx: 0))
3738 .setMIFlags(Flags);
3739
3740 auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: 0x1.0p-64f);
3741 auto One = B.buildFConstant(Res: Ty, Val: 1.0);
3742 auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3743 B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3744 MI.eraseFromParent();
3745 return true;
3746}
3747
3748static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst,
3749 const SrcOp &Src, unsigned Flags) {
3750 LLT Ty = Dst.getLLTTy(MRI: *B.getMRI());
3751
3752 if (Ty == LLT::scalar(SizeInBits: 32)) {
3753 return B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Dst})
3754 .addUse(RegNo: Src.getReg())
3755 .setMIFlags(Flags);
3756 }
3757 return B.buildFExp2(Dst, Src, Flags);
3758}
3759
3760bool AMDGPULegalizerInfo::legalizeFExpUnsafeImpl(MachineIRBuilder &B,
3761 Register Dst, Register X,
3762 unsigned Flags,
3763 bool IsExp10) const {
3764 LLT Ty = B.getMRI()->getType(Reg: X);
3765
3766 // exp(x) -> exp2(M_LOG2E_F * x);
3767 // exp10(x) -> exp2(log2(10) * x);
3768 auto Const = B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3769 auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Const, Flags);
3770 buildExp(B, Dst, Src: Mul, Flags);
3771 return true;
3772}
3773
3774bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3775 Register X, unsigned Flags) const {
3776 LLT Ty = B.getMRI()->getType(Reg: Dst);
3777 LLT F32 = LLT::scalar(SizeInBits: 32);
3778
3779 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3780 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3781 }
3782
3783 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.5d58a0p+6f);
3784 auto NeedsScaling =
3785 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold, Flags);
3786 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3787 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3788 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3789
3790 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3791 auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3792
3793 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3794 .addUse(RegNo: ExpInput.getReg(Idx: 0))
3795 .setMIFlags(Flags);
3796
3797 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.969d48p-93f);
3798 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3799 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3800 return true;
3801}
3802
3803bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B,
3804 Register Dst, Register X,
3805 unsigned Flags) const {
3806 LLT Ty = B.getMRI()->getType(Reg: Dst);
3807 LLT F32 = LLT::scalar(SizeInBits: 32);
3808
3809 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3810 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3811 auto K0 = B.buildFConstant(Res: Ty, Val: 0x1.a92000p+1f);
3812 auto K1 = B.buildFConstant(Res: Ty, Val: 0x1.4f0978p-11f);
3813
3814 auto Mul1 = B.buildFMul(Dst: Ty, Src0: X, Src1: K1, Flags);
3815 auto Exp2_1 = buildExp(B, Dst: Ty, Src: Mul1, Flags);
3816 auto Mul0 = B.buildFMul(Dst: Ty, Src0: X, Src1: K0, Flags);
3817 auto Exp2_0 = buildExp(B, Dst: Ty, Src: Mul0, Flags);
3818 B.buildFMul(Dst, Src0: Exp2_0, Src1: Exp2_1, Flags);
3819 return true;
3820 }
3821
3822 // bool s = x < -0x1.2f7030p+5f;
3823 // x += s ? 0x1.0p+5f : 0.0f;
3824 // exp10 = exp2(x * 0x1.a92000p+1f) *
3825 // exp2(x * 0x1.4f0978p-11f) *
3826 // (s ? 0x1.9f623ep-107f : 1.0f);
3827
3828 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.2f7030p+5f);
3829 auto NeedsScaling =
3830 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold);
3831
3832 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+5f);
3833 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3834 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X);
3835
3836 auto K0 = B.buildFConstant(Res: Ty, Val: 0x1.a92000p+1f);
3837 auto K1 = B.buildFConstant(Res: Ty, Val: 0x1.4f0978p-11f);
3838
3839 auto Mul1 = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: K1, Flags);
3840 auto Exp2_1 = buildExp(B, Dst: Ty, Src: Mul1, Flags);
3841 auto Mul0 = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: K0, Flags);
3842 auto Exp2_0 = buildExp(B, Dst: Ty, Src: Mul0, Flags);
3843
3844 auto MulExps = B.buildFMul(Dst: Ty, Src0: Exp2_0, Src1: Exp2_1, Flags);
3845 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.9f623ep-107f);
3846 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: MulExps, Src1: ResultScaleFactor, Flags);
3847
3848 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: MulExps);
3849 return true;
3850}
3851
3852bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3853 MachineIRBuilder &B) const {
3854 Register Dst = MI.getOperand(i: 0).getReg();
3855 Register X = MI.getOperand(i: 1).getReg();
3856 const unsigned Flags = MI.getFlags();
3857 MachineFunction &MF = B.getMF();
3858 MachineRegisterInfo &MRI = *B.getMRI();
3859 LLT Ty = MRI.getType(Reg: Dst);
3860 const LLT F16 = LLT::scalar(SizeInBits: 16);
3861 const LLT F32 = LLT::scalar(SizeInBits: 32);
3862 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3863
3864 if (Ty == F16) {
3865 // v_exp_f16 (fmul x, log2e)
3866 if (allowApproxFunc(MF, Flags)) {
3867 // TODO: Does this really require fast?
3868 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
3869 : legalizeFExpUnsafe(B, Dst, X, Flags);
3870 MI.eraseFromParent();
3871 return true;
3872 }
3873
3874 // Nothing in half is a denormal when promoted to f32.
3875 //
3876 // exp(f16 x) ->
3877 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3878 //
3879 // exp10(f16 x) ->
3880 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3881 auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
3882 Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
3883 legalizeFExpUnsafeImpl(B, Dst: Lowered, X: Ext.getReg(Idx: 0), Flags, IsExp10);
3884 B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
3885 MI.eraseFromParent();
3886 return true;
3887 }
3888
3889 assert(Ty == F32);
3890
3891 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3892 // library behavior. Also, is known-not-daz source sufficient?
3893 if (allowApproxFunc(MF, Flags)) {
3894 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
3895 : legalizeFExpUnsafe(B, Dst, X, Flags);
3896 MI.eraseFromParent();
3897 return true;
3898 }
3899
3900 // Algorithm:
3901 //
3902 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3903 //
3904 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3905 // n = 64*m + j, 0 <= j < 64
3906 //
3907 // e^x = 2^((64*m + j + f)/64)
3908 // = (2^m) * (2^(j/64)) * 2^(f/64)
3909 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3910 //
3911 // f = x*(64/ln(2)) - n
3912 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3913 //
3914 // e^x = (2^m) * (2^(j/64)) * e^r
3915 //
3916 // (2^(j/64)) is precomputed
3917 //
3918 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3919 // e^r = 1 + q
3920 //
3921 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3922 //
3923 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3924 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3925 Register PH, PL;
3926
3927 if (ST.hasFastFMAF32()) {
3928 const float c_exp = numbers::log2ef;
3929 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3930 const float c_exp10 = 0x1.a934f0p+1f;
3931 const float cc_exp10 = 0x1.2f346ep-24f;
3932
3933 auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
3934 PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: 0);
3935 auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
3936 auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
3937
3938 auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
3939 PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: 0);
3940 } else {
3941 const float ch_exp = 0x1.714000p+0f;
3942 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3943
3944 const float ch_exp10 = 0x1.a92000p+1f;
3945 const float cl_exp10 = 0x1.4f0978p-11f;
3946
3947 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3948 auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
3949 auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
3950
3951 auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
3952 PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: 0);
3953
3954 auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
3955 auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
3956
3957 Register Mad0 =
3958 getMad(B, Ty, X: XL.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: XLCL.getReg(Idx: 0), Flags);
3959 PL = getMad(B, Ty, X: XH.getReg(Idx: 0), Y: CL.getReg(Idx: 0), Z: Mad0, Flags);
3960 }
3961
3962 auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
3963
3964 // It is unsafe to contract this fsub into the PH multiply.
3965 auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
3966 auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
3967 auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: 32), Src0: E);
3968
3969 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3970 .addUse(RegNo: A.getReg(Idx: 0))
3971 .setMIFlags(Flags);
3972 auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
3973
3974 auto UnderflowCheckConst =
3975 B.buildFConstant(Res: Ty, Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3976 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3977 auto Underflow =
3978 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: UnderflowCheckConst);
3979
3980 R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
3981
3982 if (!(Flags & MachineInstr::FmNoInfs)) {
3983 auto OverflowCheckConst =
3984 B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3985
3986 auto Overflow =
3987 B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: OverflowCheckConst);
3988 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3989 R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
3990 }
3991
3992 B.buildCopy(Res: Dst, Op: R);
3993 MI.eraseFromParent();
3994 return true;
3995}
3996
3997bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3998 MachineIRBuilder &B) const {
3999 Register Dst = MI.getOperand(i: 0).getReg();
4000 Register Src0 = MI.getOperand(i: 1).getReg();
4001 Register Src1 = MI.getOperand(i: 2).getReg();
4002 unsigned Flags = MI.getFlags();
4003 LLT Ty = B.getMRI()->getType(Reg: Dst);
4004 const LLT F16 = LLT::float16();
4005 const LLT F32 = LLT::float32();
4006
4007 if (Ty == F32) {
4008 auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
4009 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
4010 .addUse(RegNo: Log.getReg(Idx: 0))
4011 .addUse(RegNo: Src1)
4012 .setMIFlags(Flags);
4013 B.buildFExp2(Dst, Src: Mul, Flags);
4014 } else if (Ty == F16) {
4015 // There's no f16 fmul_legacy, so we need to convert for it.
4016 auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
4017 auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
4018 auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
4019 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
4020 .addUse(RegNo: Ext0.getReg(Idx: 0))
4021 .addUse(RegNo: Ext1.getReg(Idx: 0))
4022 .setMIFlags(Flags);
4023 B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
4024 } else
4025 return false;
4026
4027 MI.eraseFromParent();
4028 return true;
4029}
4030
4031// Find a source register, ignoring any possible source modifiers.
4032static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
4033 Register ModSrc = OrigSrc;
4034 if (MachineInstr *SrcFNeg = getOpcodeDef(Opcode: AMDGPU::G_FNEG, Reg: ModSrc, MRI)) {
4035 ModSrc = SrcFNeg->getOperand(i: 1).getReg();
4036 if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
4037 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
4038 } else if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
4039 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
4040 return ModSrc;
4041}
4042
4043bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
4044 MachineRegisterInfo &MRI,
4045 MachineIRBuilder &B) const {
4046
4047 const LLT S1 = LLT::scalar(SizeInBits: 1);
4048 const LLT F64 = LLT::float64();
4049 Register Dst = MI.getOperand(i: 0).getReg();
4050 Register OrigSrc = MI.getOperand(i: 1).getReg();
4051 unsigned Flags = MI.getFlags();
4052 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4053 "this should not have been custom lowered");
4054
4055 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4056 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4057 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4058 // V_FRACT bug is:
4059 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4060 //
4061 // Convert floor(x) to (x - fract(x))
4062
4063 auto Fract = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {F64})
4064 .addUse(RegNo: OrigSrc)
4065 .setMIFlags(Flags);
4066
4067 // Give source modifier matching some assistance before obscuring a foldable
4068 // pattern.
4069
4070 // TODO: We can avoid the neg on the fract? The input sign to fract
4071 // shouldn't matter?
4072 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4073
4074 auto Const =
4075 B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: 0x3fefffffffffffff));
4076
4077 Register Min = MRI.createGenericVirtualRegister(Ty: F64);
4078
4079 // We don't need to concern ourselves with the snan handling difference, so
4080 // use the one which will directly select.
4081 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4082 if (MFI->getMode().IEEE)
4083 B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
4084 else
4085 B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
4086
4087 Register CorrectedFract = Min;
4088 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
4089 auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
4090 CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: 0);
4091 }
4092
4093 auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
4094 B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
4095
4096 MI.eraseFromParent();
4097 return true;
4098}
4099
4100// Turn an illegal packed v2s16 build vector into bit operations.
4101// TODO: This should probably be a bitcast action in LegalizerHelper.
4102bool AMDGPULegalizerInfo::legalizeBuildVector(
4103 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4104 Register Dst = MI.getOperand(i: 0).getReg();
4105 const LLT S32 = LLT::scalar(SizeInBits: 32);
4106 const LLT S16 = LLT::scalar(SizeInBits: 16);
4107 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4108
4109 Register Src0 = MI.getOperand(i: 1).getReg();
4110 Register Src1 = MI.getOperand(i: 2).getReg();
4111
4112 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4113 assert(MRI.getType(Src0) == S32);
4114 Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 1).getReg()).getReg(Idx: 0);
4115 Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 2).getReg()).getReg(Idx: 0);
4116 }
4117
4118 auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
4119 B.buildBitcast(Dst, Src: Merge);
4120
4121 MI.eraseFromParent();
4122 return true;
4123}
4124
4125// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4126//
4127// Source and accumulation registers must all be 32-bits.
4128//
4129// TODO: When the multiply is uniform, we should produce a code sequence
4130// that is better suited to instruction selection on the SALU. Instead of
4131// the outer loop going over parts of the result, the outer loop should go
4132// over parts of one of the factors. This should result in instruction
4133// selection that makes full use of S_ADDC_U32 instructions.
4134void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
4135 MutableArrayRef<Register> Accum,
4136 ArrayRef<Register> Src0,
4137 ArrayRef<Register> Src1,
4138 bool UsePartialMad64_32,
4139 bool SeparateOddAlignedProducts) const {
4140 // Use (possibly empty) vectors of S1 registers to represent the set of
4141 // carries from one pair of positions to the next.
4142 using Carry = SmallVector<Register, 2>;
4143
4144 MachineIRBuilder &B = Helper.MIRBuilder;
4145 GISelValueTracking &VT = *Helper.getValueTracking();
4146
4147 const LLT S1 = LLT::scalar(SizeInBits: 1);
4148 const LLT S32 = LLT::scalar(SizeInBits: 32);
4149 const LLT S64 = LLT::scalar(SizeInBits: 64);
4150
4151 Register Zero32;
4152 Register Zero64;
4153
4154 auto getZero32 = [&]() -> Register {
4155 if (!Zero32)
4156 Zero32 = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
4157 return Zero32;
4158 };
4159 auto getZero64 = [&]() -> Register {
4160 if (!Zero64)
4161 Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
4162 return Zero64;
4163 };
4164
4165 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4166 for (unsigned i = 0; i < Src0.size(); ++i) {
4167 Src0KnownZeros.push_back(Elt: VT.getKnownBits(R: Src0[i]).isZero());
4168 Src1KnownZeros.push_back(Elt: VT.getKnownBits(R: Src1[i]).isZero());
4169 }
4170
4171 // Merge the given carries into the 32-bit LocalAccum, which is modified
4172 // in-place.
4173 //
4174 // Returns the carry-out, which is a single S1 register or null.
4175 auto mergeCarry =
4176 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4177 if (CarryIn.empty())
4178 return Register();
4179
4180 bool HaveCarryOut = true;
4181 Register CarryAccum;
4182 if (CarryIn.size() == 1) {
4183 if (!LocalAccum) {
4184 LocalAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
4185 return Register();
4186 }
4187
4188 CarryAccum = getZero32();
4189 } else {
4190 CarryAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
4191 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4192 CarryAccum =
4193 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32(), CarryIn: CarryIn[i])
4194 .getReg(Idx: 0);
4195 }
4196
4197 if (!LocalAccum) {
4198 LocalAccum = getZero32();
4199 HaveCarryOut = false;
4200 }
4201 }
4202
4203 auto Add =
4204 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
4205 LocalAccum = Add.getReg(Idx: 0);
4206 return HaveCarryOut ? Add.getReg(Idx: 1) : Register();
4207 };
4208
4209 // Build a multiply-add chain to compute
4210 //
4211 // LocalAccum + (partial products at DstIndex)
4212 // + (opportunistic subset of CarryIn)
4213 //
4214 // LocalAccum is an array of one or two 32-bit registers that are updated
4215 // in-place. The incoming registers may be null.
4216 //
4217 // In some edge cases, carry-ins can be consumed "for free". In that case,
4218 // the consumed carry bits are removed from CarryIn in-place.
4219 auto buildMadChain =
4220 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4221 -> Carry {
4222 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4223 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4224
4225 Carry CarryOut;
4226 unsigned j0 = 0;
4227
4228 // Use plain 32-bit multiplication for the most significant part of the
4229 // result by default.
4230 if (LocalAccum.size() == 1 &&
4231 (!UsePartialMad64_32 || !CarryIn.empty())) {
4232 do {
4233 // Skip multiplication if one of the operands is 0
4234 unsigned j1 = DstIndex - j0;
4235 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4236 ++j0;
4237 continue;
4238 }
4239 auto Mul = B.buildMul(Dst: S32, Src0: Src0[j0], Src1: Src1[j1]);
4240 if (!LocalAccum[0] || VT.getKnownBits(R: LocalAccum[0]).isZero()) {
4241 LocalAccum[0] = Mul.getReg(Idx: 0);
4242 } else {
4243 if (CarryIn.empty()) {
4244 LocalAccum[0] = B.buildAdd(Dst: S32, Src0: LocalAccum[0], Src1: Mul).getReg(Idx: 0);
4245 } else {
4246 LocalAccum[0] =
4247 B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum[0], Op1: Mul, CarryIn: CarryIn.back())
4248 .getReg(Idx: 0);
4249 CarryIn.pop_back();
4250 }
4251 }
4252 ++j0;
4253 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4254 }
4255
4256 // Build full 64-bit multiplies.
4257 if (j0 <= DstIndex) {
4258 bool HaveSmallAccum = false;
4259 Register Tmp;
4260
4261 if (LocalAccum[0]) {
4262 if (LocalAccum.size() == 1) {
4263 Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4264 HaveSmallAccum = true;
4265 } else if (LocalAccum[1]) {
4266 Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: 0);
4267 HaveSmallAccum = false;
4268 } else {
4269 Tmp = B.buildZExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4270 HaveSmallAccum = true;
4271 }
4272 } else {
4273 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4274 Tmp = getZero64();
4275 HaveSmallAccum = true;
4276 }
4277
4278 do {
4279 unsigned j1 = DstIndex - j0;
4280 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4281 ++j0;
4282 continue;
4283 }
4284 auto Mad = B.buildInstr(Opc: AMDGPU::G_AMDGPU_MAD_U64_U32, DstOps: {S64, S1},
4285 SrcOps: {Src0[j0], Src1[j1], Tmp});
4286 Tmp = Mad.getReg(Idx: 0);
4287 if (!HaveSmallAccum)
4288 CarryOut.push_back(Elt: Mad.getReg(Idx: 1));
4289 HaveSmallAccum = false;
4290
4291 ++j0;
4292 } while (j0 <= DstIndex);
4293
4294 auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
4295 LocalAccum[0] = Unmerge.getReg(Idx: 0);
4296 if (LocalAccum.size() > 1)
4297 LocalAccum[1] = Unmerge.getReg(Idx: 1);
4298 }
4299
4300 return CarryOut;
4301 };
4302
4303 // Outer multiply loop, iterating over destination parts from least
4304 // significant to most significant parts.
4305 //
4306 // The columns of the following diagram correspond to the destination parts
4307 // affected by one iteration of the outer loop (ignoring boundary
4308 // conditions).
4309 //
4310 // Dest index relative to 2 * i: 1 0 -1
4311 // ------
4312 // Carries from previous iteration: e o
4313 // Even-aligned partial product sum: E E .
4314 // Odd-aligned partial product sum: O O
4315 //
4316 // 'o' is OddCarry, 'e' is EvenCarry.
4317 // EE and OO are computed from partial products via buildMadChain and use
4318 // accumulation where possible and appropriate.
4319 //
4320 Register SeparateOddCarry;
4321 Carry EvenCarry;
4322 Carry OddCarry;
4323
4324 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4325 Carry OddCarryIn = std::move(OddCarry);
4326 Carry EvenCarryIn = std::move(EvenCarry);
4327 OddCarry.clear();
4328 EvenCarry.clear();
4329
4330 // Partial products at offset 2 * i.
4331 if (2 * i < Accum.size()) {
4332 auto LocalAccum = Accum.drop_front(N: 2 * i).take_front(N: 2);
4333 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4334 }
4335
4336 // Partial products at offset 2 * i - 1.
4337 if (i > 0) {
4338 if (!SeparateOddAlignedProducts) {
4339 auto LocalAccum = Accum.drop_front(N: 2 * i - 1).take_front(N: 2);
4340 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4341 } else {
4342 bool IsHighest = 2 * i >= Accum.size();
4343 Register SeparateOddOut[2];
4344 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4345 .take_front(N: IsHighest ? 1 : 2);
4346 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4347
4348 MachineInstr *Lo;
4349
4350 if (i == 1) {
4351 if (!IsHighest)
4352 Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0]);
4353 else
4354 Lo = B.buildAdd(Dst: S32, Src0: Accum[2 * i - 1], Src1: SeparateOddOut[0]);
4355 } else {
4356 Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0],
4357 CarryIn: SeparateOddCarry);
4358 }
4359 Accum[2 * i - 1] = Lo->getOperand(i: 0).getReg();
4360
4361 if (!IsHighest) {
4362 auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i], Op1: SeparateOddOut[1],
4363 CarryIn: Lo->getOperand(i: 1).getReg());
4364 Accum[2 * i] = Hi.getReg(Idx: 0);
4365 SeparateOddCarry = Hi.getReg(Idx: 1);
4366 }
4367 }
4368 }
4369
4370 // Add in the carries from the previous iteration
4371 if (i > 0) {
4372 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4373 EvenCarryIn.push_back(Elt: CarryOut);
4374
4375 if (2 * i < Accum.size()) {
4376 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4377 OddCarry.push_back(Elt: CarryOut);
4378 }
4379 }
4380 }
4381}
4382
4383// Custom narrowing of wide multiplies using wide multiply-add instructions.
4384//
4385// TODO: If the multiply is followed by an addition, we should attempt to
4386// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4387bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4388 MachineInstr &MI) const {
4389 assert(ST.hasMad64_32());
4390 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4391
4392 MachineIRBuilder &B = Helper.MIRBuilder;
4393 MachineRegisterInfo &MRI = *B.getMRI();
4394
4395 Register DstReg = MI.getOperand(i: 0).getReg();
4396 Register Src0 = MI.getOperand(i: 1).getReg();
4397 Register Src1 = MI.getOperand(i: 2).getReg();
4398
4399 LLT Ty = MRI.getType(Reg: DstReg);
4400 assert(Ty.isScalar());
4401
4402 unsigned Size = Ty.getSizeInBits();
4403 if (ST.hasVectorMulU64() && Size == 64)
4404 return true;
4405
4406 unsigned NumParts = Size / 32;
4407 assert((Size % 32) == 0);
4408 assert(NumParts >= 2);
4409
4410 // Whether to use MAD_64_32 for partial products whose high half is
4411 // discarded. This avoids some ADD instructions but risks false dependency
4412 // stalls on some subtargets in some cases.
4413 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4414
4415 // Whether to compute odd-aligned partial products separately. This is
4416 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4417 // in an even-aligned VGPR.
4418 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4419
4420 LLT S32 = LLT::scalar(SizeInBits: 32);
4421 SmallVector<Register, 2> Src0Parts, Src1Parts;
4422 for (unsigned i = 0; i < NumParts; ++i) {
4423 Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4424 Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4425 }
4426 B.buildUnmerge(Res: Src0Parts, Op: Src0);
4427 B.buildUnmerge(Res: Src1Parts, Op: Src1);
4428
4429 SmallVector<Register, 2> AccumRegs(NumParts);
4430 buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4431 SeparateOddAlignedProducts);
4432
4433 B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4434 MI.eraseFromParent();
4435 return true;
4436}
4437
4438// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4439// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4440// case with a single min instruction instead of a compare+select.
4441bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4442 MachineRegisterInfo &MRI,
4443 MachineIRBuilder &B) const {
4444 Register Dst = MI.getOperand(i: 0).getReg();
4445 Register Src = MI.getOperand(i: 1).getReg();
4446 LLT DstTy = MRI.getType(Reg: Dst);
4447 LLT SrcTy = MRI.getType(Reg: Src);
4448
4449 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4450 ? AMDGPU::G_AMDGPU_FFBH_U32
4451 : AMDGPU::G_AMDGPU_FFBL_B32;
4452 auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4453 B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4454
4455 MI.eraseFromParent();
4456 return true;
4457}
4458
4459bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4460 MachineRegisterInfo &MRI,
4461 MachineIRBuilder &B) const {
4462 Register Dst = MI.getOperand(i: 0).getReg();
4463 Register Src = MI.getOperand(i: 1).getReg();
4464 LLT SrcTy = MRI.getType(Reg: Src);
4465 TypeSize NumBits = SrcTy.getSizeInBits();
4466
4467 assert(NumBits < 32u);
4468
4469 auto ShiftAmt = B.buildConstant(Res: S32, Val: 32u - NumBits);
4470 auto Extend = B.buildAnyExt(Res: S32, Op: {Src}).getReg(Idx: 0u);
4471 auto Shift = B.buildShl(Dst: S32, Src0: Extend, Src1: ShiftAmt);
4472 auto Ctlz = B.buildInstr(Opc: AMDGPU::G_AMDGPU_FFBH_U32, DstOps: {S32}, SrcOps: {Shift});
4473 B.buildTrunc(Res: Dst, Op: Ctlz);
4474 MI.eraseFromParent();
4475 return true;
4476}
4477
4478// Check that this is a G_XOR x, -1
4479static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4480 if (MI.getOpcode() != TargetOpcode::G_XOR)
4481 return false;
4482 auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: 2).getReg(), MRI);
4483 return ConstVal == -1;
4484}
4485
4486// Return the use branch instruction, otherwise null if the usage is invalid.
4487static MachineInstr *
4488verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4489 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4490 Register CondDef = MI.getOperand(i: 0).getReg();
4491 if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4492 return nullptr;
4493
4494 MachineBasicBlock *Parent = MI.getParent();
4495 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(RegNo: CondDef);
4496
4497 if (isNot(MRI, MI: *UseMI)) {
4498 Register NegatedCond = UseMI->getOperand(i: 0).getReg();
4499 if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4500 return nullptr;
4501
4502 // We're deleting the def of this value, so we need to remove it.
4503 eraseInstr(MI&: *UseMI, MRI);
4504
4505 UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4506 Negated = true;
4507 }
4508
4509 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4510 return nullptr;
4511
4512 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4513 MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4514 if (Next == Parent->end()) {
4515 MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4516 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4517 return nullptr;
4518 UncondBrTarget = &*NextMBB;
4519 } else {
4520 if (Next->getOpcode() != AMDGPU::G_BR)
4521 return nullptr;
4522 Br = &*Next;
4523 UncondBrTarget = Br->getOperand(i: 0).getMBB();
4524 }
4525
4526 return UseMI;
4527}
4528
4529void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
4530 MachineIRBuilder &B,
4531 const ArgDescriptor *Arg,
4532 const TargetRegisterClass *ArgRC,
4533 LLT ArgTy) const {
4534 MCRegister SrcReg = Arg->getRegister();
4535 assert(SrcReg.isPhysical() && "Physical register expected");
4536 assert(DstReg.isVirtual() && "Virtual register expected");
4537
4538 Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4539 RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4540 if (Arg->isMasked()) {
4541 // TODO: Should we try to emit this once in the entry block?
4542 const LLT S32 = LLT::scalar(SizeInBits: 32);
4543 const unsigned Mask = Arg->getMask();
4544 const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4545
4546 Register AndMaskSrc = LiveIn;
4547
4548 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4549 // 0.
4550 if (Shift != 0) {
4551 auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4552 AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: 0);
4553 }
4554
4555 B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4556 } else {
4557 B.buildCopy(Res: DstReg, Op: LiveIn);
4558 }
4559}
4560
4561bool AMDGPULegalizerInfo::legalizeWorkGroupId(
4562 MachineInstr &MI, MachineIRBuilder &B,
4563 AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
4564 AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
4565 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4566 Register DstReg = MI.getOperand(i: 0).getReg();
4567 if (!ST.hasClusters()) {
4568 if (!loadInputValue(DstReg, B, ArgType: WorkGroupIdPV))
4569 return false;
4570 MI.eraseFromParent();
4571 return true;
4572 }
4573
4574 // Clusters are supported. Return the global position in the grid. If clusters
4575 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4576
4577 // WorkGroupIdXYZ = ClusterId == 0 ?
4578 // ClusterIdXYZ :
4579 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4580 MachineRegisterInfo &MRI = *B.getMRI();
4581 const LLT S32 = LLT::scalar(SizeInBits: 32);
4582 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4583 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4584 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4585 if (!loadInputValue(DstReg: ClusterIdXYZ, B, ArgType: WorkGroupIdPV) ||
4586 !loadInputValue(DstReg: ClusterWorkGroupIdXYZ, B, ArgType: ClusterWorkGroupIdPV) ||
4587 !loadInputValue(DstReg: ClusterMaxIdXYZ, B, ArgType: ClusterMaxIdPV))
4588 return false;
4589
4590 auto One = B.buildConstant(Res: S32, Val: 1);
4591 auto ClusterSizeXYZ = B.buildAdd(Dst: S32, Src0: ClusterMaxIdXYZ, Src1: One);
4592 auto GlobalIdXYZ = B.buildAdd(Dst: S32, Src0: ClusterWorkGroupIdXYZ,
4593 Src1: B.buildMul(Dst: S32, Src0: ClusterIdXYZ, Src1: ClusterSizeXYZ));
4594
4595 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4596
4597 switch (MFI->getClusterDims().getKind()) {
4598 case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
4599 case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
4600 B.buildCopy(Res: DstReg, Op: GlobalIdXYZ);
4601 MI.eraseFromParent();
4602 return true;
4603 }
4604 case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
4605 B.buildCopy(Res: DstReg, Op: ClusterIdXYZ);
4606 MI.eraseFromParent();
4607 return true;
4608 }
4609 case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
4610 using namespace AMDGPU::Hwreg;
4611 unsigned ClusterIdField = HwregEncoding::encode(Values: ID_IB_STS2, Values: 6, Values: 4);
4612 Register ClusterId = MRI.createGenericVirtualRegister(Ty: S32);
4613 MRI.setRegClass(Reg: ClusterId, RC: &AMDGPU::SReg_32RegClass);
4614 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32_const)
4615 .addDef(RegNo: ClusterId)
4616 .addImm(Val: ClusterIdField);
4617 auto Zero = B.buildConstant(Res: S32, Val: 0);
4618 auto NoClusters =
4619 B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1), Op0: ClusterId, Op1: Zero);
4620 B.buildSelect(Res: DstReg, Tst: NoClusters, Op0: ClusterIdXYZ, Op1: GlobalIdXYZ);
4621 MI.eraseFromParent();
4622 return true;
4623 }
4624 }
4625
4626 llvm_unreachable("nothing should reach here");
4627}
4628
4629bool AMDGPULegalizerInfo::loadInputValue(
4630 Register DstReg, MachineIRBuilder &B,
4631 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4632 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4633 const ArgDescriptor *Arg = nullptr;
4634 const TargetRegisterClass *ArgRC;
4635 LLT ArgTy;
4636
4637 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4638 const ArgDescriptor WorkGroupIDX =
4639 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
4640 // If GridZ is not programmed in an entry function then the hardware will set
4641 // it to all zeros, so there is no need to mask the GridY value in the low
4642 // order bits.
4643 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4644 Reg: AMDGPU::TTMP7,
4645 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4646 const ArgDescriptor WorkGroupIDZ =
4647 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
4648 const ArgDescriptor ClusterWorkGroupIDX =
4649 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000000Fu);
4650 const ArgDescriptor ClusterWorkGroupIDY =
4651 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000000F0u);
4652 const ArgDescriptor ClusterWorkGroupIDZ =
4653 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00000F00u);
4654 const ArgDescriptor ClusterWorkGroupMaxIDX =
4655 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000F000u);
4656 const ArgDescriptor ClusterWorkGroupMaxIDY =
4657 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000F0000u);
4658 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4659 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00F00000u);
4660 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4661 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0F000000u);
4662
4663 auto LoadConstant = [&](unsigned N) {
4664 B.buildConstant(Res: DstReg, Val: N);
4665 return true;
4666 };
4667
4668 if (ST.hasArchitectedSGPRs() &&
4669 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4670 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4671 bool HasFixedDims = ClusterDims.isFixedDims();
4672
4673 switch (ArgType) {
4674 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4675 Arg = &WorkGroupIDX;
4676 ArgRC = &AMDGPU::SReg_32RegClass;
4677 ArgTy = LLT::scalar(SizeInBits: 32);
4678 break;
4679 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4680 Arg = &WorkGroupIDY;
4681 ArgRC = &AMDGPU::SReg_32RegClass;
4682 ArgTy = LLT::scalar(SizeInBits: 32);
4683 break;
4684 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4685 Arg = &WorkGroupIDZ;
4686 ArgRC = &AMDGPU::SReg_32RegClass;
4687 ArgTy = LLT::scalar(SizeInBits: 32);
4688 break;
4689 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
4690 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4691 return LoadConstant(0);
4692 Arg = &ClusterWorkGroupIDX;
4693 ArgRC = &AMDGPU::SReg_32RegClass;
4694 ArgTy = LLT::scalar(SizeInBits: 32);
4695 break;
4696 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
4697 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4698 return LoadConstant(0);
4699 Arg = &ClusterWorkGroupIDY;
4700 ArgRC = &AMDGPU::SReg_32RegClass;
4701 ArgTy = LLT::scalar(SizeInBits: 32);
4702 break;
4703 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
4704 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4705 return LoadConstant(0);
4706 Arg = &ClusterWorkGroupIDZ;
4707 ArgRC = &AMDGPU::SReg_32RegClass;
4708 ArgTy = LLT::scalar(SizeInBits: 32);
4709 break;
4710 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
4711 if (HasFixedDims)
4712 return LoadConstant(ClusterDims.getDims()[0] - 1);
4713 Arg = &ClusterWorkGroupMaxIDX;
4714 ArgRC = &AMDGPU::SReg_32RegClass;
4715 ArgTy = LLT::scalar(SizeInBits: 32);
4716 break;
4717 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
4718 if (HasFixedDims)
4719 return LoadConstant(ClusterDims.getDims()[1] - 1);
4720 Arg = &ClusterWorkGroupMaxIDY;
4721 ArgRC = &AMDGPU::SReg_32RegClass;
4722 ArgTy = LLT::scalar(SizeInBits: 32);
4723 break;
4724 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
4725 if (HasFixedDims)
4726 return LoadConstant(ClusterDims.getDims()[2] - 1);
4727 Arg = &ClusterWorkGroupMaxIDZ;
4728 ArgRC = &AMDGPU::SReg_32RegClass;
4729 ArgTy = LLT::scalar(SizeInBits: 32);
4730 break;
4731 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
4732 Arg = &ClusterWorkGroupMaxFlatID;
4733 ArgRC = &AMDGPU::SReg_32RegClass;
4734 ArgTy = LLT::scalar(SizeInBits: 32);
4735 break;
4736 default:
4737 break;
4738 }
4739 }
4740
4741 if (!Arg)
4742 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4743
4744 if (!Arg) {
4745 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4746 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4747 // which case the pointer argument may be missing and we use null.
4748 return LoadConstant(0);
4749 }
4750
4751 // It's undefined behavior if a function marked with the amdgpu-no-*
4752 // attributes uses the corresponding intrinsic.
4753 B.buildUndef(Res: DstReg);
4754 return true;
4755 }
4756
4757 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4758 return false; // TODO: Handle these
4759 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4760 return true;
4761}
4762
4763bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4764 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4765 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4766 if (!loadInputValue(DstReg: MI.getOperand(i: 0).getReg(), B, ArgType))
4767 return false;
4768
4769 MI.eraseFromParent();
4770 return true;
4771}
4772
4773static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4774 int64_t C) {
4775 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: C);
4776 MI.eraseFromParent();
4777 return true;
4778}
4779
4780bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4781 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4782 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4783 unsigned MaxID = ST.getMaxWorkitemID(Kernel: B.getMF().getFunction(), Dimension: Dim);
4784 if (MaxID == 0)
4785 return replaceWithConstant(B, MI, C: 0);
4786
4787 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4788 const ArgDescriptor *Arg;
4789 const TargetRegisterClass *ArgRC;
4790 LLT ArgTy;
4791 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4792
4793 Register DstReg = MI.getOperand(i: 0).getReg();
4794 if (!Arg) {
4795 // It's undefined behavior if a function marked with the amdgpu-no-*
4796 // attributes uses the corresponding intrinsic.
4797 B.buildUndef(Res: DstReg);
4798 MI.eraseFromParent();
4799 return true;
4800 }
4801
4802 if (Arg->isMasked()) {
4803 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4804 // masking operations anyway.
4805 //
4806 // TODO: We could assert the top bit is 0 for the source copy.
4807 if (!loadInputValue(DstReg, B, ArgType))
4808 return false;
4809 } else {
4810 Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
4811 if (!loadInputValue(DstReg: TmpReg, B, ArgType))
4812 return false;
4813 B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
4814 }
4815
4816 MI.eraseFromParent();
4817 return true;
4818}
4819
4820MachinePointerInfo
4821AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const {
4822 // This isn't really a constant pool but close enough.
4823 MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
4824 PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
4825 return PtrInfo;
4826}
4827
4828Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4829 int64_t Offset) const {
4830 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
4831 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
4832
4833 // TODO: If we passed in the base kernel offset we could have a better
4834 // alignment than 4, but we don't really need it.
4835 if (!loadInputValue(DstReg: KernArgReg, B,
4836 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4837 llvm_unreachable("failed to find kernarg segment ptr");
4838
4839 auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset);
4840 return B.buildObjectPtrOffset(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: 0);
4841}
4842
4843/// Legalize a value that's loaded from kernel arguments. This is only used by
4844/// legacy intrinsics.
4845bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4846 MachineIRBuilder &B,
4847 uint64_t Offset,
4848 Align Alignment) const {
4849 Register DstReg = MI.getOperand(i: 0).getReg();
4850
4851 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4852 "unexpected kernarg parameter type");
4853
4854 Register Ptr = getKernargParameterPtr(B, Offset);
4855 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF&: B.getMF());
4856 B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment: Align(4),
4857 MMOFlags: MachineMemOperand::MODereferenceable |
4858 MachineMemOperand::MOInvariant);
4859 MI.eraseFromParent();
4860 return true;
4861}
4862
4863bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4864 MachineRegisterInfo &MRI,
4865 MachineIRBuilder &B) const {
4866 Register Dst = MI.getOperand(i: 0).getReg();
4867 LLT DstTy = MRI.getType(Reg: Dst);
4868 LLT S16 = LLT::scalar(SizeInBits: 16);
4869 LLT S32 = LLT::scalar(SizeInBits: 32);
4870 LLT S64 = LLT::scalar(SizeInBits: 64);
4871
4872 if (DstTy == S16)
4873 return legalizeFDIV16(MI, MRI, B);
4874 if (DstTy == S32)
4875 return legalizeFDIV32(MI, MRI, B);
4876 if (DstTy == S64)
4877 return legalizeFDIV64(MI, MRI, B);
4878
4879 return false;
4880}
4881
4882void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4883 Register DstDivReg,
4884 Register DstRemReg,
4885 Register X,
4886 Register Y) const {
4887 const LLT S1 = LLT::scalar(SizeInBits: 1);
4888 const LLT S32 = LLT::scalar(SizeInBits: 32);
4889
4890 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4891 // algorithm used here.
4892
4893 // Initial estimate of inv(y).
4894 auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
4895 auto RcpIFlag = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {FloatY});
4896 auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f7ffffe));
4897 auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
4898 auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
4899
4900 // One round of UNR.
4901 auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 0), Src1: Y);
4902 auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
4903 Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
4904
4905 // Quotient/remainder estimate.
4906 auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
4907 auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
4908
4909 // First quotient/remainder refinement.
4910 auto One = B.buildConstant(Res: S32, Val: 1);
4911 auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4912 if (DstDivReg)
4913 Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4914 R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4915
4916 // Second quotient/remainder refinement.
4917 Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4918 if (DstDivReg)
4919 B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4920
4921 if (DstRemReg)
4922 B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4923}
4924
4925// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4926//
4927// Return lo, hi of result
4928//
4929// %cvt.lo = G_UITOFP Val.lo
4930// %cvt.hi = G_UITOFP Val.hi
4931// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4932// %rcp = G_AMDGPU_RCP_IFLAG %mad
4933// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4934// %mul2 = G_FMUL %mul1, 2**(-32)
4935// %trunc = G_INTRINSIC_TRUNC %mul2
4936// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4937// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4938static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4939 Register Val) {
4940 const LLT S32 = LLT::scalar(SizeInBits: 32);
4941 auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
4942
4943 auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 0));
4944 auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
4945
4946 auto Mad = B.buildFMAD(
4947 Dst: S32, Src0: CvtHi, // 2**32
4948 Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f800000)), Src2: CvtLo);
4949
4950 auto Rcp = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {Mad});
4951 auto Mul1 = B.buildFMul(
4952 Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x5f7ffffc)));
4953
4954 // 2**(-32)
4955 auto Mul2 = B.buildFMul(
4956 Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x2f800000)));
4957 auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
4958
4959 // -(2**32)
4960 auto Mad2 = B.buildFMAD(
4961 Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0xcf800000)),
4962 Src2: Mul1);
4963
4964 auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
4965 auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
4966
4967 return {ResultLo.getReg(Idx: 0), ResultHi.getReg(Idx: 0)};
4968}
4969
4970void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4971 Register DstDivReg,
4972 Register DstRemReg,
4973 Register Numer,
4974 Register Denom) const {
4975 const LLT S32 = LLT::scalar(SizeInBits: 32);
4976 const LLT S64 = LLT::scalar(SizeInBits: 64);
4977 const LLT S1 = LLT::scalar(SizeInBits: 1);
4978 Register RcpLo, RcpHi;
4979
4980 std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
4981
4982 auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
4983
4984 auto Zero64 = B.buildConstant(Res: S64, Val: 0);
4985 auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
4986
4987 auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
4988 auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
4989
4990 auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
4991 Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: 0);
4992 Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: 1);
4993
4994 auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
4995 auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: 1));
4996 auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
4997
4998 auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
4999 auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
5000 auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
5001 Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: 0);
5002 Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: 1);
5003
5004 auto Zero32 = B.buildConstant(Res: S32, Val: 0);
5005 auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
5006 auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: 1));
5007 auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
5008
5009 auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
5010 Register NumerLo = UnmergeNumer.getReg(Idx: 0);
5011 Register NumerHi = UnmergeNumer.getReg(Idx: 1);
5012
5013 auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
5014 auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
5015 auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
5016 Register Mul3_Lo = UnmergeMul3.getReg(Idx: 0);
5017 Register Mul3_Hi = UnmergeMul3.getReg(Idx: 1);
5018 auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
5019 auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: 1));
5020 auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
5021 auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
5022
5023 auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
5024 Register DenomLo = UnmergeDenom.getReg(Idx: 0);
5025 Register DenomHi = UnmergeDenom.getReg(Idx: 1);
5026
5027 auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
5028 auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
5029
5030 auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
5031 auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
5032
5033 auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
5034 auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
5035
5036 // TODO: Here and below portions of the code can be enclosed into if/endif.
5037 // Currently control flow is unconditional and we have 4 selects after
5038 // potential endif to substitute PHIs.
5039
5040 // if C3 != 0 ...
5041 auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
5042 auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: 1));
5043 auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: 1));
5044 auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
5045
5046 auto One64 = B.buildConstant(Res: S64, Val: 1);
5047 auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
5048
5049 auto C4 =
5050 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
5051 auto C5 =
5052 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
5053 auto C6 = B.buildSelect(
5054 Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
5055
5056 // if (C6 != 0)
5057 auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
5058 auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
5059
5060 auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: 1));
5061 auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: 1));
5062 auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
5063
5064 // endif C6
5065 // endif C3
5066
5067 if (DstDivReg) {
5068 auto Sel1 = B.buildSelect(
5069 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
5070 B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
5071 Op0: Sel1, Op1: MulHi3);
5072 }
5073
5074 if (DstRemReg) {
5075 auto Sel2 = B.buildSelect(
5076 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
5077 B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
5078 Op0: Sel2, Op1: Sub1);
5079 }
5080}
5081
5082bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
5083 MachineRegisterInfo &MRI,
5084 MachineIRBuilder &B) const {
5085 Register DstDivReg, DstRemReg;
5086 switch (MI.getOpcode()) {
5087 default:
5088 llvm_unreachable("Unexpected opcode!");
5089 case AMDGPU::G_UDIV: {
5090 DstDivReg = MI.getOperand(i: 0).getReg();
5091 break;
5092 }
5093 case AMDGPU::G_UREM: {
5094 DstRemReg = MI.getOperand(i: 0).getReg();
5095 break;
5096 }
5097 case AMDGPU::G_UDIVREM: {
5098 DstDivReg = MI.getOperand(i: 0).getReg();
5099 DstRemReg = MI.getOperand(i: 1).getReg();
5100 break;
5101 }
5102 }
5103
5104 const LLT S64 = LLT::scalar(SizeInBits: 64);
5105 const LLT S32 = LLT::scalar(SizeInBits: 32);
5106 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5107 Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
5108 Register Den = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
5109 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5110
5111 if (Ty == S32)
5112 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
5113 else if (Ty == S64)
5114 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
5115 else
5116 return false;
5117
5118 MI.eraseFromParent();
5119 return true;
5120}
5121
5122bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
5123 MachineRegisterInfo &MRI,
5124 MachineIRBuilder &B) const {
5125 const LLT S64 = LLT::scalar(SizeInBits: 64);
5126 const LLT S32 = LLT::scalar(SizeInBits: 32);
5127
5128 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5129 if (Ty != S32 && Ty != S64)
5130 return false;
5131
5132 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5133 Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
5134 Register RHS = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
5135
5136 auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - 1);
5137 auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
5138 auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
5139
5140 LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
5141 RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
5142
5143 LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
5144 RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
5145
5146 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5147 switch (MI.getOpcode()) {
5148 default:
5149 llvm_unreachable("Unexpected opcode!");
5150 case AMDGPU::G_SDIV: {
5151 DstDivReg = MI.getOperand(i: 0).getReg();
5152 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5153 break;
5154 }
5155 case AMDGPU::G_SREM: {
5156 DstRemReg = MI.getOperand(i: 0).getReg();
5157 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5158 break;
5159 }
5160 case AMDGPU::G_SDIVREM: {
5161 DstDivReg = MI.getOperand(i: 0).getReg();
5162 DstRemReg = MI.getOperand(i: 1).getReg();
5163 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5164 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5165 break;
5166 }
5167 }
5168
5169 if (Ty == S32)
5170 legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
5171 else
5172 legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
5173
5174 if (DstDivReg) {
5175 auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: 0);
5176 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: 0);
5177 B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
5178 }
5179
5180 if (DstRemReg) {
5181 auto Sign = LHSign.getReg(Idx: 0); // Remainder sign is the same as LHS
5182 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: 0);
5183 B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
5184 }
5185
5186 MI.eraseFromParent();
5187 return true;
5188}
5189
5190bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
5191 MachineRegisterInfo &MRI,
5192 MachineIRBuilder &B) const {
5193 Register Res = MI.getOperand(i: 0).getReg();
5194 Register LHS = MI.getOperand(i: 1).getReg();
5195 Register RHS = MI.getOperand(i: 2).getReg();
5196 uint16_t Flags = MI.getFlags();
5197 LLT ResTy = MRI.getType(Reg: Res);
5198
5199 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn);
5200
5201 if (const auto *CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
5202 if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: 16))
5203 return false;
5204
5205 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5206 // the CI documentation has a worst case error of 1 ulp.
5207 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5208 // use it as long as we aren't trying to use denormals.
5209 //
5210 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5211
5212 // 1 / x -> RCP(x)
5213 if (CLHS->isExactlyValue(V: 1.0)) {
5214 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
5215 .addUse(RegNo: RHS)
5216 .setMIFlags(Flags);
5217
5218 MI.eraseFromParent();
5219 return true;
5220 }
5221
5222 // -1 / x -> RCP( FNEG(x) )
5223 if (CLHS->isExactlyValue(V: -1.0)) {
5224 auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
5225 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
5226 .addUse(RegNo: FNeg.getReg(Idx: 0))
5227 .setMIFlags(Flags);
5228
5229 MI.eraseFromParent();
5230 return true;
5231 }
5232 }
5233
5234 // For f16 require afn or arcp.
5235 // For f32 require afn.
5236 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: 16) ||
5237 !MI.getFlag(Flag: MachineInstr::FmArcp)))
5238 return false;
5239
5240 // x / y -> x * (1.0 / y)
5241 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
5242 .addUse(RegNo: RHS)
5243 .setMIFlags(Flags);
5244 B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
5245
5246 MI.eraseFromParent();
5247 return true;
5248}
5249
5250bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
5251 MachineRegisterInfo &MRI,
5252 MachineIRBuilder &B) const {
5253 Register Res = MI.getOperand(i: 0).getReg();
5254 Register X = MI.getOperand(i: 1).getReg();
5255 Register Y = MI.getOperand(i: 2).getReg();
5256 uint16_t Flags = MI.getFlags();
5257 LLT ResTy = MRI.getType(Reg: Res);
5258
5259 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn);
5260
5261 if (!AllowInaccurateRcp)
5262 return false;
5263
5264 auto NegY = B.buildFNeg(Dst: ResTy, Src0: Y);
5265 auto One = B.buildFConstant(Res: ResTy, Val: 1.0);
5266
5267 auto R = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
5268 .addUse(RegNo: Y)
5269 .setMIFlags(Flags);
5270
5271 auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
5272 R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
5273
5274 auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
5275 R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
5276
5277 auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
5278 auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
5279
5280 B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
5281 MI.eraseFromParent();
5282 return true;
5283}
5284
5285bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
5286 MachineRegisterInfo &MRI,
5287 MachineIRBuilder &B) const {
5288 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5289 return true;
5290
5291 Register Res = MI.getOperand(i: 0).getReg();
5292 Register LHS = MI.getOperand(i: 1).getReg();
5293 Register RHS = MI.getOperand(i: 2).getReg();
5294
5295 uint16_t Flags = MI.getFlags();
5296
5297 LLT S16 = LLT::scalar(SizeInBits: 16);
5298 LLT S32 = LLT::scalar(SizeInBits: 32);
5299
5300 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5301 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5302 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5303 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5304 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5305 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5306 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5307 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5308 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5309 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5310 // q16.u = opx(V_CVT_F16_F32, q32.u);
5311 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5312
5313 auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
5314 auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
5315 auto NegRHSExt = B.buildFNeg(Dst: S32, Src0: RHSExt);
5316 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5317 .addUse(RegNo: RHSExt.getReg(Idx: 0))
5318 .setMIFlags(Flags);
5319 auto Quot = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: Rcp, Flags);
5320 MachineInstrBuilder Err;
5321 if (ST.hasMadMacF32Insts()) {
5322 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5323 Quot = B.buildFMAD(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
5324 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5325 } else {
5326 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5327 Quot = B.buildFMA(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
5328 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5329 }
5330 auto Tmp = B.buildFMul(Dst: S32, Src0: Err, Src1: Rcp, Flags);
5331 Tmp = B.buildAnd(Dst: S32, Src0: Tmp, Src1: B.buildConstant(Res: S32, Val: 0xff800000));
5332 Quot = B.buildFAdd(Dst: S32, Src0: Tmp, Src1: Quot, Flags);
5333 auto RDst = B.buildFPTrunc(Res: S16, Op: Quot, Flags);
5334 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5335 .addUse(RegNo: RDst.getReg(Idx: 0))
5336 .addUse(RegNo: RHS)
5337 .addUse(RegNo: LHS)
5338 .setMIFlags(Flags);
5339
5340 MI.eraseFromParent();
5341 return true;
5342}
5343
5344static constexpr unsigned SPDenormModeBitField =
5345 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 4, Values: 2);
5346
5347// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5348// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5349static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
5350 const GCNSubtarget &ST,
5351 SIModeRegisterDefaults Mode) {
5352 // Set SP denorm mode to this value.
5353 unsigned SPDenormMode =
5354 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5355
5356 if (ST.hasDenormModeInst()) {
5357 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5358 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5359
5360 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5361 B.buildInstr(Opcode: AMDGPU::S_DENORM_MODE)
5362 .addImm(Val: NewDenormModeValue);
5363
5364 } else {
5365 B.buildInstr(Opcode: AMDGPU::S_SETREG_IMM32_B32)
5366 .addImm(Val: SPDenormMode)
5367 .addImm(Val: SPDenormModeBitField);
5368 }
5369}
5370
5371bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
5372 MachineRegisterInfo &MRI,
5373 MachineIRBuilder &B) const {
5374 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5375 return true;
5376
5377 Register Res = MI.getOperand(i: 0).getReg();
5378 Register LHS = MI.getOperand(i: 1).getReg();
5379 Register RHS = MI.getOperand(i: 2).getReg();
5380 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5381 SIModeRegisterDefaults Mode = MFI->getMode();
5382
5383 uint16_t Flags = MI.getFlags();
5384
5385 LLT S32 = LLT::scalar(SizeInBits: 32);
5386 LLT S1 = LLT::scalar(SizeInBits: 1);
5387
5388 auto One = B.buildFConstant(Res: S32, Val: 1.0f);
5389
5390 auto DenominatorScaled =
5391 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5392 .addUse(RegNo: LHS)
5393 .addUse(RegNo: RHS)
5394 .addImm(Val: 0)
5395 .setMIFlags(Flags);
5396 auto NumeratorScaled =
5397 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5398 .addUse(RegNo: LHS)
5399 .addUse(RegNo: RHS)
5400 .addImm(Val: 1)
5401 .setMIFlags(Flags);
5402
5403 auto ApproxRcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5404 .addUse(RegNo: DenominatorScaled.getReg(Idx: 0))
5405 .setMIFlags(Flags);
5406 auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
5407
5408 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5409 const bool HasDynamicDenormals =
5410 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5411 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5412
5413 Register SavedSPDenormMode;
5414 if (!PreservesDenormals) {
5415 if (HasDynamicDenormals) {
5416 SavedSPDenormMode = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5417 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32)
5418 .addDef(RegNo: SavedSPDenormMode)
5419 .addImm(Val: SPDenormModeBitField);
5420 }
5421 toggleSPDenormMode(Enable: true, B, ST, Mode);
5422 }
5423
5424 auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
5425 auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
5426 auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
5427 auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
5428 auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
5429 auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
5430
5431 if (!PreservesDenormals) {
5432 if (HasDynamicDenormals) {
5433 assert(SavedSPDenormMode);
5434 B.buildInstr(Opcode: AMDGPU::S_SETREG_B32)
5435 .addReg(RegNo: SavedSPDenormMode)
5436 .addImm(Val: SPDenormModeBitField);
5437 } else
5438 toggleSPDenormMode(Enable: false, B, ST, Mode);
5439 }
5440
5441 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S32})
5442 .addUse(RegNo: Fma4.getReg(Idx: 0))
5443 .addUse(RegNo: Fma1.getReg(Idx: 0))
5444 .addUse(RegNo: Fma3.getReg(Idx: 0))
5445 .addUse(RegNo: NumeratorScaled.getReg(Idx: 1))
5446 .setMIFlags(Flags);
5447
5448 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5449 .addUse(RegNo: Fmas.getReg(Idx: 0))
5450 .addUse(RegNo: RHS)
5451 .addUse(RegNo: LHS)
5452 .setMIFlags(Flags);
5453
5454 MI.eraseFromParent();
5455 return true;
5456}
5457
5458bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5459 MachineRegisterInfo &MRI,
5460 MachineIRBuilder &B) const {
5461 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5462 return true;
5463
5464 Register Res = MI.getOperand(i: 0).getReg();
5465 Register LHS = MI.getOperand(i: 1).getReg();
5466 Register RHS = MI.getOperand(i: 2).getReg();
5467
5468 uint16_t Flags = MI.getFlags();
5469
5470 LLT S64 = LLT::scalar(SizeInBits: 64);
5471 LLT S1 = LLT::scalar(SizeInBits: 1);
5472
5473 auto One = B.buildFConstant(Res: S64, Val: 1.0);
5474
5475 auto DivScale0 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5476 .addUse(RegNo: LHS)
5477 .addUse(RegNo: RHS)
5478 .addImm(Val: 0)
5479 .setMIFlags(Flags);
5480
5481 auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(Idx: 0), Flags);
5482
5483 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S64})
5484 .addUse(RegNo: DivScale0.getReg(Idx: 0))
5485 .setMIFlags(Flags);
5486
5487 auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
5488 auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
5489 auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
5490
5491 auto DivScale1 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5492 .addUse(RegNo: LHS)
5493 .addUse(RegNo: RHS)
5494 .addImm(Val: 1)
5495 .setMIFlags(Flags);
5496
5497 auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5498 auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(Idx: 0), Src1: Fma3, Flags);
5499 auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(Idx: 0), Flags);
5500
5501 Register Scale;
5502 if (!ST.hasUsableDivScaleConditionOutput()) {
5503 // Workaround a hardware bug on SI where the condition output from div_scale
5504 // is not usable.
5505
5506 LLT S32 = LLT::scalar(SizeInBits: 32);
5507
5508 auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5509 auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5510 auto Scale0Unmerge = B.buildUnmerge(Res: S32, Op: DivScale0);
5511 auto Scale1Unmerge = B.buildUnmerge(Res: S32, Op: DivScale1);
5512
5513 auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: 1),
5514 Op1: Scale1Unmerge.getReg(Idx: 1));
5515 auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: 1),
5516 Op1: Scale0Unmerge.getReg(Idx: 1));
5517 Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(Idx: 0);
5518 } else {
5519 Scale = DivScale1.getReg(Idx: 1);
5520 }
5521
5522 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S64})
5523 .addUse(RegNo: Fma4.getReg(Idx: 0))
5524 .addUse(RegNo: Fma3.getReg(Idx: 0))
5525 .addUse(RegNo: Mul.getReg(Idx: 0))
5526 .addUse(RegNo: Scale)
5527 .setMIFlags(Flags);
5528
5529 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res: ArrayRef(Res))
5530 .addUse(RegNo: Fmas.getReg(Idx: 0))
5531 .addUse(RegNo: RHS)
5532 .addUse(RegNo: LHS)
5533 .setMIFlags(Flags);
5534
5535 MI.eraseFromParent();
5536 return true;
5537}
5538
5539bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5540 MachineRegisterInfo &MRI,
5541 MachineIRBuilder &B) const {
5542 Register Res0 = MI.getOperand(i: 0).getReg();
5543 Register Res1 = MI.getOperand(i: 1).getReg();
5544 Register Val = MI.getOperand(i: 2).getReg();
5545 uint16_t Flags = MI.getFlags();
5546
5547 LLT Ty = MRI.getType(Reg: Res0);
5548 LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: 16) ? LLT::scalar(SizeInBits: 16) : LLT::scalar(SizeInBits: 32);
5549
5550 auto Mant = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_mant, Res: {Ty})
5551 .addUse(RegNo: Val)
5552 .setMIFlags(Flags);
5553 auto Exp = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_exp, Res: {InstrExpTy})
5554 .addUse(RegNo: Val)
5555 .setMIFlags(Flags);
5556
5557 if (ST.hasFractBug()) {
5558 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5559 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5560 auto IsFinite =
5561 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
5562 auto Zero = B.buildConstant(Res: InstrExpTy, Val: 0);
5563 Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5564 Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5565 }
5566
5567 B.buildCopy(Res: Res0, Op: Mant);
5568 B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5569
5570 MI.eraseFromParent();
5571 return true;
5572}
5573
5574bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5575 MachineRegisterInfo &MRI,
5576 MachineIRBuilder &B) const {
5577 Register Res = MI.getOperand(i: 0).getReg();
5578 Register LHS = MI.getOperand(i: 2).getReg();
5579 Register RHS = MI.getOperand(i: 3).getReg();
5580 uint16_t Flags = MI.getFlags();
5581
5582 LLT S32 = LLT::scalar(SizeInBits: 32);
5583 LLT S1 = LLT::scalar(SizeInBits: 1);
5584
5585 auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5586 const APFloat C0Val(1.0f);
5587
5588 auto C0 = B.buildFConstant(Res: S32, Val: 0x1p+96f);
5589 auto C1 = B.buildFConstant(Res: S32, Val: 0x1p-32f);
5590 auto C2 = B.buildFConstant(Res: S32, Val: 1.0f);
5591
5592 auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5593 auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5594
5595 auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5596
5597 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5598 .addUse(RegNo: Mul0.getReg(Idx: 0))
5599 .setMIFlags(Flags);
5600
5601 auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5602
5603 B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5604
5605 MI.eraseFromParent();
5606 return true;
5607}
5608
5609bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5610 MachineRegisterInfo &MRI,
5611 MachineIRBuilder &B) const {
5612 // Bypass the correct expansion a standard promotion through G_FSQRT would
5613 // get. The f32 op is accurate enough for the f16 cas.
5614 unsigned Flags = MI.getFlags();
5615 assert(!ST.has16BitInsts());
5616 const LLT F32 = LLT::scalar(SizeInBits: 32);
5617 auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: 1), Flags);
5618 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: {F32})
5619 .addUse(RegNo: Ext.getReg(Idx: 0))
5620 .setMIFlags(Flags);
5621 B.buildFPTrunc(Res: MI.getOperand(i: 0), Op: Log2, Flags);
5622 MI.eraseFromParent();
5623 return true;
5624}
5625
5626bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5627 MachineRegisterInfo &MRI,
5628 MachineIRBuilder &B) const {
5629 MachineFunction &MF = B.getMF();
5630 Register Dst = MI.getOperand(i: 0).getReg();
5631 Register X = MI.getOperand(i: 1).getReg();
5632 const unsigned Flags = MI.getFlags();
5633 const LLT S1 = LLT::scalar(SizeInBits: 1);
5634 const LLT F32 = LLT::scalar(SizeInBits: 32);
5635 const LLT I32 = LLT::scalar(SizeInBits: 32);
5636
5637 if (allowApproxFunc(MF, Flags)) {
5638 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({Dst}))
5639 .addUse(RegNo: X)
5640 .setMIFlags(Flags);
5641 MI.eraseFromParent();
5642 return true;
5643 }
5644
5645 auto ScaleThreshold = B.buildFConstant(Res: F32, Val: 0x1.0p-96f);
5646 auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5647 auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: 0x1.0p+32f);
5648 auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5649 auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5650
5651 Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5652 if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5653 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({SqrtS}))
5654 .addUse(RegNo: SqrtX.getReg(Idx: 0))
5655 .setMIFlags(Flags);
5656
5657 auto NegOne = B.buildConstant(Res: I32, Val: -1);
5658 auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5659
5660 auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5661 auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5662
5663 auto PosOne = B.buildConstant(Res: I32, Val: 1);
5664 auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5665
5666 auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5667 auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5668
5669 auto Zero = B.buildFConstant(Res: F32, Val: 0.0f);
5670 auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5671
5672 SqrtS =
5673 B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5674
5675 auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
5676 SqrtS =
5677 B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: 0);
5678 } else {
5679 auto SqrtR =
5680 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F32}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5681 B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
5682
5683 auto Half = B.buildFConstant(Res: F32, Val: 0.5f);
5684 auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
5685 auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
5686 auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
5687 SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
5688 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(Idx: 0);
5689 auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
5690 auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
5691 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(Idx: 0);
5692 }
5693
5694 auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: 0x1.0p-16f);
5695
5696 auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
5697
5698 SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5699
5700 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5701 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
5702
5703 MI.eraseFromParent();
5704 return true;
5705}
5706
5707bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5708 MachineRegisterInfo &MRI,
5709 MachineIRBuilder &B) const {
5710 // For double type, the SQRT and RSQ instructions don't have required
5711 // precision, we apply Goldschmidt's algorithm to improve the result:
5712 //
5713 // y0 = rsq(x)
5714 // g0 = x * y0
5715 // h0 = 0.5 * y0
5716 //
5717 // r0 = 0.5 - h0 * g0
5718 // g1 = g0 * r0 + g0
5719 // h1 = h0 * r0 + h0
5720 //
5721 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5722 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5723 // h2 = h1 * r1 + h1
5724 //
5725 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5726 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5727 //
5728 // sqrt(x) = g3
5729
5730 const LLT S1 = LLT::scalar(SizeInBits: 1);
5731 const LLT S32 = LLT::scalar(SizeInBits: 32);
5732 const LLT F64 = LLT::scalar(SizeInBits: 64);
5733
5734 Register Dst = MI.getOperand(i: 0).getReg();
5735 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5736
5737 Register X = MI.getOperand(i: 1).getReg();
5738 unsigned Flags = MI.getFlags();
5739
5740 auto ScaleConstant = B.buildFConstant(Res: F64, Val: 0x1.0p-767);
5741
5742 auto ZeroInt = B.buildConstant(Res: S32, Val: 0);
5743 auto Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant);
5744
5745 // Scale up input if it is too small.
5746 auto ScaleUpFactor = B.buildConstant(Res: S32, Val: 256);
5747 auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
5748 auto SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags);
5749
5750 auto SqrtY =
5751 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F64}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5752
5753 auto Half = B.buildFConstant(Res: F64, Val: 0.5);
5754 auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
5755 auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
5756
5757 auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
5758 auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
5759
5760 auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
5761 auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
5762
5763 auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
5764 auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
5765
5766 auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
5767
5768 auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
5769 auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
5770
5771 auto SqrtRet = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
5772
5773 // Scale down the result.
5774 auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -128);
5775 auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
5776 SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtRet, Src1: ScaleDown, Flags);
5777
5778 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5779 // with finite only or nsz because rsq(+/-0) = +/-inf
5780
5781 // TODO: Check for DAZ and expand to subnormals
5782 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5783
5784 // If x is +INF, +0, or -0, use its original value
5785 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
5786
5787 MI.eraseFromParent();
5788 return true;
5789}
5790
5791bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5792 MachineRegisterInfo &MRI,
5793 MachineIRBuilder &B) const {
5794 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5795 if (Ty == LLT::scalar(SizeInBits: 32))
5796 return legalizeFSQRTF32(MI, MRI, B);
5797 if (Ty == LLT::scalar(SizeInBits: 64))
5798 return legalizeFSQRTF64(MI, MRI, B);
5799 if (Ty == LLT::scalar(SizeInBits: 16))
5800 return legalizeFSQRTF16(MI, MRI, B);
5801 return false;
5802}
5803
5804// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5805// FIXME: Why do we handle this one but not other removed instructions?
5806//
5807// Reciprocal square root. The clamp prevents infinite results, clamping
5808// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5809// +-max_float.
5810bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5811 MachineRegisterInfo &MRI,
5812 MachineIRBuilder &B) const {
5813 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5814 return true;
5815
5816 Register Dst = MI.getOperand(i: 0).getReg();
5817 Register Src = MI.getOperand(i: 2).getReg();
5818 auto Flags = MI.getFlags();
5819
5820 LLT Ty = MRI.getType(Reg: Dst);
5821
5822 const fltSemantics *FltSemantics;
5823 if (Ty == LLT::scalar(SizeInBits: 32))
5824 FltSemantics = &APFloat::IEEEsingle();
5825 else if (Ty == LLT::scalar(SizeInBits: 64))
5826 FltSemantics = &APFloat::IEEEdouble();
5827 else
5828 return false;
5829
5830 auto Rsq = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {Ty})
5831 .addUse(RegNo: Src)
5832 .setMIFlags(Flags);
5833
5834 // We don't need to concern ourselves with the snan handling difference, since
5835 // the rsq quieted (or not) so use the one which will directly select.
5836 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5837 const bool UseIEEE = MFI->getMode().IEEE;
5838
5839 auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
5840 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
5841 B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
5842
5843 auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics, Negative: true));
5844
5845 if (UseIEEE)
5846 B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5847 else
5848 B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5849 MI.eraseFromParent();
5850 return true;
5851}
5852
5853// TODO: Fix pointer type handling
5854bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5855 MachineInstr &MI,
5856 Intrinsic::ID IID) const {
5857
5858 MachineIRBuilder &B = Helper.MIRBuilder;
5859 MachineRegisterInfo &MRI = *B.getMRI();
5860
5861 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5862 IID == Intrinsic::amdgcn_permlanex16;
5863 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5864 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5865
5866 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5867 Register Src2, LLT VT) -> Register {
5868 auto LaneOp = B.buildIntrinsic(ID: IID, Res: {VT}).addUse(RegNo: Src0);
5869 switch (IID) {
5870 case Intrinsic::amdgcn_readfirstlane:
5871 case Intrinsic::amdgcn_permlane64:
5872 return LaneOp.getReg(Idx: 0);
5873 case Intrinsic::amdgcn_readlane:
5874 case Intrinsic::amdgcn_set_inactive:
5875 case Intrinsic::amdgcn_set_inactive_chain_arg:
5876 return LaneOp.addUse(RegNo: Src1).getReg(Idx: 0);
5877 case Intrinsic::amdgcn_writelane:
5878 return LaneOp.addUse(RegNo: Src1).addUse(RegNo: Src2).getReg(Idx: 0);
5879 case Intrinsic::amdgcn_permlane16:
5880 case Intrinsic::amdgcn_permlanex16: {
5881 Register Src3 = MI.getOperand(i: 5).getReg();
5882 int64_t Src4 = MI.getOperand(i: 6).getImm();
5883 int64_t Src5 = MI.getOperand(i: 7).getImm();
5884 return LaneOp.addUse(RegNo: Src1)
5885 .addUse(RegNo: Src2)
5886 .addUse(RegNo: Src3)
5887 .addImm(Val: Src4)
5888 .addImm(Val: Src5)
5889 .getReg(Idx: 0);
5890 }
5891 case Intrinsic::amdgcn_mov_dpp8:
5892 return LaneOp.addImm(Val: MI.getOperand(i: 3).getImm()).getReg(Idx: 0);
5893 case Intrinsic::amdgcn_update_dpp:
5894 return LaneOp.addUse(RegNo: Src1)
5895 .addImm(Val: MI.getOperand(i: 4).getImm())
5896 .addImm(Val: MI.getOperand(i: 5).getImm())
5897 .addImm(Val: MI.getOperand(i: 6).getImm())
5898 .addImm(Val: MI.getOperand(i: 7).getImm())
5899 .getReg(Idx: 0);
5900 default:
5901 llvm_unreachable("unhandled lane op");
5902 }
5903 };
5904
5905 Register DstReg = MI.getOperand(i: 0).getReg();
5906 Register Src0 = MI.getOperand(i: 2).getReg();
5907 Register Src1, Src2;
5908 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5909 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5910 Src1 = MI.getOperand(i: 3).getReg();
5911 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5912 Src2 = MI.getOperand(i: 4).getReg();
5913 }
5914 }
5915
5916 LLT Ty = MRI.getType(Reg: DstReg);
5917 unsigned Size = Ty.getSizeInBits();
5918
5919 unsigned SplitSize = 32;
5920 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5921 ST.hasDPALU_DPP() &&
5922 AMDGPU::isLegalDPALU_DPPControl(ST, DC: MI.getOperand(i: 4).getImm()))
5923 SplitSize = 64;
5924
5925 if (Size == SplitSize) {
5926 // Already legal
5927 return true;
5928 }
5929
5930 if (Size < 32) {
5931 Src0 = B.buildAnyExt(Res: S32, Op: Src0).getReg(Idx: 0);
5932
5933 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5934 Src1 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src1).getReg(Idx: 0);
5935
5936 if (IID == Intrinsic::amdgcn_writelane)
5937 Src2 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src2).getReg(Idx: 0);
5938
5939 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5940 B.buildTrunc(Res: DstReg, Op: LaneOpDst);
5941 MI.eraseFromParent();
5942 return true;
5943 }
5944
5945 if (Size % SplitSize != 0)
5946 return false;
5947
5948 LLT PartialResTy = LLT::scalar(SizeInBits: SplitSize);
5949 bool NeedsBitcast = false;
5950 if (Ty.isVector()) {
5951 LLT EltTy = Ty.getElementType();
5952 unsigned EltSize = EltTy.getSizeInBits();
5953 if (EltSize == SplitSize) {
5954 PartialResTy = EltTy;
5955 } else if (EltSize == 16 || EltSize == 32) {
5956 unsigned NElem = SplitSize / EltSize;
5957 PartialResTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: NElem));
5958 } else {
5959 // Handle all other cases via S32/S64 pieces
5960 NeedsBitcast = true;
5961 }
5962 }
5963
5964 SmallVector<Register, 4> PartialRes;
5965 unsigned NumParts = Size / SplitSize;
5966 MachineInstrBuilder Src0Parts = B.buildUnmerge(Res: PartialResTy, Op: Src0);
5967 MachineInstrBuilder Src1Parts, Src2Parts;
5968
5969 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5970 Src1Parts = B.buildUnmerge(Res: PartialResTy, Op: Src1);
5971
5972 if (IID == Intrinsic::amdgcn_writelane)
5973 Src2Parts = B.buildUnmerge(Res: PartialResTy, Op: Src2);
5974
5975 for (unsigned i = 0; i < NumParts; ++i) {
5976 Src0 = Src0Parts.getReg(Idx: i);
5977
5978 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5979 Src1 = Src1Parts.getReg(Idx: i);
5980
5981 if (IID == Intrinsic::amdgcn_writelane)
5982 Src2 = Src2Parts.getReg(Idx: i);
5983
5984 PartialRes.push_back(Elt: createLaneOp(Src0, Src1, Src2, PartialResTy));
5985 }
5986
5987 if (NeedsBitcast)
5988 B.buildBitcast(Dst: DstReg, Src: B.buildMergeLikeInstr(
5989 Res: LLT::scalar(SizeInBits: Ty.getSizeInBits()), Ops: PartialRes));
5990 else
5991 B.buildMergeLikeInstr(Res: DstReg, Ops: PartialRes);
5992
5993 MI.eraseFromParent();
5994 return true;
5995}
5996
5997bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5998 MachineRegisterInfo &MRI,
5999 MachineIRBuilder &B) const {
6000 uint64_t Offset =
6001 ST.getTargetLowering()->getImplicitParameterOffset(
6002 MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
6003 LLT DstTy = MRI.getType(Reg: DstReg);
6004 LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
6005
6006 Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
6007 if (!loadInputValue(DstReg: KernargPtrReg, B,
6008 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6009 return false;
6010
6011 B.buildObjectPtrOffset(Res: DstReg, Op0: KernargPtrReg,
6012 Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: 0));
6013 return true;
6014}
6015
6016/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6017/// bits of the pointer and replace them with the stride argument, then
6018/// merge_values everything together. In the common case of a raw buffer (the
6019/// stride component is 0), we can just AND off the upper half.
6020bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
6021 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6022 Register Result = MI.getOperand(i: 0).getReg();
6023 Register Pointer = MI.getOperand(i: 2).getReg();
6024 Register Stride = MI.getOperand(i: 3).getReg();
6025 Register NumRecords = MI.getOperand(i: 4).getReg();
6026 Register Flags = MI.getOperand(i: 5).getReg();
6027
6028 LLT S32 = LLT::scalar(SizeInBits: 32);
6029 LLT S64 = LLT::scalar(SizeInBits: 64);
6030
6031 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6032
6033 auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
6034
6035 if (ST.has45BitNumRecordsBufferResource()) {
6036 Register Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6037 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6038 // num_records.
6039 LLT PtrIntTy = LLT::scalar(SizeInBits: MRI.getType(Reg: Pointer).getSizeInBits());
6040 auto PointerInt = B.buildPtrToInt(Dst: PtrIntTy, Src: Pointer);
6041 auto ExtPointer = B.buildAnyExtOrTrunc(Res: S64, Op: PointerInt);
6042 auto NumRecordsLHS = B.buildShl(Dst: S64, Src0: NumRecords, Src1: B.buildConstant(Res: S32, Val: 57));
6043 Register LowHalf = B.buildOr(Dst: S64, Src0: ExtPointer, Src1: NumRecordsLHS).getReg(Idx: 0);
6044
6045 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6046 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6047 auto NumRecordsRHS = B.buildLShr(Dst: S64, Src0: NumRecords, Src1: B.buildConstant(Res: S32, Val: 7));
6048 auto ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: B.buildConstant(Res: S32, Val: 12));
6049 auto ExtShiftedStride =
6050 B.buildMergeValues(Res: S64, Ops: {Zero, ShiftedStride.getReg(Idx: 0)});
6051 auto ShiftedFlags = B.buildShl(Dst: S32, Src0: Flags, Src1: B.buildConstant(Res: S32, Val: 28));
6052 auto ExtShiftedFlags =
6053 B.buildMergeValues(Res: S64, Ops: {Zero, ShiftedFlags.getReg(Idx: 0)});
6054 auto CombinedFields = B.buildOr(Dst: S64, Src0: NumRecordsRHS, Src1: ExtShiftedStride);
6055 Register HighHalf =
6056 B.buildOr(Dst: S64, Src0: CombinedFields, Src1: ExtShiftedFlags).getReg(Idx: 0);
6057 B.buildMergeValues(Res: Result, Ops: {LowHalf, HighHalf});
6058 } else {
6059 NumRecords = B.buildTrunc(Res: S32, Op: NumRecords).getReg(Idx: 0);
6060 auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
6061 auto LowHalf = Unmerge.getReg(Idx: 0);
6062 auto HighHalf = Unmerge.getReg(Idx: 1);
6063
6064 auto AndMask = B.buildConstant(Res: S32, Val: 0x0000ffff);
6065 auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
6066 auto ShiftConst = B.buildConstant(Res: S32, Val: 16);
6067 auto ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
6068 auto NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
6069 Register NewHighHalfReg = NewHighHalf.getReg(Idx: 0);
6070 B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
6071 }
6072
6073 MI.eraseFromParent();
6074 return true;
6075}
6076
6077bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
6078 MachineRegisterInfo &MRI,
6079 MachineIRBuilder &B) const {
6080 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6081 if (!MFI->isEntryFunction()) {
6082 return legalizePreloadedArgIntrin(MI, MRI, B,
6083 ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
6084 }
6085
6086 Register DstReg = MI.getOperand(i: 0).getReg();
6087 if (!getImplicitArgPtr(DstReg, MRI, B))
6088 return false;
6089
6090 MI.eraseFromParent();
6091 return true;
6092}
6093
6094bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
6095 MachineRegisterInfo &MRI,
6096 MachineIRBuilder &B) const {
6097 Function &F = B.getMF().getFunction();
6098 std::optional<uint32_t> KnownSize =
6099 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
6100 if (KnownSize.has_value())
6101 B.buildConstant(Res: DstReg, Val: *KnownSize);
6102 return false;
6103}
6104
6105bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
6106 MachineRegisterInfo &MRI,
6107 MachineIRBuilder &B) const {
6108
6109 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6110 if (!MFI->isEntryFunction()) {
6111 return legalizePreloadedArgIntrin(MI, MRI, B,
6112 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
6113 }
6114
6115 Register DstReg = MI.getOperand(i: 0).getReg();
6116 if (!getLDSKernelId(DstReg, MRI, B))
6117 return false;
6118
6119 MI.eraseFromParent();
6120 return true;
6121}
6122
6123bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
6124 MachineRegisterInfo &MRI,
6125 MachineIRBuilder &B,
6126 unsigned AddrSpace) const {
6127 const LLT S32 = LLT::scalar(SizeInBits: 32);
6128 auto Unmerge = B.buildUnmerge(Res: S32, Op: MI.getOperand(i: 2).getReg());
6129 Register Hi32 = Unmerge.getReg(Idx: 1);
6130
6131 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6132 ST.hasGloballyAddressableScratch()) {
6133 Register FlatScratchBaseHi =
6134 B.buildInstr(Opc: AMDGPU::S_MOV_B32, DstOps: {S32},
6135 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6136 .getReg(Idx: 0);
6137 MRI.setRegClass(Reg: FlatScratchBaseHi, RC: &AMDGPU::SReg_32RegClass);
6138 // Test bits 63..58 against the aperture address.
6139 Register XOR = B.buildXor(Dst: S32, Src0: Hi32, Src1: FlatScratchBaseHi).getReg(Idx: 0);
6140 B.buildICmp(Pred: ICmpInst::ICMP_ULT, Res: MI.getOperand(i: 0), Op0: XOR,
6141 Op1: B.buildConstant(Res: S32, Val: 1u << 26));
6142 } else {
6143 Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
6144 B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: 0), Op0: Hi32, Op1: ApertureReg);
6145 }
6146 MI.eraseFromParent();
6147 return true;
6148}
6149
6150// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6151// offset (the offset that is included in bounds checking and swizzling, to be
6152// split between the instruction's voffset and immoffset fields) and soffset
6153// (the offset that is excluded from bounds checking and swizzling, to go in
6154// the instruction's soffset field). This function takes the first kind of
6155// offset and figures out how to split it between voffset and immoffset.
6156std::pair<Register, unsigned>
6157AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
6158 Register OrigOffset) const {
6159 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6160 Register BaseReg;
6161 unsigned ImmOffset;
6162 const LLT S32 = LLT::scalar(SizeInBits: 32);
6163 MachineRegisterInfo &MRI = *B.getMRI();
6164
6165 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6166 // being added, so we can only safely match a 32-bit addition with no unsigned
6167 // overflow.
6168 bool CheckNUW = ST.hasGFX1250Insts();
6169 std::tie(args&: BaseReg, args&: ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6170 MRI, Reg: OrigOffset, /*KnownBits=*/ValueTracking: nullptr, CheckNUW);
6171
6172 // If BaseReg is a pointer, convert it to int.
6173 if (MRI.getType(Reg: BaseReg).isPointer())
6174 BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: 0);
6175
6176 // If the immediate value is too big for the immoffset field, put only bits
6177 // that would normally fit in the immoffset field. The remaining value that
6178 // is copied/added for the voffset field is a large power of 2, and it
6179 // stands more chance of being CSEd with the copy/add for another similar
6180 // load/store.
6181 // However, do not do that rounding down if that is a negative
6182 // number, as it appears to be illegal to have a negative offset in the
6183 // vgpr, even if adding the immediate offset makes it positive.
6184 unsigned Overflow = ImmOffset & ~MaxImm;
6185 ImmOffset -= Overflow;
6186 if ((int32_t)Overflow < 0) {
6187 Overflow += ImmOffset;
6188 ImmOffset = 0;
6189 }
6190
6191 if (Overflow != 0) {
6192 if (!BaseReg) {
6193 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
6194 } else {
6195 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
6196 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
6197 }
6198 }
6199
6200 if (!BaseReg)
6201 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6202
6203 return std::pair(BaseReg, ImmOffset);
6204}
6205
6206/// Handle register layout difference for f16 images for some subtargets.
6207Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
6208 MachineRegisterInfo &MRI,
6209 Register Reg,
6210 bool ImageStore) const {
6211 const LLT S16 = LLT::scalar(SizeInBits: 16);
6212 const LLT S32 = LLT::scalar(SizeInBits: 32);
6213 LLT StoreVT = MRI.getType(Reg);
6214 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6215
6216 if (ST.hasUnpackedD16VMem()) {
6217 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
6218
6219 SmallVector<Register, 4> WideRegs;
6220 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6221 WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6222
6223 int NumElts = StoreVT.getNumElements();
6224
6225 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
6226 .getReg(Idx: 0);
6227 }
6228
6229 if (ImageStore && ST.hasImageStoreD16Bug()) {
6230 if (StoreVT.getNumElements() == 2) {
6231 SmallVector<Register, 4> PackedRegs;
6232 Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: 0);
6233 PackedRegs.push_back(Elt: Reg);
6234 PackedRegs.resize(N: 2, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
6235 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Ops: PackedRegs)
6236 .getReg(Idx: 0);
6237 }
6238
6239 if (StoreVT.getNumElements() == 3) {
6240 SmallVector<Register, 4> PackedRegs;
6241 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
6242 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6243 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
6244 PackedRegs.resize(N: 6, NV: B.buildUndef(Res: S16).getReg(Idx: 0));
6245 Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 6, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: 0);
6246 return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 3, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
6247 }
6248
6249 if (StoreVT.getNumElements() == 4) {
6250 SmallVector<Register, 4> PackedRegs;
6251 Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
6252 auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
6253 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6254 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
6255 PackedRegs.resize(N: 4, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
6256 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S32), Ops: PackedRegs)
6257 .getReg(Idx: 0);
6258 }
6259
6260 llvm_unreachable("invalid data type");
6261 }
6262
6263 if (StoreVT == LLT::fixed_vector(NumElements: 3, ScalarTy: S16)) {
6264 Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S16), Op0: Reg)
6265 .getReg(Idx: 0);
6266 }
6267 return Reg;
6268}
6269
6270Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
6271 Register VData, LLT MemTy,
6272 bool IsFormat) const {
6273 MachineRegisterInfo *MRI = B.getMRI();
6274 LLT Ty = MRI->getType(Reg: VData);
6275
6276 const LLT S16 = LLT::scalar(SizeInBits: 16);
6277
6278 // Fixup buffer resources themselves needing to be v4i128.
6279 if (hasBufferRsrcWorkaround(Ty))
6280 return castBufferRsrcToV4I32(Pointer: VData, B);
6281
6282 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6283 Ty = getBitcastRegisterType(Ty);
6284 VData = B.buildBitcast(Dst: Ty, Src: VData).getReg(Idx: 0);
6285 }
6286 // Fixup illegal register types for i8 stores.
6287 if (Ty == LLT::scalar(SizeInBits: 8) || Ty == S16) {
6288 Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: VData).getReg(Idx: 0);
6289 return AnyExt;
6290 }
6291
6292 if (Ty.isVector()) {
6293 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6294 if (IsFormat)
6295 return handleD16VData(B, MRI&: *MRI, Reg: VData);
6296 }
6297 }
6298
6299 return VData;
6300}
6301
6302bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
6303 LegalizerHelper &Helper,
6304 bool IsTyped,
6305 bool IsFormat) const {
6306 MachineIRBuilder &B = Helper.MIRBuilder;
6307 MachineRegisterInfo &MRI = *B.getMRI();
6308
6309 Register VData = MI.getOperand(i: 1).getReg();
6310 LLT Ty = MRI.getType(Reg: VData);
6311 LLT EltTy = Ty.getScalarType();
6312 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6313 const LLT S32 = LLT::scalar(SizeInBits: 32);
6314
6315 MachineMemOperand *MMO = *MI.memoperands_begin();
6316 const int MemSize = MMO->getSize().getValue();
6317 LLT MemTy = MMO->getMemoryType();
6318
6319 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6320
6321 castBufferRsrcArgToV4I32(MI, B, Idx: 2);
6322 Register RSrc = MI.getOperand(i: 2).getReg();
6323
6324 unsigned ImmOffset;
6325
6326 // The typed intrinsics add an immediate after the registers.
6327 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6328
6329 // The struct intrinsic variants add one additional operand over raw.
6330 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6331 Register VIndex;
6332 int OpOffset = 0;
6333 if (HasVIndex) {
6334 VIndex = MI.getOperand(i: 3).getReg();
6335 OpOffset = 1;
6336 } else {
6337 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6338 }
6339
6340 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6341 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6342
6343 unsigned Format = 0;
6344 if (IsTyped) {
6345 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6346 ++OpOffset;
6347 }
6348
6349 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6350
6351 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6352
6353 unsigned Opc;
6354 if (IsTyped) {
6355 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6356 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6357 } else if (IsFormat) {
6358 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6359 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6360 } else {
6361 switch (MemSize) {
6362 case 1:
6363 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6364 break;
6365 case 2:
6366 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6367 break;
6368 default:
6369 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6370 break;
6371 }
6372 }
6373
6374 auto MIB = B.buildInstr(Opcode: Opc)
6375 .addUse(RegNo: VData) // vdata
6376 .addUse(RegNo: RSrc) // rsrc
6377 .addUse(RegNo: VIndex) // vindex
6378 .addUse(RegNo: VOffset) // voffset
6379 .addUse(RegNo: SOffset) // soffset
6380 .addImm(Val: ImmOffset); // offset(imm)
6381
6382 if (IsTyped)
6383 MIB.addImm(Val: Format);
6384
6385 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6386 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6387 .addMemOperand(MMO);
6388
6389 MI.eraseFromParent();
6390 return true;
6391}
6392
6393static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6394 Register VIndex, Register VOffset, Register SOffset,
6395 unsigned ImmOffset, unsigned Format,
6396 unsigned AuxiliaryData, MachineMemOperand *MMO,
6397 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6398 auto MIB = B.buildInstr(Opcode: Opc)
6399 .addDef(RegNo: LoadDstReg) // vdata
6400 .addUse(RegNo: RSrc) // rsrc
6401 .addUse(RegNo: VIndex) // vindex
6402 .addUse(RegNo: VOffset) // voffset
6403 .addUse(RegNo: SOffset) // soffset
6404 .addImm(Val: ImmOffset); // offset(imm)
6405
6406 if (IsTyped)
6407 MIB.addImm(Val: Format);
6408
6409 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6410 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6411 .addMemOperand(MMO);
6412}
6413
6414bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
6415 LegalizerHelper &Helper,
6416 bool IsFormat,
6417 bool IsTyped) const {
6418 MachineIRBuilder &B = Helper.MIRBuilder;
6419 MachineRegisterInfo &MRI = *B.getMRI();
6420 GISelChangeObserver &Observer = Helper.Observer;
6421
6422 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6423 MachineMemOperand *MMO = *MI.memoperands_begin();
6424 const LLT MemTy = MMO->getMemoryType();
6425 const LLT S32 = LLT::scalar(SizeInBits: 32);
6426
6427 Register Dst = MI.getOperand(i: 0).getReg();
6428
6429 Register StatusDst;
6430 int OpOffset = 0;
6431 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6432 bool IsTFE = MI.getNumExplicitDefs() == 2;
6433 if (IsTFE) {
6434 StatusDst = MI.getOperand(i: 1).getReg();
6435 ++OpOffset;
6436 }
6437
6438 castBufferRsrcArgToV4I32(MI, B, Idx: 2 + OpOffset);
6439 Register RSrc = MI.getOperand(i: 2 + OpOffset).getReg();
6440
6441 // The typed intrinsics add an immediate after the registers.
6442 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6443
6444 // The struct intrinsic variants add one additional operand over raw.
6445 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6446 Register VIndex;
6447 if (HasVIndex) {
6448 VIndex = MI.getOperand(i: 3 + OpOffset).getReg();
6449 ++OpOffset;
6450 } else {
6451 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6452 }
6453
6454 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6455 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6456
6457 unsigned Format = 0;
6458 if (IsTyped) {
6459 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6460 ++OpOffset;
6461 }
6462
6463 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6464 unsigned ImmOffset;
6465
6466 LLT Ty = MRI.getType(Reg: Dst);
6467 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6468 // logic doesn't have to handle that case.
6469 if (hasBufferRsrcWorkaround(Ty)) {
6470 Observer.changingInstr(MI);
6471 Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
6472 Observer.changedInstr(MI);
6473 Dst = MI.getOperand(i: 0).getReg();
6474 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6475 }
6476 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6477 Ty = getBitcastRegisterType(Ty);
6478 Observer.changingInstr(MI);
6479 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
6480 Observer.changedInstr(MI);
6481 Dst = MI.getOperand(i: 0).getReg();
6482 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6483 }
6484
6485 LLT EltTy = Ty.getScalarType();
6486 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6487 const bool Unpacked = ST.hasUnpackedD16VMem();
6488
6489 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6490
6491 unsigned Opc;
6492
6493 // TODO: Support TFE for typed and narrow loads.
6494 if (IsTyped) {
6495 if (IsTFE)
6496 return false;
6497 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6498 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6499 } else if (IsFormat) {
6500 if (IsD16) {
6501 if (IsTFE)
6502 return false;
6503 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6504 } else {
6505 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6506 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6507 }
6508 } else {
6509 switch (MemTy.getSizeInBits()) {
6510 case 8:
6511 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6512 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6513 break;
6514 case 16:
6515 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6516 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6517 break;
6518 default:
6519 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6520 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6521 break;
6522 }
6523 }
6524
6525 if (IsTFE) {
6526 unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: 32);
6527 unsigned NumLoadDWords = NumValueDWords + 1;
6528 LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
6529 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
6530 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6531 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6532 if (MemTy.getSizeInBits() < 32) {
6533 Register ExtDst = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6534 B.buildUnmerge(Res: {ExtDst, StatusDst}, Op: LoadDstReg);
6535 B.buildTrunc(Res: Dst, Op: ExtDst);
6536 } else if (NumValueDWords == 1) {
6537 B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
6538 } else {
6539 SmallVector<Register, 5> LoadElts;
6540 for (unsigned I = 0; I != NumValueDWords; ++I)
6541 LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
6542 LoadElts.push_back(Elt: StatusDst);
6543 B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
6544 LoadElts.truncate(N: NumValueDWords);
6545 B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
6546 }
6547 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6548 (IsD16 && !Ty.isVector())) {
6549 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6550 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6551 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6552 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6553 B.buildTrunc(Res: Dst, Op: LoadDstReg);
6554 } else if (Unpacked && IsD16 && Ty.isVector()) {
6555 LLT UnpackedTy = Ty.changeElementSize(NewEltSize: 32);
6556 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
6557 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6558 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6559 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6560 // FIXME: G_TRUNC should work, but legalization currently fails
6561 auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
6562 SmallVector<Register, 4> Repack;
6563 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6564 Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6565 B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
6566 } else {
6567 buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6568 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6569 }
6570
6571 MI.eraseFromParent();
6572 return true;
6573}
6574
6575static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6576 switch (IntrID) {
6577 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6578 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6579 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6580 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6581 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6582 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6583 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6584 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6585 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6586 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6587 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6588 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6589 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6590 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6591 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6592 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6593 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6594 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6595 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6596 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6597 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6598 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6599 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6600 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6601 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6602 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6603 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6604 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6605 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6606 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6607 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6608 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6609 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6610 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6611 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6612 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6613 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6614 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6615 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6616 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6617 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6618 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6619 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6620 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6621 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6622 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6623 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6624 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6625 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6626 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6627 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6628 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6629 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6630 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6631 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6632 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6633 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6634 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6635 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6636 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6637 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6638 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6639 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6640 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6641 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6642 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6643 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6644 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6645 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6646 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6647 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6648 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6649 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6650 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6651 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6652 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6653 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6654 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6655 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6656 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6657 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6658 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6659 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6660 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6661 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6662 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6663 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6664 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6665 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6666 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6667 default:
6668 llvm_unreachable("unhandled atomic opcode");
6669 }
6670}
6671
6672bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6673 MachineIRBuilder &B,
6674 Intrinsic::ID IID) const {
6675 const bool IsCmpSwap =
6676 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6677 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6678 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6679 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6680
6681 Register Dst = MI.getOperand(i: 0).getReg();
6682 // Since we don't have 128-bit atomics, we don't need to handle the case of
6683 // p8 argmunents to the atomic itself
6684 Register VData = MI.getOperand(i: 2).getReg();
6685
6686 Register CmpVal;
6687 int OpOffset = 0;
6688
6689 if (IsCmpSwap) {
6690 CmpVal = MI.getOperand(i: 3).getReg();
6691 ++OpOffset;
6692 }
6693
6694 castBufferRsrcArgToV4I32(MI, B, Idx: 3 + OpOffset);
6695 Register RSrc = MI.getOperand(i: 3 + OpOffset).getReg();
6696 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6697
6698 // The struct intrinsic variants add one additional operand over raw.
6699 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6700 Register VIndex;
6701 if (HasVIndex) {
6702 VIndex = MI.getOperand(i: 4 + OpOffset).getReg();
6703 ++OpOffset;
6704 } else {
6705 VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 0).getReg(Idx: 0);
6706 }
6707
6708 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6709 Register SOffset = MI.getOperand(i: 5 + OpOffset).getReg();
6710 unsigned AuxiliaryData = MI.getOperand(i: 6 + OpOffset).getImm();
6711
6712 MachineMemOperand *MMO = *MI.memoperands_begin();
6713
6714 unsigned ImmOffset;
6715 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6716
6717 auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
6718 .addDef(RegNo: Dst)
6719 .addUse(RegNo: VData); // vdata
6720
6721 if (IsCmpSwap)
6722 MIB.addReg(RegNo: CmpVal);
6723
6724 MIB.addUse(RegNo: RSrc) // rsrc
6725 .addUse(RegNo: VIndex) // vindex
6726 .addUse(RegNo: VOffset) // voffset
6727 .addUse(RegNo: SOffset) // soffset
6728 .addImm(Val: ImmOffset) // offset(imm)
6729 .addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6730 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6731 .addMemOperand(MMO);
6732
6733 MI.eraseFromParent();
6734 return true;
6735}
6736
6737/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6738/// vector with s16 typed elements.
6739static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6740 SmallVectorImpl<Register> &PackedAddrs,
6741 unsigned ArgOffset,
6742 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6743 bool IsA16, bool IsG16) {
6744 const LLT S16 = LLT::scalar(SizeInBits: 16);
6745 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6746 auto EndIdx = Intr->VAddrEnd;
6747
6748 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6749 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6750 if (!SrcOp.isReg())
6751 continue; // _L to _LZ may have eliminated this.
6752
6753 Register AddrReg = SrcOp.getReg();
6754
6755 if ((I < Intr->GradientStart) ||
6756 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6757 (I >= Intr->CoordStart && !IsA16)) {
6758 if ((I < Intr->GradientStart) && IsA16 &&
6759 (B.getMRI()->getType(Reg: AddrReg) == S16)) {
6760 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6761 // Special handling of bias when A16 is on. Bias is of type half but
6762 // occupies full 32-bit.
6763 PackedAddrs.push_back(
6764 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6765 .getReg(Idx: 0));
6766 } else {
6767 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6768 "Bias needs to be converted to 16 bit in A16 mode");
6769 // Handle any gradient or coordinate operands that should not be packed
6770 AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: 0);
6771 PackedAddrs.push_back(Elt: AddrReg);
6772 }
6773 } else {
6774 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6775 // derivatives dx/dh and dx/dv are packed with undef.
6776 if (((I + 1) >= EndIdx) ||
6777 ((Intr->NumGradients / 2) % 2 == 1 &&
6778 (I == static_cast<unsigned>(Intr->GradientStart +
6779 (Intr->NumGradients / 2) - 1) ||
6780 I == static_cast<unsigned>(Intr->GradientStart +
6781 Intr->NumGradients - 1))) ||
6782 // Check for _L to _LZ optimization
6783 !MI.getOperand(i: ArgOffset + I + 1).isReg()) {
6784 PackedAddrs.push_back(
6785 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6786 .getReg(Idx: 0));
6787 } else {
6788 PackedAddrs.push_back(
6789 Elt: B.buildBuildVector(
6790 Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + 1).getReg()})
6791 .getReg(Idx: 0));
6792 ++I;
6793 }
6794 }
6795 }
6796}
6797
6798/// Convert from separate vaddr components to a single vector address register,
6799/// and replace the remaining operands with $noreg.
6800static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6801 int DimIdx, int NumVAddrs) {
6802 const LLT S32 = LLT::scalar(SizeInBits: 32);
6803 (void)S32;
6804 SmallVector<Register, 8> AddrRegs;
6805 for (int I = 0; I != NumVAddrs; ++I) {
6806 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6807 if (SrcOp.isReg()) {
6808 AddrRegs.push_back(Elt: SrcOp.getReg());
6809 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6810 }
6811 }
6812
6813 int NumAddrRegs = AddrRegs.size();
6814 if (NumAddrRegs != 1) {
6815 auto VAddr =
6816 B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: 32), Ops: AddrRegs);
6817 MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: 0));
6818 }
6819
6820 for (int I = 1; I != NumVAddrs; ++I) {
6821 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6822 if (SrcOp.isReg())
6823 MI.getOperand(i: DimIdx + I).setReg(AMDGPU::NoRegister);
6824 }
6825}
6826
6827/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6828///
6829/// Depending on the subtarget, load/store with 16-bit element data need to be
6830/// rewritten to use the low half of 32-bit registers, or directly use a packed
6831/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6832/// registers.
6833///
6834/// We don't want to directly select image instructions just yet, but also want
6835/// to exposes all register repacking to the legalizer/combiners. We also don't
6836/// want a selected instruction entering RegBankSelect. In order to avoid
6837/// defining a multitude of intermediate image instructions, directly hack on
6838/// the intrinsic's arguments. In cases like a16 addresses, this requires
6839/// padding now unnecessary arguments with $noreg.
6840bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6841 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6842 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6843
6844 const MachineFunction &MF = *MI.getMF();
6845 const unsigned NumDefs = MI.getNumExplicitDefs();
6846 const unsigned ArgOffset = NumDefs + 1;
6847 bool IsTFE = NumDefs == 2;
6848 // We are only processing the operands of d16 image operations on subtargets
6849 // that use the unpacked register layout, or need to repack the TFE result.
6850
6851 // TODO: Do we need to guard against already legalized intrinsics?
6852 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6853 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
6854
6855 MachineRegisterInfo *MRI = B.getMRI();
6856 const LLT S32 = LLT::scalar(SizeInBits: 32);
6857 const LLT S16 = LLT::scalar(SizeInBits: 16);
6858 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6859
6860 unsigned DMask = 0;
6861 Register VData;
6862 LLT Ty;
6863
6864 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6865 VData = MI.getOperand(i: NumDefs == 0 ? 1 : 0).getReg();
6866 Ty = MRI->getType(Reg: VData);
6867 }
6868
6869 const bool IsAtomicPacked16Bit =
6870 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6871 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6872
6873 // Check for 16 bit addresses and pack if true.
6874 LLT GradTy =
6875 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
6876 LLT AddrTy =
6877 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
6878 const bool IsG16 =
6879 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6880 const bool IsA16 = AddrTy == S16;
6881 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6882
6883 int DMaskLanes = 0;
6884 if (!BaseOpcode->Atomic) {
6885 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
6886 if (BaseOpcode->Gather4) {
6887 DMaskLanes = 4;
6888 } else if (DMask != 0) {
6889 DMaskLanes = llvm::popcount(Value: DMask);
6890 } else if (!IsTFE && !BaseOpcode->Store) {
6891 // If dmask is 0, this is a no-op load. This can be eliminated.
6892 B.buildUndef(Res: MI.getOperand(i: 0));
6893 MI.eraseFromParent();
6894 return true;
6895 }
6896 }
6897
6898 Observer.changingInstr(MI);
6899 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
6900
6901 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6902 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6903 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6904 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6905 unsigned NewOpcode = LoadOpcode;
6906 if (BaseOpcode->Store)
6907 NewOpcode = StoreOpcode;
6908 else if (BaseOpcode->NoReturn)
6909 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6910
6911 // Track that we legalized this
6912 MI.setDesc(B.getTII().get(Opcode: NewOpcode));
6913
6914 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6915 // dmask to be at least 1 otherwise the instruction will fail
6916 if (IsTFE && DMask == 0) {
6917 DMask = 0x1;
6918 DMaskLanes = 1;
6919 MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
6920 }
6921
6922 if (BaseOpcode->Atomic) {
6923 Register VData0 = MI.getOperand(i: 2).getReg();
6924 LLT Ty = MRI->getType(Reg: VData0);
6925
6926 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6927 if (Ty.isVector() && !IsAtomicPacked16Bit)
6928 return false;
6929
6930 if (BaseOpcode->AtomicX2) {
6931 Register VData1 = MI.getOperand(i: 3).getReg();
6932 // The two values are packed in one register.
6933 LLT PackedTy = LLT::fixed_vector(NumElements: 2, ScalarTy: Ty);
6934 auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
6935 MI.getOperand(i: 2).setReg(Concat.getReg(Idx: 0));
6936 MI.getOperand(i: 3).setReg(AMDGPU::NoRegister);
6937 }
6938 }
6939
6940 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6941
6942 // Rewrite the addressing register layout before doing anything else.
6943 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6944 // 16 bit gradients are supported, but are tied to the A16 control
6945 // so both gradients and addresses must be 16 bit
6946 return false;
6947 }
6948
6949 if (IsA16 && !ST.hasA16()) {
6950 // A16 not supported
6951 return false;
6952 }
6953
6954 const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
6955 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6956
6957 if (IsA16 || IsG16) {
6958 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6959 // instructions expect VGPR_32
6960 SmallVector<Register, 4> PackedRegs;
6961
6962 packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6963
6964 // See also below in the non-a16 branch
6965 const bool UseNSA = ST.hasNSAEncoding() &&
6966 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6967 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6968 const bool UsePartialNSA =
6969 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6970
6971 if (UsePartialNSA) {
6972 // Pack registers that would go over NSAMaxSize into last VAddr register
6973 LLT PackedAddrTy =
6974 LLT::fixed_vector(NumElements: 2 * (PackedRegs.size() - NSAMaxSize + 1), ScalarSizeInBits: 16);
6975 auto Concat = B.buildConcatVectors(
6976 Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - 1));
6977 PackedRegs[NSAMaxSize - 1] = Concat.getReg(Idx: 0);
6978 PackedRegs.resize(N: NSAMaxSize);
6979 } else if (!UseNSA && PackedRegs.size() > 1) {
6980 LLT PackedAddrTy = LLT::fixed_vector(NumElements: 2 * PackedRegs.size(), ScalarSizeInBits: 16);
6981 auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
6982 PackedRegs[0] = Concat.getReg(Idx: 0);
6983 PackedRegs.resize(N: 1);
6984 }
6985
6986 const unsigned NumPacked = PackedRegs.size();
6987 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6988 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6989 if (!SrcOp.isReg()) {
6990 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6991 continue;
6992 }
6993
6994 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6995
6996 if (I - Intr->VAddrStart < NumPacked)
6997 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6998 else
6999 SrcOp.setReg(AMDGPU::NoRegister);
7000 }
7001 } else {
7002 // If the register allocator cannot place the address registers contiguously
7003 // without introducing moves, then using the non-sequential address encoding
7004 // is always preferable, since it saves VALU instructions and is usually a
7005 // wash in terms of code size or even better.
7006 //
7007 // However, we currently have no way of hinting to the register allocator
7008 // that MIMG addresses should be placed contiguously when it is possible to
7009 // do so, so force non-NSA for the common 2-address case as a heuristic.
7010 //
7011 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7012 // allocation when possible.
7013 //
7014 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7015 // set of the remaining addresses.
7016 const bool UseNSA = ST.hasNSAEncoding() &&
7017 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7018 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7019 const bool UsePartialNSA =
7020 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7021
7022 if (UsePartialNSA) {
7023 convertImageAddrToPacked(B, MI,
7024 DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7025 NumVAddrs: Intr->NumVAddrs - NSAMaxSize + 1);
7026 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7027 convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
7028 NumVAddrs: Intr->NumVAddrs);
7029 }
7030 }
7031
7032 int Flags = 0;
7033 if (IsA16)
7034 Flags |= 1;
7035 if (IsG16)
7036 Flags |= 2;
7037 MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
7038
7039 if (BaseOpcode->NoReturn) { // No TFE for stores?
7040 // TODO: Handle dmask trim
7041 if (!Ty.isVector() || !IsD16)
7042 return true;
7043
7044 Register RepackedReg = handleD16VData(B, MRI&: *MRI, Reg: VData, ImageStore: true);
7045 if (RepackedReg != VData) {
7046 MI.getOperand(i: 1).setReg(RepackedReg);
7047 }
7048
7049 return true;
7050 }
7051
7052 Register DstReg = MI.getOperand(i: 0).getReg();
7053 const LLT EltTy = Ty.getScalarType();
7054 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7055
7056 // Confirm that the return type is large enough for the dmask specified
7057 if (NumElts < DMaskLanes)
7058 return false;
7059
7060 if (NumElts > 4 || DMaskLanes > 4)
7061 return false;
7062
7063 // Image atomic instructions are using DMask to specify how many bits
7064 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7065 // DMaskLanes for image atomic has default value '0'.
7066 // We must be sure that atomic variants (especially packed) will not be
7067 // truncated from v2s16 or v4s16 to s16 type.
7068 //
7069 // ChangeElementCount will be needed for image load where Ty is always scalar.
7070 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7071 const LLT AdjustedTy =
7072 DMaskLanes == 0
7073 ? Ty
7074 : Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
7075
7076 // The raw dword aligned data component of the load. The only legal cases
7077 // where this matters should be when using the packed D16 format, for
7078 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7079 LLT RoundedTy;
7080
7081 // S32 vector to cover all data, plus TFE result element.
7082 LLT TFETy;
7083
7084 // Register type to use for each loaded component. Will be S32 or V2S16.
7085 LLT RegTy;
7086
7087 if (IsD16 && ST.hasUnpackedD16VMem()) {
7088 RoundedTy =
7089 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: 32);
7090 TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + 1, ScalarSizeInBits: 32);
7091 RegTy = S32;
7092 } else {
7093 unsigned EltSize = EltTy.getSizeInBits();
7094 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7095 unsigned RoundedSize = 32 * RoundedElts;
7096 RoundedTy = LLT::scalarOrVector(
7097 EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
7098 TFETy = LLT::fixed_vector(NumElements: RoundedSize / 32 + 1, ScalarTy: S32);
7099 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7100 }
7101
7102 // The return type does not need adjustment.
7103 // TODO: Should we change s16 case to s32 or <2 x s16>?
7104 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7105 return true;
7106
7107 Register Dst1Reg;
7108
7109 // Insert after the instruction.
7110 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
7111
7112 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7113 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7114 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7115 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7116
7117 Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
7118
7119 MI.getOperand(i: 0).setReg(NewResultReg);
7120
7121 // In the IR, TFE is supposed to be used with a 2 element struct return
7122 // type. The instruction really returns these two values in one contiguous
7123 // register, with one additional dword beyond the loaded data. Rewrite the
7124 // return type to use a single register result.
7125
7126 if (IsTFE) {
7127 Dst1Reg = MI.getOperand(i: 1).getReg();
7128 if (MRI->getType(Reg: Dst1Reg) != S32)
7129 return false;
7130
7131 // TODO: Make sure the TFE operand bit is set.
7132 MI.removeOperand(OpNo: 1);
7133
7134 // Handle the easy case that requires no repack instructions.
7135 if (Ty == S32) {
7136 B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
7137 return true;
7138 }
7139 }
7140
7141 // Now figure out how to copy the new result register back into the old
7142 // result.
7143 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7144
7145 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7146
7147 if (ResultNumRegs == 1) {
7148 assert(!IsTFE);
7149 ResultRegs[0] = NewResultReg;
7150 } else {
7151 // We have to repack into a new vector of some kind.
7152 for (int I = 0; I != NumDataRegs; ++I)
7153 ResultRegs[I] = MRI->createGenericVirtualRegister(Ty: RegTy);
7154 B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
7155
7156 // Drop the final TFE element to get the data part. The TFE result is
7157 // directly written to the right place already.
7158 if (IsTFE)
7159 ResultRegs.resize(N: NumDataRegs);
7160 }
7161
7162 // For an s16 scalar result, we form an s32 result with a truncate regardless
7163 // of packed vs. unpacked.
7164 if (IsD16 && !Ty.isVector()) {
7165 B.buildTrunc(Res: DstReg, Op: ResultRegs[0]);
7166 return true;
7167 }
7168
7169 // Avoid a build/concat_vector of 1 entry.
7170 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7171 B.buildBitcast(Dst: DstReg, Src: ResultRegs[0]);
7172 return true;
7173 }
7174
7175 assert(Ty.isVector());
7176
7177 if (IsD16) {
7178 // For packed D16 results with TFE enabled, all the data components are
7179 // S32. Cast back to the expected type.
7180 //
7181 // TODO: We don't really need to use load s32 elements. We would only need one
7182 // cast for the TFE result if a multiple of v2s16 was used.
7183 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7184 for (Register &Reg : ResultRegs)
7185 Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: 0);
7186 } else if (ST.hasUnpackedD16VMem()) {
7187 for (Register &Reg : ResultRegs)
7188 Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: 0);
7189 }
7190 }
7191
7192 auto padWithUndef = [&](LLT Ty, int NumElts) {
7193 if (NumElts == 0)
7194 return;
7195 Register Undef = B.buildUndef(Res: Ty).getReg(Idx: 0);
7196 for (int I = 0; I != NumElts; ++I)
7197 ResultRegs.push_back(Elt: Undef);
7198 };
7199
7200 // Pad out any elements eliminated due to the dmask.
7201 LLT ResTy = MRI->getType(Reg: ResultRegs[0]);
7202 if (!ResTy.isVector()) {
7203 padWithUndef(ResTy, NumElts - ResultRegs.size());
7204 B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
7205 return true;
7206 }
7207
7208 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7209 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7210
7211 // Deal with the one annoying legal case.
7212 const LLT V3S16 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 16);
7213 if (Ty == V3S16) {
7214 if (IsTFE) {
7215 if (ResultRegs.size() == 1) {
7216 NewResultReg = ResultRegs[0];
7217 } else if (ResultRegs.size() == 2) {
7218 LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
7219 NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: 0);
7220 } else {
7221 return false;
7222 }
7223 }
7224
7225 if (MRI->getType(Reg: DstReg).getNumElements() <
7226 MRI->getType(Reg: NewResultReg).getNumElements()) {
7227 B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
7228 } else {
7229 B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
7230 }
7231 return true;
7232 }
7233
7234 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7235 B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
7236 return true;
7237}
7238
7239bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
7240 MachineInstr &MI) const {
7241 MachineIRBuilder &B = Helper.MIRBuilder;
7242 GISelChangeObserver &Observer = Helper.Observer;
7243
7244 Register OrigDst = MI.getOperand(i: 0).getReg();
7245 Register Dst;
7246 LLT Ty = B.getMRI()->getType(Reg: OrigDst);
7247 unsigned Size = Ty.getSizeInBits();
7248 MachineFunction &MF = B.getMF();
7249 unsigned Opc = 0;
7250 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7251 assert(Size == 8 || Size == 16);
7252 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7253 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7254 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7255 // destination register.
7256 Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
7257 } else {
7258 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7259 Dst = OrigDst;
7260 }
7261
7262 Observer.changingInstr(MI);
7263
7264 // Handle needing to s.buffer.load() a p8 value.
7265 if (hasBufferRsrcWorkaround(Ty)) {
7266 Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: 0);
7267 B.setInsertPt(MBB&: B.getMBB(), II: MI);
7268 }
7269 if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
7270 Ty = getBitcastRegisterType(Ty);
7271 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
7272 B.setInsertPt(MBB&: B.getMBB(), II: MI);
7273 }
7274
7275 // FIXME: We don't really need this intermediate instruction. The intrinsic
7276 // should be fixed to have a memory operand. Since it's readnone, we're not
7277 // allowed to add one.
7278 MI.setDesc(B.getTII().get(Opcode: Opc));
7279 MI.removeOperand(OpNo: 1); // Remove intrinsic ID
7280
7281 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7282 const unsigned MemSize = (Size + 7) / 8;
7283 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7284 Ty: getTypeForLLT(Ty, C&: MF.getFunction().getContext()));
7285 MachineMemOperand *MMO = MF.getMachineMemOperand(
7286 PtrInfo: MachinePointerInfo(),
7287 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7288 MachineMemOperand::MOInvariant,
7289 Size: MemSize, BaseAlignment: MemAlign);
7290 MI.addMemOperand(MF, MO: MMO);
7291 if (Dst != OrigDst) {
7292 MI.getOperand(i: 0).setReg(Dst);
7293 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
7294 B.buildTrunc(Res: OrigDst, Op: Dst);
7295 }
7296
7297 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7298 // always be legal. We may need to restore this to a 96-bit result if it turns
7299 // out this needs to be converted to a vector load during RegBankSelect.
7300 if (!isPowerOf2_32(Value: Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7301 if (Ty.isVector())
7302 Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: 0);
7303 else
7304 Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: 0);
7305 }
7306
7307 Observer.changedInstr(MI);
7308 return true;
7309}
7310
7311bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
7312 MachineInstr &MI) const {
7313 MachineIRBuilder &B = Helper.MIRBuilder;
7314 GISelChangeObserver &Observer = Helper.Observer;
7315 Observer.changingInstr(MI);
7316 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7317 MI.removeOperand(OpNo: 0); // Remove intrinsic ID
7318 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
7319 Observer.changedInstr(MI);
7320 return true;
7321}
7322
7323// TODO: Move to selection
7324bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
7325 MachineRegisterInfo &MRI,
7326 MachineIRBuilder &B) const {
7327 if (!ST.hasTrapHandler() ||
7328 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7329 return legalizeTrapEndpgm(MI, MRI, B);
7330
7331 return ST.supportsGetDoorbellID() ?
7332 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
7333}
7334
7335bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
7336 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
7337 const DebugLoc &DL = MI.getDebugLoc();
7338 MachineBasicBlock &BB = B.getMBB();
7339 MachineFunction *MF = BB.getParent();
7340
7341 if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
7342 BuildMI(BB, I: BB.end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
7343 .addImm(Val: 0);
7344 MI.eraseFromParent();
7345 return true;
7346 }
7347
7348 // We need a block split to make the real endpgm a terminator. We also don't
7349 // want to break phis in successor blocks, so we can't just delete to the
7350 // end of the block.
7351 BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
7352 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7353 MF->push_back(MBB: TrapBB);
7354 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
7355 .addImm(Val: 0);
7356 BuildMI(BB, I: &MI, MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
7357 .addMBB(MBB: TrapBB);
7358
7359 BB.addSuccessor(Succ: TrapBB);
7360 MI.eraseFromParent();
7361 return true;
7362}
7363
7364bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
7365 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
7366 MachineFunction &MF = B.getMF();
7367 const LLT S64 = LLT::scalar(SizeInBits: 64);
7368
7369 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7370 // For code object version 5, queue_ptr is passed through implicit kernarg.
7371 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
7372 AMDGPU::AMDHSA_COV5) {
7373 AMDGPUTargetLowering::ImplicitParameter Param =
7374 AMDGPUTargetLowering::QUEUE_PTR;
7375 uint64_t Offset =
7376 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
7377
7378 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7379 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7380
7381 if (!loadInputValue(DstReg: KernargPtrReg, B,
7382 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
7383 return false;
7384
7385 // TODO: can we be smarter about machine pointer info?
7386 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF);
7387 MachineMemOperand *MMO = MF.getMachineMemOperand(
7388 PtrInfo: PtrInfo.getWithOffset(O: Offset),
7389 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7390 MachineMemOperand::MOInvariant,
7391 MemTy: LLT::scalar(SizeInBits: 64), base_alignment: commonAlignment(A: Align(64), Offset));
7392
7393 // Pointer address
7394 Register LoadAddr = MRI.createGenericVirtualRegister(
7395 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7396 B.buildObjectPtrOffset(Res: LoadAddr, Op0: KernargPtrReg,
7397 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
7398 // Load address
7399 Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
7400 B.buildCopy(Res: SGPR01, Op: Temp);
7401 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7402 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7403 .addReg(RegNo: SGPR01, Flags: RegState::Implicit);
7404 MI.eraseFromParent();
7405 return true;
7406 }
7407
7408 // Pass queue pointer to trap handler as input, and insert trap instruction
7409 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7410 Register LiveIn =
7411 MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7412 if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
7413 return false;
7414
7415 B.buildCopy(Res: SGPR01, Op: LiveIn);
7416 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7417 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7418 .addReg(RegNo: SGPR01, Flags: RegState::Implicit);
7419
7420 MI.eraseFromParent();
7421 return true;
7422}
7423
7424bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
7425 MachineRegisterInfo &MRI,
7426 MachineIRBuilder &B) const {
7427 // We need to simulate the 's_trap 2' instruction on targets that run in
7428 // PRIV=1 (where it is treated as a nop).
7429 if (ST.hasPrivEnabledTrap2NopBug()) {
7430 ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
7431 DL: MI.getDebugLoc());
7432 MI.eraseFromParent();
7433 return true;
7434 }
7435
7436 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7437 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7438 MI.eraseFromParent();
7439 return true;
7440}
7441
7442bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
7443 MachineRegisterInfo &MRI,
7444 MachineIRBuilder &B) const {
7445 // Is non-HSA path or trap-handler disabled? Then, report a warning
7446 // accordingly
7447 if (!ST.hasTrapHandler() ||
7448 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7449 Function &Fn = B.getMF().getFunction();
7450 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7451 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7452 } else {
7453 // Insert debug-trap instruction
7454 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7455 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7456 }
7457
7458 MI.eraseFromParent();
7459 return true;
7460}
7461
7462bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic(
7463 MachineInstr &MI, MachineIRBuilder &B) const {
7464 MachineRegisterInfo &MRI = *B.getMRI();
7465 const LLT S16 = LLT::scalar(SizeInBits: 16);
7466 const LLT S32 = LLT::scalar(SizeInBits: 32);
7467 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
7468 const LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
7469
7470 Register DstReg = MI.getOperand(i: 0).getReg();
7471 Register NodePtr = MI.getOperand(i: 2).getReg();
7472 Register RayExtent = MI.getOperand(i: 3).getReg();
7473 Register RayOrigin = MI.getOperand(i: 4).getReg();
7474 Register RayDir = MI.getOperand(i: 5).getReg();
7475 Register RayInvDir = MI.getOperand(i: 6).getReg();
7476 Register TDescr = MI.getOperand(i: 7).getReg();
7477
7478 if (!ST.hasGFX10_AEncoding()) {
7479 Function &Fn = B.getMF().getFunction();
7480 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7481 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7482 return false;
7483 }
7484
7485 const bool IsGFX11 = AMDGPU::isGFX11(STI: ST);
7486 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: ST);
7487 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: ST);
7488 const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == 16;
7489 const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == 64;
7490 const unsigned NumVDataDwords = 4;
7491 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7492 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7493 const bool UseNSA =
7494 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7495
7496 const unsigned BaseOpcodes[2][2] = {
7497 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7498 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7499 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7500 int Opcode;
7501 if (UseNSA) {
7502 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7503 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7504 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7505 : AMDGPU::MIMGEncGfx10NSA,
7506 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7507 } else {
7508 assert(!IsGFX12Plus);
7509 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7510 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7511 : AMDGPU::MIMGEncGfx10Default,
7512 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7513 }
7514 assert(Opcode != -1);
7515
7516 SmallVector<Register, 12> Ops;
7517 if (UseNSA && IsGFX11Plus) {
7518 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7519 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7520 auto Merged = B.buildMergeLikeInstr(
7521 Res: V3S32, Ops: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1), Unmerge.getReg(Idx: 2)});
7522 Ops.push_back(Elt: Merged.getReg(Idx: 0));
7523 };
7524
7525 Ops.push_back(Elt: NodePtr);
7526 Ops.push_back(Elt: RayExtent);
7527 packLanes(RayOrigin);
7528
7529 if (IsA16) {
7530 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7531 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7532 auto MergedDir = B.buildMergeLikeInstr(
7533 Res: V3S32,
7534 Ops: {B.buildBitcast(
7535 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 0),
7536 UnmergeRayDir.getReg(Idx: 0)}))
7537 .getReg(Idx: 0),
7538 B.buildBitcast(
7539 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 1),
7540 UnmergeRayDir.getReg(Idx: 1)}))
7541 .getReg(Idx: 0),
7542 B.buildBitcast(
7543 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 2),
7544 UnmergeRayDir.getReg(Idx: 2)}))
7545 .getReg(Idx: 0)});
7546 Ops.push_back(Elt: MergedDir.getReg(Idx: 0));
7547 } else {
7548 packLanes(RayDir);
7549 packLanes(RayInvDir);
7550 }
7551 } else {
7552 if (Is64) {
7553 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
7554 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7555 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7556 } else {
7557 Ops.push_back(Elt: NodePtr);
7558 }
7559 Ops.push_back(Elt: RayExtent);
7560
7561 auto packLanes = [&Ops, &S32, &B](Register Src) {
7562 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7563 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7564 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7565 Ops.push_back(Elt: Unmerge.getReg(Idx: 2));
7566 };
7567
7568 packLanes(RayOrigin);
7569 if (IsA16) {
7570 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7571 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7572 Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
7573 Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
7574 Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
7575 B.buildMergeLikeInstr(Res: R1,
7576 Ops: {UnmergeRayDir.getReg(Idx: 0), UnmergeRayDir.getReg(Idx: 1)});
7577 B.buildMergeLikeInstr(
7578 Res: R2, Ops: {UnmergeRayDir.getReg(Idx: 2), UnmergeRayInvDir.getReg(Idx: 0)});
7579 B.buildMergeLikeInstr(
7580 Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: 1), UnmergeRayInvDir.getReg(Idx: 2)});
7581 Ops.push_back(Elt: R1);
7582 Ops.push_back(Elt: R2);
7583 Ops.push_back(Elt: R3);
7584 } else {
7585 packLanes(RayDir);
7586 packLanes(RayInvDir);
7587 }
7588 }
7589
7590 if (!UseNSA) {
7591 // Build a single vector containing all the operands so far prepared.
7592 LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: 32);
7593 Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: 0);
7594 Ops.clear();
7595 Ops.push_back(Elt: MergedOps);
7596 }
7597
7598 auto MIB = B.buildInstr(Opcode: AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7599 .addDef(RegNo: DstReg)
7600 .addImm(Val: Opcode);
7601
7602 for (Register R : Ops) {
7603 MIB.addUse(RegNo: R);
7604 }
7605
7606 MIB.addUse(RegNo: TDescr)
7607 .addImm(Val: IsA16 ? 1 : 0)
7608 .cloneMemRefs(OtherMI: MI);
7609
7610 MI.eraseFromParent();
7611 return true;
7612}
7613
7614bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic(
7615 MachineInstr &MI, MachineIRBuilder &B) const {
7616 const LLT S32 = LLT::scalar(SizeInBits: 32);
7617 const LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
7618
7619 Register DstReg = MI.getOperand(i: 0).getReg();
7620 Register DstOrigin = MI.getOperand(i: 1).getReg();
7621 Register DstDir = MI.getOperand(i: 2).getReg();
7622 Register NodePtr = MI.getOperand(i: 4).getReg();
7623 Register RayExtent = MI.getOperand(i: 5).getReg();
7624 Register InstanceMask = MI.getOperand(i: 6).getReg();
7625 Register RayOrigin = MI.getOperand(i: 7).getReg();
7626 Register RayDir = MI.getOperand(i: 8).getReg();
7627 Register Offsets = MI.getOperand(i: 9).getReg();
7628 Register TDescr = MI.getOperand(i: 10).getReg();
7629
7630 if (!ST.hasBVHDualAndBVH8Insts()) {
7631 Function &Fn = B.getMF().getFunction();
7632 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7633 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7634 return false;
7635 }
7636
7637 bool IsBVH8 = cast<GIntrinsic>(Val&: MI).getIntrinsicID() ==
7638 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7639 const unsigned NumVDataDwords = 10;
7640 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7641 int Opcode = AMDGPU::getMIMGOpcode(
7642 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7643 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7644 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7645 assert(Opcode != -1);
7646
7647 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7648 Res: V2S32, Ops: {RayExtent, B.buildAnyExt(Res: S32, Op: InstanceMask)});
7649
7650 B.buildInstr(Opcode: IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7651 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7652 .addDef(RegNo: DstReg)
7653 .addDef(RegNo: DstOrigin)
7654 .addDef(RegNo: DstDir)
7655 .addImm(Val: Opcode)
7656 .addUse(RegNo: NodePtr)
7657 .addUse(RegNo: RayExtentInstanceMaskVec.getReg(Idx: 0))
7658 .addUse(RegNo: RayOrigin)
7659 .addUse(RegNo: RayDir)
7660 .addUse(RegNo: Offsets)
7661 .addUse(RegNo: TDescr)
7662 .cloneMemRefs(OtherMI: MI);
7663
7664 MI.eraseFromParent();
7665 return true;
7666}
7667
7668bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7669 MachineIRBuilder &B) const {
7670 const SITargetLowering *TLI = ST.getTargetLowering();
7671 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7672 Register DstReg = MI.getOperand(i: 0).getReg();
7673 B.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {DstReg}, SrcOps: {StackPtr});
7674 MI.eraseFromParent();
7675 return true;
7676}
7677
7678bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7679 MachineIRBuilder &B) const {
7680 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7681 if (!ST.hasArchitectedSGPRs())
7682 return false;
7683 LLT S32 = LLT::scalar(SizeInBits: 32);
7684 Register DstReg = MI.getOperand(i: 0).getReg();
7685 auto TTMP8 = B.buildCopy(Res: S32, Op: Register(AMDGPU::TTMP8));
7686 auto LSB = B.buildConstant(Res: S32, Val: 25);
7687 auto Width = B.buildConstant(Res: S32, Val: 5);
7688 B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
7689 MI.eraseFromParent();
7690 return true;
7691}
7692
7693bool AMDGPULegalizerInfo::legalizeConstHwRegRead(MachineInstr &MI,
7694 MachineIRBuilder &B,
7695 AMDGPU::Hwreg::Id HwReg,
7696 unsigned LowBit,
7697 unsigned Width) const {
7698 MachineRegisterInfo &MRI = *B.getMRI();
7699 Register DstReg = MI.getOperand(i: 0).getReg();
7700 if (!MRI.getRegClassOrNull(Reg: DstReg))
7701 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32RegClass);
7702 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32_const)
7703 .addDef(RegNo: DstReg)
7704 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(Values: HwReg, Values: LowBit, Values: Width));
7705 MI.eraseFromParent();
7706 return true;
7707}
7708
7709static constexpr unsigned FPEnvModeBitField =
7710 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
7711
7712static constexpr unsigned FPEnvTrapBitField =
7713 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
7714
7715bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7716 MachineRegisterInfo &MRI,
7717 MachineIRBuilder &B) const {
7718 Register Src = MI.getOperand(i: 0).getReg();
7719 if (MRI.getType(Reg: Src) != S64)
7720 return false;
7721
7722 auto ModeReg =
7723 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7724 /*HasSideEffects=*/true, /*isConvergent=*/false)
7725 .addImm(Val: FPEnvModeBitField);
7726 auto TrapReg =
7727 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7728 /*HasSideEffects=*/true, /*isConvergent=*/false)
7729 .addImm(Val: FPEnvTrapBitField);
7730 B.buildMergeLikeInstr(Res: Src, Ops: {ModeReg, TrapReg});
7731 MI.eraseFromParent();
7732 return true;
7733}
7734
7735bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7736 MachineRegisterInfo &MRI,
7737 MachineIRBuilder &B) const {
7738 Register Src = MI.getOperand(i: 0).getReg();
7739 if (MRI.getType(Reg: Src) != S64)
7740 return false;
7741
7742 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: 0));
7743 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7744 /*HasSideEffects=*/true, /*isConvergent=*/false)
7745 .addImm(Val: static_cast<int16_t>(FPEnvModeBitField))
7746 .addReg(RegNo: Unmerge.getReg(Idx: 0));
7747 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7748 /*HasSideEffects=*/true, /*isConvergent=*/false)
7749 .addImm(Val: static_cast<int16_t>(FPEnvTrapBitField))
7750 .addReg(RegNo: Unmerge.getReg(Idx: 1));
7751 MI.eraseFromParent();
7752 return true;
7753}
7754
7755bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7756 MachineInstr &MI) const {
7757 MachineIRBuilder &B = Helper.MIRBuilder;
7758 MachineRegisterInfo &MRI = *B.getMRI();
7759
7760 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7761 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
7762 switch (IntrID) {
7763 case Intrinsic::sponentry:
7764 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
7765 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
7766 // that we can remove this cast.
7767 const LLT S32 = LLT::scalar(SizeInBits: 32);
7768 Register TmpReg = MRI.createGenericVirtualRegister(Ty: S32);
7769 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_SPONENTRY).addDef(RegNo: TmpReg);
7770
7771 Register DstReg = MI.getOperand(i: 0).getReg();
7772 B.buildIntToPtr(Dst: DstReg, Src: TmpReg);
7773 MI.eraseFromParent();
7774 } else {
7775 int FI = B.getMF().getFrameInfo().CreateFixedObject(
7776 Size: 1, SPOffset: 0, /*IsImmutable=*/false);
7777 B.buildFrameIndex(Res: MI.getOperand(i: 0), Idx: FI);
7778 MI.eraseFromParent();
7779 }
7780 return true;
7781 case Intrinsic::amdgcn_if:
7782 case Intrinsic::amdgcn_else: {
7783 MachineInstr *Br = nullptr;
7784 MachineBasicBlock *UncondBrTarget = nullptr;
7785 bool Negated = false;
7786 if (MachineInstr *BrCond =
7787 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7788 const SIRegisterInfo *TRI
7789 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7790
7791 Register Def = MI.getOperand(i: 1).getReg();
7792 Register Use = MI.getOperand(i: 3).getReg();
7793
7794 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7795
7796 if (Negated)
7797 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7798
7799 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7800 if (IntrID == Intrinsic::amdgcn_if) {
7801 B.buildInstr(Opcode: AMDGPU::SI_IF)
7802 .addDef(RegNo: Def)
7803 .addUse(RegNo: Use)
7804 .addMBB(MBB: UncondBrTarget);
7805 } else {
7806 B.buildInstr(Opcode: AMDGPU::SI_ELSE)
7807 .addDef(RegNo: Def)
7808 .addUse(RegNo: Use)
7809 .addMBB(MBB: UncondBrTarget);
7810 }
7811
7812 if (Br) {
7813 Br->getOperand(i: 0).setMBB(CondBrTarget);
7814 } else {
7815 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7816 // since we're swapping branch targets it needs to be reinserted.
7817 // FIXME: IRTranslator should probably not do this
7818 B.buildBr(Dest&: *CondBrTarget);
7819 }
7820
7821 MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
7822 MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
7823 MI.eraseFromParent();
7824 BrCond->eraseFromParent();
7825 return true;
7826 }
7827
7828 return false;
7829 }
7830 case Intrinsic::amdgcn_loop: {
7831 MachineInstr *Br = nullptr;
7832 MachineBasicBlock *UncondBrTarget = nullptr;
7833 bool Negated = false;
7834 if (MachineInstr *BrCond =
7835 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7836 const SIRegisterInfo *TRI
7837 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7838
7839 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7840 Register Reg = MI.getOperand(i: 2).getReg();
7841
7842 if (Negated)
7843 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7844
7845 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7846 B.buildInstr(Opcode: AMDGPU::SI_LOOP)
7847 .addUse(RegNo: Reg)
7848 .addMBB(MBB: UncondBrTarget);
7849
7850 if (Br)
7851 Br->getOperand(i: 0).setMBB(CondBrTarget);
7852 else
7853 B.buildBr(Dest&: *CondBrTarget);
7854
7855 MI.eraseFromParent();
7856 BrCond->eraseFromParent();
7857 MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
7858 return true;
7859 }
7860
7861 return false;
7862 }
7863 case Intrinsic::amdgcn_addrspacecast_nonnull:
7864 return legalizeAddrSpaceCast(MI, MRI, B);
7865 case Intrinsic::amdgcn_make_buffer_rsrc:
7866 return legalizePointerAsRsrcIntrin(MI, MRI, B);
7867 case Intrinsic::amdgcn_kernarg_segment_ptr:
7868 if (!AMDGPU::isKernel(F: B.getMF().getFunction())) {
7869 // This only makes sense to call in a kernel, so just lower to null.
7870 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: 0);
7871 MI.eraseFromParent();
7872 return true;
7873 }
7874
7875 return legalizePreloadedArgIntrin(
7876 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7877 case Intrinsic::amdgcn_implicitarg_ptr:
7878 return legalizeImplicitArgPtr(MI, MRI, B);
7879 case Intrinsic::amdgcn_workitem_id_x:
7880 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 0,
7881 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7882 case Intrinsic::amdgcn_workitem_id_y:
7883 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 1,
7884 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7885 case Intrinsic::amdgcn_workitem_id_z:
7886 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 2,
7887 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7888 case Intrinsic::amdgcn_workgroup_id_x:
7889 return legalizeWorkGroupId(
7890 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
7891 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
7892 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
7893 case Intrinsic::amdgcn_workgroup_id_y:
7894 return legalizeWorkGroupId(
7895 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
7896 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
7897 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
7898 case Intrinsic::amdgcn_workgroup_id_z:
7899 return legalizeWorkGroupId(
7900 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
7901 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
7902 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
7903 case Intrinsic::amdgcn_cluster_id_x:
7904 return ST.hasClusters() &&
7905 legalizePreloadedArgIntrin(MI, MRI, B,
7906 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7907 case Intrinsic::amdgcn_cluster_id_y:
7908 return ST.hasClusters() &&
7909 legalizePreloadedArgIntrin(MI, MRI, B,
7910 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7911 case Intrinsic::amdgcn_cluster_id_z:
7912 return ST.hasClusters() &&
7913 legalizePreloadedArgIntrin(MI, MRI, B,
7914 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7915 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7916 return ST.hasClusters() &&
7917 legalizePreloadedArgIntrin(
7918 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
7919 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7920 return ST.hasClusters() &&
7921 legalizePreloadedArgIntrin(
7922 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
7923 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7924 return ST.hasClusters() &&
7925 legalizePreloadedArgIntrin(
7926 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
7927 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7928 return ST.hasClusters() &&
7929 legalizeConstHwRegRead(MI, B, HwReg: AMDGPU::Hwreg::ID_IB_STS2, LowBit: 21, Width: 4);
7930 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7931 return ST.hasClusters() &&
7932 legalizePreloadedArgIntrin(
7933 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
7934 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7935 return ST.hasClusters() &&
7936 legalizePreloadedArgIntrin(
7937 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
7938 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7939 return ST.hasClusters() &&
7940 legalizePreloadedArgIntrin(
7941 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
7942 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7943 return ST.hasClusters() &&
7944 legalizePreloadedArgIntrin(
7945 MI, MRI, B,
7946 ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
7947 case Intrinsic::amdgcn_wave_id:
7948 return legalizeWaveID(MI, B);
7949 case Intrinsic::amdgcn_lds_kernel_id:
7950 return legalizePreloadedArgIntrin(MI, MRI, B,
7951 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7952 case Intrinsic::amdgcn_dispatch_ptr:
7953 return legalizePreloadedArgIntrin(MI, MRI, B,
7954 ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
7955 case Intrinsic::amdgcn_queue_ptr:
7956 return legalizePreloadedArgIntrin(MI, MRI, B,
7957 ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
7958 case Intrinsic::amdgcn_implicit_buffer_ptr:
7959 return legalizePreloadedArgIntrin(
7960 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7961 case Intrinsic::amdgcn_dispatch_id:
7962 return legalizePreloadedArgIntrin(MI, MRI, B,
7963 ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
7964 case Intrinsic::r600_read_ngroups_x:
7965 // TODO: Emit error for hsa
7966 return legalizeKernargMemParameter(MI, B,
7967 Offset: SI::KernelInputOffsets::NGROUPS_X);
7968 case Intrinsic::r600_read_ngroups_y:
7969 return legalizeKernargMemParameter(MI, B,
7970 Offset: SI::KernelInputOffsets::NGROUPS_Y);
7971 case Intrinsic::r600_read_ngroups_z:
7972 return legalizeKernargMemParameter(MI, B,
7973 Offset: SI::KernelInputOffsets::NGROUPS_Z);
7974 case Intrinsic::r600_read_local_size_x:
7975 // TODO: Could insert G_ASSERT_ZEXT from s16
7976 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
7977 case Intrinsic::r600_read_local_size_y:
7978 // TODO: Could insert G_ASSERT_ZEXT from s16
7979 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
7980 // TODO: Could insert G_ASSERT_ZEXT from s16
7981 case Intrinsic::r600_read_local_size_z:
7982 return legalizeKernargMemParameter(MI, B,
7983 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
7984 case Intrinsic::amdgcn_fdiv_fast:
7985 return legalizeFDIVFastIntrin(MI, MRI, B);
7986 case Intrinsic::amdgcn_is_shared:
7987 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
7988 case Intrinsic::amdgcn_is_private:
7989 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
7990 case Intrinsic::amdgcn_wavefrontsize: {
7991 B.buildConstant(Res: MI.getOperand(i: 0), Val: ST.getWavefrontSize());
7992 MI.eraseFromParent();
7993 return true;
7994 }
7995 case Intrinsic::amdgcn_s_buffer_load:
7996 return legalizeSBufferLoad(Helper, MI);
7997 case Intrinsic::amdgcn_raw_buffer_store:
7998 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7999 case Intrinsic::amdgcn_struct_buffer_store:
8000 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8001 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: false);
8002 case Intrinsic::amdgcn_raw_buffer_store_format:
8003 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8004 case Intrinsic::amdgcn_struct_buffer_store_format:
8005 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8006 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: true);
8007 case Intrinsic::amdgcn_raw_tbuffer_store:
8008 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8009 case Intrinsic::amdgcn_struct_tbuffer_store:
8010 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8011 return legalizeBufferStore(MI, Helper, IsTyped: true, IsFormat: true);
8012 case Intrinsic::amdgcn_raw_buffer_load:
8013 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8014 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8015 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8016 case Intrinsic::amdgcn_struct_buffer_load:
8017 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8018 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8019 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8020 return legalizeBufferLoad(MI, Helper, IsFormat: false, IsTyped: false);
8021 case Intrinsic::amdgcn_raw_buffer_load_format:
8022 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8023 case Intrinsic::amdgcn_struct_buffer_load_format:
8024 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8025 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: false);
8026 case Intrinsic::amdgcn_raw_tbuffer_load:
8027 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8028 case Intrinsic::amdgcn_struct_tbuffer_load:
8029 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8030 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: true);
8031 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8032 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8033 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8034 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8035 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8036 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8037 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8038 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8039 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8040 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8041 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8042 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8043 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8044 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8045 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8046 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8047 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8048 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8049 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8050 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8051 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8052 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8053 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8054 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8055 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8056 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8057 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8058 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8059 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8060 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8061 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8062 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8063 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8064 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8065 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8066 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8067 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8068 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8069 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8070 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8071 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8072 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8073 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8074 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8075 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8077 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8079 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8080 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8081 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8082 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8083 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8084 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8085 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8086 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8087 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8088 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8089 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8090 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8091 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8092 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8093 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8094 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8095 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8096 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8097 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8098 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8099 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8100 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8101 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8102 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8103 return legalizeBufferAtomic(MI, B, IID: IntrID);
8104 case Intrinsic::amdgcn_rsq_clamp:
8105 return legalizeRsqClampIntrinsic(MI, MRI, B);
8106 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8107 return legalizeBVHIntersectRayIntrinsic(MI, B);
8108 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8109 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8110 return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
8111 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8112 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8113 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8114 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8115 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8116 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8117 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8118 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8119 Register Index = MI.getOperand(i: 5).getReg();
8120 LLT S64 = LLT::scalar(SizeInBits: 64);
8121 if (MRI.getType(Reg: Index) != S64)
8122 MI.getOperand(i: 5).setReg(B.buildAnyExt(Res: S64, Op: Index).getReg(Idx: 0));
8123 return true;
8124 }
8125 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8126 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8127 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8128 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8129 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8130 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8131 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8132 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8133 Register Index = MI.getOperand(i: 5).getReg();
8134 LLT S32 = LLT::scalar(SizeInBits: 32);
8135 if (MRI.getType(Reg: Index) != S32)
8136 MI.getOperand(i: 5).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
8137 return true;
8138 }
8139 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8140 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8141 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8142 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8143 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8144 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8145 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8146 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8147 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8148 Register Index = MI.getOperand(i: 7).getReg();
8149 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8150 ? LLT::scalar(SizeInBits: 64)
8151 : LLT::scalar(SizeInBits: 32);
8152 if (MRI.getType(Reg: Index) != IdxTy)
8153 MI.getOperand(i: 7).setReg(B.buildAnyExt(Res: IdxTy, Op: Index).getReg(Idx: 0));
8154 return true;
8155 }
8156
8157 case Intrinsic::amdgcn_fmed3: {
8158 GISelChangeObserver &Observer = Helper.Observer;
8159
8160 // FIXME: This is to workaround the inability of tablegen match combiners to
8161 // match intrinsics in patterns.
8162 Observer.changingInstr(MI);
8163 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_FMED3));
8164 MI.removeOperand(OpNo: 1);
8165 Observer.changedInstr(MI);
8166 return true;
8167 }
8168 case Intrinsic::amdgcn_readlane:
8169 case Intrinsic::amdgcn_writelane:
8170 case Intrinsic::amdgcn_readfirstlane:
8171 case Intrinsic::amdgcn_permlane16:
8172 case Intrinsic::amdgcn_permlanex16:
8173 case Intrinsic::amdgcn_permlane64:
8174 case Intrinsic::amdgcn_set_inactive:
8175 case Intrinsic::amdgcn_set_inactive_chain_arg:
8176 case Intrinsic::amdgcn_mov_dpp8:
8177 case Intrinsic::amdgcn_update_dpp:
8178 return legalizeLaneOp(Helper, MI, IID: IntrID);
8179 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8180 return legalizeSBufferPrefetch(Helper, MI);
8181 case Intrinsic::amdgcn_dead: {
8182 // TODO: Use poison instead of undef
8183 for (const MachineOperand &Def : MI.defs())
8184 B.buildUndef(Res: Def);
8185 MI.eraseFromParent();
8186 return true;
8187 }
8188 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8189 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8190 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8191 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8192 B.buildLoad(Res: MI.getOperand(i: 0), Addr: MI.getOperand(i: 2), MMO&: **MI.memoperands_begin());
8193 MI.eraseFromParent();
8194 return true;
8195 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8196 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8197 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8198 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8199 B.buildStore(Val: MI.getOperand(i: 2), Addr: MI.getOperand(i: 1), MMO&: **MI.memoperands_begin());
8200 MI.eraseFromParent();
8201 return true;
8202 default: {
8203 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8204 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
8205 return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
8206 return true;
8207 }
8208 }
8209
8210 return true;
8211}
8212