1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
24#include "SIRegisterInfo.h"
25#include "Utils/AMDGPUBaseInfo.h"
26#include "llvm/ADT/ScopeExit.h"
27#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
30#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
32#include "llvm/CodeGen/GlobalISel/Utils.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/PseudoSourceValueManager.h"
35#include "llvm/CodeGen/TargetOpcodes.h"
36#include "llvm/IR/DiagnosticInfo.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
49static cl::opt<bool> EnableNewLegality(
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(Val: false),
54 cl::ReallyHidden);
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
59static LLT getPow2VectorType(LLT Ty) {
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(Value: NElts);
62 return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
66static LLT getPow2ScalarType(LLT Ty) {
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Value: Bits);
69 return LLT::scalar(SizeInBits: Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(NumElements: Ty.getNumElements() + 1, ScalarTy: EltTy));
110 };
111}
112
113static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
144static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) {
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(x: TypeIdx, y: LLT::scalar(SizeInBits: MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
152static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: Ty.getElementType()));
170 };
171}
172
173static LLT getBufferRsrcScalarType(const LLT Ty) {
174 if (!Ty.isVector())
175 return LLT::scalar(SizeInBits: 128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: 128));
178}
179
180static LLT getBufferRsrcRegisterType(const LLT Ty) {
181 if (!Ty.isVector())
182 return LLT::fixed_vector(NumElements: 4, ScalarTy: LLT::scalar(SizeInBits: 32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElements: NumElems * 4, ScalarTy: LLT::scalar(SizeInBits: 32));
185}
186
187static LLT getBitcastRegisterType(const LLT Ty) {
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(SizeInBits: Size);
194 }
195
196 return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32);
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
206static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
212 TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32));
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
239 Size <= MaxRegisterSize;
240}
241
242static bool isRegisterVectorElementType(LLT EltTy) {
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Size: Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
267static LegalityPredicate isRegisterType(const GCNSubtarget &ST,
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Ty: Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
277static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST,
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(SizeInBits: 16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(SizeInBits: 1);
297constexpr LLT S8 = LLT::scalar(SizeInBits: 8);
298constexpr LLT S16 = LLT::scalar(SizeInBits: 16);
299constexpr LLT S32 = LLT::scalar(SizeInBits: 32);
300constexpr LLT F32 = LLT::scalar(SizeInBits: 32); // TODO: Expected float32
301constexpr LLT S64 = LLT::scalar(SizeInBits: 64);
302constexpr LLT F64 = LLT::scalar(SizeInBits: 64); // TODO: Expected float64
303constexpr LLT S96 = LLT::scalar(SizeInBits: 96);
304constexpr LLT S128 = LLT::scalar(SizeInBits: 128);
305constexpr LLT S160 = LLT::scalar(SizeInBits: 160);
306constexpr LLT S192 = LLT::scalar(SizeInBits: 192);
307constexpr LLT S224 = LLT::scalar(SizeInBits: 224);
308constexpr LLT S256 = LLT::scalar(SizeInBits: 256);
309constexpr LLT S512 = LLT::scalar(SizeInBits: 512);
310constexpr LLT S1024 = LLT::scalar(SizeInBits: 1024);
311constexpr LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
312
313constexpr LLT V2S8 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 8);
314constexpr LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
315constexpr LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
316constexpr LLT V6S16 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16);
317constexpr LLT V8S16 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
318constexpr LLT V10S16 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 16);
319constexpr LLT V12S16 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 16);
320constexpr LLT V16S16 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16);
321
322// TODO: Expected LLT::fixed_vector(2, LLT::float16())
323constexpr LLT V2F16 = LLT::fixed_vector(NumElements: 2, ScalarTy: LLT::scalar(SizeInBits: 16));
324constexpr LLT V2BF16 = V2F16; // FIXME
325
326constexpr LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
327constexpr LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
328constexpr LLT V4S32 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
329constexpr LLT V5S32 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 32);
330constexpr LLT V6S32 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 32);
331constexpr LLT V7S32 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 32);
332constexpr LLT V8S32 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32);
333constexpr LLT V9S32 = LLT::fixed_vector(NumElements: 9, ScalarSizeInBits: 32);
334constexpr LLT V10S32 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 32);
335constexpr LLT V11S32 = LLT::fixed_vector(NumElements: 11, ScalarSizeInBits: 32);
336constexpr LLT V12S32 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 32);
337constexpr LLT V16S32 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32);
338constexpr LLT V32S32 = LLT::fixed_vector(NumElements: 32, ScalarSizeInBits: 32);
339
340constexpr LLT V2S64 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
341constexpr LLT V3S64 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 64);
342constexpr LLT V4S64 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64);
343constexpr LLT V5S64 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 64);
344constexpr LLT V6S64 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 64);
345constexpr LLT V7S64 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 64);
346constexpr LLT V8S64 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64);
347constexpr LLT V16S64 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 64);
348
349constexpr LLT V2S128 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 128);
350constexpr LLT V4S128 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 128);
351
352constexpr std::initializer_list<LLT> AllScalarTypes = {
353 S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
354
355constexpr std::initializer_list<LLT> AllS16Vectors{
356 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
357
358constexpr std::initializer_list<LLT> AllS32Vectors = {
359 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
360 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
361
362constexpr std::initializer_list<LLT> AllS64Vectors = {
363 V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
364
365constexpr std::initializer_list<LLT> AllVectors{
366 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128,
367 V4S128, V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
368 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32, V2S64, V3S64,
369 V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
370
371// Checks whether a type is in the list of legal register types.
372static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
375
376 return is_contained(Set: AllS32Vectors, Element: Ty) || is_contained(Set: AllS64Vectors, Element: Ty) ||
377 is_contained(Set: AllScalarTypes, Element: Ty) ||
378 (ST.useRealTrue16Insts() && Ty == S16) ||
379 is_contained(Set: AllS16Vectors, Element: Ty);
380}
381
382static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST,
383 unsigned TypeIdx) {
384 return [&ST, TypeIdx](const LegalityQuery &Query) {
385 return isRegisterClassType(ST, Ty: Query.Types[TypeIdx]);
386 };
387}
388
389// If we have a truncating store or an extending load with a data size larger
390// than 32-bits, we need to reduce to a 32-bit type.
391static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
392 return [=](const LegalityQuery &Query) {
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
396 };
397}
398
399// If we have a truncating store or an extending load with a data size larger
400// than 32-bits and mem location is a power of 2
401static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) {
402 return [=](const LegalityQuery &Query) {
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
404 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
405 isPowerOf2_64(Value: MemSize);
406 };
407}
408
409// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
410// handle some operations by just promoting the register during
411// selection. There are also d16 loads on GFX9+ which preserve the high bits.
412static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
413 bool IsLoad, bool IsAtomic) {
414 switch (AS) {
415 case AMDGPUAS::PRIVATE_ADDRESS:
416 // FIXME: Private element size.
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
418 case AMDGPUAS::LOCAL_ADDRESS:
419 return ST.useDS128() ? 128 : 64;
420 case AMDGPUAS::GLOBAL_ADDRESS:
421 case AMDGPUAS::CONSTANT_ADDRESS:
422 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
423 case AMDGPUAS::BUFFER_RESOURCE:
424 // Treat constant and global as identical. SMRD loads are sometimes usable for
425 // global loads (ideally constant address space should be eliminated)
426 // depending on the context. Legality cannot be context dependent, but
427 // RegBankSelect can split the load as necessary depending on the pointer
428 // register bank/uniformity and if the memory is invariant or not written in a
429 // kernel.
430 return IsLoad ? 512 : 128;
431 default:
432 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
433 // if they may alias scratch depending on the subtarget. This needs to be
434 // moved to custom handling to use addressMayBeAccessedAsPrivate
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
436 }
437}
438
439static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
440 const LegalityQuery &Query) {
441 const LLT Ty = Query.Types[0];
442
443 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
444 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
445
446 unsigned RegSize = Ty.getSizeInBits();
447 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
448 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
449 unsigned AS = Query.Types[1].getAddressSpace();
450
451 // All of these need to be custom lowered to cast the pointer operand.
452 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
453 return false;
454
455 // Do not handle extending vector loads.
456 if (Ty.isVector() && MemSize != RegSize)
457 return false;
458
459 // TODO: We should be able to widen loads if the alignment is high enough, but
460 // we also need to modify the memory access size.
461#if 0
462 // Accept widening loads based on alignment.
463 if (IsLoad && MemSize < Size)
464 MemSize = std::max(MemSize, Align);
465#endif
466
467 // Only 1-byte and 2-byte to 32-bit extloads are valid.
468 if (MemSize != RegSize && RegSize != 32)
469 return false;
470
471 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
472 IsAtomic: Query.MMODescrs[0].Ordering !=
473 AtomicOrdering::NotAtomic))
474 return false;
475
476 switch (MemSize) {
477 case 8:
478 case 16:
479 case 32:
480 case 64:
481 case 128:
482 break;
483 case 96:
484 if (!ST.hasDwordx3LoadStores())
485 return false;
486 break;
487 case 256:
488 case 512:
489 // These may contextually need to be broken down.
490 break;
491 default:
492 return false;
493 }
494
495 assert(RegSize >= MemSize);
496
497 if (AlignBits < MemSize) {
498 const SITargetLowering *TLI = ST.getTargetLowering();
499 if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
500 Alignment: Align(AlignBits / 8)))
501 return false;
502 }
503
504 return true;
505}
506
507// The newer buffer intrinsic forms take their resource arguments as
508// pointers in address space 8, aka s128 values. However, in order to not break
509// SelectionDAG, the underlying operations have to continue to take v4i32
510// arguments. Therefore, we convert resource pointers - or vectors of them
511// to integer values here.
512static bool hasBufferRsrcWorkaround(const LLT Ty) {
513 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
514 return true;
515 if (Ty.isVector()) {
516 const LLT ElemTy = Ty.getElementType();
517 return hasBufferRsrcWorkaround(Ty: ElemTy);
518 }
519 return false;
520}
521
522// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
523// workaround this. Eventually it should ignore the type for loads and only care
524// about the size. Return true in cases where we will workaround this for now by
525// bitcasting.
526static bool loadStoreBitcastWorkaround(const LLT Ty) {
527 if (EnableNewLegality)
528 return false;
529
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
532 return true;
533 if (Size <= 64)
534 return false;
535 // Address space 8 pointers get their own workaround.
536 if (hasBufferRsrcWorkaround(Ty))
537 return false;
538 if (!Ty.isVector())
539 return true;
540
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
543}
544
545static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
546 const LLT Ty = Query.Types[0];
547 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
548 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
549}
550
551/// Return true if a load or store of the type should be lowered with a bitcast
552/// to a different type.
553static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
554 const LLT MemTy) {
555 const unsigned MemSizeInBits = MemTy.getSizeInBits();
556 const unsigned Size = Ty.getSizeInBits();
557 if (Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
559
560 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
561 return true;
562
563 // Don't try to handle bitcasting vector ext loads for now.
564 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
565 (Size <= 32 || isRegisterSize(ST, Size)) &&
566 !isRegisterVectorElementType(EltTy: Ty.getElementType());
567}
568
569/// Return true if we should legalize a load by widening an odd sized memory
570/// access up to the alignment. Note this case when the memory access itself
571/// changes, not the size of the result register.
572static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
573 uint64_t AlignInBits, unsigned AddrSpace,
574 unsigned Opcode) {
575 unsigned SizeInBits = MemoryTy.getSizeInBits();
576 // We don't want to widen cases that are naturally legal.
577 if (isPowerOf2_32(Value: SizeInBits))
578 return false;
579
580 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
581 // end up widening these for a scalar load during RegBankSelect, if we don't
582 // have 96-bit scalar loads.
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
584 return false;
585
586 if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
587 return false;
588
589 // A load is known dereferenceable up to the alignment, so it's legal to widen
590 // to it.
591 //
592 // TODO: Could check dereferenceable for less aligned cases.
593 unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
594 if (AlignInBits < RoundedSize)
595 return false;
596
597 // Do not widen if it would introduce a slow unaligned load.
598 const SITargetLowering *TLI = ST.getTargetLowering();
599 unsigned Fast = 0;
600 return TLI->allowsMisalignedMemoryAccessesImpl(
601 Size: RoundedSize, AddrSpace, Alignment: Align(AlignInBits / 8),
602 Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
603 Fast;
604}
605
606static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
607 unsigned Opcode) {
608 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
609 return false;
610
611 return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs[0].MemoryTy,
612 AlignInBits: Query.MMODescrs[0].AlignInBits,
613 AddrSpace: Query.Types[1].getAddressSpace(), Opcode);
614}
615
616/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
617/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
618/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
619static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
620 MachineRegisterInfo &MRI, unsigned Idx) {
621 MachineOperand &MO = MI.getOperand(i: Idx);
622
623 const LLT PointerTy = MRI.getType(Reg: MO.getReg());
624
625 // Paranoidly prevent us from doing this multiple times.
626 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
627 return PointerTy;
628
629 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
631 if (!PointerTy.isVector()) {
632 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
633 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
634 const LLT S32 = LLT::scalar(SizeInBits: 32);
635
636 Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
639 for (unsigned I = 0; I < NumParts; ++I)
640 VectorElems[I] =
641 B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: 0);
642 B.buildMergeValues(Res: MO, Ops: VectorElems);
643 MO.setReg(VectorReg);
644 return VectorTy;
645 }
646 Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
647 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
648 auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
649 B.buildIntToPtr(Dst: MO, Src: Scalar);
650 MO.setReg(BitcastReg);
651
652 return VectorTy;
653}
654
655/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
656/// the form in which the value must be in order to be passed to the low-level
657/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
658/// needed in order to account for the fact that we can't define a register
659/// class for s128 without breaking SelectionDAG.
660static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
661 MachineRegisterInfo &MRI = *B.getMRI();
662 const LLT PointerTy = MRI.getType(Reg: Pointer);
663 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
664 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
665
666 if (!PointerTy.isVector()) {
667 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
668 SmallVector<Register, 4> PointerParts;
669 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
670 auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: Pointer);
671 for (unsigned I = 0; I < NumParts; ++I)
672 PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
673 return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: 0);
674 }
675 Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: 0);
676 return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: 0);
677}
678
679static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
680 unsigned Idx) {
681 MachineOperand &MO = MI.getOperand(i: Idx);
682
683 const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
684 // Paranoidly prevent us from doing this multiple times.
685 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
686 return;
687 MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
688}
689
690AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
691 const GCNTargetMachine &TM)
692 : ST(ST_) {
693 using namespace TargetOpcode;
694
695 auto GetAddrSpacePtr = [&TM](unsigned AS) {
696 return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
697 };
698
699 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
700 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
701 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
702 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
703 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
704 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
705 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
706 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
707 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
708 const LLT BufferStridedPtr =
709 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
710
711 const LLT CodePtr = FlatPtr;
712
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
715 };
716
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 };
720
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722
723 const std::initializer_list<LLT> FPTypesBase = {
724 S32, S64
725 };
726
727 const std::initializer_list<LLT> FPTypes16 = {
728 S32, S64, S16
729 };
730
731 const std::initializer_list<LLT> FPTypesPK16 = {
732 S32, S64, S16, V2S16
733 };
734
735 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
736
737 // s1 for VCC branches, s32 for SCC branches.
738 getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
739
740 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
741 // elements for v3s16
742 getActionDefinitionsBuilder(Opcode: G_PHI)
743 .legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
744 .legalFor(Types: AllS32Vectors)
745 .legalFor(Types: AllS64Vectors)
746 .legalFor(Types: AddrSpaces64)
747 .legalFor(Types: AddrSpaces32)
748 .legalFor(Types: AddrSpaces128)
749 .legalIf(Predicate: isPointer(TypeIdx: 0))
750 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S256)
751 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
752 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16)
753 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
754 .scalarize(TypeIdx: 0);
755
756 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
757 // Full set of gfx9 features.
758 if (ST.hasScalarAddSub64()) {
759 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
760 .legalFor(Types: {S64, S32, S16, V2S16})
761 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
762 .scalarize(TypeIdx: 0)
763 .minScalar(TypeIdx: 0, Ty: S16)
764 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
765 .maxScalar(TypeIdx: 0, Ty: S32);
766 } else {
767 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
768 .legalFor(Types: {S32, S16, V2S16})
769 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
770 .scalarize(TypeIdx: 0)
771 .minScalar(TypeIdx: 0, Ty: S16)
772 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
773 .maxScalar(TypeIdx: 0, Ty: S32);
774 }
775
776 if (ST.hasScalarSMulU64()) {
777 getActionDefinitionsBuilder(Opcode: G_MUL)
778 .legalFor(Types: {S64, S32, S16, V2S16})
779 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
780 .scalarize(TypeIdx: 0)
781 .minScalar(TypeIdx: 0, Ty: S16)
782 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
783 .custom();
784 } else {
785 getActionDefinitionsBuilder(Opcode: G_MUL)
786 .legalFor(Types: {S32, S16, V2S16})
787 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
788 .scalarize(TypeIdx: 0)
789 .minScalar(TypeIdx: 0, Ty: S16)
790 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
791 .custom();
792 }
793 assert(ST.hasMad64_32());
794
795 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
796 .legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
797 .minScalarOrElt(TypeIdx: 0, Ty: S16)
798 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
799 .scalarize(TypeIdx: 0)
800 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
801 .lower();
802 } else if (ST.has16BitInsts()) {
803 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
804 .legalFor(Types: {S32, S16})
805 .minScalar(TypeIdx: 0, Ty: S16)
806 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
807 .maxScalar(TypeIdx: 0, Ty: S32)
808 .scalarize(TypeIdx: 0);
809
810 getActionDefinitionsBuilder(Opcode: G_MUL)
811 .legalFor(Types: {S32, S16})
812 .scalarize(TypeIdx: 0)
813 .minScalar(TypeIdx: 0, Ty: S16)
814 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
815 .custom();
816 assert(ST.hasMad64_32());
817
818 // Technically the saturating operations require clamp bit support, but this
819 // was introduced at the same time as 16-bit operations.
820 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
821 .legalFor(Types: {S32, S16}) // Clamp modifier
822 .minScalar(TypeIdx: 0, Ty: S16)
823 .scalarize(TypeIdx: 0)
824 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 16)
825 .lower();
826
827 // We're just lowering this, but it helps get a better result to try to
828 // coerce to the desired type first.
829 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
830 .minScalar(TypeIdx: 0, Ty: S16)
831 .scalarize(TypeIdx: 0)
832 .lower();
833 } else {
834 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
835 .legalFor(Types: {S32})
836 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
837 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
838 .scalarize(TypeIdx: 0);
839
840 auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
841 .legalFor(Types: {S32})
842 .scalarize(TypeIdx: 0)
843 .minScalar(TypeIdx: 0, Ty: S32)
844 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32);
845
846 if (ST.hasMad64_32())
847 Mul.custom();
848 else
849 Mul.maxScalar(TypeIdx: 0, Ty: S32);
850
851 if (ST.hasIntClamp()) {
852 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
853 .legalFor(Types: {S32}) // Clamp modifier.
854 .scalarize(TypeIdx: 0)
855 .minScalarOrElt(TypeIdx: 0, Ty: S32)
856 .lower();
857 } else {
858 // Clamp bit support was added in VI, along with 16-bit operations.
859 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
860 .minScalar(TypeIdx: 0, Ty: S32)
861 .scalarize(TypeIdx: 0)
862 .lower();
863 }
864
865 // FIXME: DAG expansion gets better results. The widening uses the smaller
866 // range values and goes for the min/max lowering directly.
867 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
868 .minScalar(TypeIdx: 0, Ty: S32)
869 .scalarize(TypeIdx: 0)
870 .lower();
871 }
872
873 getActionDefinitionsBuilder(
874 Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
875 .customFor(Types: {S32, S64})
876 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
877 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
878 .scalarize(TypeIdx: 0);
879
880 auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
881 .legalFor(Types: {S32})
882 .maxScalar(TypeIdx: 0, Ty: S32);
883
884 if (ST.hasVOP3PInsts()) {
885 Mulh
886 .clampMaxNumElements(TypeIdx: 0, EltTy: S8, MaxElements: 2)
887 .lowerFor(Types: {V2S8});
888 }
889
890 Mulh
891 .scalarize(TypeIdx: 0)
892 .lower();
893
894 // Report legal for any types we can handle anywhere. For the cases only legal
895 // on the SALU, RegBankSelect will be able to re-legalize.
896 getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
897 .legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
898 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
899 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
900 .fewerElementsIf(
901 Predicate: all(P0: vectorWiderThan(TypeIdx: 0, Size: 64), P1: scalarOrEltNarrowerThan(TypeIdx: 0, Size: 64)),
902 Mutation: fewerEltsToSize64Vector(TypeIdx: 0))
903 .widenScalarToNextPow2(TypeIdx: 0)
904 .scalarize(TypeIdx: 0);
905
906 getActionDefinitionsBuilder(
907 Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
908 .legalFor(Types: {{S32, S1}, {S32, S32}})
909 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
910 .scalarize(TypeIdx: 0);
911
912 getActionDefinitionsBuilder(Opcode: G_BITCAST)
913 // Don't worry about the size constraint.
914 .legalIf(Predicate: all(P0: isRegisterClassType(ST, TypeIdx: 0), P1: isRegisterClassType(ST, TypeIdx: 1)))
915 .lower();
916
917 getActionDefinitionsBuilder(Opcode: G_CONSTANT)
918 .legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
919 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
920 .legalIf(Predicate: isPointer(TypeIdx: 0))
921 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
922 .widenScalarToNextPow2(TypeIdx: 0);
923
924 getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
925 .legalFor(Types: {S32, S64, S16})
926 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
927
928 getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
929 .legalIf(Predicate: isRegisterClassType(ST, TypeIdx: 0))
930 // s1 and s16 are special cases because they have legal operations on
931 // them, but don't really occupy registers in the normal way.
932 .legalFor(Types: {S1, S16})
933 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
934 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
935 .clampScalarOrElt(TypeIdx: 0, MinTy: S32, MaxTy: MaxScalar)
936 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
937 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16);
938
939 getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
940
941 // If the amount is divergent, we have to do a wave reduction to get the
942 // maximum value, so this is expanded during RegBankSelect.
943 getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
944 .legalFor(Types: {{PrivatePtr, S32}});
945
946 getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
947 .customFor(Types: {PrivatePtr});
948 getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
949 .legalFor(Types: {PrivatePtr});
950
951 getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
952
953 getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
954 .customIf(Predicate: typeIsNot(TypeIdx: 0, Type: PrivatePtr));
955
956 getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
957
958 auto &FPOpActions = getActionDefinitionsBuilder(
959 Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
960 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
961 .legalFor(Types: {S32, S64});
962 auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
963 .customFor(Types: {S32, S64});
964 auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
965 .customFor(Types: {S32, S64});
966
967 if (ST.has16BitInsts()) {
968 if (ST.hasVOP3PInsts())
969 FPOpActions.legalFor(Types: {S16, V2S16});
970 else
971 FPOpActions.legalFor(Types: {S16});
972
973 TrigActions.customFor(Types: {S16});
974 FDIVActions.customFor(Types: {S16});
975 }
976
977 if (ST.hasPackedFP32Ops()) {
978 FPOpActions.legalFor(Types: {V2S32});
979 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S32, NumElts: 2);
980 }
981
982 auto &MinNumMaxNumIeee =
983 getActionDefinitionsBuilder(Opcodes: {G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
984
985 if (ST.hasVOP3PInsts()) {
986 MinNumMaxNumIeee.legalFor(Types: FPTypesPK16)
987 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
988 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
989 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
990 .scalarize(TypeIdx: 0);
991 } else if (ST.has16BitInsts()) {
992 MinNumMaxNumIeee.legalFor(Types: FPTypes16).clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64).scalarize(TypeIdx: 0);
993 } else {
994 MinNumMaxNumIeee.legalFor(Types: FPTypesBase)
995 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
996 .scalarize(TypeIdx: 0);
997 }
998
999 auto &MinNumMaxNum = getActionDefinitionsBuilder(
1000 Opcodes: {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1001
1002 if (ST.hasVOP3PInsts()) {
1003 MinNumMaxNum.customFor(Types: FPTypesPK16)
1004 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1005 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1006 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1007 .scalarize(TypeIdx: 0);
1008 } else if (ST.has16BitInsts()) {
1009 MinNumMaxNum.customFor(Types: FPTypes16)
1010 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1011 .scalarize(TypeIdx: 0);
1012 } else {
1013 MinNumMaxNum.customFor(Types: FPTypesBase)
1014 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1015 .scalarize(TypeIdx: 0);
1016 }
1017
1018 if (ST.hasVOP3PInsts())
1019 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
1020
1021 FPOpActions
1022 .scalarize(TypeIdx: 0)
1023 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1024
1025 TrigActions
1026 .scalarize(TypeIdx: 0)
1027 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1028
1029 FDIVActions
1030 .scalarize(TypeIdx: 0)
1031 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1032
1033 getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS})
1034 .legalFor(Types: FPTypesPK16)
1035 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1036 .scalarize(TypeIdx: 0)
1037 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1038
1039 if (ST.has16BitInsts()) {
1040 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1041 .legalFor(Types: {S16})
1042 .customFor(Types: {S32, S64})
1043 .scalarize(TypeIdx: 0)
1044 .unsupported();
1045 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1046 .legalFor(Types: {S32, S64, S16})
1047 .scalarize(TypeIdx: 0)
1048 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1049
1050 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1051 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
1052 .scalarize(TypeIdx: 0)
1053 .maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16)
1054 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1055 .lower();
1056
1057 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1058 .customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1059 .scalarize(TypeIdx: 0)
1060 .lower();
1061
1062 getActionDefinitionsBuilder(Opcode: G_FMODF)
1063 .lowerFor(Types: {S16, S32, S64})
1064 .scalarize(TypeIdx: 0)
1065 .lower();
1066 } else {
1067 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1068 .customFor(Types: {S32, S64, S16})
1069 .scalarize(TypeIdx: 0)
1070 .unsupported();
1071
1072
1073 if (ST.hasFractBug()) {
1074 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1075 .customFor(Types: {S64})
1076 .legalFor(Types: {S32, S64})
1077 .scalarize(TypeIdx: 0)
1078 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1079 } else {
1080 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1081 .legalFor(Types: {S32, S64})
1082 .scalarize(TypeIdx: 0)
1083 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1084 }
1085
1086 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1087 .legalFor(Types: {{S32, S32}, {S64, S32}})
1088 .scalarize(TypeIdx: 0)
1089 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1090 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1091 .lower();
1092
1093 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1094 .customFor(Types: {{S32, S32}, {S64, S32}})
1095 .scalarize(TypeIdx: 0)
1096 .minScalar(TypeIdx: 0, Ty: S32)
1097 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1098 .lower();
1099
1100 getActionDefinitionsBuilder(Opcode: G_FMODF)
1101 .lowerFor(Types: {S32, S64})
1102 .scalarize(TypeIdx: 0)
1103 .lower();
1104 }
1105
1106 auto &FPTruncActions = getActionDefinitionsBuilder(Opcode: G_FPTRUNC);
1107 if (ST.hasCvtPkF16F32Inst()) {
1108 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1109 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1110 } else {
1111 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}});
1112 }
1113 FPTruncActions.scalarize(TypeIdx: 0).lower();
1114
1115 getActionDefinitionsBuilder(Opcode: G_FPEXT)
1116 .legalFor(Types: {{S64, S32}, {S32, S16}})
1117 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32))
1118 .scalarize(TypeIdx: 0);
1119
1120 auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1121 if (ST.has16BitInsts()) {
1122 FSubActions
1123 // Use actual fsub instruction
1124 .legalFor(Types: {S32, S16})
1125 // Must use fadd + fneg
1126 .lowerFor(Types: {S64, V2S16});
1127 } else {
1128 FSubActions
1129 // Use actual fsub instruction
1130 .legalFor(Types: {S32})
1131 // Must use fadd + fneg
1132 .lowerFor(Types: {S64, S16, V2S16});
1133 }
1134
1135 FSubActions
1136 .scalarize(TypeIdx: 0)
1137 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1138
1139 // Whether this is legal depends on the floating point mode for the function.
1140 auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1141 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1142 FMad.customFor(Types: {S32, S16});
1143 else if (ST.hasMadMacF32Insts())
1144 FMad.customFor(Types: {S32});
1145 else if (ST.hasMadF16())
1146 FMad.customFor(Types: {S16});
1147 FMad.scalarize(TypeIdx: 0)
1148 .lower();
1149
1150 auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1151 if (ST.has16BitInsts()) {
1152 FRem.customFor(Types: {S16, S32, S64});
1153 } else {
1154 FRem.minScalar(TypeIdx: 0, Ty: S32)
1155 .customFor(Types: {S32, S64});
1156 }
1157 FRem.scalarize(TypeIdx: 0);
1158
1159 // TODO: Do we need to clamp maximum bitwidth?
1160 getActionDefinitionsBuilder(Opcode: G_TRUNC)
1161 .legalIf(Predicate: isScalar(TypeIdx: 0))
1162 .legalFor(Types: {{V2S16, V2S32}})
1163 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1164 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1165 // situations (like an invalid implicit use), we don't want to infinite loop
1166 // in the legalizer.
1167 .fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: 0), Mutation: LegalizeMutations::scalarize(TypeIdx: 0))
1168 .alwaysLegal();
1169
1170 getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1171 .legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1172 {S32, S1}, {S64, S1}, {S16, S1}})
1173 .scalarize(TypeIdx: 0)
1174 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1175 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1176
1177 // TODO: Split s1->s64 during regbankselect for VALU.
1178 auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1179 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S32}})
1180 .lowerIf(Predicate: typeIs(TypeIdx: 1, TypesInit: S1))
1181 .customFor(Types: {{S32, S64}, {S64, S64}});
1182 if (ST.has16BitInsts())
1183 IToFP.legalFor(Types: {{S16, S16}});
1184 IToFP.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1185 .minScalar(TypeIdx: 0, Ty: S32)
1186 .scalarize(TypeIdx: 0)
1187 .widenScalarToNextPow2(TypeIdx: 1);
1188
1189 auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1190 .legalFor(Types: {{S32, S32}, {S32, S64}, {S32, S16}})
1191 .customFor(Types: {{S64, S32}, {S64, S64}})
1192 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1193 if (ST.has16BitInsts())
1194 FPToI.legalFor(Types: {{S16, S16}});
1195 else
1196 FPToI.minScalar(TypeIdx: 1, Ty: S32);
1197
1198 FPToI.minScalar(TypeIdx: 0, Ty: S32)
1199 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1200 .scalarize(TypeIdx: 0)
1201 .lower();
1202
1203 // clang-format off
1204 auto &FPToISat = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI_SAT, G_FPTOUI_SAT})
1205 .legalFor(Types: {{S32, S32}, {S32, S64}})
1206 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1207 if (ST.has16BitInsts())
1208 FPToISat.legalFor(Types: {{S16, S16}});
1209
1210 FPToISat.minScalar(TypeIdx: 1, Ty: S32);
1211 FPToISat.minScalar(TypeIdx: 0, Ty: S32)
1212 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1213 .scalarize(TypeIdx: 0)
1214 .lower();
1215 // clang-format on
1216
1217 getActionDefinitionsBuilder(Opcodes: {G_LROUND, G_LLROUND})
1218 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1219 .scalarize(TypeIdx: 0)
1220 .lower();
1221
1222 getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1223 .legalFor(Types: {S16, S32})
1224 .scalarize(TypeIdx: 0)
1225 .lower();
1226
1227 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1228 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1229 .scalarize(TypeIdx: 0)
1230 .lower();
1231
1232 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1233 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1234 .scalarize(TypeIdx: 0)
1235 .lower();
1236
1237 if (ST.has16BitInsts()) {
1238 getActionDefinitionsBuilder(
1239 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1240 .legalFor(Types: {S16, S32, S64})
1241 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1242 .scalarize(TypeIdx: 0);
1243 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1244 getActionDefinitionsBuilder(
1245 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1246 .legalFor(Types: {S32, S64})
1247 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1248 .scalarize(TypeIdx: 0);
1249 } else {
1250 getActionDefinitionsBuilder(
1251 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1252 .legalFor(Types: {S32})
1253 .customFor(Types: {S64})
1254 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1255 .scalarize(TypeIdx: 0);
1256 }
1257
1258 getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1259 .unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1260 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: sameSize(TypeIdx0: 0, TypeIdx1: 1)))
1261 .scalarize(TypeIdx: 0)
1262 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0);
1263
1264 getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1265 .legalIf(Predicate: all(P0: sameSize(TypeIdx0: 0, TypeIdx1: 1), P1: typeInSet(TypeIdx: 1, TypesInit: {S64, S32})))
1266 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0)
1267 .scalarize(TypeIdx: 0);
1268
1269 auto &CmpBuilder =
1270 getActionDefinitionsBuilder(Opcode: G_ICMP)
1271 // The compare output type differs based on the register bank of the output,
1272 // so make both s1 and s32 legal.
1273 //
1274 // Scalar compares producing output in scc will be promoted to s32, as that
1275 // is the allocatable register type that will be needed for the copy from
1276 // scc. This will be promoted during RegBankSelect, and we assume something
1277 // before that won't try to use s32 result types.
1278 //
1279 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1280 // bank.
1281 .legalForCartesianProduct(
1282 Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1283 .legalForCartesianProduct(
1284 Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1285 if (ST.has16BitInsts()) {
1286 CmpBuilder.legalFor(Types: {{S1, S16}});
1287 }
1288
1289 CmpBuilder
1290 .widenScalarToNextPow2(TypeIdx: 1)
1291 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1292 .scalarize(TypeIdx: 0)
1293 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: 1)));
1294
1295 auto &FCmpBuilder =
1296 getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1297 Types0: {S1}, Types1: ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1298
1299 if (ST.hasSALUFloatInsts())
1300 FCmpBuilder.legalForCartesianProduct(Types0: {S32}, Types1: {S16, S32});
1301
1302 FCmpBuilder
1303 .widenScalarToNextPow2(TypeIdx: 1)
1304 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1305 .scalarize(TypeIdx: 0);
1306
1307 // FIXME: fpow has a selection pattern that should move to custom lowering.
1308 auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1309 if (ST.has16BitInsts())
1310 ExpOps.customFor(Types: {{S32}, {S16}});
1311 else
1312 ExpOps.customFor(Types: {S32});
1313 ExpOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1314 .scalarize(TypeIdx: 0);
1315
1316 getActionDefinitionsBuilder(Opcode: G_FPOWI)
1317 .clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1318 .lower();
1319
1320 getActionDefinitionsBuilder(Opcode: G_FLOG2)
1321 .legalFor(Pred: ST.has16BitInsts(), Types: {S16})
1322 .customFor(Types: {S32, S16})
1323 .scalarize(TypeIdx: 0)
1324 .lower();
1325
1326 getActionDefinitionsBuilder(Opcode: G_FEXP2)
1327 .legalFor(Pred: ST.has16BitInsts(), Types: {S16})
1328 .customFor(Types: {S32, S64, S16})
1329 .scalarize(TypeIdx: 0)
1330 .lower();
1331
1332 auto &LogOps =
1333 getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1334 LogOps.customFor(Types: {S32, S16, S64});
1335 LogOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1336 .scalarize(TypeIdx: 0);
1337
1338 // The 64-bit versions produce 32-bit results, but only on the SALU.
1339 getActionDefinitionsBuilder(Opcode: G_CTPOP)
1340 .legalFor(Types: {{S32, S32}, {S32, S64}})
1341 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1342 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1343 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1344 .scalarize(TypeIdx: 0)
1345 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1346
1347 // If no 16 bit instr is available, lower into different instructions.
1348 if (ST.has16BitInsts())
1349 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1350 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1351 .widenScalarToNextPow2(TypeIdx: 1)
1352 .scalarize(TypeIdx: 0)
1353 .lower();
1354 else
1355 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1356 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1357 .lowerFor(Types: {S1, S16})
1358 .widenScalarToNextPow2(TypeIdx: 1)
1359 .scalarize(TypeIdx: 0)
1360 .lower();
1361
1362 // The hardware instructions return a different result on 0 than the generic
1363 // instructions expect. The hardware produces -1, but these produce the
1364 // bitwidth.
1365 getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1366 .scalarize(TypeIdx: 0)
1367 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1368 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1369 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1370 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1371 .custom();
1372
1373 // The 64-bit versions produce 32-bit results, but only on the SALU.
1374 getActionDefinitionsBuilder(Opcode: G_CTLZ_ZERO_UNDEF)
1375 .legalFor(Types: {{S32, S32}, {S32, S64}})
1376 .customIf(Predicate: scalarNarrowerThan(TypeIdx: 1, Size: 32))
1377 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1378 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1379 .scalarize(TypeIdx: 0)
1380 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1381 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1382
1383 getActionDefinitionsBuilder(Opcode: G_CTTZ_ZERO_UNDEF)
1384 .legalFor(Types: {{S32, S32}, {S32, S64}})
1385 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1386 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1387 .scalarize(TypeIdx: 0)
1388 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1389 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1390
1391 getActionDefinitionsBuilder(Opcode: G_CTLS)
1392 .customFor(Types: {{S32, S32}})
1393 .scalarize(TypeIdx: 0)
1394 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1395 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1396
1397 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1398 // RegBankSelect.
1399 getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1400 .legalFor(Types: {S32, S64})
1401 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1402 .scalarize(TypeIdx: 0)
1403 .widenScalarToNextPow2(TypeIdx: 0);
1404
1405 if (ST.has16BitInsts()) {
1406 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1407 .legalFor(Types: {S16, S32, V2S16})
1408 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1409 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1410 // narrowScalar limitation.
1411 .widenScalarToNextPow2(TypeIdx: 0)
1412 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S32)
1413 .scalarize(TypeIdx: 0);
1414
1415 if (ST.hasVOP3PInsts()) {
1416 getActionDefinitionsBuilder(Opcode: G_ABS)
1417 .legalFor(Types: {S32, S16, V2S16})
1418 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1419 .minScalar(TypeIdx: 0, Ty: S16)
1420 .widenScalarToNextPow2(TypeIdx: 0)
1421 .scalarize(TypeIdx: 0)
1422 .lower();
1423 if (ST.hasIntMinMax64()) {
1424 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1425 .legalFor(Types: {S32, S16, S64, V2S16})
1426 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1427 .minScalar(TypeIdx: 0, Ty: S16)
1428 .widenScalarToNextPow2(TypeIdx: 0)
1429 .scalarize(TypeIdx: 0)
1430 .lower();
1431 } else {
1432 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1433 .legalFor(Types: {S32, S16, V2S16})
1434 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1435 .minScalar(TypeIdx: 0, Ty: S16)
1436 .widenScalarToNextPow2(TypeIdx: 0)
1437 .scalarize(TypeIdx: 0)
1438 .lower();
1439 }
1440 } else {
1441 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1442 .legalFor(Types: {S32, S16})
1443 .widenScalarToNextPow2(TypeIdx: 0)
1444 .minScalar(TypeIdx: 0, Ty: S16)
1445 .scalarize(TypeIdx: 0)
1446 .lower();
1447 }
1448 } else {
1449 // TODO: Should have same legality without v_perm_b32
1450 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1451 .legalFor(Types: {S32})
1452 .lowerIf(Predicate: scalarNarrowerThan(TypeIdx: 0, Size: 32))
1453 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1454 // narrowScalar limitation.
1455 .widenScalarToNextPow2(TypeIdx: 0)
1456 .maxScalar(TypeIdx: 0, Ty: S32)
1457 .scalarize(TypeIdx: 0)
1458 .lower();
1459
1460 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1461 .legalFor(Types: {S32})
1462 .minScalar(TypeIdx: 0, Ty: S32)
1463 .widenScalarToNextPow2(TypeIdx: 0)
1464 .scalarize(TypeIdx: 0)
1465 .lower();
1466 }
1467
1468 getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1469 // List the common cases
1470 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1471 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1472 .scalarize(TypeIdx: 0)
1473 // Accept any address space as long as the size matches
1474 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1475 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 1, TypeIdx1: 0),
1476 Mutation: [](const LegalityQuery &Query) {
1477 return std::pair(
1478 1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1479 })
1480 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 1, TypeIdx1: 0), Mutation: [](const LegalityQuery &Query) {
1481 return std::pair(1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1482 });
1483
1484 getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1485 // List the common cases
1486 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1487 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1488 .scalarize(TypeIdx: 0)
1489 // Accept any address space as long as the size matches
1490 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1491 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 0, TypeIdx1: 1),
1492 Mutation: [](const LegalityQuery &Query) {
1493 return std::pair(
1494 0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1495 })
1496 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 0, TypeIdx1: 1), Mutation: [](const LegalityQuery &Query) {
1497 return std::pair(0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1498 });
1499
1500 getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1501 .scalarize(TypeIdx: 0)
1502 .custom();
1503
1504 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1505 bool IsLoad) -> bool {
1506 const LLT DstTy = Query.Types[0];
1507
1508 // Split vector extloads.
1509 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1510
1511 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1512 return true;
1513
1514 const LLT PtrTy = Query.Types[1];
1515 unsigned AS = PtrTy.getAddressSpace();
1516 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1517 IsAtomic: Query.MMODescrs[0].Ordering !=
1518 AtomicOrdering::NotAtomic))
1519 return true;
1520
1521 // Catch weird sized loads that don't evenly divide into the access sizes
1522 // TODO: May be able to widen depending on alignment etc.
1523 unsigned NumRegs = (MemSize + 31) / 32;
1524 if (NumRegs == 3) {
1525 if (!ST.hasDwordx3LoadStores())
1526 return true;
1527 } else {
1528 // If the alignment allows, these should have been widened.
1529 if (!isPowerOf2_32(Value: NumRegs))
1530 return true;
1531 }
1532
1533 return false;
1534 };
1535
1536 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1537 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1538 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1539
1540 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1541 // LDS
1542 // TODO: Unsupported flat for SI.
1543
1544 for (unsigned Op : {G_LOAD, G_STORE}) {
1545 const bool IsStore = Op == G_STORE;
1546
1547 auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1548 // Explicitly list some common cases.
1549 // TODO: Does this help compile time at all?
1550 Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1551 {.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1552 {.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1553 {.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1554 {.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1555 {.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1556 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1557 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1558
1559 {.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1560 {.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: 32},
1561 {.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: 32},
1562 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1563 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1564 {.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1565
1566 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1567 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1568 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1569 {.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1570
1571 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1572 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1573 {.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1574 {.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1575 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1576 Actions.legalIf(
1577 Predicate: [=](const LegalityQuery &Query) -> bool {
1578 return isLoadStoreLegal(ST, Query);
1579 });
1580
1581 // The custom pointers (fat pointers, buffer resources) don't work with load
1582 // and store at this level. Fat pointers should have been lowered to
1583 // intrinsics before the translation to MIR.
1584 Actions.unsupportedIf(
1585 Predicate: typeInSet(TypeIdx: 1, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1586
1587 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1588 // ptrtoint. This is needed to account for the fact that we can't have i128
1589 // as a register class for SelectionDAG reasons.
1590 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1591 return hasBufferRsrcWorkaround(Ty: Query.Types[0]);
1592 });
1593
1594 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1595 // 64-bits.
1596 //
1597 // TODO: Should generalize bitcast action into coerce, which will also cover
1598 // inserting addrspacecasts.
1599 Actions.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1600
1601 // Turn any illegal element vectors into something easier to deal
1602 // with. These will ultimately produce 32-bit scalar shifts to extract the
1603 // parts anyway.
1604 //
1605 // For odd 16-bit element vectors, prefer to split those into pieces with
1606 // 16-bit vector parts.
1607 Actions.bitcastIf(
1608 Predicate: [=](const LegalityQuery &Query) -> bool {
1609 return shouldBitcastLoadStoreType(ST, Ty: Query.Types[0],
1610 MemTy: Query.MMODescrs[0].MemoryTy);
1611 }, Mutation: bitcastToRegisterType(TypeIdx: 0));
1612
1613 if (!IsStore) {
1614 // Widen suitably aligned loads by loading extra bytes. The standard
1615 // legalization actions can't properly express widening memory operands.
1616 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1617 return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1618 });
1619 }
1620
1621 // FIXME: load/store narrowing should be moved to lower action
1622 Actions
1623 .narrowScalarIf(
1624 Predicate: [=](const LegalityQuery &Query) -> bool {
1625 return !Query.Types[0].isVector() &&
1626 needToSplitMemOp(Query, Op == G_LOAD);
1627 },
1628 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1629 const LLT DstTy = Query.Types[0];
1630 const LLT PtrTy = Query.Types[1];
1631
1632 const unsigned DstSize = DstTy.getSizeInBits();
1633 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1634
1635 // Split extloads.
1636 if (DstSize > MemSize)
1637 return std::pair(0, LLT::scalar(SizeInBits: MemSize));
1638
1639 unsigned MaxSize = maxSizeForAddrSpace(
1640 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1641 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1642 if (MemSize > MaxSize)
1643 return std::pair(0, LLT::scalar(SizeInBits: MaxSize));
1644
1645 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1646 return std::pair(0, LLT::scalar(SizeInBits: Align));
1647 })
1648 .fewerElementsIf(
1649 Predicate: [=](const LegalityQuery &Query) -> bool {
1650 return Query.Types[0].isVector() &&
1651 needToSplitMemOp(Query, Op == G_LOAD);
1652 },
1653 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1654 const LLT DstTy = Query.Types[0];
1655 const LLT PtrTy = Query.Types[1];
1656
1657 LLT EltTy = DstTy.getElementType();
1658 unsigned MaxSize = maxSizeForAddrSpace(
1659 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1660 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1661
1662 // FIXME: Handle widened to power of 2 results better. This ends
1663 // up scalarizing.
1664 // FIXME: 3 element stores scalarized on SI
1665
1666 // Split if it's too large for the address space.
1667 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1668 if (MemSize > MaxSize) {
1669 unsigned NumElts = DstTy.getNumElements();
1670 unsigned EltSize = EltTy.getSizeInBits();
1671
1672 if (MaxSize % EltSize == 0) {
1673 return std::pair(
1674 0, LLT::scalarOrVector(
1675 EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1676 }
1677
1678 unsigned NumPieces = MemSize / MaxSize;
1679
1680 // FIXME: Refine when odd breakdowns handled
1681 // The scalars will need to be re-legalized.
1682 if (NumPieces == 1 || NumPieces >= NumElts ||
1683 NumElts % NumPieces != 0)
1684 return std::pair(0, EltTy);
1685
1686 return std::pair(0,
1687 LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1688 }
1689
1690 // FIXME: We could probably handle weird extending loads better.
1691 if (DstTy.getSizeInBits() > MemSize)
1692 return std::pair(0, EltTy);
1693
1694 unsigned EltSize = EltTy.getSizeInBits();
1695 unsigned DstSize = DstTy.getSizeInBits();
1696 if (!isPowerOf2_32(Value: DstSize)) {
1697 // We're probably decomposing an odd sized store. Try to split
1698 // to the widest type. TODO: Account for alignment. As-is it
1699 // should be OK, since the new parts will be further legalized.
1700 unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1701 return std::pair(
1702 0, LLT::scalarOrVector(
1703 EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1704 }
1705
1706 // May need relegalization for the scalars.
1707 return std::pair(0, EltTy);
1708 })
1709 .minScalar(TypeIdx: 0, Ty: S32)
1710 .narrowScalarIf(Predicate: isTruncStoreToSizePowerOf2(TypeIdx: 0),
1711 Mutation: getScalarTypeFromMemDesc(TypeIdx: 0))
1712 .widenScalarToNextPow2(TypeIdx: 0)
1713 .moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: 0, Size: 32), Mutation: moreEltsToNext32Bit(TypeIdx: 0))
1714 .lower();
1715 }
1716
1717 // FIXME: Unaligned accesses not lowered.
1718 auto &ExtLoads = getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1719 .legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: 8},
1720 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: 2 * 8},
1721 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1722 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1723 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1724 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1725 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: 8},
1726 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: 2 * 8}})
1727 .legalIf(
1728 Predicate: [=](const LegalityQuery &Query) -> bool {
1729 return isLoadStoreLegal(ST, Query);
1730 });
1731
1732 if (ST.hasFlatAddressSpace()) {
1733 ExtLoads.legalForTypesWithMemDesc(
1734 TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: 8}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: 16}});
1735 }
1736
1737 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1738 // 64-bits.
1739 //
1740 // TODO: Should generalize bitcast action into coerce, which will also cover
1741 // inserting addrspacecasts.
1742 ExtLoads.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1743
1744 ExtLoads.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1745 .widenScalarToNextPow2(TypeIdx: 0)
1746 .lower();
1747
1748 auto &Atomics = getActionDefinitionsBuilder(
1749 Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1750 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1751 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1752 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1753 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1754 {S64, GlobalPtr}, {S64, LocalPtr},
1755 {S32, RegionPtr}, {S64, RegionPtr}});
1756 if (ST.hasFlatAddressSpace()) {
1757 Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1758 }
1759
1760 auto &Atomics32 =
1761 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1762 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1763 if (ST.hasFlatAddressSpace()) {
1764 Atomics32.legalFor(Types: {{S32, FlatPtr}});
1765 }
1766
1767 // TODO: v2bf16 operations, and fat buffer pointer support.
1768 auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1769 if (ST.hasLDSFPAtomicAddF32()) {
1770 Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1771 if (ST.hasLdsAtomicAddF64())
1772 Atomic.legalFor(Types: {{S64, LocalPtr}});
1773 if (ST.hasAtomicDsPkAdd16Insts())
1774 Atomic.legalFor(Types: {{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1775 }
1776 if (ST.hasAtomicFaddInsts())
1777 Atomic.legalFor(Types: {{S32, GlobalPtr}});
1778 if (ST.hasFlatAtomicFaddF32Inst())
1779 Atomic.legalFor(Types: {{S32, FlatPtr}});
1780
1781 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1782 // These are legal with some caveats, and should have undergone expansion in
1783 // the IR in most situations
1784 // TODO: Move atomic expansion into legalizer
1785 Atomic.legalFor(Types: {
1786 {S32, GlobalPtr},
1787 {S64, GlobalPtr},
1788 {S64, FlatPtr}
1789 });
1790 }
1791
1792 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1793 ST.hasAtomicBufferGlobalPkAddF16Insts())
1794 Atomic.legalFor(Types: {{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1795 if (ST.hasAtomicGlobalPkAddBF16Inst())
1796 Atomic.legalFor(Types: {{V2BF16, GlobalPtr}});
1797 if (ST.hasAtomicFlatPkAdd16Insts())
1798 Atomic.legalFor(Types: {{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1799
1800
1801 // Most of the legalization work here is done by AtomicExpand. We could
1802 // probably use a simpler legality rule that just assumes anything is OK.
1803 auto &AtomicFMinFMax =
1804 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1805 .legalFor(Types: {{F32, LocalPtr}, {F64, LocalPtr}});
1806
1807 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1808 AtomicFMinFMax.legalFor(Types: {{F32, GlobalPtr},{F32, BufferFatPtr}});
1809 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1810 AtomicFMinFMax.legalFor(Types: {{F64, GlobalPtr}, {F64, BufferFatPtr}});
1811 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1812 AtomicFMinFMax.legalFor(Types: {F32, FlatPtr});
1813 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1814 AtomicFMinFMax.legalFor(Types: {F64, FlatPtr});
1815
1816 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1817 // demarshalling
1818 getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1819 .customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1820 {S32, FlatPtr}, {S64, FlatPtr}})
1821 .legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1822 {S32, RegionPtr}, {S64, RegionPtr}});
1823 // TODO: Pointer types, any 32-bit or 64-bit vector
1824
1825 // Condition should be s32 for scalar, s1 for vector.
1826 getActionDefinitionsBuilder(Opcode: G_SELECT)
1827 .legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1828 LocalPtr, FlatPtr, PrivatePtr,
1829 LLT::fixed_vector(NumElements: 2, ScalarTy: LocalPtr),
1830 LLT::fixed_vector(NumElements: 2, ScalarTy: PrivatePtr)},
1831 Types1: {S1, S32})
1832 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1833 .scalarize(TypeIdx: 1)
1834 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1835 .fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: 0), Mutation: scalarize(TypeIdx: 0))
1836 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 2)
1837 .clampMaxNumElements(TypeIdx: 0, EltTy: LocalPtr, MaxElements: 2)
1838 .clampMaxNumElements(TypeIdx: 0, EltTy: PrivatePtr, MaxElements: 2)
1839 .scalarize(TypeIdx: 0)
1840 .widenScalarToNextPow2(TypeIdx: 0)
1841 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: typeInSet(TypeIdx: 1, TypesInit: {S1, S32})));
1842
1843 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1844 // be more flexible with the shift amount type.
1845 auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1846 .legalFor(Types: {{S32, S32}, {S64, S32}});
1847 if (ST.has16BitInsts()) {
1848 if (ST.hasVOP3PInsts()) {
1849 Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1850 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1851 } else
1852 Shifts.legalFor(Types: {{S16, S16}});
1853
1854 // TODO: Support 16-bit shift amounts for all types
1855 Shifts.widenScalarIf(
1856 Predicate: [=](const LegalityQuery &Query) {
1857 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1858 // 32-bit amount.
1859 const LLT ValTy = Query.Types[0];
1860 const LLT AmountTy = Query.Types[1];
1861 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1862 AmountTy.getSizeInBits() < 16;
1863 }, Mutation: changeTo(TypeIdx: 1, Ty: S16));
1864 Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16);
1865 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1866 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 16);
1867 Shifts.clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1868
1869 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1870 .minScalar(TypeIdx: 0, Ty: S16)
1871 .scalarize(TypeIdx: 0)
1872 .lower();
1873 } else {
1874 // Make sure we legalize the shift amount type first, as the general
1875 // expansion for the shifted type will produce much worse code if it hasn't
1876 // been truncated already.
1877 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1878 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1879 Shifts.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1880
1881 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1882 .minScalar(TypeIdx: 0, Ty: S32)
1883 .scalarize(TypeIdx: 0)
1884 .lower();
1885 }
1886 Shifts.scalarize(TypeIdx: 0);
1887
1888 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1889 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1890 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1891 unsigned IdxTypeIdx = 2;
1892
1893 getActionDefinitionsBuilder(Opcode: Op)
1894 .customIf(Predicate: [=](const LegalityQuery &Query) {
1895 const LLT EltTy = Query.Types[EltTypeIdx];
1896 const LLT VecTy = Query.Types[VecTypeIdx];
1897 const LLT IdxTy = Query.Types[IdxTypeIdx];
1898 const unsigned EltSize = EltTy.getSizeInBits();
1899 const bool isLegalVecType =
1900 !!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1901 // Address space 8 pointers are 128-bit wide values, but the logic
1902 // below will try to bitcast them to 2N x s64, which will fail.
1903 // Therefore, as an intermediate step, wrap extracts/insertions from a
1904 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1905 // extraction result) in order to produce a vector operation that can
1906 // be handled by the logic below.
1907 if (EltTy.isPointer() && EltSize > 64)
1908 return true;
1909 return (EltSize == 32 || EltSize == 64) &&
1910 VecTy.getSizeInBits() % 32 == 0 &&
1911 VecTy.getSizeInBits() <= MaxRegisterSize &&
1912 IdxTy.getSizeInBits() == 32 &&
1913 isLegalVecType;
1914 })
1915 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1916 P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: 32)),
1917 Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1918 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1919 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1920 P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: 64)),
1921 Mutation: [=](const LegalityQuery &Query) {
1922 // For > 64-bit element types, try to turn this into a
1923 // 64-bit element vector since we may be able to do better
1924 // indexing if this is scalar. If not, fall back to 32.
1925 const LLT EltTy = Query.Types[EltTypeIdx];
1926 const LLT VecTy = Query.Types[VecTypeIdx];
1927 const unsigned DstEltSize = EltTy.getSizeInBits();
1928 const unsigned VecSize = VecTy.getSizeInBits();
1929
1930 const unsigned TargetEltSize =
1931 DstEltSize % 64 == 0 ? 64 : 32;
1932 return std::pair(VecTypeIdx,
1933 LLT::fixed_vector(NumElements: VecSize / TargetEltSize,
1934 ScalarSizeInBits: TargetEltSize));
1935 })
1936 .clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
1937 .clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
1938 .clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
1939 .clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: 32)
1940 // TODO: Clamp elements for 64-bit vectors?
1941 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: VecTypeIdx),
1942 Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
1943 // It should only be necessary with variable indexes.
1944 // As a last resort, lower to the stack
1945 .lower();
1946 }
1947
1948 getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1949 .unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1950 const LLT &EltTy = Query.Types[1].getElementType();
1951 return Query.Types[0] != EltTy;
1952 });
1953
1954 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1955 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1956 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1957 getActionDefinitionsBuilder(Opcode: Op)
1958 .widenScalarIf(
1959 Predicate: [=](const LegalityQuery &Query) {
1960 const LLT BigTy = Query.Types[BigTyIdx];
1961 return (BigTy.getScalarSizeInBits() < 16);
1962 },
1963 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: 16))
1964 .widenScalarIf(
1965 Predicate: [=](const LegalityQuery &Query) {
1966 const LLT LitTy = Query.Types[LitTyIdx];
1967 return (LitTy.getScalarSizeInBits() < 16);
1968 },
1969 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: 16))
1970 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1971 .widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: 32)
1972 .customIf(Predicate: [=](const LegalityQuery &Query) {
1973 // Generic lower operates on the full-width value, producing
1974 // shift+trunc/mask sequences. For simple cases where extract/insert
1975 // values are 32-bit aligned, we can instead unmerge/merge and work on
1976 // the 32-bit components. However, we can't check the offset here so
1977 // custom lower function will have to call generic lowering if offset
1978 // is not 32-bit aligned.
1979 const LLT BigTy = Query.Types[BigTyIdx];
1980 const LLT LitTy = Query.Types[LitTyIdx];
1981 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
1982 LitTy.getSizeInBits() % 32 == 0;
1983 })
1984 .lower();
1985 }
1986
1987 auto &BuildVector =
1988 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1989 .legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
1990 .legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
1991 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
1992 .clampNumElements(TypeIdx: 0, MinTy: V2S64, MaxTy: V16S64)
1993 .fewerElementsIf(Predicate: isWideVec16(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: V2S16))
1994 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: 0),
1995 Mutation: moreElementsToNextExistingRegClass(TypeIdx: 0));
1996
1997 if (ST.hasScalarPackInsts()) {
1998 BuildVector
1999 // FIXME: Should probably widen s1 vectors straight to s32
2000 .minScalarOrElt(TypeIdx: 0, Ty: S16)
2001 .minScalar(TypeIdx: 1, Ty: S16);
2002
2003 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
2004 .legalFor(Types: {V2S16, S32})
2005 .lower();
2006 } else {
2007 BuildVector.customFor(Types: {V2S16, S16});
2008 BuildVector.minScalarOrElt(TypeIdx: 0, Ty: S32);
2009
2010 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
2011 .customFor(Types: {V2S16, S32})
2012 .lower();
2013 }
2014
2015 BuildVector.legalIf(Predicate: isRegisterType(ST, TypeIdx: 0));
2016
2017 // FIXME: Clamp maximum size
2018 getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
2019 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
2020 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 32)
2021 .clampMaxNumElements(TypeIdx: 1, EltTy: S16, MaxElements: 2) // TODO: Make 4?
2022 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 64);
2023
2024 getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
2025
2026 // Merge/Unmerge
2027 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2028 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2029 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2030
2031 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2032 const LLT Ty = Query.Types[TypeIdx];
2033 if (Ty.isVector()) {
2034 const LLT &EltTy = Ty.getElementType();
2035 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2036 return true;
2037 if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
2038 return true;
2039 }
2040 return false;
2041 };
2042
2043 auto &Builder =
2044 getActionDefinitionsBuilder(Opcode: Op)
2045 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
2046 .lowerFor(Types: {{S16, V2S16}})
2047 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
2048 const LLT BigTy = Query.Types[BigTyIdx];
2049 return BigTy.getSizeInBits() == 32;
2050 })
2051 // Try to widen to s16 first for small types.
2052 // TODO: Only do this on targets with legal s16 shifts
2053 .minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: 16), TypeIdx: LitTyIdx, Ty: S16)
2054 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 16)
2055 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx),
2056 Mutation: oneMoreElement(TypeIdx: BigTyIdx))
2057 .fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: 0, TypesInit: S16), P1: vectorWiderThan(TypeIdx: 1, Size: 32),
2058 args: elementTypeIs(TypeIdx: 1, EltTy: S16)),
2059 Mutation: changeTo(TypeIdx: 1, Ty: V2S16))
2060 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2061 // not worth considering the multiples of 64 since 2*192 and 2*384
2062 // are not valid.
2063 .clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
2064 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 32)
2065 // Break up vectors with weird elements into scalars
2066 .fewerElementsIf(
2067 Predicate: [=](const LegalityQuery &Query) {
2068 return notValidElt(Query, LitTyIdx);
2069 },
2070 Mutation: scalarize(TypeIdx: 0))
2071 .fewerElementsIf(
2072 Predicate: [=](const LegalityQuery &Query) {
2073 return notValidElt(Query, BigTyIdx);
2074 },
2075 Mutation: scalarize(TypeIdx: 1))
2076 .clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
2077
2078 if (Op == G_MERGE_VALUES) {
2079 Builder.widenScalarIf(
2080 // TODO: Use 16-bit shifts if legal for 8-bit values?
2081 Predicate: [=](const LegalityQuery &Query) {
2082 const LLT Ty = Query.Types[LitTyIdx];
2083 return Ty.getSizeInBits() < 32;
2084 },
2085 Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
2086 }
2087
2088 Builder.widenScalarIf(
2089 Predicate: [=](const LegalityQuery &Query) {
2090 const LLT Ty = Query.Types[BigTyIdx];
2091 return Ty.getSizeInBits() % 16 != 0;
2092 },
2093 Mutation: [=](const LegalityQuery &Query) {
2094 // Pick the next power of 2, or a multiple of 64 over 128.
2095 // Whichever is smaller.
2096 const LLT &Ty = Query.Types[BigTyIdx];
2097 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Value: Ty.getSizeInBits() + 1);
2098 if (NewSizeInBits >= 256) {
2099 unsigned RoundedTo = alignTo<64>(Value: Ty.getSizeInBits() + 1);
2100 if (RoundedTo < NewSizeInBits)
2101 NewSizeInBits = RoundedTo;
2102 }
2103 return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
2104 })
2105 // Any vectors left are the wrong size. Scalarize them.
2106 .scalarize(TypeIdx: 0)
2107 .scalarize(TypeIdx: 1);
2108 }
2109
2110 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2111 // RegBankSelect.
2112 auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
2113 .legalFor(Types: {{S32}, {S64}})
2114 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
2115
2116 if (ST.hasVOP3PInsts()) {
2117 SextInReg.lowerFor(Types: {{V2S16}})
2118 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2119 // get more vector shift opportunities, since we'll get those when
2120 // expanded.
2121 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2122 } else if (ST.has16BitInsts()) {
2123 SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
2124 } else {
2125 // Prefer to promote to s32 before lowering if we don't have 16-bit
2126 // shifts. This avoid a lot of intermediate truncate and extend operations.
2127 SextInReg.lowerFor(Types: {{S32}, {S64}});
2128 }
2129
2130 SextInReg
2131 .scalarize(TypeIdx: 0)
2132 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2133 .lower();
2134
2135 getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
2136 .scalarize(TypeIdx: 0)
2137 .lower();
2138
2139 auto &FSHRActionDefs = getActionDefinitionsBuilder(Opcode: G_FSHR);
2140 FSHRActionDefs.legalFor(Types: {{S32, S32}})
2141 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2142 if (ST.hasVOP3PInsts())
2143 FSHRActionDefs.lowerFor(Types: {{V2S16, V2S16}});
2144 FSHRActionDefs.scalarize(TypeIdx: 0).lower();
2145
2146 if (ST.hasVOP3PInsts()) {
2147 getActionDefinitionsBuilder(Opcode: G_FSHL)
2148 .lowerFor(Types: {{V2S16, V2S16}})
2149 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2150 .scalarize(TypeIdx: 0)
2151 .lower();
2152 } else {
2153 getActionDefinitionsBuilder(Opcode: G_FSHL)
2154 .scalarize(TypeIdx: 0)
2155 .lower();
2156 }
2157
2158 getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
2159 .legalFor(Types: {S64});
2160
2161 getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
2162
2163 getActionDefinitionsBuilder(Opcode: G_FENCE)
2164 .alwaysLegal();
2165
2166 getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
2167 .scalarize(TypeIdx: 0)
2168 .minScalar(TypeIdx: 0, Ty: S32)
2169 .lower();
2170
2171 getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2172 .legalFor(Types: {{S32, S32}, {S64, S32}})
2173 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
2174 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2175 .widenScalarToNextPow2(TypeIdx: 0)
2176 .scalarize(TypeIdx: 0);
2177
2178 getActionDefinitionsBuilder(
2179 Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2180 G_FCOPYSIGN,
2181
2182 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2183 G_READ_REGISTER, G_WRITE_REGISTER,
2184
2185 G_SADDO, G_SSUBO})
2186 .lower();
2187
2188 if (ST.hasIEEEMinimumMaximumInsts()) {
2189 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2190 .legalFor(Types: FPTypesPK16)
2191 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
2192 .scalarize(TypeIdx: 0);
2193 } else if (ST.hasVOP3PInsts()) {
2194 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2195 .lowerFor(Types: {V2S16})
2196 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2197 .scalarize(TypeIdx: 0)
2198 .lower();
2199 } else {
2200 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2201 .scalarize(TypeIdx: 0)
2202 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2203 .lower();
2204 }
2205
2206 getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2207 .lower();
2208
2209 getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2210
2211 getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2212 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2213 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2214 .unsupported();
2215
2216 getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2217
2218 getActionDefinitionsBuilder(
2219 Opcodes: {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2220 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2221 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2222 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2223 .legalFor(Types: AllVectors)
2224 .scalarize(TypeIdx: 1)
2225 .lower();
2226
2227 getLegacyLegalizerInfo().computeTables();
2228 verify(MII: *ST.getInstrInfo());
2229}
2230
2231bool AMDGPULegalizerInfo::legalizeCustom(
2232 LegalizerHelper &Helper, MachineInstr &MI,
2233 LostDebugLocObserver &LocObserver) const {
2234 MachineIRBuilder &B = Helper.MIRBuilder;
2235 MachineRegisterInfo &MRI = *B.getMRI();
2236
2237 switch (MI.getOpcode()) {
2238 case TargetOpcode::G_ADDRSPACE_CAST:
2239 return legalizeAddrSpaceCast(MI, MRI, B);
2240 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2241 return legalizeFroundeven(MI, MRI, B);
2242 case TargetOpcode::G_FCEIL:
2243 return legalizeFceil(MI, MRI, B);
2244 case TargetOpcode::G_FREM:
2245 return legalizeFrem(MI, MRI, B);
2246 case TargetOpcode::G_INTRINSIC_TRUNC:
2247 return legalizeIntrinsicTrunc(MI, MRI, B);
2248 case TargetOpcode::G_SITOFP:
2249 return legalizeITOFP(MI, MRI, B, Signed: true);
2250 case TargetOpcode::G_UITOFP:
2251 return legalizeITOFP(MI, MRI, B, Signed: false);
2252 case TargetOpcode::G_FPTOSI:
2253 return legalizeFPTOI(MI, MRI, B, Signed: true);
2254 case TargetOpcode::G_FPTOUI:
2255 return legalizeFPTOI(MI, MRI, B, Signed: false);
2256 case TargetOpcode::G_FMINNUM:
2257 case TargetOpcode::G_FMAXNUM:
2258 case TargetOpcode::G_FMINIMUMNUM:
2259 case TargetOpcode::G_FMAXIMUMNUM:
2260 return legalizeMinNumMaxNum(Helper, MI);
2261 case TargetOpcode::G_EXTRACT:
2262 return legalizeExtract(Helper, MI);
2263 case TargetOpcode::G_INSERT:
2264 return legalizeInsert(Helper, MI);
2265 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2266 return legalizeExtractVectorElt(MI, MRI, B);
2267 case TargetOpcode::G_INSERT_VECTOR_ELT:
2268 return legalizeInsertVectorElt(MI, MRI, B);
2269 case TargetOpcode::G_FSIN:
2270 case TargetOpcode::G_FCOS:
2271 return legalizeSinCos(MI, MRI, B);
2272 case TargetOpcode::G_GLOBAL_VALUE:
2273 return legalizeGlobalValue(MI, MRI, B);
2274 case TargetOpcode::G_LOAD:
2275 case TargetOpcode::G_SEXTLOAD:
2276 case TargetOpcode::G_ZEXTLOAD:
2277 return legalizeLoad(Helper, MI);
2278 case TargetOpcode::G_STORE:
2279 return legalizeStore(Helper, MI);
2280 case TargetOpcode::G_FMAD:
2281 return legalizeFMad(MI, MRI, B);
2282 case TargetOpcode::G_FDIV:
2283 return legalizeFDIV(MI, MRI, B);
2284 case TargetOpcode::G_FFREXP:
2285 return legalizeFFREXP(MI, MRI, B);
2286 case TargetOpcode::G_FSQRT:
2287 return legalizeFSQRT(MI, MRI, B);
2288 case TargetOpcode::G_UDIV:
2289 case TargetOpcode::G_UREM:
2290 case TargetOpcode::G_UDIVREM:
2291 return legalizeUnsignedDIV_REM(MI, MRI, B);
2292 case TargetOpcode::G_SDIV:
2293 case TargetOpcode::G_SREM:
2294 case TargetOpcode::G_SDIVREM:
2295 return legalizeSignedDIV_REM(MI, MRI, B);
2296 case TargetOpcode::G_ATOMIC_CMPXCHG:
2297 return legalizeAtomicCmpXChg(MI, MRI, B);
2298 case TargetOpcode::G_FLOG2:
2299 return legalizeFlog2(MI, B);
2300 case TargetOpcode::G_FLOG:
2301 case TargetOpcode::G_FLOG10:
2302 return legalizeFlogCommon(MI, B);
2303 case TargetOpcode::G_FEXP2:
2304 return legalizeFExp2(MI, B);
2305 case TargetOpcode::G_FEXP:
2306 case TargetOpcode::G_FEXP10:
2307 return legalizeFExp(MI, B);
2308 case TargetOpcode::G_FPOW:
2309 return legalizeFPow(MI, B);
2310 case TargetOpcode::G_FFLOOR:
2311 return legalizeFFloor(MI, MRI, B);
2312 case TargetOpcode::G_BUILD_VECTOR:
2313 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2314 return legalizeBuildVector(MI, MRI, B);
2315 case TargetOpcode::G_MUL:
2316 return legalizeMul(Helper, MI);
2317 case TargetOpcode::G_CTLZ:
2318 case TargetOpcode::G_CTTZ:
2319 return legalizeCTLZ_CTTZ(MI, MRI, B);
2320 case TargetOpcode::G_CTLS:
2321 return legalizeCTLS(MI, MRI, B);
2322 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2323 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2324 case TargetOpcode::G_STACKSAVE:
2325 return legalizeStackSave(MI, B);
2326 case TargetOpcode::G_GET_FPENV:
2327 return legalizeGetFPEnv(MI, MRI, B);
2328 case TargetOpcode::G_SET_FPENV:
2329 return legalizeSetFPEnv(MI, MRI, B);
2330 case TargetOpcode::G_TRAP:
2331 return legalizeTrap(MI, MRI, B);
2332 case TargetOpcode::G_DEBUGTRAP:
2333 return legalizeDebugTrap(MI, MRI, B);
2334 default:
2335 return false;
2336 }
2337
2338 llvm_unreachable("expected switch to return");
2339}
2340
2341Register AMDGPULegalizerInfo::getSegmentAperture(
2342 unsigned AS,
2343 MachineRegisterInfo &MRI,
2344 MachineIRBuilder &B) const {
2345 MachineFunction &MF = B.getMF();
2346 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2347 const LLT S32 = LLT::scalar(SizeInBits: 32);
2348 const LLT S64 = LLT::scalar(SizeInBits: 64);
2349
2350 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2351
2352 if (ST.hasApertureRegs()) {
2353 // Note: this register is somewhat broken. When used as a 32-bit operand,
2354 // it only returns zeroes. The real value is in the upper 32 bits.
2355 // Thus, we must emit extract the high 32 bits.
2356 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2357 ? AMDGPU::SRC_SHARED_BASE
2358 : AMDGPU::SRC_PRIVATE_BASE;
2359 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2360 !ST.hasGloballyAddressableScratch()) &&
2361 "Cannot use src_private_base with globally addressable scratch!");
2362 Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2363 MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2364 B.buildCopy(Res: {Dst}, Op: {Register(ApertureRegNo)});
2365 return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: 1);
2366 }
2367
2368 Register LoadAddr = MRI.createGenericVirtualRegister(
2369 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2370 // For code object version 5, private_base and shared_base are passed through
2371 // implicit kernargs.
2372 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2373 AMDGPU::AMDHSA_COV5) {
2374 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF&: B.getMF());
2375
2376 AMDGPUTargetLowering::ImplicitParameter Param =
2377 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2378 : AMDGPUTargetLowering::PRIVATE_BASE;
2379 uint64_t Offset =
2380 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2381
2382 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2383 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2384
2385 if (!loadInputValue(DstReg: KernargPtrReg, B,
2386 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2387 return Register();
2388
2389 MachineMemOperand *MMO = MF.getMachineMemOperand(
2390 PtrInfo: PtrInfo.getWithOffset(O: Offset),
2391 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2392 MachineMemOperand::MOInvariant,
2393 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset));
2394
2395 // Pointer address
2396 B.buildObjectPtrOffset(Res: LoadAddr, Op0: KernargPtrReg,
2397 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
2398 // Load address
2399 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2400 }
2401
2402 Register QueuePtr = MRI.createGenericVirtualRegister(
2403 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2404
2405 if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2406 return Register();
2407
2408 // TODO: Use custom PseudoSourceValue
2409 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2410
2411 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2412 // private_segment_aperture_base_hi.
2413 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2414
2415 MachineMemOperand *MMO = MF.getMachineMemOperand(
2416 PtrInfo,
2417 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2418 MachineMemOperand::MOInvariant,
2419 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset: StructOffset));
2420
2421 B.buildObjectPtrOffset(
2422 Res: LoadAddr, Op0: QueuePtr,
2423 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: StructOffset).getReg(Idx: 0));
2424 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2425}
2426
2427/// Return true if the value is a known valid address, such that a null check is
2428/// not necessary.
2429static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2430 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2431 MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2432 switch (Def->getOpcode()) {
2433 case AMDGPU::G_FRAME_INDEX:
2434 case AMDGPU::G_GLOBAL_VALUE:
2435 case AMDGPU::G_BLOCK_ADDR:
2436 return true;
2437 case AMDGPU::G_CONSTANT: {
2438 const ConstantInt *CI = Def->getOperand(i: 1).getCImm();
2439 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AS: AddrSpace);
2440 }
2441 default:
2442 return false;
2443 }
2444
2445 return false;
2446}
2447
2448bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2449 MachineInstr &MI, MachineRegisterInfo &MRI,
2450 MachineIRBuilder &B) const {
2451 MachineFunction &MF = B.getMF();
2452
2453 // MI can either be a G_ADDRSPACE_CAST or a
2454 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2455 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2456 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2457 Intrinsic::amdgcn_addrspacecast_nonnull));
2458
2459 const LLT S32 = LLT::scalar(SizeInBits: 32);
2460 Register Dst = MI.getOperand(i: 0).getReg();
2461 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
2462 : MI.getOperand(i: 1).getReg();
2463 LLT DstTy = MRI.getType(Reg: Dst);
2464 LLT SrcTy = MRI.getType(Reg: Src);
2465 unsigned DestAS = DstTy.getAddressSpace();
2466 unsigned SrcAS = SrcTy.getAddressSpace();
2467
2468 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2469 // vector element.
2470 assert(!DstTy.isVector());
2471
2472 const AMDGPUTargetMachine &TM
2473 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2474
2475 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2476 MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2477 return true;
2478 }
2479
2480 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2481 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2482 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2483 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2484 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2485 ST.hasGloballyAddressableScratch()) {
2486 // flat -> private with globally addressable scratch: subtract
2487 // src_flat_scratch_base_lo.
2488 const LLT S32 = LLT::scalar(SizeInBits: 32);
2489 Register SrcLo = B.buildExtract(Res: S32, Src, Index: 0).getReg(Idx: 0);
2490 Register FlatScratchBaseLo =
2491 B.buildInstr(Opc: AMDGPU::S_MOV_B32, DstOps: {S32},
2492 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2493 .getReg(Idx: 0);
2494 MRI.setRegClass(Reg: FlatScratchBaseLo, RC: &AMDGPU::SReg_32RegClass);
2495 Register Sub = B.buildSub(Dst: S32, Src0: SrcLo, Src1: FlatScratchBaseLo).getReg(Idx: 0);
2496 return B.buildIntToPtr(Dst, Src: Sub).getReg(Idx: 0);
2497 }
2498
2499 // Extract low 32-bits of the pointer.
2500 return B.buildExtract(Res: Dst, Src, Index: 0).getReg(Idx: 0);
2501 };
2502
2503 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2504 // G_ADDRSPACE_CAST we need to guess.
2505 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2506 castFlatToLocalOrPrivate(Dst);
2507 MI.eraseFromParent();
2508 return true;
2509 }
2510
2511 unsigned NullVal = AMDGPU::getNullPointerValue(AS: DestAS);
2512
2513 auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2514 auto FlatNull = B.buildConstant(Res: SrcTy, Val: 0);
2515
2516 // Extract low 32-bits of the pointer.
2517 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2518
2519 auto CmpRes =
2520 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: FlatNull.getReg(Idx: 0));
2521 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: 0));
2522
2523 MI.eraseFromParent();
2524 return true;
2525 }
2526
2527 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2528 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2529 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2530 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2531 // Coerce the type of the low half of the result so we can use
2532 // merge_values.
2533 Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: 0);
2534
2535 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2536 ST.hasGloballyAddressableScratch()) {
2537 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2538 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2539 Register AllOnes = B.buildConstant(Res: S32, Val: -1).getReg(Idx: 0);
2540 Register ThreadID = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
2541 ThreadID = B.buildIntrinsic(ID: Intrinsic::amdgcn_mbcnt_lo, Res: {S32})
2542 .addUse(RegNo: AllOnes)
2543 .addUse(RegNo: ThreadID)
2544 .getReg(Idx: 0);
2545 if (ST.isWave64()) {
2546 ThreadID = B.buildIntrinsic(ID: Intrinsic::amdgcn_mbcnt_hi, Res: {S32})
2547 .addUse(RegNo: AllOnes)
2548 .addUse(RegNo: ThreadID)
2549 .getReg(Idx: 0);
2550 }
2551 Register ShAmt =
2552 B.buildConstant(Res: S32, Val: 57 - 32 - ST.getWavefrontSizeLog2()).getReg(Idx: 0);
2553 Register SrcHi = B.buildShl(Dst: S32, Src0: ThreadID, Src1: ShAmt).getReg(Idx: 0);
2554 Register CvtPtr =
2555 B.buildMergeLikeInstr(Res: DstTy, Ops: {SrcAsInt, SrcHi}).getReg(Idx: 0);
2556 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2557 // 64-bit hi:lo value.
2558 Register FlatScratchBase =
2559 B.buildInstr(Opc: AMDGPU::S_MOV_B64, DstOps: {S64},
2560 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2561 .getReg(Idx: 0);
2562 MRI.setRegClass(Reg: FlatScratchBase, RC: &AMDGPU::SReg_64RegClass);
2563 return B.buildPtrAdd(Res: Dst, Op0: CvtPtr, Op1: FlatScratchBase).getReg(Idx: 0);
2564 }
2565
2566 Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2567 if (!ApertureReg.isValid())
2568 return false;
2569
2570 // TODO: Should we allow mismatched types but matching sizes in merges to
2571 // avoid the ptrtoint?
2572 return B.buildMergeLikeInstr(Res: Dst, Ops: {SrcAsInt, ApertureReg}).getReg(Idx: 0);
2573 };
2574
2575 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2576 // G_ADDRSPACE_CAST we need to guess.
2577 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2578 castLocalOrPrivateToFlat(Dst);
2579 MI.eraseFromParent();
2580 return true;
2581 }
2582
2583 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2584
2585 auto SegmentNull =
2586 B.buildConstant(Res: SrcTy, Val: AMDGPU::getNullPointerValue(AS: SrcAS));
2587 auto FlatNull = B.buildConstant(Res: DstTy, Val: AMDGPU::getNullPointerValue(AS: DestAS));
2588
2589 auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
2590 Op1: SegmentNull.getReg(Idx: 0));
2591
2592 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2593
2594 MI.eraseFromParent();
2595 return true;
2596 }
2597
2598 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2599 SrcTy.getSizeInBits() == 64) {
2600 // Truncate.
2601 B.buildExtract(Res: Dst, Src, Index: 0);
2602 MI.eraseFromParent();
2603 return true;
2604 }
2605
2606 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2607 DstTy.getSizeInBits() == 64) {
2608 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2609 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2610 auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2611 if (AddrHiVal == 0) {
2612 auto Zext = B.buildZExt(Res: LLT::scalar(SizeInBits: 64), Op: PtrLo);
2613 B.buildIntToPtr(Dst, Src: Zext);
2614 } else {
2615 auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2616 B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2617 }
2618
2619 MI.eraseFromParent();
2620 return true;
2621 }
2622
2623 // Invalid casts are poison.
2624 // TODO: Should return poison
2625 B.buildUndef(Res: Dst);
2626 MI.eraseFromParent();
2627 return true;
2628}
2629
2630bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2631 MachineRegisterInfo &MRI,
2632 MachineIRBuilder &B) const {
2633 Register Src = MI.getOperand(i: 1).getReg();
2634 LLT Ty = MRI.getType(Reg: Src);
2635 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2636
2637 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2638 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2639
2640 auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2641 auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2642
2643 // TODO: Should this propagate fast-math-flags?
2644 auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2645 auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2646
2647 auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2648 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2649
2650 auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: C2);
2651 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2652 MI.eraseFromParent();
2653 return true;
2654}
2655
2656bool AMDGPULegalizerInfo::legalizeFceil(
2657 MachineInstr &MI, MachineRegisterInfo &MRI,
2658 MachineIRBuilder &B) const {
2659
2660 const LLT S1 = LLT::scalar(SizeInBits: 1);
2661 const LLT S64 = LLT::scalar(SizeInBits: 64);
2662
2663 Register Src = MI.getOperand(i: 1).getReg();
2664 assert(MRI.getType(Src) == S64);
2665
2666 // result = trunc(src)
2667 // if (src > 0.0 && src != result)
2668 // result += 1.0
2669
2670 auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2671
2672 const auto Zero = B.buildFConstant(Res: S64, Val: 0.0);
2673 const auto One = B.buildFConstant(Res: S64, Val: 1.0);
2674 auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2675 auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2676 auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2677 auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2678
2679 // TODO: Should this propagate fast-math-flags?
2680 B.buildFAdd(Dst: MI.getOperand(i: 0).getReg(), Src0: Trunc, Src1: Add);
2681 MI.eraseFromParent();
2682 return true;
2683}
2684
2685bool AMDGPULegalizerInfo::legalizeFrem(
2686 MachineInstr &MI, MachineRegisterInfo &MRI,
2687 MachineIRBuilder &B) const {
2688 Register DstReg = MI.getOperand(i: 0).getReg();
2689 Register Src0Reg = MI.getOperand(i: 1).getReg();
2690 Register Src1Reg = MI.getOperand(i: 2).getReg();
2691 auto Flags = MI.getFlags();
2692 LLT Ty = MRI.getType(Reg: DstReg);
2693
2694 auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2695 auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2696 auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2697 B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2698 MI.eraseFromParent();
2699 return true;
2700}
2701
2702static MachineInstrBuilder extractF64Exponent(Register Hi,
2703 MachineIRBuilder &B) {
2704 const unsigned FractBits = 52;
2705 const unsigned ExpBits = 11;
2706 LLT S32 = LLT::scalar(SizeInBits: 32);
2707
2708 auto Const0 = B.buildConstant(Res: S32, Val: FractBits - 32);
2709 auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2710
2711 auto ExpPart = B.buildIntrinsic(ID: Intrinsic::amdgcn_ubfe, Res: {S32})
2712 .addUse(RegNo: Hi)
2713 .addUse(RegNo: Const0.getReg(Idx: 0))
2714 .addUse(RegNo: Const1.getReg(Idx: 0));
2715
2716 return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: 1023));
2717}
2718
2719bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2720 MachineInstr &MI, MachineRegisterInfo &MRI,
2721 MachineIRBuilder &B) const {
2722 const LLT S1 = LLT::scalar(SizeInBits: 1);
2723 const LLT S32 = LLT::scalar(SizeInBits: 32);
2724 const LLT S64 = LLT::scalar(SizeInBits: 64);
2725
2726 Register Src = MI.getOperand(i: 1).getReg();
2727 assert(MRI.getType(Src) == S64);
2728
2729 // TODO: Should this use extract since the low half is unused?
2730 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2731 Register Hi = Unmerge.getReg(Idx: 1);
2732
2733 // Extract the upper half, since this is where we will find the sign and
2734 // exponent.
2735 auto Exp = extractF64Exponent(Hi, B);
2736
2737 const unsigned FractBits = 52;
2738
2739 // Extract the sign bit.
2740 const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(1) << 31);
2741 auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2742
2743 const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(1) << FractBits) - 1);
2744
2745 const auto Zero32 = B.buildConstant(Res: S32, Val: 0);
2746
2747 // Extend back to 64-bits.
2748 auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2749
2750 auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2751 auto Not = B.buildNot(Dst: S64, Src0: Shr);
2752 auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2753 auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - 1);
2754
2755 auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2756 auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2757
2758 auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2759 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2760 MI.eraseFromParent();
2761 return true;
2762}
2763
2764bool AMDGPULegalizerInfo::legalizeITOFP(
2765 MachineInstr &MI, MachineRegisterInfo &MRI,
2766 MachineIRBuilder &B, bool Signed) const {
2767
2768 Register Dst = MI.getOperand(i: 0).getReg();
2769 Register Src = MI.getOperand(i: 1).getReg();
2770
2771 const LLT S64 = LLT::scalar(SizeInBits: 64);
2772 const LLT S32 = LLT::scalar(SizeInBits: 32);
2773
2774 assert(MRI.getType(Src) == S64);
2775
2776 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2777 auto ThirtyTwo = B.buildConstant(Res: S32, Val: 32);
2778
2779 if (MRI.getType(Reg: Dst) == S64) {
2780 auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1))
2781 : B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1));
2782
2783 auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 0));
2784 auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2785
2786 // TODO: Should this propagate fast-math-flags?
2787 B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2788 MI.eraseFromParent();
2789 return true;
2790 }
2791
2792 assert(MRI.getType(Dst) == S32);
2793
2794 auto One = B.buildConstant(Res: S32, Val: 1);
2795
2796 MachineInstrBuilder ShAmt;
2797 if (Signed) {
2798 auto ThirtyOne = B.buildConstant(Res: S32, Val: 31);
2799 auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: 0), Src1: Unmerge.getReg(Idx: 1));
2800 auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2801 auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2802 auto LS = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32})
2803 .addUse(RegNo: Unmerge.getReg(Idx: 1));
2804 auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2805 ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2806 } else
2807 ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
2808 auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2809 auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2810 auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: 0));
2811 auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: 1), Src1: Adjust);
2812 auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2813 auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2814 B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2815 MI.eraseFromParent();
2816 return true;
2817}
2818
2819// TODO: Copied from DAG implementation. Verify logic and document how this
2820// actually works.
2821bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2822 MachineRegisterInfo &MRI,
2823 MachineIRBuilder &B,
2824 bool Signed) const {
2825
2826 Register Dst = MI.getOperand(i: 0).getReg();
2827 Register Src = MI.getOperand(i: 1).getReg();
2828
2829 const LLT S64 = LLT::scalar(SizeInBits: 64);
2830 const LLT S32 = LLT::scalar(SizeInBits: 32);
2831
2832 const LLT SrcLT = MRI.getType(Reg: Src);
2833 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2834
2835 unsigned Flags = MI.getFlags();
2836
2837 // The basic idea of converting a floating point number into a pair of 32-bit
2838 // integers is illustrated as follows:
2839 //
2840 // tf := trunc(val);
2841 // hif := floor(tf * 2^-32);
2842 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2843 // hi := fptoi(hif);
2844 // lo := fptoi(lof);
2845 //
2846 auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2847 MachineInstrBuilder Sign;
2848 if (Signed && SrcLT == S32) {
2849 // However, a 32-bit floating point number has only 23 bits mantissa and
2850 // it's not enough to hold all the significant bits of `lof` if val is
2851 // negative. To avoid the loss of precision, We need to take the absolute
2852 // value after truncating and flip the result back based on the original
2853 // signedness.
2854 Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: 31));
2855 Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2856 }
2857 MachineInstrBuilder K0, K1;
2858 if (SrcLT == S64) {
2859 K0 = B.buildFConstant(
2860 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2861 K1 = B.buildFConstant(
2862 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2863 } else {
2864 K0 = B.buildFConstant(
2865 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2866 K1 = B.buildFConstant(
2867 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2868 }
2869
2870 auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2871 auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2872 auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2873
2874 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2875 : B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2876 auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2877
2878 if (Signed && SrcLT == S32) {
2879 // Flip the result based on the signedness, which is either all 0s or 1s.
2880 Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2881 // r := xor({lo, hi}, sign) - sign;
2882 B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2883 Src1: Sign);
2884 } else
2885 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2886 MI.eraseFromParent();
2887
2888 return true;
2889}
2890
2891bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2892 MachineInstr &MI) const {
2893 MachineFunction &MF = Helper.MIRBuilder.getMF();
2894 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2895
2896 // With ieee_mode disabled, the instructions have the correct behavior.
2897 if (!MFI->getMode().IEEE)
2898 return true;
2899
2900 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2901}
2902
2903bool AMDGPULegalizerInfo::legalizeExtract(LegalizerHelper &Helper,
2904 MachineInstr &MI) const {
2905 MachineIRBuilder &B = Helper.MIRBuilder;
2906 MachineRegisterInfo &MRI = *B.getMRI();
2907 Register DstReg = MI.getOperand(i: 0).getReg();
2908 Register SrcReg = MI.getOperand(i: 1).getReg();
2909 uint64_t Offset = MI.getOperand(i: 2).getImm();
2910
2911 // Fall back to generic lowering for offset 0 (trivial trunc) and
2912 // non-32-bit-aligned cases which require shift+trunc sequences
2913 // that generic code handles correctly.
2914 if (Offset == 0 || Offset % 32 != 0)
2915 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2916
2917 const LLT DstTy = MRI.getType(Reg: DstReg);
2918 unsigned StartIdx = Offset / 32;
2919 unsigned DstCount = DstTy.getSizeInBits() / 32;
2920 auto Unmerge = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: SrcReg);
2921
2922 if (DstCount == 1) {
2923 if (DstTy.isPointer())
2924 B.buildIntToPtr(Dst: DstReg, Src: Unmerge.getReg(Idx: StartIdx));
2925 else
2926 MRI.replaceRegWith(FromReg: DstReg, ToReg: Unmerge.getReg(Idx: StartIdx));
2927 } else {
2928 SmallVector<Register, 8> MergeVec;
2929 for (unsigned I = 0; I < DstCount; ++I)
2930 MergeVec.push_back(Elt: Unmerge.getReg(Idx: StartIdx + I));
2931 B.buildMergeLikeInstr(Res: DstReg, Ops: MergeVec);
2932 }
2933
2934 MI.eraseFromParent();
2935 return true;
2936}
2937
2938bool AMDGPULegalizerInfo::legalizeInsert(LegalizerHelper &Helper,
2939 MachineInstr &MI) const {
2940 MachineIRBuilder &B = Helper.MIRBuilder;
2941 MachineRegisterInfo &MRI = *B.getMRI();
2942 Register DstReg = MI.getOperand(i: 0).getReg();
2943 Register SrcReg = MI.getOperand(i: 1).getReg();
2944 Register InsertSrc = MI.getOperand(i: 2).getReg();
2945 uint64_t Offset = MI.getOperand(i: 3).getImm();
2946
2947 unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
2948 const LLT InsertTy = MRI.getType(Reg: InsertSrc);
2949 unsigned InsertSize = InsertTy.getSizeInBits();
2950
2951 // Fall back to generic lowering for non-32-bit-aligned cases which
2952 // require shift+mask sequences that generic code handles correctly.
2953 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2954 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
2955
2956 const LLT S32 = LLT::scalar(SizeInBits: 32);
2957 unsigned DstCount = DstSize / 32;
2958 unsigned InsertCount = InsertSize / 32;
2959 unsigned StartIdx = Offset / 32;
2960
2961 auto SrcUnmerge = B.buildUnmerge(Res: S32, Op: SrcReg);
2962
2963 SmallVector<Register, 8> MergeVec;
2964 for (unsigned I = 0; I < StartIdx; ++I)
2965 MergeVec.push_back(Elt: SrcUnmerge.getReg(Idx: I));
2966
2967 if (InsertCount == 1) {
2968 // Merge-like instructions require same source types. Convert pointer
2969 // to scalar when inserting a pointer value into a scalar.
2970 if (InsertTy.isPointer())
2971 InsertSrc = B.buildPtrToInt(Dst: S32, Src: InsertSrc).getReg(Idx: 0);
2972 MergeVec.push_back(Elt: InsertSrc);
2973 } else {
2974 auto InsertUnmerge = B.buildUnmerge(Res: S32, Op: InsertSrc);
2975 for (unsigned I = 0; I < InsertCount; ++I)
2976 MergeVec.push_back(Elt: InsertUnmerge.getReg(Idx: I));
2977 }
2978
2979 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
2980 MergeVec.push_back(Elt: SrcUnmerge.getReg(Idx: I));
2981
2982 B.buildMergeLikeInstr(Res: DstReg, Ops: MergeVec);
2983
2984 MI.eraseFromParent();
2985 return true;
2986}
2987
2988bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2989 MachineInstr &MI, MachineRegisterInfo &MRI,
2990 MachineIRBuilder &B) const {
2991 // TODO: Should move some of this into LegalizerHelper.
2992
2993 // TODO: Promote dynamic indexing of s16 to s32
2994
2995 Register Dst = MI.getOperand(i: 0).getReg();
2996 Register Vec = MI.getOperand(i: 1).getReg();
2997
2998 LLT VecTy = MRI.getType(Reg: Vec);
2999 LLT EltTy = VecTy.getElementType();
3000 assert(EltTy == MRI.getType(Dst));
3001
3002 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3003 // but we can't go directly to that logic becasue you can't bitcast a vector
3004 // of pointers to a vector of integers. Therefore, introduce an intermediate
3005 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3006 // drive the legalization forward.
3007 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3008 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
3009 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
3010
3011 auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
3012 auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: 2));
3013 B.buildIntToPtr(Dst, Src: IntElt);
3014
3015 MI.eraseFromParent();
3016 return true;
3017 }
3018
3019 // FIXME: Artifact combiner probably should have replaced the truncated
3020 // constant before this, so we shouldn't need
3021 // getIConstantVRegValWithLookThrough.
3022 std::optional<ValueAndVReg> MaybeIdxVal =
3023 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 2).getReg(), MRI);
3024 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3025 return true;
3026 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3027
3028 if (IdxVal < VecTy.getNumElements()) {
3029 auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
3030 B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
3031 } else {
3032 B.buildUndef(Res: Dst);
3033 }
3034
3035 MI.eraseFromParent();
3036 return true;
3037}
3038
3039bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
3040 MachineInstr &MI, MachineRegisterInfo &MRI,
3041 MachineIRBuilder &B) const {
3042 // TODO: Should move some of this into LegalizerHelper.
3043
3044 // TODO: Promote dynamic indexing of s16 to s32
3045
3046 Register Dst = MI.getOperand(i: 0).getReg();
3047 Register Vec = MI.getOperand(i: 1).getReg();
3048 Register Ins = MI.getOperand(i: 2).getReg();
3049
3050 LLT VecTy = MRI.getType(Reg: Vec);
3051 LLT EltTy = VecTy.getElementType();
3052 assert(EltTy == MRI.getType(Ins));
3053
3054 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3055 // but we can't go directly to that logic becasue you can't bitcast a vector
3056 // of pointers to a vector of integers. Therefore, make the pointer vector
3057 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3058 // new value, and then inttoptr the result vector back. This will then allow
3059 // the rest of legalization to take over.
3060 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3061 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
3062 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
3063
3064 auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
3065 auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
3066 auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
3067 Idx: MI.getOperand(i: 3));
3068 B.buildIntToPtr(Dst, Src: IntVecDest);
3069 MI.eraseFromParent();
3070 return true;
3071 }
3072
3073 // FIXME: Artifact combiner probably should have replaced the truncated
3074 // constant before this, so we shouldn't need
3075 // getIConstantVRegValWithLookThrough.
3076 std::optional<ValueAndVReg> MaybeIdxVal =
3077 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 3).getReg(), MRI);
3078 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3079 return true;
3080
3081 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3082
3083 unsigned NumElts = VecTy.getNumElements();
3084 if (IdxVal < NumElts) {
3085 SmallVector<Register, 8> SrcRegs;
3086 for (unsigned i = 0; i < NumElts; ++i)
3087 SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
3088 B.buildUnmerge(Res: SrcRegs, Op: Vec);
3089
3090 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
3091 B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3092 } else {
3093 B.buildUndef(Res: Dst);
3094 }
3095
3096 MI.eraseFromParent();
3097 return true;
3098}
3099
3100bool AMDGPULegalizerInfo::legalizeSinCos(
3101 MachineInstr &MI, MachineRegisterInfo &MRI,
3102 MachineIRBuilder &B) const {
3103
3104 Register DstReg = MI.getOperand(i: 0).getReg();
3105 Register SrcReg = MI.getOperand(i: 1).getReg();
3106 LLT Ty = MRI.getType(Reg: DstReg);
3107 unsigned Flags = MI.getFlags();
3108
3109 Register TrigVal;
3110 auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: 0.5 * numbers::inv_pi);
3111 if (ST.hasTrigReducedRange()) {
3112 auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
3113 TrigVal = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {Ty})
3114 .addUse(RegNo: MulVal.getReg(Idx: 0))
3115 .setMIFlags(Flags)
3116 .getReg(Idx: 0);
3117 } else
3118 TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: 0);
3119
3120 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3121 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3122 B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
3123 .addUse(RegNo: TrigVal)
3124 .setMIFlags(Flags);
3125 MI.eraseFromParent();
3126 return true;
3127}
3128
3129bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
3130 MachineIRBuilder &B,
3131 const GlobalValue *GV,
3132 int64_t Offset,
3133 unsigned GAFlags) const {
3134 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3135 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3136 // to the following code sequence:
3137 //
3138 // For constant address space:
3139 // s_getpc_b64 s[0:1]
3140 // s_add_u32 s0, s0, $symbol
3141 // s_addc_u32 s1, s1, 0
3142 //
3143 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3144 // a fixup or relocation is emitted to replace $symbol with a literal
3145 // constant, which is a pc-relative offset from the encoding of the $symbol
3146 // operand to the global variable.
3147 //
3148 // For global address space:
3149 // s_getpc_b64 s[0:1]
3150 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3151 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3152 //
3153 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3154 // fixups or relocations are emitted to replace $symbol@*@lo and
3155 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3156 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3157 // operand to the global variable.
3158
3159 LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3160
3161 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3162 B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
3163
3164 if (ST.has64BitLiterals()) {
3165 assert(GAFlags != SIInstrInfo::MO_NONE);
3166
3167 MachineInstrBuilder MIB =
3168 B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(RegNo: PCReg);
3169 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 2);
3170 } else {
3171 MachineInstrBuilder MIB =
3172 B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(RegNo: PCReg);
3173
3174 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
3175 if (GAFlags == SIInstrInfo::MO_NONE)
3176 MIB.addImm(Val: 0);
3177 else
3178 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 1);
3179 }
3180
3181 if (!B.getMRI()->getRegClassOrNull(Reg: PCReg))
3182 B.getMRI()->setRegClass(Reg: PCReg, RC: &AMDGPU::SReg_64RegClass);
3183
3184 if (PtrTy.getSizeInBits() == 32)
3185 B.buildExtract(Res: DstReg, Src: PCReg, Index: 0);
3186 return true;
3187}
3188
3189// Emit a ABS32_LO / ABS32_HI relocation stub.
3190void AMDGPULegalizerInfo::buildAbsGlobalAddress(
3191 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3192 MachineRegisterInfo &MRI) const {
3193 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3194
3195 if (RequiresHighHalf && ST.has64BitLiterals()) {
3196 if (!MRI.getRegClassOrNull(Reg: DstReg))
3197 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_64RegClass);
3198 B.buildInstr(Opcode: AMDGPU::S_MOV_B64)
3199 .addDef(RegNo: DstReg)
3200 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS64);
3201 return;
3202 }
3203
3204 LLT S32 = LLT::scalar(SizeInBits: 32);
3205
3206 // Use the destination directly, if and only if we store the lower address
3207 // part only and we don't have a register class being set.
3208 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
3209 ? DstReg
3210 : MRI.createGenericVirtualRegister(Ty: S32);
3211
3212 if (!MRI.getRegClassOrNull(Reg: AddrLo))
3213 MRI.setRegClass(Reg: AddrLo, RC: &AMDGPU::SReg_32RegClass);
3214
3215 // Write the lower half.
3216 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
3217 .addDef(RegNo: AddrLo)
3218 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
3219
3220 // If required, write the upper half as well.
3221 if (RequiresHighHalf) {
3222 assert(PtrTy.getSizeInBits() == 64 &&
3223 "Must provide a 64-bit pointer type!");
3224
3225 Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
3226 MRI.setRegClass(Reg: AddrHi, RC: &AMDGPU::SReg_32RegClass);
3227
3228 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
3229 .addDef(RegNo: AddrHi)
3230 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_HI);
3231
3232 // Use the destination directly, if and only if we don't have a register
3233 // class being set.
3234 Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
3235 ? DstReg
3236 : MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
3237
3238 if (!MRI.getRegClassOrNull(Reg: AddrDst))
3239 MRI.setRegClass(Reg: AddrDst, RC: &AMDGPU::SReg_64RegClass);
3240
3241 B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
3242
3243 // If we created a new register for the destination, cast the result into
3244 // the final output.
3245 if (AddrDst != DstReg)
3246 B.buildCast(Dst: DstReg, Src: AddrDst);
3247 } else if (AddrLo != DstReg) {
3248 // If we created a new register for the destination, cast the result into
3249 // the final output.
3250 B.buildCast(Dst: DstReg, Src: AddrLo);
3251 }
3252}
3253
3254bool AMDGPULegalizerInfo::legalizeGlobalValue(
3255 MachineInstr &MI, MachineRegisterInfo &MRI,
3256 MachineIRBuilder &B) const {
3257 Register DstReg = MI.getOperand(i: 0).getReg();
3258 LLT Ty = MRI.getType(Reg: DstReg);
3259 unsigned AS = Ty.getAddressSpace();
3260
3261 const GlobalValue *GV = MI.getOperand(i: 1).getGlobal();
3262 MachineFunction &MF = B.getMF();
3263 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3264
3265 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
3266 if (!MFI->isModuleEntryFunction() &&
3267 GV->getName() != "llvm.amdgcn.module.lds" &&
3268 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
3269 const Function &Fn = MF.getFunction();
3270 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
3271 Fn, "local memory global used by non-kernel function",
3272 MI.getDebugLoc(), DS_Warning));
3273
3274 // We currently don't have a way to correctly allocate LDS objects that
3275 // aren't directly associated with a kernel. We do force inlining of
3276 // functions that use local objects. However, if these dead functions are
3277 // not eliminated, we don't want a compile time error. Just emit a warning
3278 // and a trap, since there should be no callable path here.
3279 B.buildTrap();
3280 B.buildUndef(Res: DstReg);
3281 MI.eraseFromParent();
3282 return true;
3283 }
3284
3285 // TODO: We could emit code to handle the initialization somewhere.
3286 // We ignore the initializer for now and legalize it to allow selection.
3287 // The initializer will anyway get errored out during assembly emission.
3288 const SITargetLowering *TLI = ST.getTargetLowering();
3289 if (!TLI->shouldUseLDSConstAddress(GV)) {
3290 MI.getOperand(i: 1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3291 return true; // Leave in place;
3292 }
3293
3294 const GlobalVariable &GVar = *cast<GlobalVariable>(Val: GV);
3295 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3296 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3297 // zero-sized type in other languages to declare the dynamic shared
3298 // memory which size is not known at the compile time. They will be
3299 // allocated by the runtime and placed directly after the static
3300 // allocated ones. They all share the same offset.
3301 if (GVar.getGlobalSize(DL: GVar.getDataLayout()) == 0) {
3302 // Adjust alignment for that dynamic shared memory array.
3303 MFI->setDynLDSAlign(F: MF.getFunction(), GV: GVar);
3304 LLT S32 = LLT::scalar(SizeInBits: 32);
3305 auto Sz = B.buildIntrinsic(ID: Intrinsic::amdgcn_groupstaticsize, Res: {S32});
3306 B.buildIntToPtr(Dst: DstReg, Src: Sz);
3307 MI.eraseFromParent();
3308 return true;
3309 }
3310 }
3311
3312 B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(), GV: GVar));
3313 MI.eraseFromParent();
3314 return true;
3315 }
3316
3317 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3318 buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
3319 MI.eraseFromParent();
3320 return true;
3321 }
3322
3323 const SITargetLowering *TLI = ST.getTargetLowering();
3324
3325 if (TLI->shouldEmitFixup(GV)) {
3326 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0);
3327 MI.eraseFromParent();
3328 return true;
3329 }
3330
3331 if (TLI->shouldEmitPCReloc(GV)) {
3332 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_REL32);
3333 MI.eraseFromParent();
3334 return true;
3335 }
3336
3337 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3338 Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
3339
3340 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3341 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3342 PtrInfo: MachinePointerInfo::getGOT(MF),
3343 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3344 MachineMemOperand::MOInvariant,
3345 MemTy: LoadTy, base_alignment: Align(8));
3346
3347 buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3348
3349 if (Ty.getSizeInBits() == 32) {
3350 // Truncate if this is a 32-bit constant address.
3351 auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3352 B.buildExtract(Res: DstReg, Src: Load, Index: 0);
3353 } else
3354 B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3355
3356 MI.eraseFromParent();
3357 return true;
3358}
3359
3360static LLT widenToNextPowerOf2(LLT Ty) {
3361 if (Ty.isVector())
3362 return Ty.changeElementCount(
3363 EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3364 return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3365}
3366
3367bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3368 MachineInstr &MI) const {
3369 MachineIRBuilder &B = Helper.MIRBuilder;
3370 MachineRegisterInfo &MRI = *B.getMRI();
3371 GISelChangeObserver &Observer = Helper.Observer;
3372
3373 Register PtrReg = MI.getOperand(i: 1).getReg();
3374 LLT PtrTy = MRI.getType(Reg: PtrReg);
3375 unsigned AddrSpace = PtrTy.getAddressSpace();
3376
3377 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3378 LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3379 auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3380 Observer.changingInstr(MI);
3381 MI.getOperand(i: 1).setReg(Cast.getReg(Idx: 0));
3382 Observer.changedInstr(MI);
3383 return true;
3384 }
3385
3386 if (MI.getOpcode() != AMDGPU::G_LOAD)
3387 return false;
3388
3389 Register ValReg = MI.getOperand(i: 0).getReg();
3390 LLT ValTy = MRI.getType(Reg: ValReg);
3391
3392 if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3393 Observer.changingInstr(MI);
3394 castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
3395 Observer.changedInstr(MI);
3396 return true;
3397 }
3398
3399 MachineMemOperand *MMO = *MI.memoperands_begin();
3400 const unsigned ValSize = ValTy.getSizeInBits();
3401 const LLT MemTy = MMO->getMemoryType();
3402 const Align MemAlign = MMO->getAlign();
3403 const unsigned MemSize = MemTy.getSizeInBits();
3404 const uint64_t AlignInBits = 8 * MemAlign.value();
3405
3406 // Widen non-power-of-2 loads to the alignment if needed
3407 if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3408 const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3409
3410 // This was already the correct extending load result type, so just adjust
3411 // the memory type.
3412 if (WideMemSize == ValSize) {
3413 MachineFunction &MF = B.getMF();
3414
3415 MachineMemOperand *WideMMO =
3416 MF.getMachineMemOperand(MMO, Offset: 0, Size: WideMemSize / 8);
3417 Observer.changingInstr(MI);
3418 MI.setMemRefs(MF, MemRefs: {WideMMO});
3419 Observer.changedInstr(MI);
3420 return true;
3421 }
3422
3423 // Don't bother handling edge case that should probably never be produced.
3424 if (ValSize > WideMemSize)
3425 return false;
3426
3427 LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3428
3429 Register WideLoad;
3430 if (!WideTy.isVector()) {
3431 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3432 B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: 0);
3433 } else {
3434 // Extract the subvector.
3435
3436 if (isRegisterType(ST, Ty: ValTy)) {
3437 // If this a case where G_EXTRACT is legal, use it.
3438 // (e.g. <3 x s32> -> <4 x s32>)
3439 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3440 B.buildExtract(Res: ValReg, Src: WideLoad, Index: 0);
3441 } else {
3442 // For cases where the widened type isn't a nice register value, unmerge
3443 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3444 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3445 B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3446 }
3447 }
3448
3449 MI.eraseFromParent();
3450 return true;
3451 }
3452
3453 return false;
3454}
3455
3456bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3457 MachineInstr &MI) const {
3458 MachineIRBuilder &B = Helper.MIRBuilder;
3459 MachineRegisterInfo &MRI = *B.getMRI();
3460 GISelChangeObserver &Observer = Helper.Observer;
3461
3462 Register DataReg = MI.getOperand(i: 0).getReg();
3463 LLT DataTy = MRI.getType(Reg: DataReg);
3464
3465 if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3466 Observer.changingInstr(MI);
3467 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
3468 Observer.changedInstr(MI);
3469 return true;
3470 }
3471 return false;
3472}
3473
3474bool AMDGPULegalizerInfo::legalizeFMad(
3475 MachineInstr &MI, MachineRegisterInfo &MRI,
3476 MachineIRBuilder &B) const {
3477 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3478 assert(Ty.isScalar());
3479
3480 MachineFunction &MF = B.getMF();
3481 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3482
3483 // TODO: Always legal with future ftz flag.
3484 // TODO: Type is expected to be LLT::float32()/LLT::float16()
3485 // FIXME: Do we need just output?
3486 if (Ty == LLT::scalar(SizeInBits: 32) &&
3487 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3488 return true;
3489 if (Ty == LLT::scalar(SizeInBits: 16) &&
3490 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3491 return true;
3492
3493 MachineIRBuilder HelperBuilder(MI);
3494 GISelObserverWrapper DummyObserver;
3495 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3496 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3497}
3498
3499bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3500 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3501 Register DstReg = MI.getOperand(i: 0).getReg();
3502 Register PtrReg = MI.getOperand(i: 1).getReg();
3503 Register CmpVal = MI.getOperand(i: 2).getReg();
3504 Register NewVal = MI.getOperand(i: 3).getReg();
3505
3506 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3507 "this should not have been custom lowered");
3508
3509 LLT ValTy = MRI.getType(Reg: CmpVal);
3510 LLT VecTy = LLT::fixed_vector(NumElements: 2, ScalarTy: ValTy);
3511
3512 Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: 0);
3513
3514 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3515 .addDef(RegNo: DstReg)
3516 .addUse(RegNo: PtrReg)
3517 .addUse(RegNo: PackedVal)
3518 .setMemRefs(MI.memoperands());
3519
3520 MI.eraseFromParent();
3521 return true;
3522}
3523
3524/// Return true if it's known that \p Src can never be an f32 denormal value.
3525static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3526 Register Src) {
3527 const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3528 switch (DefMI->getOpcode()) {
3529 case TargetOpcode::G_INTRINSIC: {
3530 switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3531 case Intrinsic::amdgcn_frexp_mant:
3532 case Intrinsic::amdgcn_log:
3533 case Intrinsic::amdgcn_log_clamp:
3534 case Intrinsic::amdgcn_exp2:
3535 case Intrinsic::amdgcn_sqrt:
3536 return true;
3537 default:
3538 break;
3539 }
3540
3541 break;
3542 }
3543 case TargetOpcode::G_FSQRT:
3544 return true;
3545 case TargetOpcode::G_FFREXP: {
3546 if (DefMI->getOperand(i: 0).getReg() == Src)
3547 return true;
3548 break;
3549 }
3550 case TargetOpcode::G_FPEXT: {
3551 return MRI.getType(Reg: DefMI->getOperand(i: 1).getReg()) == LLT::scalar(SizeInBits: 16);
3552 }
3553 default:
3554 return false;
3555 }
3556
3557 return false;
3558}
3559
3560static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3561 return Flags & MachineInstr::FmAfn;
3562}
3563
3564static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3565 unsigned Flags) {
3566 return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3567 MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3568 DenormalMode::PreserveSign;
3569}
3570
3571std::pair<Register, Register>
3572AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3573 unsigned Flags) const {
3574 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3575 return {};
3576
3577 const LLT F32 = LLT::scalar(SizeInBits: 32);
3578 auto SmallestNormal = B.buildFConstant(
3579 Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3580 auto IsLtSmallestNormal =
3581 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: SmallestNormal);
3582
3583 auto Scale32 = B.buildFConstant(Res: F32, Val: 0x1.0p+32);
3584 auto One = B.buildFConstant(Res: F32, Val: 1.0);
3585 auto ScaleFactor =
3586 B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3587 auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3588
3589 return {ScaledInput.getReg(Idx: 0), IsLtSmallestNormal.getReg(Idx: 0)};
3590}
3591
3592bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3593 MachineIRBuilder &B) const {
3594 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3595 // If we have to handle denormals, scale up the input and adjust the result.
3596
3597 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3598 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3599
3600 Register Dst = MI.getOperand(i: 0).getReg();
3601 Register Src = MI.getOperand(i: 1).getReg();
3602 LLT Ty = B.getMRI()->getType(Reg: Dst);
3603 unsigned Flags = MI.getFlags();
3604
3605 if (Ty == LLT::scalar(SizeInBits: 16)) {
3606 const LLT F32 = LLT::scalar(SizeInBits: 32);
3607 // Nothing in half is a denormal when promoted to f32.
3608 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3609 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {F32})
3610 .addUse(RegNo: Ext.getReg(Idx: 0))
3611 .setMIFlags(Flags);
3612 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3613 MI.eraseFromParent();
3614 return true;
3615 }
3616
3617 assert(Ty == LLT::scalar(32));
3618
3619 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3620 if (!ScaledInput) {
3621 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {MI.getOperand(i: 0)})
3622 .addUse(RegNo: Src)
3623 .setMIFlags(Flags);
3624 MI.eraseFromParent();
3625 return true;
3626 }
3627
3628 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3629 .addUse(RegNo: ScaledInput)
3630 .setMIFlags(Flags);
3631
3632 auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: 32.0);
3633 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3634 auto ResultOffset =
3635 B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3636 B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3637
3638 MI.eraseFromParent();
3639 return true;
3640}
3641
3642static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3643 Register Z, unsigned Flags) {
3644 auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3645 return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: 0);
3646}
3647
3648bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3649 MachineIRBuilder &B) const {
3650 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3651 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3652
3653 MachineRegisterInfo &MRI = *B.getMRI();
3654 Register Dst = MI.getOperand(i: 0).getReg();
3655 Register X = MI.getOperand(i: 1).getReg();
3656 unsigned Flags = MI.getFlags();
3657 const LLT Ty = MRI.getType(Reg: X);
3658
3659 const LLT F32 = LLT::scalar(SizeInBits: 32);
3660 const LLT F16 = LLT::scalar(SizeInBits: 16);
3661
3662 if (Ty == F16 || MI.getFlag(Flag: MachineInstr::FmAfn)) {
3663 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3664 // depending on !fpmath metadata.
3665 bool PromoteToF32 =
3666 Ty == F16 && (!MI.getFlag(Flag: MachineInstr::FmAfn) || !ST.has16BitInsts());
3667 if (PromoteToF32) {
3668 Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3669 auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3670 legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: 0), IsLog10, Flags);
3671 B.buildFPTrunc(Res: Dst, Op: LogVal);
3672 } else {
3673 legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3674 }
3675
3676 MI.eraseFromParent();
3677 return true;
3678 }
3679
3680 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3681 if (ScaledInput)
3682 X = ScaledInput;
3683
3684 auto Y =
3685 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty}).addUse(RegNo: X).setMIFlags(Flags);
3686
3687 Register R;
3688 if (ST.hasFastFMAF32()) {
3689 // c+cc are ln(2)/ln(10) to more than 49 bits
3690 const float c_log10 = 0x1.344134p-2f;
3691 const float cc_log10 = 0x1.09f79ep-26f;
3692
3693 // c + cc is ln(2) to more than 49 bits
3694 const float c_log = 0x1.62e42ep-1f;
3695 const float cc_log = 0x1.efa39ep-25f;
3696
3697 auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3698 auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3699 // This adds correction terms for which contraction may lead to an increase
3700 // in the error of the approximation, so disable it.
3701 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3702 R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags: NewFlags).getReg(Idx: 0);
3703 auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags: NewFlags);
3704 auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags: NewFlags);
3705 auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags: NewFlags);
3706 R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags: NewFlags).getReg(Idx: 0);
3707 } else {
3708 // ch+ct is ln(2)/ln(10) to more than 36 bits
3709 const float ch_log10 = 0x1.344000p-2f;
3710 const float ct_log10 = 0x1.3509f6p-18f;
3711
3712 // ch + ct is ln(2) to more than 36 bits
3713 const float ch_log = 0x1.62e000p-1f;
3714 const float ct_log = 0x1.0bfbe8p-15f;
3715
3716 auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3717 auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3718
3719 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3720 auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3721 auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3722 // This adds correction terms for which contraction may lead to an increase
3723 // in the error of the approximation, so disable it.
3724 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3725 auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags: NewFlags);
3726
3727 Register Mad0 =
3728 getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CT.getReg(Idx: 0), Z: YTCT.getReg(Idx: 0), Flags: NewFlags);
3729 Register Mad1 = getMad(B, Ty, X: YT.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad0, Flags: NewFlags);
3730 R = getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad1, Flags: NewFlags);
3731 }
3732
3733 const bool IsFiniteOnly =
3734 MI.getFlag(Flag: MachineInstr::FmNoNans) && MI.getFlag(Flag: MachineInstr::FmNoInfs);
3735
3736 if (!IsFiniteOnly) {
3737 // Expand isfinite(x) => fabs(x) < inf
3738 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3739 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3740 auto IsFinite =
3741 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
3742 R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(Idx: 0);
3743 }
3744
3745 if (ScaledInput) {
3746 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3747 auto ShiftK =
3748 B.buildFConstant(Res: Ty, Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3749 auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3750 B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3751 } else {
3752 B.buildCopy(Res: Dst, Op: R);
3753 }
3754
3755 MI.eraseFromParent();
3756 return true;
3757}
3758
3759bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3760 Register Src, bool IsLog10,
3761 unsigned Flags) const {
3762 const double Log2BaseInverted =
3763 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3764
3765 LLT Ty = B.getMRI()->getType(Reg: Dst);
3766
3767 if (Ty == LLT::scalar(SizeInBits: 32)) {
3768 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3769 if (ScaledInput) {
3770 auto LogSrc = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3771 .addUse(RegNo: Src)
3772 .setMIFlags(Flags);
3773 auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -32.0 * Log2BaseInverted);
3774 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3775 auto ResultOffset =
3776 B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3777 auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3778
3779 if (ST.hasFastFMAF32())
3780 B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3781 else {
3782 auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3783 B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3784 }
3785
3786 return true;
3787 }
3788 }
3789
3790 auto Log2Operand = Ty == LLT::scalar(SizeInBits: 16)
3791 ? B.buildFLog2(Dst: Ty, Src, Flags)
3792 : B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3793 .addUse(RegNo: Src)
3794 .setMIFlags(Flags);
3795 auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3796 B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3797 return true;
3798}
3799
3800bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3801 MachineIRBuilder &B) const {
3802 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3803 // If we have to handle denormals, scale up the input and adjust the result.
3804
3805 Register Dst = MI.getOperand(i: 0).getReg();
3806 Register Src = MI.getOperand(i: 1).getReg();
3807 unsigned Flags = MI.getFlags();
3808 LLT Ty = B.getMRI()->getType(Reg: Dst);
3809 const LLT F16 = LLT::scalar(SizeInBits: 16);
3810 const LLT F32 = LLT::scalar(SizeInBits: 32);
3811 const LLT F64 = LLT::scalar(SizeInBits: 64);
3812
3813 if (Ty == F64)
3814 return legalizeFEXPF64(MI, B);
3815
3816 if (Ty == F16) {
3817 // Nothing in half is a denormal when promoted to f32.
3818 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3819 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {F32})
3820 .addUse(RegNo: Ext.getReg(Idx: 0))
3821 .setMIFlags(Flags);
3822 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3823 MI.eraseFromParent();
3824 return true;
3825 }
3826
3827 assert(Ty == F32);
3828
3829 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3830 B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3831 .addUse(RegNo: Src)
3832 .setMIFlags(Flags);
3833 MI.eraseFromParent();
3834 return true;
3835 }
3836
3837 // bool needs_scaling = x < -0x1.f80000p+6f;
3838 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3839
3840 // -nextafter(128.0, -1)
3841 auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -0x1.f80000p+6f);
3842 auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
3843 Op1: RangeCheckConst, Flags);
3844
3845 auto SixtyFour = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3846 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3847 auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3848 auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3849
3850 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3851 .addUse(RegNo: AddInput.getReg(Idx: 0))
3852 .setMIFlags(Flags);
3853
3854 auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: 0x1.0p-64f);
3855 auto One = B.buildFConstant(Res: Ty, Val: 1.0);
3856 auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3857 B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3858 MI.eraseFromParent();
3859 return true;
3860}
3861
3862static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst,
3863 const SrcOp &Src, unsigned Flags) {
3864 LLT Ty = Dst.getLLTTy(MRI: *B.getMRI());
3865
3866 if (Ty == LLT::scalar(SizeInBits: 32)) {
3867 return B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Dst})
3868 .addUse(RegNo: Src.getReg())
3869 .setMIFlags(Flags);
3870 }
3871 return B.buildFExp2(Dst, Src, Flags);
3872}
3873
3874bool AMDGPULegalizerInfo::legalizeFExpUnsafeImpl(MachineIRBuilder &B,
3875 Register Dst, Register X,
3876 unsigned Flags,
3877 bool IsExp10) const {
3878 LLT Ty = B.getMRI()->getType(Reg: X);
3879
3880 // exp(x) -> exp2(M_LOG2E_F * x);
3881 // exp10(x) -> exp2(log2(10) * x);
3882 auto Const = B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3883 auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Const, Flags);
3884 buildExp(B, Dst, Src: Mul, Flags);
3885 return true;
3886}
3887
3888bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3889 Register X, unsigned Flags) const {
3890 LLT Ty = B.getMRI()->getType(Reg: Dst);
3891 LLT F32 = LLT::scalar(SizeInBits: 32);
3892
3893 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3894 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3895 }
3896
3897 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.5d58a0p+6f);
3898 auto NeedsScaling =
3899 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold, Flags);
3900 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3901 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3902 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3903
3904 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3905 auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3906
3907 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3908 .addUse(RegNo: ExpInput.getReg(Idx: 0))
3909 .setMIFlags(Flags);
3910
3911 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.969d48p-93f);
3912 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3913 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3914 return true;
3915}
3916
3917bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B,
3918 Register Dst, Register X,
3919 unsigned Flags) const {
3920 LLT Ty = B.getMRI()->getType(Reg: Dst);
3921 LLT F32 = LLT::scalar(SizeInBits: 32);
3922
3923 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3924 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3925 auto K0 = B.buildFConstant(Res: Ty, Val: 0x1.a92000p+1f);
3926 auto K1 = B.buildFConstant(Res: Ty, Val: 0x1.4f0978p-11f);
3927
3928 auto Mul1 = B.buildFMul(Dst: Ty, Src0: X, Src1: K1, Flags);
3929 auto Exp2_1 = buildExp(B, Dst: Ty, Src: Mul1, Flags);
3930 auto Mul0 = B.buildFMul(Dst: Ty, Src0: X, Src1: K0, Flags);
3931 auto Exp2_0 = buildExp(B, Dst: Ty, Src: Mul0, Flags);
3932 B.buildFMul(Dst, Src0: Exp2_0, Src1: Exp2_1, Flags);
3933 return true;
3934 }
3935
3936 // bool s = x < -0x1.2f7030p+5f;
3937 // x += s ? 0x1.0p+5f : 0.0f;
3938 // exp10 = exp2(x * 0x1.a92000p+1f) *
3939 // exp2(x * 0x1.4f0978p-11f) *
3940 // (s ? 0x1.9f623ep-107f : 1.0f);
3941
3942 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.2f7030p+5f);
3943 auto NeedsScaling =
3944 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold);
3945
3946 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+5f);
3947 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3948 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X);
3949
3950 auto K0 = B.buildFConstant(Res: Ty, Val: 0x1.a92000p+1f);
3951 auto K1 = B.buildFConstant(Res: Ty, Val: 0x1.4f0978p-11f);
3952
3953 auto Mul1 = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: K1, Flags);
3954 auto Exp2_1 = buildExp(B, Dst: Ty, Src: Mul1, Flags);
3955 auto Mul0 = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: K0, Flags);
3956 auto Exp2_0 = buildExp(B, Dst: Ty, Src: Mul0, Flags);
3957
3958 auto MulExps = B.buildFMul(Dst: Ty, Src0: Exp2_0, Src1: Exp2_1, Flags);
3959 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.9f623ep-107f);
3960 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: MulExps, Src1: ResultScaleFactor, Flags);
3961
3962 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: MulExps);
3963 return true;
3964}
3965
3966// This expansion gives a result slightly better than 1ulp.
3967bool AMDGPULegalizerInfo::legalizeFEXPF64(MachineInstr &MI,
3968 MachineIRBuilder &B) const {
3969
3970 Register X = MI.getOperand(i: 1).getReg();
3971 LLT S64 = LLT::scalar(SizeInBits: 64);
3972 LLT S32 = LLT::scalar(SizeInBits: 32);
3973 LLT S1 = LLT::scalar(SizeInBits: 1);
3974
3975 // TODO: Check if reassoc is safe. There is an output change in exp2 and
3976 // exp10, which slightly increases ulp.
3977 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
3978
3979 Register Dn, F, T;
3980
3981 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
3982 // Dn = rint(X)
3983 Dn = B.buildFRint(Dst: S64, Src0: X, Flags).getReg(Idx: 0);
3984 // F = X - Dn
3985 F = B.buildFSub(Dst: S64, Src0: X, Src1: Dn, Flags).getReg(Idx: 0);
3986 // T = F*C1 + F*C2
3987 auto C1 = B.buildFConstant(Res: S64, Val: APFloat(0x1.62e42fefa39efp-1));
3988 auto C2 = B.buildFConstant(Res: S64, Val: APFloat(0x1.abc9e3b39803fp-56));
3989 auto Mul2 = B.buildFMul(Dst: S64, Src0: F, Src1: C2, Flags).getReg(Idx: 0);
3990 T = B.buildFMA(Dst: S64, Src0: F, Src1: C1, Src2: Mul2, Flags).getReg(Idx: 0);
3991
3992 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
3993 auto C1 = B.buildFConstant(Res: S64, Val: APFloat(0x1.a934f0979a371p+1));
3994 auto Mul = B.buildFMul(Dst: S64, Src0: X, Src1: C1, Flags).getReg(Idx: 0);
3995 Dn = B.buildFRint(Dst: S64, Src0: Mul, Flags).getReg(Idx: 0);
3996
3997 auto NegDn = B.buildFNeg(Dst: S64, Src0: Dn, Flags).getReg(Idx: 0);
3998 auto C2 = B.buildFConstant(Res: S64, Val: APFloat(-0x1.9dc1da994fd21p-59));
3999 auto C3 = B.buildFConstant(Res: S64, Val: APFloat(0x1.34413509f79ffp-2));
4000 auto Inner = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C3, Src2: X, Flags).getReg(Idx: 0);
4001 F = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C2, Src2: Inner, Flags).getReg(Idx: 0);
4002
4003 auto C4 = B.buildFConstant(Res: S64, Val: APFloat(0x1.26bb1bbb55516p+1));
4004 auto C5 = B.buildFConstant(Res: S64, Val: APFloat(-0x1.f48ad494ea3e9p-53));
4005 auto MulF = B.buildFMul(Dst: S64, Src0: F, Src1: C5, Flags).getReg(Idx: 0);
4006 T = B.buildFMA(Dst: S64, Src0: F, Src1: C4, Src2: MulF, Flags).getReg(Idx: 0);
4007
4008 } else { // G_FEXP
4009 auto C1 = B.buildFConstant(Res: S64, Val: APFloat(0x1.71547652b82fep+0));
4010 auto Mul = B.buildFMul(Dst: S64, Src0: X, Src1: C1, Flags).getReg(Idx: 0);
4011 Dn = B.buildFRint(Dst: S64, Src0: Mul, Flags).getReg(Idx: 0);
4012
4013 auto NegDn = B.buildFNeg(Dst: S64, Src0: Dn, Flags).getReg(Idx: 0);
4014 auto C2 = B.buildFConstant(Res: S64, Val: APFloat(0x1.abc9e3b39803fp-56));
4015 auto C3 = B.buildFConstant(Res: S64, Val: APFloat(0x1.62e42fefa39efp-1));
4016 auto Inner = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C3, Src2: X, Flags).getReg(Idx: 0);
4017 T = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C2, Src2: Inner, Flags).getReg(Idx: 0);
4018 }
4019
4020 // Polynomial chain for P
4021 auto P = B.buildFConstant(Res: S64, Val: 0x1.ade156a5dcb37p-26);
4022 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.28af3fca7ab0cp-22),
4023 Flags);
4024 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.71dee623fde64p-19),
4025 Flags);
4026 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.a01997c89e6b0p-16),
4027 Flags);
4028 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.a01a014761f6ep-13),
4029 Flags);
4030 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.6c16c1852b7b0p-10),
4031 Flags);
4032 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.1111111122322p-7), Flags);
4033 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.55555555502a1p-5), Flags);
4034 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.5555555555511p-3), Flags);
4035 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.000000000000bp-1), Flags);
4036
4037 auto One = B.buildFConstant(Res: S64, Val: 1.0);
4038 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: One, Flags);
4039 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: One, Flags);
4040
4041 // Z = FLDEXP(P, (int)Dn)
4042 auto DnInt = B.buildFPTOSI(Dst: S32, Src0: Dn);
4043 auto Z = B.buildFLdexp(Dst: S64, Src0: P, Src1: DnInt, Flags);
4044
4045 if (!(Flags & MachineInstr::FmNoInfs)) {
4046 // Overflow guard: if X <= 1024.0 then Z else +inf
4047 auto CondHi = B.buildFCmp(Pred: CmpInst::FCMP_ULE, Res: S1, Op0: X,
4048 Op1: B.buildFConstant(Res: S64, Val: APFloat(1024.0)));
4049 auto PInf = B.buildFConstant(Res: S64, Val: APFloat::getInf(Sem: APFloat::IEEEdouble()));
4050 Z = B.buildSelect(Res: S64, Tst: CondHi, Op0: Z, Op1: PInf, Flags);
4051 }
4052
4053 // Underflow guard: if X >= -1075.0 then Z else 0.0
4054 auto CondLo = B.buildFCmp(Pred: CmpInst::FCMP_UGE, Res: S1, Op0: X,
4055 Op1: B.buildFConstant(Res: S64, Val: APFloat(-1075.0)));
4056 auto Zero = B.buildFConstant(Res: S64, Val: APFloat(0.0));
4057 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: CondLo, Op0: Z, Op1: Zero, Flags);
4058
4059 MI.eraseFromParent();
4060 return true;
4061}
4062
4063bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
4064 MachineIRBuilder &B) const {
4065 Register Dst = MI.getOperand(i: 0).getReg();
4066 Register X = MI.getOperand(i: 1).getReg();
4067 const unsigned Flags = MI.getFlags();
4068 MachineFunction &MF = B.getMF();
4069 MachineRegisterInfo &MRI = *B.getMRI();
4070 LLT Ty = MRI.getType(Reg: Dst);
4071
4072 const LLT F64 = LLT::scalar(SizeInBits: 64);
4073
4074 if (Ty == F64)
4075 return legalizeFEXPF64(MI, B);
4076
4077 const LLT F16 = LLT::scalar(SizeInBits: 16);
4078 const LLT F32 = LLT::scalar(SizeInBits: 32);
4079 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4080
4081 if (Ty == F16) {
4082 // v_exp_f16 (fmul x, log2e)
4083 if (allowApproxFunc(MF, Flags)) {
4084 // TODO: Does this really require fast?
4085 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4086 : legalizeFExpUnsafe(B, Dst, X, Flags);
4087 MI.eraseFromParent();
4088 return true;
4089 }
4090
4091 // Nothing in half is a denormal when promoted to f32.
4092 //
4093 // exp(f16 x) ->
4094 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4095 //
4096 // exp10(f16 x) ->
4097 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4098 auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
4099 Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
4100 legalizeFExpUnsafeImpl(B, Dst: Lowered, X: Ext.getReg(Idx: 0), Flags, IsExp10);
4101 B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
4102 MI.eraseFromParent();
4103 return true;
4104 }
4105
4106 assert(Ty == F32);
4107
4108 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4109 // library behavior. Also, is known-not-daz source sufficient?
4110 if (allowApproxFunc(MF, Flags)) {
4111 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4112 : legalizeFExpUnsafe(B, Dst, X, Flags);
4113 MI.eraseFromParent();
4114 return true;
4115 }
4116
4117 // Algorithm:
4118 //
4119 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4120 //
4121 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4122 // n = 64*m + j, 0 <= j < 64
4123 //
4124 // e^x = 2^((64*m + j + f)/64)
4125 // = (2^m) * (2^(j/64)) * 2^(f/64)
4126 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4127 //
4128 // f = x*(64/ln(2)) - n
4129 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4130 //
4131 // e^x = (2^m) * (2^(j/64)) * e^r
4132 //
4133 // (2^(j/64)) is precomputed
4134 //
4135 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4136 // e^r = 1 + q
4137 //
4138 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4139 //
4140 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4141 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4142 Register PH, PL;
4143
4144 if (ST.hasFastFMAF32()) {
4145 const float c_exp = numbers::log2ef;
4146 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4147 const float c_exp10 = 0x1.a934f0p+1f;
4148 const float cc_exp10 = 0x1.2f346ep-24f;
4149
4150 auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
4151 PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: 0);
4152 auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
4153 auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
4154
4155 auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
4156 PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: 0);
4157 } else {
4158 const float ch_exp = 0x1.714000p+0f;
4159 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4160
4161 const float ch_exp10 = 0x1.a92000p+1f;
4162 const float cl_exp10 = 0x1.4f0978p-11f;
4163
4164 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
4165 auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
4166 auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
4167
4168 auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
4169 PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: 0);
4170
4171 auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
4172 auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
4173
4174 Register Mad0 =
4175 getMad(B, Ty, X: XL.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: XLCL.getReg(Idx: 0), Flags);
4176 PL = getMad(B, Ty, X: XH.getReg(Idx: 0), Y: CL.getReg(Idx: 0), Z: Mad0, Flags);
4177 }
4178
4179 auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
4180
4181 // It is unsafe to contract this fsub into the PH multiply.
4182 auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
4183 auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
4184 auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: 32), Src0: E);
4185
4186 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
4187 .addUse(RegNo: A.getReg(Idx: 0))
4188 .setMIFlags(Flags);
4189 auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
4190
4191 auto UnderflowCheckConst =
4192 B.buildFConstant(Res: Ty, Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4193 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
4194 auto Underflow =
4195 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: UnderflowCheckConst);
4196
4197 R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
4198
4199 if (!(Flags & MachineInstr::FmNoInfs)) {
4200 auto OverflowCheckConst =
4201 B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4202
4203 auto Overflow =
4204 B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: OverflowCheckConst);
4205 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
4206 R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
4207 }
4208
4209 B.buildCopy(Res: Dst, Op: R);
4210 MI.eraseFromParent();
4211 return true;
4212}
4213
4214bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
4215 MachineIRBuilder &B) const {
4216 Register Dst = MI.getOperand(i: 0).getReg();
4217 Register Src0 = MI.getOperand(i: 1).getReg();
4218 Register Src1 = MI.getOperand(i: 2).getReg();
4219 unsigned Flags = MI.getFlags();
4220 LLT Ty = B.getMRI()->getType(Reg: Dst);
4221 const LLT F16 = LLT::scalar(SizeInBits: 16); // TODO: Expected LLT::float16()
4222 const LLT F32 = LLT::scalar(SizeInBits: 32); // TODO: Expected LLT::float32()
4223
4224 if (Ty == F32) {
4225 auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
4226 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
4227 .addUse(RegNo: Log.getReg(Idx: 0))
4228 .addUse(RegNo: Src1)
4229 .setMIFlags(Flags);
4230 B.buildFExp2(Dst, Src: Mul, Flags);
4231 } else if (Ty == F16) {
4232 // There's no f16 fmul_legacy, so we need to convert for it.
4233 auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
4234 auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
4235 auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
4236 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
4237 .addUse(RegNo: Ext0.getReg(Idx: 0))
4238 .addUse(RegNo: Ext1.getReg(Idx: 0))
4239 .setMIFlags(Flags);
4240 B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
4241 } else
4242 return false;
4243
4244 MI.eraseFromParent();
4245 return true;
4246}
4247
4248// Find a source register, ignoring any possible source modifiers.
4249static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
4250 Register ModSrc = OrigSrc;
4251 if (MachineInstr *SrcFNeg = getOpcodeDef(Opcode: AMDGPU::G_FNEG, Reg: ModSrc, MRI)) {
4252 ModSrc = SrcFNeg->getOperand(i: 1).getReg();
4253 if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
4254 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
4255 } else if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
4256 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
4257 return ModSrc;
4258}
4259
4260bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
4261 MachineRegisterInfo &MRI,
4262 MachineIRBuilder &B) const {
4263
4264 const LLT S1 = LLT::scalar(SizeInBits: 1);
4265 const LLT F64 = LLT::scalar(SizeInBits: 64); // TODO: Expected float64
4266 Register Dst = MI.getOperand(i: 0).getReg();
4267 Register OrigSrc = MI.getOperand(i: 1).getReg();
4268 unsigned Flags = MI.getFlags();
4269 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4270 "this should not have been custom lowered");
4271
4272 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4273 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4274 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4275 // V_FRACT bug is:
4276 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4277 //
4278 // Convert floor(x) to (x - fract(x))
4279
4280 auto Fract = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {F64})
4281 .addUse(RegNo: OrigSrc)
4282 .setMIFlags(Flags);
4283
4284 // Give source modifier matching some assistance before obscuring a foldable
4285 // pattern.
4286
4287 // TODO: We can avoid the neg on the fract? The input sign to fract
4288 // shouldn't matter?
4289 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4290
4291 auto Const =
4292 B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: 0x3fefffffffffffff));
4293
4294 Register Min = MRI.createGenericVirtualRegister(Ty: F64);
4295
4296 // We don't need to concern ourselves with the snan handling difference, so
4297 // use the one which will directly select.
4298 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4299 if (MFI->getMode().IEEE)
4300 B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
4301 else
4302 B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
4303
4304 Register CorrectedFract = Min;
4305 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
4306 auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
4307 CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: 0);
4308 }
4309
4310 auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
4311 B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
4312
4313 MI.eraseFromParent();
4314 return true;
4315}
4316
4317// Turn an illegal packed v2s16 build vector into bit operations.
4318// TODO: This should probably be a bitcast action in LegalizerHelper.
4319bool AMDGPULegalizerInfo::legalizeBuildVector(
4320 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4321 Register Dst = MI.getOperand(i: 0).getReg();
4322 const LLT S32 = LLT::scalar(SizeInBits: 32);
4323 const LLT S16 = LLT::scalar(SizeInBits: 16);
4324 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4325
4326 Register Src0 = MI.getOperand(i: 1).getReg();
4327 Register Src1 = MI.getOperand(i: 2).getReg();
4328
4329 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4330 assert(MRI.getType(Src0) == S32);
4331 Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 1).getReg()).getReg(Idx: 0);
4332 Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 2).getReg()).getReg(Idx: 0);
4333 }
4334
4335 auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
4336 B.buildBitcast(Dst, Src: Merge);
4337
4338 MI.eraseFromParent();
4339 return true;
4340}
4341
4342// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4343//
4344// Source and accumulation registers must all be 32-bits.
4345//
4346// TODO: When the multiply is uniform, we should produce a code sequence
4347// that is better suited to instruction selection on the SALU. Instead of
4348// the outer loop going over parts of the result, the outer loop should go
4349// over parts of one of the factors. This should result in instruction
4350// selection that makes full use of S_ADDC_U32 instructions.
4351void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
4352 MutableArrayRef<Register> Accum,
4353 ArrayRef<Register> Src0,
4354 ArrayRef<Register> Src1,
4355 bool UsePartialMad64_32,
4356 bool SeparateOddAlignedProducts) const {
4357 // Use (possibly empty) vectors of S1 registers to represent the set of
4358 // carries from one pair of positions to the next.
4359 using Carry = SmallVector<Register, 2>;
4360
4361 MachineIRBuilder &B = Helper.MIRBuilder;
4362 GISelValueTracking &VT = *Helper.getValueTracking();
4363
4364 const LLT S1 = LLT::scalar(SizeInBits: 1);
4365 const LLT S32 = LLT::scalar(SizeInBits: 32);
4366 const LLT S64 = LLT::scalar(SizeInBits: 64);
4367
4368 Register Zero32;
4369 Register Zero64;
4370
4371 auto getZero32 = [&]() -> Register {
4372 if (!Zero32)
4373 Zero32 = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
4374 return Zero32;
4375 };
4376 auto getZero64 = [&]() -> Register {
4377 if (!Zero64)
4378 Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
4379 return Zero64;
4380 };
4381
4382 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4383 for (unsigned i = 0; i < Src0.size(); ++i) {
4384 Src0KnownZeros.push_back(Elt: VT.getKnownBits(R: Src0[i]).isZero());
4385 Src1KnownZeros.push_back(Elt: VT.getKnownBits(R: Src1[i]).isZero());
4386 }
4387
4388 // Merge the given carries into the 32-bit LocalAccum, which is modified
4389 // in-place.
4390 //
4391 // Returns the carry-out, which is a single S1 register or null.
4392 auto mergeCarry =
4393 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4394 if (CarryIn.empty())
4395 return Register();
4396
4397 bool HaveCarryOut = true;
4398 Register CarryAccum;
4399 if (CarryIn.size() == 1) {
4400 if (!LocalAccum) {
4401 LocalAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
4402 return Register();
4403 }
4404
4405 CarryAccum = getZero32();
4406 } else {
4407 CarryAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
4408 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4409 CarryAccum =
4410 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32(), CarryIn: CarryIn[i])
4411 .getReg(Idx: 0);
4412 }
4413
4414 if (!LocalAccum) {
4415 LocalAccum = getZero32();
4416 HaveCarryOut = false;
4417 }
4418 }
4419
4420 auto Add =
4421 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
4422 LocalAccum = Add.getReg(Idx: 0);
4423 return HaveCarryOut ? Add.getReg(Idx: 1) : Register();
4424 };
4425
4426 // Build a multiply-add chain to compute
4427 //
4428 // LocalAccum + (partial products at DstIndex)
4429 // + (opportunistic subset of CarryIn)
4430 //
4431 // LocalAccum is an array of one or two 32-bit registers that are updated
4432 // in-place. The incoming registers may be null.
4433 //
4434 // In some edge cases, carry-ins can be consumed "for free". In that case,
4435 // the consumed carry bits are removed from CarryIn in-place.
4436 auto buildMadChain =
4437 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4438 -> Carry {
4439 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4440 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4441
4442 Carry CarryOut;
4443 unsigned j0 = 0;
4444
4445 // Use plain 32-bit multiplication for the most significant part of the
4446 // result by default.
4447 if (LocalAccum.size() == 1 &&
4448 (!UsePartialMad64_32 || !CarryIn.empty())) {
4449 do {
4450 // Skip multiplication if one of the operands is 0
4451 unsigned j1 = DstIndex - j0;
4452 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4453 ++j0;
4454 continue;
4455 }
4456 auto Mul = B.buildMul(Dst: S32, Src0: Src0[j0], Src1: Src1[j1]);
4457 if (!LocalAccum[0] || VT.getKnownBits(R: LocalAccum[0]).isZero()) {
4458 LocalAccum[0] = Mul.getReg(Idx: 0);
4459 } else {
4460 if (CarryIn.empty()) {
4461 LocalAccum[0] = B.buildAdd(Dst: S32, Src0: LocalAccum[0], Src1: Mul).getReg(Idx: 0);
4462 } else {
4463 LocalAccum[0] =
4464 B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum[0], Op1: Mul, CarryIn: CarryIn.back())
4465 .getReg(Idx: 0);
4466 CarryIn.pop_back();
4467 }
4468 }
4469 ++j0;
4470 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4471 }
4472
4473 // Build full 64-bit multiplies.
4474 if (j0 <= DstIndex) {
4475 bool HaveSmallAccum = false;
4476 Register Tmp;
4477
4478 if (LocalAccum[0]) {
4479 if (LocalAccum.size() == 1) {
4480 Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4481 HaveSmallAccum = true;
4482 } else if (LocalAccum[1]) {
4483 Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: 0);
4484 HaveSmallAccum = false;
4485 } else {
4486 Tmp = B.buildZExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4487 HaveSmallAccum = true;
4488 }
4489 } else {
4490 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4491 Tmp = getZero64();
4492 HaveSmallAccum = true;
4493 }
4494
4495 do {
4496 unsigned j1 = DstIndex - j0;
4497 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4498 ++j0;
4499 continue;
4500 }
4501 auto Mad = B.buildInstr(Opc: AMDGPU::G_AMDGPU_MAD_U64_U32, DstOps: {S64, S1},
4502 SrcOps: {Src0[j0], Src1[j1], Tmp});
4503 Tmp = Mad.getReg(Idx: 0);
4504 if (!HaveSmallAccum)
4505 CarryOut.push_back(Elt: Mad.getReg(Idx: 1));
4506 HaveSmallAccum = false;
4507
4508 ++j0;
4509 } while (j0 <= DstIndex);
4510
4511 auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
4512 LocalAccum[0] = Unmerge.getReg(Idx: 0);
4513 if (LocalAccum.size() > 1)
4514 LocalAccum[1] = Unmerge.getReg(Idx: 1);
4515 }
4516
4517 return CarryOut;
4518 };
4519
4520 // Outer multiply loop, iterating over destination parts from least
4521 // significant to most significant parts.
4522 //
4523 // The columns of the following diagram correspond to the destination parts
4524 // affected by one iteration of the outer loop (ignoring boundary
4525 // conditions).
4526 //
4527 // Dest index relative to 2 * i: 1 0 -1
4528 // ------
4529 // Carries from previous iteration: e o
4530 // Even-aligned partial product sum: E E .
4531 // Odd-aligned partial product sum: O O
4532 //
4533 // 'o' is OddCarry, 'e' is EvenCarry.
4534 // EE and OO are computed from partial products via buildMadChain and use
4535 // accumulation where possible and appropriate.
4536 //
4537 Register SeparateOddCarry;
4538 Carry EvenCarry;
4539 Carry OddCarry;
4540
4541 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4542 Carry OddCarryIn = std::move(OddCarry);
4543 Carry EvenCarryIn = std::move(EvenCarry);
4544 OddCarry.clear();
4545 EvenCarry.clear();
4546
4547 // Partial products at offset 2 * i.
4548 if (2 * i < Accum.size()) {
4549 auto LocalAccum = Accum.drop_front(N: 2 * i).take_front(N: 2);
4550 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4551 }
4552
4553 // Partial products at offset 2 * i - 1.
4554 if (i > 0) {
4555 if (!SeparateOddAlignedProducts) {
4556 auto LocalAccum = Accum.drop_front(N: 2 * i - 1).take_front(N: 2);
4557 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4558 } else {
4559 bool IsHighest = 2 * i >= Accum.size();
4560 Register SeparateOddOut[2];
4561 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4562 .take_front(N: IsHighest ? 1 : 2);
4563 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4564
4565 MachineInstr *Lo;
4566
4567 if (i == 1) {
4568 if (!IsHighest)
4569 Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0]);
4570 else
4571 Lo = B.buildAdd(Dst: S32, Src0: Accum[2 * i - 1], Src1: SeparateOddOut[0]);
4572 } else {
4573 Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0],
4574 CarryIn: SeparateOddCarry);
4575 }
4576 Accum[2 * i - 1] = Lo->getOperand(i: 0).getReg();
4577
4578 if (!IsHighest) {
4579 auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i], Op1: SeparateOddOut[1],
4580 CarryIn: Lo->getOperand(i: 1).getReg());
4581 Accum[2 * i] = Hi.getReg(Idx: 0);
4582 SeparateOddCarry = Hi.getReg(Idx: 1);
4583 }
4584 }
4585 }
4586
4587 // Add in the carries from the previous iteration
4588 if (i > 0) {
4589 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4590 EvenCarryIn.push_back(Elt: CarryOut);
4591
4592 if (2 * i < Accum.size()) {
4593 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4594 OddCarry.push_back(Elt: CarryOut);
4595 }
4596 }
4597 }
4598}
4599
4600// Custom narrowing of wide multiplies using wide multiply-add instructions.
4601//
4602// TODO: If the multiply is followed by an addition, we should attempt to
4603// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4604bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4605 MachineInstr &MI) const {
4606 assert(ST.hasMad64_32());
4607 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4608
4609 MachineIRBuilder &B = Helper.MIRBuilder;
4610 MachineRegisterInfo &MRI = *B.getMRI();
4611
4612 Register DstReg = MI.getOperand(i: 0).getReg();
4613 Register Src0 = MI.getOperand(i: 1).getReg();
4614 Register Src1 = MI.getOperand(i: 2).getReg();
4615
4616 LLT Ty = MRI.getType(Reg: DstReg);
4617 assert(Ty.isScalar());
4618
4619 unsigned Size = Ty.getSizeInBits();
4620 if (ST.hasVectorMulU64() && Size == 64)
4621 return true;
4622
4623 unsigned NumParts = Size / 32;
4624 assert((Size % 32) == 0);
4625 assert(NumParts >= 2);
4626
4627 // Whether to use MAD_64_32 for partial products whose high half is
4628 // discarded. This avoids some ADD instructions but risks false dependency
4629 // stalls on some subtargets in some cases.
4630 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4631
4632 // Whether to compute odd-aligned partial products separately. This is
4633 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4634 // in an even-aligned VGPR.
4635 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4636
4637 LLT S32 = LLT::scalar(SizeInBits: 32);
4638 SmallVector<Register, 2> Src0Parts, Src1Parts;
4639 for (unsigned i = 0; i < NumParts; ++i) {
4640 Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4641 Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4642 }
4643 B.buildUnmerge(Res: Src0Parts, Op: Src0);
4644 B.buildUnmerge(Res: Src1Parts, Op: Src1);
4645
4646 SmallVector<Register, 2> AccumRegs(NumParts);
4647 buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4648 SeparateOddAlignedProducts);
4649
4650 B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4651 MI.eraseFromParent();
4652 return true;
4653}
4654
4655// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4656// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4657// case with a single min instruction instead of a compare+select.
4658bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4659 MachineRegisterInfo &MRI,
4660 MachineIRBuilder &B) const {
4661 Register Dst = MI.getOperand(i: 0).getReg();
4662 Register Src = MI.getOperand(i: 1).getReg();
4663 LLT DstTy = MRI.getType(Reg: Dst);
4664 LLT SrcTy = MRI.getType(Reg: Src);
4665
4666 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4667 ? AMDGPU::G_AMDGPU_FFBH_U32
4668 : AMDGPU::G_AMDGPU_FFBL_B32;
4669 auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4670 B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4671
4672 MI.eraseFromParent();
4673 return true;
4674}
4675
4676bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4677 MachineRegisterInfo &MRI,
4678 MachineIRBuilder &B) const {
4679 Register Dst = MI.getOperand(i: 0).getReg();
4680 Register Src = MI.getOperand(i: 1).getReg();
4681 LLT SrcTy = MRI.getType(Reg: Src);
4682 TypeSize NumBits = SrcTy.getSizeInBits();
4683
4684 assert(NumBits < 32u);
4685
4686 auto ShiftAmt = B.buildConstant(Res: S32, Val: 32u - NumBits);
4687 auto Extend = B.buildAnyExt(Res: S32, Op: {Src}).getReg(Idx: 0u);
4688 auto Shift = B.buildShl(Dst: S32, Src0: Extend, Src1: ShiftAmt);
4689 auto Ctlz = B.buildInstr(Opc: AMDGPU::G_AMDGPU_FFBH_U32, DstOps: {S32}, SrcOps: {Shift});
4690 B.buildTrunc(Res: Dst, Op: Ctlz);
4691 MI.eraseFromParent();
4692 return true;
4693}
4694
4695bool AMDGPULegalizerInfo::legalizeCTLS(MachineInstr &MI,
4696 MachineRegisterInfo &MRI,
4697 MachineIRBuilder &B) const {
4698 Register Dst = MI.getOperand(i: 0).getReg();
4699 Register Src = MI.getOperand(i: 1).getReg();
4700 LLT SrcTy = MRI.getType(Reg: Src);
4701 const LLT S32 = LLT::scalar(SizeInBits: 32);
4702 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4703 unsigned BitWidth = SrcTy.getSizeInBits();
4704
4705 auto Sffbh = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32}).addUse(RegNo: Src);
4706 auto Clamped = B.buildUMin(Dst: S32, Src0: Sffbh, Src1: B.buildConstant(Res: S32, Val: BitWidth));
4707 B.buildSub(Dst, Src0: Clamped, Src1: B.buildConstant(Res: S32, Val: 1));
4708 MI.eraseFromParent();
4709 return true;
4710}
4711
4712// Check that this is a G_XOR x, -1
4713static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4714 if (MI.getOpcode() != TargetOpcode::G_XOR)
4715 return false;
4716 auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: 2).getReg(), MRI);
4717 return ConstVal == -1;
4718}
4719
4720// Return the use branch instruction, otherwise null if the usage is invalid.
4721static MachineInstr *
4722verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4723 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4724 Register CondDef = MI.getOperand(i: 0).getReg();
4725 if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4726 return nullptr;
4727
4728 MachineBasicBlock *Parent = MI.getParent();
4729 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(RegNo: CondDef);
4730
4731 if (isNot(MRI, MI: *UseMI)) {
4732 Register NegatedCond = UseMI->getOperand(i: 0).getReg();
4733 if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4734 return nullptr;
4735
4736 // We're deleting the def of this value, so we need to remove it.
4737 eraseInstr(MI&: *UseMI, MRI);
4738
4739 UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4740 Negated = true;
4741 }
4742
4743 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4744 return nullptr;
4745
4746 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4747 MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4748 if (Next == Parent->end()) {
4749 MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4750 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4751 return nullptr;
4752 UncondBrTarget = &*NextMBB;
4753 } else {
4754 if (Next->getOpcode() != AMDGPU::G_BR)
4755 return nullptr;
4756 Br = &*Next;
4757 UncondBrTarget = Br->getOperand(i: 0).getMBB();
4758 }
4759
4760 return UseMI;
4761}
4762
4763void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
4764 MachineIRBuilder &B,
4765 const ArgDescriptor *Arg,
4766 const TargetRegisterClass *ArgRC,
4767 LLT ArgTy) const {
4768 MCRegister SrcReg = Arg->getRegister();
4769 assert(SrcReg.isPhysical() && "Physical register expected");
4770 assert(DstReg.isVirtual() && "Virtual register expected");
4771
4772 Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4773 RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4774 if (Arg->isMasked()) {
4775 // TODO: Should we try to emit this once in the entry block?
4776 const LLT S32 = LLT::scalar(SizeInBits: 32);
4777 const unsigned Mask = Arg->getMask();
4778 const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4779
4780 Register AndMaskSrc = LiveIn;
4781
4782 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4783 // 0.
4784 if (Shift != 0) {
4785 auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4786 AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: 0);
4787 }
4788
4789 B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4790 } else {
4791 B.buildCopy(Res: DstReg, Op: LiveIn);
4792 }
4793}
4794
4795bool AMDGPULegalizerInfo::legalizeWorkGroupId(
4796 MachineInstr &MI, MachineIRBuilder &B,
4797 AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
4798 AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
4799 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4800 Register DstReg = MI.getOperand(i: 0).getReg();
4801 if (!ST.hasClusters()) {
4802 if (!loadInputValue(DstReg, B, ArgType: WorkGroupIdPV))
4803 return false;
4804 MI.eraseFromParent();
4805 return true;
4806 }
4807
4808 // Clusters are supported. Return the global position in the grid. If clusters
4809 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4810
4811 // WorkGroupIdXYZ = ClusterId == 0 ?
4812 // ClusterIdXYZ :
4813 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4814 MachineRegisterInfo &MRI = *B.getMRI();
4815 const LLT S32 = LLT::scalar(SizeInBits: 32);
4816 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4817 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4818 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4819 if (!loadInputValue(DstReg: ClusterIdXYZ, B, ArgType: WorkGroupIdPV) ||
4820 !loadInputValue(DstReg: ClusterWorkGroupIdXYZ, B, ArgType: ClusterWorkGroupIdPV) ||
4821 !loadInputValue(DstReg: ClusterMaxIdXYZ, B, ArgType: ClusterMaxIdPV))
4822 return false;
4823
4824 auto One = B.buildConstant(Res: S32, Val: 1);
4825 auto ClusterSizeXYZ = B.buildAdd(Dst: S32, Src0: ClusterMaxIdXYZ, Src1: One);
4826 auto GlobalIdXYZ = B.buildAdd(Dst: S32, Src0: ClusterWorkGroupIdXYZ,
4827 Src1: B.buildMul(Dst: S32, Src0: ClusterIdXYZ, Src1: ClusterSizeXYZ));
4828
4829 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4830
4831 switch (MFI->getClusterDims().getKind()) {
4832 case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
4833 case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
4834 B.buildCopy(Res: DstReg, Op: GlobalIdXYZ);
4835 MI.eraseFromParent();
4836 return true;
4837 }
4838 case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
4839 B.buildCopy(Res: DstReg, Op: ClusterIdXYZ);
4840 MI.eraseFromParent();
4841 return true;
4842 }
4843 case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
4844 using namespace AMDGPU::Hwreg;
4845 unsigned ClusterIdField = HwregEncoding::encode(Values: ID_IB_STS2, Values: 6, Values: 4);
4846 Register ClusterId = MRI.createGenericVirtualRegister(Ty: S32);
4847 MRI.setRegClass(Reg: ClusterId, RC: &AMDGPU::SReg_32RegClass);
4848 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32_const)
4849 .addDef(RegNo: ClusterId)
4850 .addImm(Val: ClusterIdField);
4851 auto Zero = B.buildConstant(Res: S32, Val: 0);
4852 auto NoClusters =
4853 B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1), Op0: ClusterId, Op1: Zero);
4854 B.buildSelect(Res: DstReg, Tst: NoClusters, Op0: ClusterIdXYZ, Op1: GlobalIdXYZ);
4855 MI.eraseFromParent();
4856 return true;
4857 }
4858 }
4859
4860 llvm_unreachable("nothing should reach here");
4861}
4862
4863bool AMDGPULegalizerInfo::loadInputValue(
4864 Register DstReg, MachineIRBuilder &B,
4865 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4866 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4867 const ArgDescriptor *Arg = nullptr;
4868 const TargetRegisterClass *ArgRC;
4869 LLT ArgTy;
4870
4871 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4872 const ArgDescriptor WorkGroupIDX =
4873 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
4874 // If GridZ is not programmed in an entry function then the hardware will set
4875 // it to all zeros, so there is no need to mask the GridY value in the low
4876 // order bits.
4877 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4878 Reg: AMDGPU::TTMP7,
4879 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4880 const ArgDescriptor WorkGroupIDZ =
4881 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
4882 const ArgDescriptor ClusterWorkGroupIDX =
4883 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000000Fu);
4884 const ArgDescriptor ClusterWorkGroupIDY =
4885 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000000F0u);
4886 const ArgDescriptor ClusterWorkGroupIDZ =
4887 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00000F00u);
4888 const ArgDescriptor ClusterWorkGroupMaxIDX =
4889 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000F000u);
4890 const ArgDescriptor ClusterWorkGroupMaxIDY =
4891 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000F0000u);
4892 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4893 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00F00000u);
4894 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4895 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0F000000u);
4896
4897 auto LoadConstant = [&](unsigned N) {
4898 B.buildConstant(Res: DstReg, Val: N);
4899 return true;
4900 };
4901
4902 if (ST.hasArchitectedSGPRs() &&
4903 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4904 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4905 bool HasFixedDims = ClusterDims.isFixedDims();
4906
4907 switch (ArgType) {
4908 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4909 Arg = &WorkGroupIDX;
4910 ArgRC = &AMDGPU::SReg_32RegClass;
4911 ArgTy = LLT::scalar(SizeInBits: 32);
4912 break;
4913 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4914 Arg = &WorkGroupIDY;
4915 ArgRC = &AMDGPU::SReg_32RegClass;
4916 ArgTy = LLT::scalar(SizeInBits: 32);
4917 break;
4918 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4919 Arg = &WorkGroupIDZ;
4920 ArgRC = &AMDGPU::SReg_32RegClass;
4921 ArgTy = LLT::scalar(SizeInBits: 32);
4922 break;
4923 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
4924 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4925 return LoadConstant(0);
4926 Arg = &ClusterWorkGroupIDX;
4927 ArgRC = &AMDGPU::SReg_32RegClass;
4928 ArgTy = LLT::scalar(SizeInBits: 32);
4929 break;
4930 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
4931 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4932 return LoadConstant(0);
4933 Arg = &ClusterWorkGroupIDY;
4934 ArgRC = &AMDGPU::SReg_32RegClass;
4935 ArgTy = LLT::scalar(SizeInBits: 32);
4936 break;
4937 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
4938 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4939 return LoadConstant(0);
4940 Arg = &ClusterWorkGroupIDZ;
4941 ArgRC = &AMDGPU::SReg_32RegClass;
4942 ArgTy = LLT::scalar(SizeInBits: 32);
4943 break;
4944 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
4945 if (HasFixedDims)
4946 return LoadConstant(ClusterDims.getDims()[0] - 1);
4947 Arg = &ClusterWorkGroupMaxIDX;
4948 ArgRC = &AMDGPU::SReg_32RegClass;
4949 ArgTy = LLT::scalar(SizeInBits: 32);
4950 break;
4951 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
4952 if (HasFixedDims)
4953 return LoadConstant(ClusterDims.getDims()[1] - 1);
4954 Arg = &ClusterWorkGroupMaxIDY;
4955 ArgRC = &AMDGPU::SReg_32RegClass;
4956 ArgTy = LLT::scalar(SizeInBits: 32);
4957 break;
4958 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
4959 if (HasFixedDims)
4960 return LoadConstant(ClusterDims.getDims()[2] - 1);
4961 Arg = &ClusterWorkGroupMaxIDZ;
4962 ArgRC = &AMDGPU::SReg_32RegClass;
4963 ArgTy = LLT::scalar(SizeInBits: 32);
4964 break;
4965 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
4966 Arg = &ClusterWorkGroupMaxFlatID;
4967 ArgRC = &AMDGPU::SReg_32RegClass;
4968 ArgTy = LLT::scalar(SizeInBits: 32);
4969 break;
4970 default:
4971 break;
4972 }
4973 }
4974
4975 if (!Arg)
4976 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4977
4978 if (!Arg) {
4979 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4980 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4981 // which case the pointer argument may be missing and we use null.
4982 return LoadConstant(0);
4983 }
4984
4985 // It's undefined behavior if a function marked with the amdgpu-no-*
4986 // attributes uses the corresponding intrinsic.
4987 B.buildUndef(Res: DstReg);
4988 return true;
4989 }
4990
4991 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4992 return false; // TODO: Handle these
4993 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4994 return true;
4995}
4996
4997bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4998 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4999 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5000 if (!loadInputValue(DstReg: MI.getOperand(i: 0).getReg(), B, ArgType))
5001 return false;
5002
5003 MI.eraseFromParent();
5004 return true;
5005}
5006
5007static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
5008 int64_t C) {
5009 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: C);
5010 MI.eraseFromParent();
5011 return true;
5012}
5013
5014bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
5015 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
5016 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5017 unsigned MaxID = ST.getMaxWorkitemID(Kernel: B.getMF().getFunction(), Dimension: Dim);
5018 if (MaxID == 0)
5019 return replaceWithConstant(B, MI, C: 0);
5020
5021 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5022 const ArgDescriptor *Arg;
5023 const TargetRegisterClass *ArgRC;
5024 LLT ArgTy;
5025 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
5026
5027 Register DstReg = MI.getOperand(i: 0).getReg();
5028 if (!Arg) {
5029 // It's undefined behavior if a function marked with the amdgpu-no-*
5030 // attributes uses the corresponding intrinsic.
5031 B.buildUndef(Res: DstReg);
5032 MI.eraseFromParent();
5033 return true;
5034 }
5035
5036 if (Arg->isMasked()) {
5037 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5038 // masking operations anyway.
5039 //
5040 // TODO: We could assert the top bit is 0 for the source copy.
5041 if (!loadInputValue(DstReg, B, ArgType))
5042 return false;
5043 } else {
5044 Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
5045 if (!loadInputValue(DstReg: TmpReg, B, ArgType))
5046 return false;
5047 B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
5048 }
5049
5050 MI.eraseFromParent();
5051 return true;
5052}
5053
5054MachinePointerInfo
5055AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const {
5056 // This isn't really a constant pool but close enough.
5057 MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
5058 PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
5059 return PtrInfo;
5060}
5061
5062Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
5063 int64_t Offset) const {
5064 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
5065 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
5066
5067 // TODO: If we passed in the base kernel offset we could have a better
5068 // alignment than 4, but we don't really need it.
5069 if (!loadInputValue(DstReg: KernArgReg, B,
5070 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5071 llvm_unreachable("failed to find kernarg segment ptr");
5072
5073 auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset);
5074 return B.buildObjectPtrOffset(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: 0);
5075}
5076
5077/// Legalize a value that's loaded from kernel arguments. This is only used by
5078/// legacy intrinsics.
5079bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
5080 MachineIRBuilder &B,
5081 uint64_t Offset,
5082 Align Alignment) const {
5083 Register DstReg = MI.getOperand(i: 0).getReg();
5084
5085 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5086 "unexpected kernarg parameter type");
5087
5088 Register Ptr = getKernargParameterPtr(B, Offset);
5089 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF&: B.getMF());
5090 B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment: Align(4),
5091 MMOFlags: MachineMemOperand::MODereferenceable |
5092 MachineMemOperand::MOInvariant);
5093 MI.eraseFromParent();
5094 return true;
5095}
5096
5097bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
5098 MachineRegisterInfo &MRI,
5099 MachineIRBuilder &B) const {
5100 Register Dst = MI.getOperand(i: 0).getReg();
5101 LLT DstTy = MRI.getType(Reg: Dst);
5102 LLT S16 = LLT::scalar(SizeInBits: 16);
5103 LLT S32 = LLT::scalar(SizeInBits: 32);
5104 LLT S64 = LLT::scalar(SizeInBits: 64);
5105
5106 if (DstTy == S16)
5107 return legalizeFDIV16(MI, MRI, B);
5108 if (DstTy == S32)
5109 return legalizeFDIV32(MI, MRI, B);
5110 if (DstTy == S64)
5111 return legalizeFDIV64(MI, MRI, B);
5112
5113 return false;
5114}
5115
5116void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
5117 Register DstDivReg,
5118 Register DstRemReg,
5119 Register X,
5120 Register Y) const {
5121 const LLT S1 = LLT::scalar(SizeInBits: 1);
5122 const LLT S32 = LLT::scalar(SizeInBits: 32);
5123
5124 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5125 // algorithm used here.
5126
5127 // Initial estimate of inv(y).
5128 auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
5129 auto RcpIFlag = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {FloatY});
5130 auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f7ffffe));
5131 auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
5132 auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
5133
5134 // One round of UNR.
5135 auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 0), Src1: Y);
5136 auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
5137 Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
5138
5139 // Quotient/remainder estimate.
5140 auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
5141 auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
5142
5143 // First quotient/remainder refinement.
5144 auto One = B.buildConstant(Res: S32, Val: 1);
5145 auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
5146 if (DstDivReg)
5147 Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
5148 R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
5149
5150 // Second quotient/remainder refinement.
5151 Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
5152 if (DstDivReg)
5153 B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
5154
5155 if (DstRemReg)
5156 B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
5157}
5158
5159// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5160//
5161// Return lo, hi of result
5162//
5163// %cvt.lo = G_UITOFP Val.lo
5164// %cvt.hi = G_UITOFP Val.hi
5165// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5166// %rcp = G_AMDGPU_RCP_IFLAG %mad
5167// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5168// %mul2 = G_FMUL %mul1, 2**(-32)
5169// %trunc = G_INTRINSIC_TRUNC %mul2
5170// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5171// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5172static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5173 Register Val) {
5174 const LLT S32 = LLT::scalar(SizeInBits: 32);
5175 auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
5176
5177 auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 0));
5178 auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
5179
5180 auto Mad = B.buildFMAD(
5181 Dst: S32, Src0: CvtHi, // 2**32
5182 Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f800000)), Src2: CvtLo);
5183
5184 auto Rcp = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {Mad});
5185 auto Mul1 = B.buildFMul(
5186 Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x5f7ffffc)));
5187
5188 // 2**(-32)
5189 auto Mul2 = B.buildFMul(
5190 Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x2f800000)));
5191 auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
5192
5193 // -(2**32)
5194 auto Mad2 = B.buildFMAD(
5195 Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0xcf800000)),
5196 Src2: Mul1);
5197
5198 auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
5199 auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
5200
5201 return {ResultLo.getReg(Idx: 0), ResultHi.getReg(Idx: 0)};
5202}
5203
5204void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
5205 Register DstDivReg,
5206 Register DstRemReg,
5207 Register Numer,
5208 Register Denom) const {
5209 const LLT S32 = LLT::scalar(SizeInBits: 32);
5210 const LLT S64 = LLT::scalar(SizeInBits: 64);
5211 const LLT S1 = LLT::scalar(SizeInBits: 1);
5212 Register RcpLo, RcpHi;
5213
5214 std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
5215
5216 auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
5217
5218 auto Zero64 = B.buildConstant(Res: S64, Val: 0);
5219 auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
5220
5221 auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
5222 auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
5223
5224 auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
5225 Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: 0);
5226 Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: 1);
5227
5228 auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
5229 auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: 1));
5230 auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
5231
5232 auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
5233 auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
5234 auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
5235 Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: 0);
5236 Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: 1);
5237
5238 auto Zero32 = B.buildConstant(Res: S32, Val: 0);
5239 auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
5240 auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: 1));
5241 auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
5242
5243 auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
5244 Register NumerLo = UnmergeNumer.getReg(Idx: 0);
5245 Register NumerHi = UnmergeNumer.getReg(Idx: 1);
5246
5247 auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
5248 auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
5249 auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
5250 Register Mul3_Lo = UnmergeMul3.getReg(Idx: 0);
5251 Register Mul3_Hi = UnmergeMul3.getReg(Idx: 1);
5252 auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
5253 auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: 1));
5254 auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
5255 auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
5256
5257 auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
5258 Register DenomLo = UnmergeDenom.getReg(Idx: 0);
5259 Register DenomHi = UnmergeDenom.getReg(Idx: 1);
5260
5261 auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
5262 auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
5263
5264 auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
5265 auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
5266
5267 auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
5268 auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
5269
5270 // TODO: Here and below portions of the code can be enclosed into if/endif.
5271 // Currently control flow is unconditional and we have 4 selects after
5272 // potential endif to substitute PHIs.
5273
5274 // if C3 != 0 ...
5275 auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
5276 auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: 1));
5277 auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: 1));
5278 auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
5279
5280 auto One64 = B.buildConstant(Res: S64, Val: 1);
5281 auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
5282
5283 auto C4 =
5284 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
5285 auto C5 =
5286 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
5287 auto C6 = B.buildSelect(
5288 Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
5289
5290 // if (C6 != 0)
5291 auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
5292 auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
5293
5294 auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: 1));
5295 auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: 1));
5296 auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
5297
5298 // endif C6
5299 // endif C3
5300
5301 if (DstDivReg) {
5302 auto Sel1 = B.buildSelect(
5303 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
5304 B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
5305 Op0: Sel1, Op1: MulHi3);
5306 }
5307
5308 if (DstRemReg) {
5309 auto Sel2 = B.buildSelect(
5310 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
5311 B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
5312 Op0: Sel2, Op1: Sub1);
5313 }
5314}
5315
5316bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
5317 MachineRegisterInfo &MRI,
5318 MachineIRBuilder &B) const {
5319 Register DstDivReg, DstRemReg;
5320 switch (MI.getOpcode()) {
5321 default:
5322 llvm_unreachable("Unexpected opcode!");
5323 case AMDGPU::G_UDIV: {
5324 DstDivReg = MI.getOperand(i: 0).getReg();
5325 break;
5326 }
5327 case AMDGPU::G_UREM: {
5328 DstRemReg = MI.getOperand(i: 0).getReg();
5329 break;
5330 }
5331 case AMDGPU::G_UDIVREM: {
5332 DstDivReg = MI.getOperand(i: 0).getReg();
5333 DstRemReg = MI.getOperand(i: 1).getReg();
5334 break;
5335 }
5336 }
5337
5338 const LLT S64 = LLT::scalar(SizeInBits: 64);
5339 const LLT S32 = LLT::scalar(SizeInBits: 32);
5340 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5341 Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
5342 Register Den = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
5343 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5344
5345 if (Ty == S32)
5346 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
5347 else if (Ty == S64)
5348 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
5349 else
5350 return false;
5351
5352 MI.eraseFromParent();
5353 return true;
5354}
5355
5356bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
5357 MachineRegisterInfo &MRI,
5358 MachineIRBuilder &B) const {
5359 const LLT S64 = LLT::scalar(SizeInBits: 64);
5360 const LLT S32 = LLT::scalar(SizeInBits: 32);
5361
5362 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5363 if (Ty != S32 && Ty != S64)
5364 return false;
5365
5366 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5367 Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
5368 Register RHS = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
5369
5370 auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - 1);
5371 auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
5372 auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
5373
5374 LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
5375 RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
5376
5377 LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
5378 RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
5379
5380 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5381 switch (MI.getOpcode()) {
5382 default:
5383 llvm_unreachable("Unexpected opcode!");
5384 case AMDGPU::G_SDIV: {
5385 DstDivReg = MI.getOperand(i: 0).getReg();
5386 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5387 break;
5388 }
5389 case AMDGPU::G_SREM: {
5390 DstRemReg = MI.getOperand(i: 0).getReg();
5391 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5392 break;
5393 }
5394 case AMDGPU::G_SDIVREM: {
5395 DstDivReg = MI.getOperand(i: 0).getReg();
5396 DstRemReg = MI.getOperand(i: 1).getReg();
5397 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5398 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5399 break;
5400 }
5401 }
5402
5403 if (Ty == S32)
5404 legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
5405 else
5406 legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
5407
5408 if (DstDivReg) {
5409 auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: 0);
5410 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: 0);
5411 B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
5412 }
5413
5414 if (DstRemReg) {
5415 auto Sign = LHSign.getReg(Idx: 0); // Remainder sign is the same as LHS
5416 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: 0);
5417 B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
5418 }
5419
5420 MI.eraseFromParent();
5421 return true;
5422}
5423
5424bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
5425 MachineRegisterInfo &MRI,
5426 MachineIRBuilder &B) const {
5427 Register Res = MI.getOperand(i: 0).getReg();
5428 Register LHS = MI.getOperand(i: 1).getReg();
5429 Register RHS = MI.getOperand(i: 2).getReg();
5430 uint16_t Flags = MI.getFlags();
5431 LLT ResTy = MRI.getType(Reg: Res);
5432
5433 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn);
5434
5435 if (const auto *CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
5436 if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: 16))
5437 return false;
5438
5439 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5440 // the CI documentation has a worst case error of 1 ulp.
5441 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5442 // use it as long as we aren't trying to use denormals.
5443 //
5444 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5445
5446 // 1 / x -> RCP(x)
5447 if (CLHS->isExactlyValue(V: 1.0)) {
5448 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
5449 .addUse(RegNo: RHS)
5450 .setMIFlags(Flags);
5451
5452 MI.eraseFromParent();
5453 return true;
5454 }
5455
5456 // -1 / x -> RCP( FNEG(x) )
5457 if (CLHS->isExactlyValue(V: -1.0)) {
5458 auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
5459 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
5460 .addUse(RegNo: FNeg.getReg(Idx: 0))
5461 .setMIFlags(Flags);
5462
5463 MI.eraseFromParent();
5464 return true;
5465 }
5466 }
5467
5468 // For f16 require afn or arcp.
5469 // For f32 require afn.
5470 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: 16) ||
5471 !MI.getFlag(Flag: MachineInstr::FmArcp)))
5472 return false;
5473
5474 // x / y -> x * (1.0 / y)
5475 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
5476 .addUse(RegNo: RHS)
5477 .setMIFlags(Flags);
5478 B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
5479
5480 MI.eraseFromParent();
5481 return true;
5482}
5483
5484bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
5485 MachineRegisterInfo &MRI,
5486 MachineIRBuilder &B) const {
5487 Register Res = MI.getOperand(i: 0).getReg();
5488 Register X = MI.getOperand(i: 1).getReg();
5489 Register Y = MI.getOperand(i: 2).getReg();
5490 uint16_t Flags = MI.getFlags();
5491 LLT ResTy = MRI.getType(Reg: Res);
5492
5493 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn);
5494
5495 if (!AllowInaccurateRcp)
5496 return false;
5497
5498 auto NegY = B.buildFNeg(Dst: ResTy, Src0: Y);
5499 auto One = B.buildFConstant(Res: ResTy, Val: 1.0);
5500
5501 auto R = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
5502 .addUse(RegNo: Y)
5503 .setMIFlags(Flags);
5504
5505 auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
5506 R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
5507
5508 auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
5509 R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
5510
5511 auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
5512 auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
5513
5514 B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
5515 MI.eraseFromParent();
5516 return true;
5517}
5518
5519bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
5520 MachineRegisterInfo &MRI,
5521 MachineIRBuilder &B) const {
5522 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5523 return true;
5524
5525 Register Res = MI.getOperand(i: 0).getReg();
5526 Register LHS = MI.getOperand(i: 1).getReg();
5527 Register RHS = MI.getOperand(i: 2).getReg();
5528
5529 uint16_t Flags = MI.getFlags();
5530
5531 LLT S16 = LLT::scalar(SizeInBits: 16);
5532 LLT S32 = LLT::scalar(SizeInBits: 32);
5533
5534 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5535 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5536 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5537 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5538 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5539 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5540 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5541 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5542 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5543 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5544 // q16.u = opx(V_CVT_F16_F32, q32.u);
5545 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5546
5547 auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
5548 auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
5549 auto NegRHSExt = B.buildFNeg(Dst: S32, Src0: RHSExt);
5550 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5551 .addUse(RegNo: RHSExt.getReg(Idx: 0))
5552 .setMIFlags(Flags);
5553 auto Quot = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: Rcp, Flags);
5554 MachineInstrBuilder Err;
5555 if (ST.hasMadMacF32Insts()) {
5556 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5557 Quot = B.buildFMAD(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
5558 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5559 } else {
5560 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5561 Quot = B.buildFMA(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
5562 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5563 }
5564 auto Tmp = B.buildFMul(Dst: S32, Src0: Err, Src1: Rcp, Flags);
5565 Tmp = B.buildAnd(Dst: S32, Src0: Tmp, Src1: B.buildConstant(Res: S32, Val: 0xff800000));
5566 Quot = B.buildFAdd(Dst: S32, Src0: Tmp, Src1: Quot, Flags);
5567 auto RDst = B.buildFPTrunc(Res: S16, Op: Quot, Flags);
5568 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5569 .addUse(RegNo: RDst.getReg(Idx: 0))
5570 .addUse(RegNo: RHS)
5571 .addUse(RegNo: LHS)
5572 .setMIFlags(Flags);
5573
5574 MI.eraseFromParent();
5575 return true;
5576}
5577
5578static constexpr unsigned SPDenormModeBitField =
5579 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 4, Values: 2);
5580
5581// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5582// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5583static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
5584 const GCNSubtarget &ST,
5585 SIModeRegisterDefaults Mode) {
5586 // Set SP denorm mode to this value.
5587 unsigned SPDenormMode =
5588 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5589
5590 if (ST.hasDenormModeInst()) {
5591 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5592 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5593
5594 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5595 B.buildInstr(Opcode: AMDGPU::S_DENORM_MODE)
5596 .addImm(Val: NewDenormModeValue);
5597
5598 } else {
5599 B.buildInstr(Opcode: AMDGPU::S_SETREG_IMM32_B32)
5600 .addImm(Val: SPDenormMode)
5601 .addImm(Val: SPDenormModeBitField);
5602 }
5603}
5604
5605bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
5606 MachineRegisterInfo &MRI,
5607 MachineIRBuilder &B) const {
5608 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5609 return true;
5610
5611 Register Res = MI.getOperand(i: 0).getReg();
5612 Register LHS = MI.getOperand(i: 1).getReg();
5613 Register RHS = MI.getOperand(i: 2).getReg();
5614 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5615 SIModeRegisterDefaults Mode = MFI->getMode();
5616
5617 uint16_t Flags = MI.getFlags();
5618
5619 LLT S32 = LLT::scalar(SizeInBits: 32);
5620 LLT S1 = LLT::scalar(SizeInBits: 1);
5621
5622 auto One = B.buildFConstant(Res: S32, Val: 1.0f);
5623
5624 auto DenominatorScaled =
5625 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5626 .addUse(RegNo: LHS)
5627 .addUse(RegNo: RHS)
5628 .addImm(Val: 0)
5629 .setMIFlags(Flags);
5630 auto NumeratorScaled =
5631 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5632 .addUse(RegNo: LHS)
5633 .addUse(RegNo: RHS)
5634 .addImm(Val: 1)
5635 .setMIFlags(Flags);
5636
5637 auto ApproxRcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5638 .addUse(RegNo: DenominatorScaled.getReg(Idx: 0))
5639 .setMIFlags(Flags);
5640 auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
5641
5642 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5643 const bool HasDynamicDenormals =
5644 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5645 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5646
5647 Register SavedSPDenormMode;
5648 if (!PreservesDenormals) {
5649 if (HasDynamicDenormals) {
5650 SavedSPDenormMode = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5651 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32)
5652 .addDef(RegNo: SavedSPDenormMode)
5653 .addImm(Val: SPDenormModeBitField);
5654 }
5655 toggleSPDenormMode(Enable: true, B, ST, Mode);
5656 }
5657
5658 auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
5659 auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
5660 auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
5661 auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
5662 auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
5663 auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
5664
5665 if (!PreservesDenormals) {
5666 if (HasDynamicDenormals) {
5667 assert(SavedSPDenormMode);
5668 B.buildInstr(Opcode: AMDGPU::S_SETREG_B32)
5669 .addReg(RegNo: SavedSPDenormMode)
5670 .addImm(Val: SPDenormModeBitField);
5671 } else
5672 toggleSPDenormMode(Enable: false, B, ST, Mode);
5673 }
5674
5675 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S32})
5676 .addUse(RegNo: Fma4.getReg(Idx: 0))
5677 .addUse(RegNo: Fma1.getReg(Idx: 0))
5678 .addUse(RegNo: Fma3.getReg(Idx: 0))
5679 .addUse(RegNo: NumeratorScaled.getReg(Idx: 1))
5680 .setMIFlags(Flags);
5681
5682 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5683 .addUse(RegNo: Fmas.getReg(Idx: 0))
5684 .addUse(RegNo: RHS)
5685 .addUse(RegNo: LHS)
5686 .setMIFlags(Flags);
5687
5688 MI.eraseFromParent();
5689 return true;
5690}
5691
5692bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5693 MachineRegisterInfo &MRI,
5694 MachineIRBuilder &B) const {
5695 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5696 return true;
5697
5698 Register Res = MI.getOperand(i: 0).getReg();
5699 Register LHS = MI.getOperand(i: 1).getReg();
5700 Register RHS = MI.getOperand(i: 2).getReg();
5701
5702 uint16_t Flags = MI.getFlags();
5703
5704 LLT S64 = LLT::scalar(SizeInBits: 64);
5705 LLT S1 = LLT::scalar(SizeInBits: 1);
5706
5707 auto One = B.buildFConstant(Res: S64, Val: 1.0);
5708
5709 auto DivScale0 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5710 .addUse(RegNo: LHS)
5711 .addUse(RegNo: RHS)
5712 .addImm(Val: 0)
5713 .setMIFlags(Flags);
5714
5715 auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(Idx: 0), Flags);
5716
5717 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S64})
5718 .addUse(RegNo: DivScale0.getReg(Idx: 0))
5719 .setMIFlags(Flags);
5720
5721 auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
5722 auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
5723 auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
5724
5725 auto DivScale1 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5726 .addUse(RegNo: LHS)
5727 .addUse(RegNo: RHS)
5728 .addImm(Val: 1)
5729 .setMIFlags(Flags);
5730
5731 auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5732 auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(Idx: 0), Src1: Fma3, Flags);
5733 auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(Idx: 0), Flags);
5734
5735 Register Scale;
5736 if (!ST.hasUsableDivScaleConditionOutput()) {
5737 // Workaround a hardware bug on SI where the condition output from div_scale
5738 // is not usable.
5739
5740 LLT S32 = LLT::scalar(SizeInBits: 32);
5741
5742 auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5743 auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5744 auto Scale0Unmerge = B.buildUnmerge(Res: S32, Op: DivScale0);
5745 auto Scale1Unmerge = B.buildUnmerge(Res: S32, Op: DivScale1);
5746
5747 auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: 1),
5748 Op1: Scale1Unmerge.getReg(Idx: 1));
5749 auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: 1),
5750 Op1: Scale0Unmerge.getReg(Idx: 1));
5751 Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(Idx: 0);
5752 } else {
5753 Scale = DivScale1.getReg(Idx: 1);
5754 }
5755
5756 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S64})
5757 .addUse(RegNo: Fma4.getReg(Idx: 0))
5758 .addUse(RegNo: Fma3.getReg(Idx: 0))
5759 .addUse(RegNo: Mul.getReg(Idx: 0))
5760 .addUse(RegNo: Scale)
5761 .setMIFlags(Flags);
5762
5763 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res: ArrayRef(Res))
5764 .addUse(RegNo: Fmas.getReg(Idx: 0))
5765 .addUse(RegNo: RHS)
5766 .addUse(RegNo: LHS)
5767 .setMIFlags(Flags);
5768
5769 MI.eraseFromParent();
5770 return true;
5771}
5772
5773bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5774 MachineRegisterInfo &MRI,
5775 MachineIRBuilder &B) const {
5776 Register Res0 = MI.getOperand(i: 0).getReg();
5777 Register Res1 = MI.getOperand(i: 1).getReg();
5778 Register Val = MI.getOperand(i: 2).getReg();
5779 uint16_t Flags = MI.getFlags();
5780
5781 LLT Ty = MRI.getType(Reg: Res0);
5782 LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: 16) ? LLT::scalar(SizeInBits: 16) : LLT::scalar(SizeInBits: 32);
5783
5784 auto Mant = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_mant, Res: {Ty})
5785 .addUse(RegNo: Val)
5786 .setMIFlags(Flags);
5787 auto Exp = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_exp, Res: {InstrExpTy})
5788 .addUse(RegNo: Val)
5789 .setMIFlags(Flags);
5790
5791 if (ST.hasFractBug()) {
5792 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5793 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5794 auto IsFinite =
5795 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
5796 auto Zero = B.buildConstant(Res: InstrExpTy, Val: 0);
5797 Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5798 Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5799 }
5800
5801 B.buildCopy(Res: Res0, Op: Mant);
5802 B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5803
5804 MI.eraseFromParent();
5805 return true;
5806}
5807
5808bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5809 MachineRegisterInfo &MRI,
5810 MachineIRBuilder &B) const {
5811 Register Res = MI.getOperand(i: 0).getReg();
5812 Register LHS = MI.getOperand(i: 2).getReg();
5813 Register RHS = MI.getOperand(i: 3).getReg();
5814 uint16_t Flags = MI.getFlags();
5815
5816 LLT S32 = LLT::scalar(SizeInBits: 32);
5817 LLT S1 = LLT::scalar(SizeInBits: 1);
5818
5819 auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5820 const APFloat C0Val(1.0f);
5821
5822 auto C0 = B.buildFConstant(Res: S32, Val: 0x1p+96f);
5823 auto C1 = B.buildFConstant(Res: S32, Val: 0x1p-32f);
5824 auto C2 = B.buildFConstant(Res: S32, Val: 1.0f);
5825
5826 auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5827 auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5828
5829 auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5830
5831 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5832 .addUse(RegNo: Mul0.getReg(Idx: 0))
5833 .setMIFlags(Flags);
5834
5835 auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5836
5837 B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5838
5839 MI.eraseFromParent();
5840 return true;
5841}
5842
5843bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5844 MachineRegisterInfo &MRI,
5845 MachineIRBuilder &B) const {
5846 // Bypass the correct expansion a standard promotion through G_FSQRT would
5847 // get. The f32 op is accurate enough for the f16 cas.
5848 unsigned Flags = MI.getFlags();
5849 assert(!ST.has16BitInsts());
5850 const LLT F32 = LLT::scalar(SizeInBits: 32);
5851 auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: 1), Flags);
5852 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: {F32})
5853 .addUse(RegNo: Ext.getReg(Idx: 0))
5854 .setMIFlags(Flags);
5855 B.buildFPTrunc(Res: MI.getOperand(i: 0), Op: Log2, Flags);
5856 MI.eraseFromParent();
5857 return true;
5858}
5859
5860bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5861 MachineRegisterInfo &MRI,
5862 MachineIRBuilder &B) const {
5863 MachineFunction &MF = B.getMF();
5864 Register Dst = MI.getOperand(i: 0).getReg();
5865 Register X = MI.getOperand(i: 1).getReg();
5866 const unsigned Flags = MI.getFlags();
5867 const LLT S1 = LLT::scalar(SizeInBits: 1);
5868 const LLT F32 = LLT::scalar(SizeInBits: 32);
5869 const LLT I32 = LLT::scalar(SizeInBits: 32);
5870
5871 if (allowApproxFunc(MF, Flags)) {
5872 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({Dst}))
5873 .addUse(RegNo: X)
5874 .setMIFlags(Flags);
5875 MI.eraseFromParent();
5876 return true;
5877 }
5878
5879 auto ScaleThreshold = B.buildFConstant(Res: F32, Val: 0x1.0p-96f);
5880 auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5881 auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: 0x1.0p+32f);
5882 auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5883 auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5884
5885 Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5886 if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5887 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({SqrtS}))
5888 .addUse(RegNo: SqrtX.getReg(Idx: 0))
5889 .setMIFlags(Flags);
5890
5891 auto NegOne = B.buildConstant(Res: I32, Val: -1);
5892 auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5893
5894 auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5895 auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5896
5897 auto PosOne = B.buildConstant(Res: I32, Val: 1);
5898 auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5899
5900 auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5901 auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5902
5903 auto Zero = B.buildFConstant(Res: F32, Val: 0.0f);
5904 auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5905
5906 SqrtS =
5907 B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5908
5909 auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
5910 SqrtS =
5911 B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: 0);
5912 } else {
5913 auto SqrtR =
5914 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F32}).addReg(RegNo: SqrtX.getReg(Idx: 0));
5915 B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
5916
5917 auto Half = B.buildFConstant(Res: F32, Val: 0.5f);
5918 auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
5919 auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
5920 auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
5921 SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
5922 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(Idx: 0);
5923 auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
5924 auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
5925 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(Idx: 0);
5926 }
5927
5928 auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: 0x1.0p-16f);
5929
5930 auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
5931
5932 SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5933
5934 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5935 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
5936
5937 MI.eraseFromParent();
5938 return true;
5939}
5940
5941bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5942 MachineRegisterInfo &MRI,
5943 MachineIRBuilder &B) const {
5944 // For double type, the SQRT and RSQ instructions don't have required
5945 // precision, we apply Goldschmidt's algorithm to improve the result:
5946 //
5947 // y0 = rsq(x)
5948 // g0 = x * y0
5949 // h0 = 0.5 * y0
5950 //
5951 // r0 = 0.5 - h0 * g0
5952 // g1 = g0 * r0 + g0
5953 // h1 = h0 * r0 + h0
5954 //
5955 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5956 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5957 // h2 = h1 * r1 + h1
5958 //
5959 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5960 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5961 //
5962 // sqrt(x) = g3
5963
5964 const LLT S1 = LLT::scalar(SizeInBits: 1);
5965 const LLT S32 = LLT::scalar(SizeInBits: 32);
5966 const LLT F64 = LLT::scalar(SizeInBits: 64);
5967
5968 Register Dst = MI.getOperand(i: 0).getReg();
5969 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5970
5971 Register X = MI.getOperand(i: 1).getReg();
5972 unsigned Flags = MI.getFlags();
5973
5974 Register SqrtX = X;
5975 Register Scaling, ZeroInt;
5976 if (!MI.getFlag(Flag: MachineInstr::FmAfn)) {
5977 auto ScaleConstant = B.buildFConstant(Res: F64, Val: 0x1.0p-767);
5978
5979 ZeroInt = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
5980 Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant).getReg(Idx: 0);
5981
5982 // Scale up input if it is too small.
5983 auto ScaleUpFactor = B.buildConstant(Res: S32, Val: 256);
5984 auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
5985 SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags).getReg(Idx: 0);
5986 }
5987
5988 auto SqrtY = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F64}).addReg(RegNo: SqrtX);
5989
5990 auto Half = B.buildFConstant(Res: F64, Val: 0.5);
5991 auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
5992 auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
5993
5994 auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
5995 auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
5996
5997 auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
5998 auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
5999
6000 auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
6001 auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
6002
6003 auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
6004
6005 Register SqrtRet = SqrtS2.getReg(Idx: 0);
6006 if (!MI.getFlag(Flag: MachineInstr::FmAfn)) {
6007 auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
6008 auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
6009 auto SqrtD2 = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
6010
6011 // Scale down the result.
6012 auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -128);
6013 auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
6014 SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtD2, Src1: ScaleDown, Flags).getReg(Idx: 0);
6015 }
6016
6017 Register IsZeroOrInf;
6018 if (MI.getFlag(Flag: MachineInstr::FmNoInfs)) {
6019 auto ZeroFP = B.buildFConstant(Res: F64, Val: 0.0);
6020 IsZeroOrInf = B.buildFCmp(Pred: FCmpInst::FCMP_OEQ, Res: S1, Op0: SqrtX, Op1: ZeroFP).getReg(Idx: 0);
6021 } else {
6022 IsZeroOrInf = B.buildIsFPClass(Res: S1, Src: SqrtX, Mask: fcZero | fcPosInf).getReg(Idx: 0);
6023 }
6024
6025 // TODO: Check for DAZ and expand to subnormals
6026
6027 // If x is +INF, +0, or -0, use its original value
6028 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
6029
6030 MI.eraseFromParent();
6031 return true;
6032}
6033
6034bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
6035 MachineRegisterInfo &MRI,
6036 MachineIRBuilder &B) const {
6037 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
6038 if (Ty == LLT::scalar(SizeInBits: 32))
6039 return legalizeFSQRTF32(MI, MRI, B);
6040 if (Ty == LLT::scalar(SizeInBits: 64))
6041 return legalizeFSQRTF64(MI, MRI, B);
6042 if (Ty == LLT::scalar(SizeInBits: 16))
6043 return legalizeFSQRTF16(MI, MRI, B);
6044 return false;
6045}
6046
6047// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6048// FIXME: Why do we handle this one but not other removed instructions?
6049//
6050// Reciprocal square root. The clamp prevents infinite results, clamping
6051// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6052// +-max_float.
6053bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
6054 MachineRegisterInfo &MRI,
6055 MachineIRBuilder &B) const {
6056 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6057 return true;
6058
6059 Register Dst = MI.getOperand(i: 0).getReg();
6060 Register Src = MI.getOperand(i: 2).getReg();
6061 auto Flags = MI.getFlags();
6062
6063 LLT Ty = MRI.getType(Reg: Dst);
6064
6065 const fltSemantics *FltSemantics;
6066 if (Ty == LLT::scalar(SizeInBits: 32))
6067 FltSemantics = &APFloat::IEEEsingle();
6068 else if (Ty == LLT::scalar(SizeInBits: 64))
6069 FltSemantics = &APFloat::IEEEdouble();
6070 else
6071 return false;
6072
6073 auto Rsq = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {Ty})
6074 .addUse(RegNo: Src)
6075 .setMIFlags(Flags);
6076
6077 // We don't need to concern ourselves with the snan handling difference, since
6078 // the rsq quieted (or not) so use the one which will directly select.
6079 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6080 const bool UseIEEE = MFI->getMode().IEEE;
6081
6082 auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
6083 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
6084 B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
6085
6086 auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics, Negative: true));
6087
6088 if (UseIEEE)
6089 B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
6090 else
6091 B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
6092 MI.eraseFromParent();
6093 return true;
6094}
6095
6096// TODO: Fix pointer type handling
6097bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
6098 MachineInstr &MI,
6099 Intrinsic::ID IID) const {
6100
6101 MachineIRBuilder &B = Helper.MIRBuilder;
6102 MachineRegisterInfo &MRI = *B.getMRI();
6103
6104 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6105 IID == Intrinsic::amdgcn_permlanex16;
6106 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6107 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6108
6109 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6110 Register Src2, LLT VT) -> Register {
6111 auto LaneOp = B.buildIntrinsic(ID: IID, Res: {VT}).addUse(RegNo: Src0);
6112 switch (IID) {
6113 case Intrinsic::amdgcn_readfirstlane:
6114 case Intrinsic::amdgcn_permlane64:
6115 return LaneOp.getReg(Idx: 0);
6116 case Intrinsic::amdgcn_readlane:
6117 case Intrinsic::amdgcn_set_inactive:
6118 case Intrinsic::amdgcn_set_inactive_chain_arg:
6119 return LaneOp.addUse(RegNo: Src1).getReg(Idx: 0);
6120 case Intrinsic::amdgcn_writelane:
6121 return LaneOp.addUse(RegNo: Src1).addUse(RegNo: Src2).getReg(Idx: 0);
6122 case Intrinsic::amdgcn_permlane16:
6123 case Intrinsic::amdgcn_permlanex16: {
6124 Register Src3 = MI.getOperand(i: 5).getReg();
6125 int64_t Src4 = MI.getOperand(i: 6).getImm();
6126 int64_t Src5 = MI.getOperand(i: 7).getImm();
6127 return LaneOp.addUse(RegNo: Src1)
6128 .addUse(RegNo: Src2)
6129 .addUse(RegNo: Src3)
6130 .addImm(Val: Src4)
6131 .addImm(Val: Src5)
6132 .getReg(Idx: 0);
6133 }
6134 case Intrinsic::amdgcn_mov_dpp8:
6135 return LaneOp.addImm(Val: MI.getOperand(i: 3).getImm()).getReg(Idx: 0);
6136 case Intrinsic::amdgcn_update_dpp:
6137 return LaneOp.addUse(RegNo: Src1)
6138 .addImm(Val: MI.getOperand(i: 4).getImm())
6139 .addImm(Val: MI.getOperand(i: 5).getImm())
6140 .addImm(Val: MI.getOperand(i: 6).getImm())
6141 .addImm(Val: MI.getOperand(i: 7).getImm())
6142 .getReg(Idx: 0);
6143 default:
6144 llvm_unreachable("unhandled lane op");
6145 }
6146 };
6147
6148 Register DstReg = MI.getOperand(i: 0).getReg();
6149 Register Src0 = MI.getOperand(i: 2).getReg();
6150 Register Src1, Src2;
6151 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6152 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6153 Src1 = MI.getOperand(i: 3).getReg();
6154 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6155 Src2 = MI.getOperand(i: 4).getReg();
6156 }
6157 }
6158
6159 LLT Ty = MRI.getType(Reg: DstReg);
6160 unsigned Size = Ty.getSizeInBits();
6161
6162 unsigned SplitSize = 32;
6163 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6164 ST.hasDPALU_DPP() &&
6165 AMDGPU::isLegalDPALU_DPPControl(ST, DC: MI.getOperand(i: 4).getImm()))
6166 SplitSize = 64;
6167
6168 if (Size == SplitSize) {
6169 // Already legal
6170 return true;
6171 }
6172
6173 if (Size < 32) {
6174 Src0 = B.buildAnyExt(Res: S32, Op: Src0).getReg(Idx: 0);
6175
6176 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6177 Src1 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src1).getReg(Idx: 0);
6178
6179 if (IID == Intrinsic::amdgcn_writelane)
6180 Src2 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src2).getReg(Idx: 0);
6181
6182 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6183 B.buildTrunc(Res: DstReg, Op: LaneOpDst);
6184 MI.eraseFromParent();
6185 return true;
6186 }
6187
6188 if (Size % SplitSize != 0)
6189 return false;
6190
6191 LLT PartialResTy = LLT::scalar(SizeInBits: SplitSize);
6192 bool NeedsBitcast = false;
6193 if (Ty.isVector()) {
6194 LLT EltTy = Ty.getElementType();
6195 unsigned EltSize = EltTy.getSizeInBits();
6196 if (EltSize == SplitSize) {
6197 PartialResTy = EltTy;
6198 } else if (EltSize == 16 || EltSize == 32) {
6199 unsigned NElem = SplitSize / EltSize;
6200 PartialResTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: NElem));
6201 } else {
6202 // Handle all other cases via S32/S64 pieces
6203 NeedsBitcast = true;
6204 }
6205 }
6206
6207 SmallVector<Register, 4> PartialRes;
6208 unsigned NumParts = Size / SplitSize;
6209 MachineInstrBuilder Src0Parts = B.buildUnmerge(Res: PartialResTy, Op: Src0);
6210 MachineInstrBuilder Src1Parts, Src2Parts;
6211
6212 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6213 Src1Parts = B.buildUnmerge(Res: PartialResTy, Op: Src1);
6214
6215 if (IID == Intrinsic::amdgcn_writelane)
6216 Src2Parts = B.buildUnmerge(Res: PartialResTy, Op: Src2);
6217
6218 for (unsigned i = 0; i < NumParts; ++i) {
6219 Src0 = Src0Parts.getReg(Idx: i);
6220
6221 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6222 Src1 = Src1Parts.getReg(Idx: i);
6223
6224 if (IID == Intrinsic::amdgcn_writelane)
6225 Src2 = Src2Parts.getReg(Idx: i);
6226
6227 PartialRes.push_back(Elt: createLaneOp(Src0, Src1, Src2, PartialResTy));
6228 }
6229
6230 if (NeedsBitcast)
6231 B.buildBitcast(Dst: DstReg, Src: B.buildMergeLikeInstr(
6232 Res: LLT::scalar(SizeInBits: Ty.getSizeInBits()), Ops: PartialRes));
6233 else
6234 B.buildMergeLikeInstr(Res: DstReg, Ops: PartialRes);
6235
6236 MI.eraseFromParent();
6237 return true;
6238}
6239
6240bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
6241 MachineRegisterInfo &MRI,
6242 MachineIRBuilder &B) const {
6243 uint64_t Offset =
6244 ST.getTargetLowering()->getImplicitParameterOffset(
6245 MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
6246 LLT DstTy = MRI.getType(Reg: DstReg);
6247 LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
6248
6249 Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
6250 if (!loadInputValue(DstReg: KernargPtrReg, B,
6251 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6252 return false;
6253
6254 B.buildObjectPtrOffset(Res: DstReg, Op0: KernargPtrReg,
6255 Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: 0));
6256 return true;
6257}
6258
6259/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6260/// bits of the pointer and replace them with the stride argument, then
6261/// merge_values everything together. In the common case of a raw buffer (the
6262/// stride component is 0), we can just AND off the upper half.
6263bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
6264 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6265 Register Result = MI.getOperand(i: 0).getReg();
6266 Register Pointer = MI.getOperand(i: 2).getReg();
6267 Register Stride = MI.getOperand(i: 3).getReg();
6268 Register NumRecords = MI.getOperand(i: 4).getReg();
6269 Register Flags = MI.getOperand(i: 5).getReg();
6270
6271 LLT S32 = LLT::scalar(SizeInBits: 32);
6272 LLT S64 = LLT::scalar(SizeInBits: 64);
6273
6274 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6275
6276 auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
6277
6278 if (ST.has45BitNumRecordsBufferResource()) {
6279 Register Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6280 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6281 // num_records.
6282 LLT PtrIntTy = LLT::scalar(SizeInBits: MRI.getType(Reg: Pointer).getSizeInBits());
6283 auto PointerInt = B.buildPtrToInt(Dst: PtrIntTy, Src: Pointer);
6284 auto ExtPointer = B.buildAnyExtOrTrunc(Res: S64, Op: PointerInt);
6285 auto NumRecordsLHS = B.buildShl(Dst: S64, Src0: NumRecords, Src1: B.buildConstant(Res: S32, Val: 57));
6286 Register LowHalf = B.buildOr(Dst: S64, Src0: ExtPointer, Src1: NumRecordsLHS).getReg(Idx: 0);
6287
6288 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6289 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6290 auto NumRecordsRHS = B.buildLShr(Dst: S64, Src0: NumRecords, Src1: B.buildConstant(Res: S32, Val: 7));
6291 auto ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: B.buildConstant(Res: S32, Val: 12));
6292 auto ExtShiftedStride =
6293 B.buildMergeValues(Res: S64, Ops: {Zero, ShiftedStride.getReg(Idx: 0)});
6294 auto ShiftedFlags = B.buildShl(Dst: S32, Src0: Flags, Src1: B.buildConstant(Res: S32, Val: 28));
6295 auto ExtShiftedFlags =
6296 B.buildMergeValues(Res: S64, Ops: {Zero, ShiftedFlags.getReg(Idx: 0)});
6297 auto CombinedFields = B.buildOr(Dst: S64, Src0: NumRecordsRHS, Src1: ExtShiftedStride);
6298 Register HighHalf =
6299 B.buildOr(Dst: S64, Src0: CombinedFields, Src1: ExtShiftedFlags).getReg(Idx: 0);
6300 B.buildMergeValues(Res: Result, Ops: {LowHalf, HighHalf});
6301 } else {
6302 NumRecords = B.buildTrunc(Res: S32, Op: NumRecords).getReg(Idx: 0);
6303 auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
6304 auto LowHalf = Unmerge.getReg(Idx: 0);
6305 auto HighHalf = Unmerge.getReg(Idx: 1);
6306
6307 auto AndMask = B.buildConstant(Res: S32, Val: 0x0000ffff);
6308 auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
6309 auto ShiftConst = B.buildConstant(Res: S32, Val: 16);
6310 auto ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
6311 auto NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
6312 Register NewHighHalfReg = NewHighHalf.getReg(Idx: 0);
6313 B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
6314 }
6315
6316 MI.eraseFromParent();
6317 return true;
6318}
6319
6320bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
6321 MachineRegisterInfo &MRI,
6322 MachineIRBuilder &B) const {
6323 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6324 if (!MFI->isEntryFunction()) {
6325 return legalizePreloadedArgIntrin(MI, MRI, B,
6326 ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
6327 }
6328
6329 Register DstReg = MI.getOperand(i: 0).getReg();
6330 if (!getImplicitArgPtr(DstReg, MRI, B))
6331 return false;
6332
6333 MI.eraseFromParent();
6334 return true;
6335}
6336
6337bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
6338 MachineRegisterInfo &MRI,
6339 MachineIRBuilder &B) const {
6340 Function &F = B.getMF().getFunction();
6341 std::optional<uint32_t> KnownSize =
6342 AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(F);
6343 if (KnownSize.has_value())
6344 B.buildConstant(Res: DstReg, Val: *KnownSize);
6345 return false;
6346}
6347
6348bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
6349 MachineRegisterInfo &MRI,
6350 MachineIRBuilder &B) const {
6351
6352 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6353 if (!MFI->isEntryFunction()) {
6354 return legalizePreloadedArgIntrin(MI, MRI, B,
6355 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
6356 }
6357
6358 Register DstReg = MI.getOperand(i: 0).getReg();
6359 if (!getLDSKernelId(DstReg, MRI, B))
6360 return false;
6361
6362 MI.eraseFromParent();
6363 return true;
6364}
6365
6366bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
6367 MachineRegisterInfo &MRI,
6368 MachineIRBuilder &B,
6369 unsigned AddrSpace) const {
6370 const LLT S32 = LLT::scalar(SizeInBits: 32);
6371 auto Unmerge = B.buildUnmerge(Res: S32, Op: MI.getOperand(i: 2).getReg());
6372 Register Hi32 = Unmerge.getReg(Idx: 1);
6373
6374 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6375 ST.hasGloballyAddressableScratch()) {
6376 Register FlatScratchBaseHi =
6377 B.buildInstr(Opc: AMDGPU::S_MOV_B32, DstOps: {S32},
6378 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6379 .getReg(Idx: 0);
6380 MRI.setRegClass(Reg: FlatScratchBaseHi, RC: &AMDGPU::SReg_32RegClass);
6381 // Test bits 63..58 against the aperture address.
6382 Register XOR = B.buildXor(Dst: S32, Src0: Hi32, Src1: FlatScratchBaseHi).getReg(Idx: 0);
6383 B.buildICmp(Pred: ICmpInst::ICMP_ULT, Res: MI.getOperand(i: 0), Op0: XOR,
6384 Op1: B.buildConstant(Res: S32, Val: 1u << 26));
6385 } else {
6386 Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
6387 B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: 0), Op0: Hi32, Op1: ApertureReg);
6388 }
6389 MI.eraseFromParent();
6390 return true;
6391}
6392
6393// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6394// offset (the offset that is included in bounds checking and swizzling, to be
6395// split between the instruction's voffset and immoffset fields) and soffset
6396// (the offset that is excluded from bounds checking and swizzling, to go in
6397// the instruction's soffset field). This function takes the first kind of
6398// offset and figures out how to split it between voffset and immoffset.
6399std::pair<Register, unsigned>
6400AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
6401 Register OrigOffset) const {
6402 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6403 Register BaseReg;
6404 unsigned ImmOffset;
6405 const LLT S32 = LLT::scalar(SizeInBits: 32);
6406 MachineRegisterInfo &MRI = *B.getMRI();
6407
6408 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6409 // being added, so we can only safely match a 32-bit addition with no unsigned
6410 // overflow.
6411 bool CheckNUW = ST.hasGFX1250Insts();
6412 std::tie(args&: BaseReg, args&: ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6413 MRI, Reg: OrigOffset, /*KnownBits=*/ValueTracking: nullptr, CheckNUW);
6414
6415 // If BaseReg is a pointer, convert it to int.
6416 if (MRI.getType(Reg: BaseReg).isPointer())
6417 BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: 0);
6418
6419 // If the immediate value is too big for the immoffset field, put only bits
6420 // that would normally fit in the immoffset field. The remaining value that
6421 // is copied/added for the voffset field is a large power of 2, and it
6422 // stands more chance of being CSEd with the copy/add for another similar
6423 // load/store.
6424 // However, do not do that rounding down if that is a negative
6425 // number, as it appears to be illegal to have a negative offset in the
6426 // vgpr, even if adding the immediate offset makes it positive.
6427 unsigned Overflow = ImmOffset & ~MaxImm;
6428 ImmOffset -= Overflow;
6429 if ((int32_t)Overflow < 0) {
6430 Overflow += ImmOffset;
6431 ImmOffset = 0;
6432 }
6433
6434 if (Overflow != 0) {
6435 if (!BaseReg) {
6436 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
6437 } else {
6438 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
6439 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
6440 }
6441 }
6442
6443 if (!BaseReg)
6444 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6445
6446 return std::pair(BaseReg, ImmOffset);
6447}
6448
6449/// Handle register layout difference for f16 images for some subtargets.
6450Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
6451 MachineRegisterInfo &MRI,
6452 Register Reg,
6453 bool ImageStore) const {
6454 const LLT S16 = LLT::scalar(SizeInBits: 16);
6455 const LLT S32 = LLT::scalar(SizeInBits: 32);
6456 LLT StoreVT = MRI.getType(Reg);
6457 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6458
6459 if (ST.hasUnpackedD16VMem()) {
6460 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
6461
6462 SmallVector<Register, 4> WideRegs;
6463 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6464 WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6465
6466 int NumElts = StoreVT.getNumElements();
6467
6468 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
6469 .getReg(Idx: 0);
6470 }
6471
6472 if (ImageStore && ST.hasImageStoreD16Bug()) {
6473 if (StoreVT.getNumElements() == 2) {
6474 SmallVector<Register, 4> PackedRegs;
6475 Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: 0);
6476 PackedRegs.push_back(Elt: Reg);
6477 PackedRegs.resize(N: 2, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
6478 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Ops: PackedRegs)
6479 .getReg(Idx: 0);
6480 }
6481
6482 if (StoreVT.getNumElements() == 3) {
6483 SmallVector<Register, 4> PackedRegs;
6484 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
6485 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6486 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
6487 PackedRegs.resize(N: 6, NV: B.buildUndef(Res: S16).getReg(Idx: 0));
6488 Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 6, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: 0);
6489 return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 3, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
6490 }
6491
6492 if (StoreVT.getNumElements() == 4) {
6493 SmallVector<Register, 4> PackedRegs;
6494 Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
6495 auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
6496 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6497 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
6498 PackedRegs.resize(N: 4, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
6499 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S32), Ops: PackedRegs)
6500 .getReg(Idx: 0);
6501 }
6502
6503 llvm_unreachable("invalid data type");
6504 }
6505
6506 if (StoreVT == LLT::fixed_vector(NumElements: 3, ScalarTy: S16)) {
6507 Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S16), Op0: Reg)
6508 .getReg(Idx: 0);
6509 }
6510 return Reg;
6511}
6512
6513Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
6514 Register VData, LLT MemTy,
6515 bool IsFormat) const {
6516 MachineRegisterInfo *MRI = B.getMRI();
6517 LLT Ty = MRI->getType(Reg: VData);
6518
6519 const LLT S16 = LLT::scalar(SizeInBits: 16);
6520
6521 // Fixup buffer resources themselves needing to be v4i128.
6522 if (hasBufferRsrcWorkaround(Ty))
6523 return castBufferRsrcToV4I32(Pointer: VData, B);
6524
6525 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6526 Ty = getBitcastRegisterType(Ty);
6527 VData = B.buildBitcast(Dst: Ty, Src: VData).getReg(Idx: 0);
6528 }
6529 // Fixup illegal register types for i8 stores.
6530 if (Ty == LLT::scalar(SizeInBits: 8) || Ty == S16) {
6531 Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: VData).getReg(Idx: 0);
6532 return AnyExt;
6533 }
6534
6535 if (Ty.isVector()) {
6536 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6537 if (IsFormat)
6538 return handleD16VData(B, MRI&: *MRI, Reg: VData);
6539 }
6540 }
6541
6542 return VData;
6543}
6544
6545bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
6546 LegalizerHelper &Helper,
6547 bool IsTyped,
6548 bool IsFormat) const {
6549 MachineIRBuilder &B = Helper.MIRBuilder;
6550 MachineRegisterInfo &MRI = *B.getMRI();
6551
6552 Register VData = MI.getOperand(i: 1).getReg();
6553 LLT Ty = MRI.getType(Reg: VData);
6554 LLT EltTy = Ty.getScalarType();
6555 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6556 const LLT S32 = LLT::scalar(SizeInBits: 32);
6557
6558 MachineMemOperand *MMO = *MI.memoperands_begin();
6559 const int MemSize = MMO->getSize().getValue();
6560 LLT MemTy = MMO->getMemoryType();
6561
6562 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6563
6564 castBufferRsrcArgToV4I32(MI, B, Idx: 2);
6565 Register RSrc = MI.getOperand(i: 2).getReg();
6566
6567 unsigned ImmOffset;
6568
6569 // The typed intrinsics add an immediate after the registers.
6570 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6571
6572 // The struct intrinsic variants add one additional operand over raw.
6573 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6574 Register VIndex;
6575 int OpOffset = 0;
6576 if (HasVIndex) {
6577 VIndex = MI.getOperand(i: 3).getReg();
6578 OpOffset = 1;
6579 } else {
6580 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6581 }
6582
6583 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6584 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6585
6586 unsigned Format = 0;
6587 if (IsTyped) {
6588 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6589 ++OpOffset;
6590 }
6591
6592 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6593
6594 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6595
6596 unsigned Opc;
6597 if (IsTyped) {
6598 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6599 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6600 } else if (IsFormat) {
6601 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6602 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6603 } else {
6604 switch (MemSize) {
6605 case 1:
6606 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6607 break;
6608 case 2:
6609 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6610 break;
6611 default:
6612 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6613 break;
6614 }
6615 }
6616
6617 auto MIB = B.buildInstr(Opcode: Opc)
6618 .addUse(RegNo: VData) // vdata
6619 .addUse(RegNo: RSrc) // rsrc
6620 .addUse(RegNo: VIndex) // vindex
6621 .addUse(RegNo: VOffset) // voffset
6622 .addUse(RegNo: SOffset) // soffset
6623 .addImm(Val: ImmOffset); // offset(imm)
6624
6625 if (IsTyped)
6626 MIB.addImm(Val: Format);
6627
6628 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6629 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6630 .addMemOperand(MMO);
6631
6632 MI.eraseFromParent();
6633 return true;
6634}
6635
6636static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6637 Register VIndex, Register VOffset, Register SOffset,
6638 unsigned ImmOffset, unsigned Format,
6639 unsigned AuxiliaryData, MachineMemOperand *MMO,
6640 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6641 auto MIB = B.buildInstr(Opcode: Opc)
6642 .addDef(RegNo: LoadDstReg) // vdata
6643 .addUse(RegNo: RSrc) // rsrc
6644 .addUse(RegNo: VIndex) // vindex
6645 .addUse(RegNo: VOffset) // voffset
6646 .addUse(RegNo: SOffset) // soffset
6647 .addImm(Val: ImmOffset); // offset(imm)
6648
6649 if (IsTyped)
6650 MIB.addImm(Val: Format);
6651
6652 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6653 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6654 .addMemOperand(MMO);
6655}
6656
6657bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
6658 LegalizerHelper &Helper,
6659 bool IsFormat,
6660 bool IsTyped) const {
6661 MachineIRBuilder &B = Helper.MIRBuilder;
6662 MachineRegisterInfo &MRI = *B.getMRI();
6663 GISelChangeObserver &Observer = Helper.Observer;
6664
6665 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6666 MachineMemOperand *MMO = *MI.memoperands_begin();
6667 const LLT MemTy = MMO->getMemoryType();
6668 const LLT S32 = LLT::scalar(SizeInBits: 32);
6669
6670 Register Dst = MI.getOperand(i: 0).getReg();
6671
6672 Register StatusDst;
6673 int OpOffset = 0;
6674 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6675 bool IsTFE = MI.getNumExplicitDefs() == 2;
6676 if (IsTFE) {
6677 StatusDst = MI.getOperand(i: 1).getReg();
6678 ++OpOffset;
6679 }
6680
6681 castBufferRsrcArgToV4I32(MI, B, Idx: 2 + OpOffset);
6682 Register RSrc = MI.getOperand(i: 2 + OpOffset).getReg();
6683
6684 // The typed intrinsics add an immediate after the registers.
6685 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6686
6687 // The struct intrinsic variants add one additional operand over raw.
6688 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6689 Register VIndex;
6690 if (HasVIndex) {
6691 VIndex = MI.getOperand(i: 3 + OpOffset).getReg();
6692 ++OpOffset;
6693 } else {
6694 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6695 }
6696
6697 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6698 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6699
6700 unsigned Format = 0;
6701 if (IsTyped) {
6702 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6703 ++OpOffset;
6704 }
6705
6706 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6707 unsigned ImmOffset;
6708
6709 LLT Ty = MRI.getType(Reg: Dst);
6710 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6711 // logic doesn't have to handle that case.
6712 if (hasBufferRsrcWorkaround(Ty)) {
6713 Observer.changingInstr(MI);
6714 Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
6715 Observer.changedInstr(MI);
6716 Dst = MI.getOperand(i: 0).getReg();
6717 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6718 }
6719 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6720 Ty = getBitcastRegisterType(Ty);
6721 Observer.changingInstr(MI);
6722 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
6723 Observer.changedInstr(MI);
6724 Dst = MI.getOperand(i: 0).getReg();
6725 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6726 }
6727
6728 LLT EltTy = Ty.getScalarType();
6729 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6730 const bool Unpacked = ST.hasUnpackedD16VMem();
6731
6732 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6733
6734 unsigned Opc;
6735
6736 // TODO: Support TFE for typed and narrow loads.
6737 if (IsTyped) {
6738 if (IsTFE)
6739 return false;
6740 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6741 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6742 } else if (IsFormat) {
6743 if (IsD16) {
6744 if (IsTFE)
6745 return false;
6746 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6747 } else {
6748 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6749 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6750 }
6751 } else {
6752 switch (MemTy.getSizeInBits()) {
6753 case 8:
6754 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6755 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6756 break;
6757 case 16:
6758 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6759 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6760 break;
6761 default:
6762 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6763 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6764 break;
6765 }
6766 }
6767
6768 if (IsTFE) {
6769 unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: 32);
6770 unsigned NumLoadDWords = NumValueDWords + 1;
6771 LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
6772 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
6773 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6774 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6775 if (MemTy.getSizeInBits() < 32) {
6776 Register ExtDst = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6777 B.buildUnmerge(Res: {ExtDst, StatusDst}, Op: LoadDstReg);
6778 B.buildTrunc(Res: Dst, Op: ExtDst);
6779 } else if (NumValueDWords == 1) {
6780 B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
6781 } else {
6782 SmallVector<Register, 5> LoadElts;
6783 for (unsigned I = 0; I != NumValueDWords; ++I)
6784 LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
6785 LoadElts.push_back(Elt: StatusDst);
6786 B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
6787 LoadElts.truncate(N: NumValueDWords);
6788 B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
6789 }
6790 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6791 (IsD16 && !Ty.isVector())) {
6792 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6793 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6794 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6795 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6796 B.buildTrunc(Res: Dst, Op: LoadDstReg);
6797 } else if (Unpacked && IsD16 && Ty.isVector()) {
6798 LLT UnpackedTy = Ty.changeElementSize(NewEltSize: 32);
6799 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
6800 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6801 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6802 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6803 // FIXME: G_TRUNC should work, but legalization currently fails
6804 auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
6805 SmallVector<Register, 4> Repack;
6806 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6807 Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6808 B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
6809 } else {
6810 buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6811 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6812 }
6813
6814 MI.eraseFromParent();
6815 return true;
6816}
6817
6818static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6819 switch (IntrID) {
6820 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6821 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6822 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6823 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6824 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6825 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6826 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6827 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6828 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6829 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6830 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6831 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6832 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6833 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6834 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6835 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6836 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6837 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6838 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6839 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6840 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6841 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6842 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6843 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6844 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6845 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6846 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6847 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6848 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6849 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6850 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6851 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6852 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6853 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6854 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6855 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6856 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6857 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6858 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6859 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6860 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6861 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6862 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6863 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6864 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6865 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6866 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6867 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6868 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6869 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6870 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6871 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6872 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6873 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6874 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6875 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6876 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6877 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6878 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6879 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6880 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6881 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6882 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6883 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6884 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6885 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6886 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6887 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6888 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6889 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6890 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6891 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6892 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6893 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6894 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6895 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6897 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6898 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6899 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6900 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6901 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6902 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6903 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6904 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6905 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6907 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6908 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6909 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6910 default:
6911 llvm_unreachable("unhandled atomic opcode");
6912 }
6913}
6914
6915bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6916 MachineIRBuilder &B,
6917 Intrinsic::ID IID) const {
6918 const bool IsCmpSwap =
6919 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6920 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6921 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6922 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6923
6924 Register Dst = MI.getOperand(i: 0).getReg();
6925 // Since we don't have 128-bit atomics, we don't need to handle the case of
6926 // p8 argmunents to the atomic itself
6927 Register VData = MI.getOperand(i: 2).getReg();
6928
6929 Register CmpVal;
6930 int OpOffset = 0;
6931
6932 if (IsCmpSwap) {
6933 CmpVal = MI.getOperand(i: 3).getReg();
6934 ++OpOffset;
6935 }
6936
6937 castBufferRsrcArgToV4I32(MI, B, Idx: 3 + OpOffset);
6938 Register RSrc = MI.getOperand(i: 3 + OpOffset).getReg();
6939 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6940
6941 // The struct intrinsic variants add one additional operand over raw.
6942 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6943 Register VIndex;
6944 if (HasVIndex) {
6945 VIndex = MI.getOperand(i: 4 + OpOffset).getReg();
6946 ++OpOffset;
6947 } else {
6948 VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 0).getReg(Idx: 0);
6949 }
6950
6951 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6952 Register SOffset = MI.getOperand(i: 5 + OpOffset).getReg();
6953 unsigned AuxiliaryData = MI.getOperand(i: 6 + OpOffset).getImm();
6954
6955 MachineMemOperand *MMO = *MI.memoperands_begin();
6956
6957 unsigned ImmOffset;
6958 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6959
6960 auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
6961 .addDef(RegNo: Dst)
6962 .addUse(RegNo: VData); // vdata
6963
6964 if (IsCmpSwap)
6965 MIB.addReg(RegNo: CmpVal);
6966
6967 MIB.addUse(RegNo: RSrc) // rsrc
6968 .addUse(RegNo: VIndex) // vindex
6969 .addUse(RegNo: VOffset) // voffset
6970 .addUse(RegNo: SOffset) // soffset
6971 .addImm(Val: ImmOffset) // offset(imm)
6972 .addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6973 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6974 .addMemOperand(MMO);
6975
6976 MI.eraseFromParent();
6977 return true;
6978}
6979
6980/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6981/// vector with s16 typed elements.
6982static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6983 SmallVectorImpl<Register> &PackedAddrs,
6984 unsigned ArgOffset,
6985 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6986 bool IsA16, bool IsG16) {
6987 const LLT S16 = LLT::scalar(SizeInBits: 16);
6988 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6989 auto EndIdx = Intr->VAddrEnd;
6990
6991 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6992 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6993 if (!SrcOp.isReg())
6994 continue; // _L to _LZ may have eliminated this.
6995
6996 Register AddrReg = SrcOp.getReg();
6997
6998 if ((I < Intr->GradientStart) ||
6999 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
7000 (I >= Intr->CoordStart && !IsA16)) {
7001 if ((I < Intr->GradientStart) && IsA16 &&
7002 (B.getMRI()->getType(Reg: AddrReg) == S16)) {
7003 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7004 // Special handling of bias when A16 is on. Bias is of type half but
7005 // occupies full 32-bit.
7006 PackedAddrs.push_back(
7007 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
7008 .getReg(Idx: 0));
7009 } else {
7010 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7011 "Bias needs to be converted to 16 bit in A16 mode");
7012 // Handle any gradient or coordinate operands that should not be packed
7013 AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: 0);
7014 PackedAddrs.push_back(Elt: AddrReg);
7015 }
7016 } else {
7017 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7018 // derivatives dx/dh and dx/dv are packed with undef.
7019 if (((I + 1) >= EndIdx) ||
7020 ((Intr->NumGradients / 2) % 2 == 1 &&
7021 (I == static_cast<unsigned>(Intr->GradientStart +
7022 (Intr->NumGradients / 2) - 1) ||
7023 I == static_cast<unsigned>(Intr->GradientStart +
7024 Intr->NumGradients - 1))) ||
7025 // Check for _L to _LZ optimization
7026 !MI.getOperand(i: ArgOffset + I + 1).isReg()) {
7027 PackedAddrs.push_back(
7028 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
7029 .getReg(Idx: 0));
7030 } else {
7031 PackedAddrs.push_back(
7032 Elt: B.buildBuildVector(
7033 Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + 1).getReg()})
7034 .getReg(Idx: 0));
7035 ++I;
7036 }
7037 }
7038 }
7039}
7040
7041/// Convert from separate vaddr components to a single vector address register,
7042/// and replace the remaining operands with $noreg.
7043static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
7044 int DimIdx, int NumVAddrs) {
7045 const LLT S32 = LLT::scalar(SizeInBits: 32);
7046 (void)S32;
7047 SmallVector<Register, 8> AddrRegs;
7048 for (int I = 0; I != NumVAddrs; ++I) {
7049 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
7050 if (SrcOp.isReg()) {
7051 AddrRegs.push_back(Elt: SrcOp.getReg());
7052 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7053 }
7054 }
7055
7056 int NumAddrRegs = AddrRegs.size();
7057 if (NumAddrRegs != 1) {
7058 auto VAddr =
7059 B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: 32), Ops: AddrRegs);
7060 MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: 0));
7061 }
7062
7063 for (int I = 1; I != NumVAddrs; ++I) {
7064 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
7065 if (SrcOp.isReg())
7066 MI.getOperand(i: DimIdx + I).setReg(AMDGPU::NoRegister);
7067 }
7068}
7069
7070/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7071///
7072/// Depending on the subtarget, load/store with 16-bit element data need to be
7073/// rewritten to use the low half of 32-bit registers, or directly use a packed
7074/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7075/// registers.
7076///
7077/// We don't want to directly select image instructions just yet, but also want
7078/// to exposes all register repacking to the legalizer/combiners. We also don't
7079/// want a selected instruction entering RegBankSelect. In order to avoid
7080/// defining a multitude of intermediate image instructions, directly hack on
7081/// the intrinsic's arguments. In cases like a16 addresses, this requires
7082/// padding now unnecessary arguments with $noreg.
7083bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
7084 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
7085 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7086
7087 const MachineFunction &MF = *MI.getMF();
7088 const unsigned NumDefs = MI.getNumExplicitDefs();
7089 const unsigned ArgOffset = NumDefs + 1;
7090 bool IsTFE = NumDefs == 2;
7091 // We are only processing the operands of d16 image operations on subtargets
7092 // that use the unpacked register layout, or need to repack the TFE result.
7093
7094 // TODO: Do we need to guard against already legalized intrinsics?
7095 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7096 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
7097
7098 MachineRegisterInfo *MRI = B.getMRI();
7099 const LLT S32 = LLT::scalar(SizeInBits: 32);
7100 const LLT S16 = LLT::scalar(SizeInBits: 16);
7101 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
7102
7103 unsigned DMask = 0;
7104 Register VData;
7105 LLT Ty;
7106
7107 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7108 VData = MI.getOperand(i: NumDefs == 0 ? 1 : 0).getReg();
7109 Ty = MRI->getType(Reg: VData);
7110 }
7111
7112 const bool IsAtomicPacked16Bit =
7113 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7114 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7115
7116 // Check for 16 bit addresses and pack if true.
7117 LLT GradTy =
7118 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
7119 LLT AddrTy =
7120 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
7121 const bool IsG16 =
7122 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7123 const bool IsA16 = AddrTy == S16;
7124 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7125
7126 int DMaskLanes = 0;
7127 if (!BaseOpcode->Atomic) {
7128 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
7129 if (BaseOpcode->Gather4) {
7130 DMaskLanes = 4;
7131 } else if (DMask != 0) {
7132 DMaskLanes = llvm::popcount(Value: DMask);
7133 } else if (!IsTFE && !BaseOpcode->Store) {
7134 // If dmask is 0, this is a no-op load. This can be eliminated.
7135 B.buildUndef(Res: MI.getOperand(i: 0));
7136 MI.eraseFromParent();
7137 return true;
7138 }
7139 }
7140
7141 Observer.changingInstr(MI);
7142 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7143
7144 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7145 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7146 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7147 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7148 unsigned NewOpcode = LoadOpcode;
7149 if (BaseOpcode->Store)
7150 NewOpcode = StoreOpcode;
7151 else if (BaseOpcode->NoReturn)
7152 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7153
7154 // Track that we legalized this
7155 MI.setDesc(B.getTII().get(Opcode: NewOpcode));
7156
7157 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7158 // dmask to be at least 1 otherwise the instruction will fail
7159 if (IsTFE && DMask == 0) {
7160 DMask = 0x1;
7161 DMaskLanes = 1;
7162 MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
7163 }
7164
7165 if (BaseOpcode->Atomic) {
7166 Register VData0 = MI.getOperand(i: 2).getReg();
7167 LLT Ty = MRI->getType(Reg: VData0);
7168
7169 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7170 if (Ty.isVector() && !IsAtomicPacked16Bit)
7171 return false;
7172
7173 if (BaseOpcode->AtomicX2) {
7174 Register VData1 = MI.getOperand(i: 3).getReg();
7175 // The two values are packed in one register.
7176 LLT PackedTy = LLT::fixed_vector(NumElements: 2, ScalarTy: Ty);
7177 auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
7178 MI.getOperand(i: 2).setReg(Concat.getReg(Idx: 0));
7179 MI.getOperand(i: 3).setReg(AMDGPU::NoRegister);
7180 }
7181 }
7182
7183 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7184
7185 // Rewrite the addressing register layout before doing anything else.
7186 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7187 // 16 bit gradients are supported, but are tied to the A16 control
7188 // so both gradients and addresses must be 16 bit
7189 return false;
7190 }
7191
7192 if (IsA16 && !ST.hasA16()) {
7193 // A16 not supported
7194 return false;
7195 }
7196
7197 const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
7198 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7199
7200 if (IsA16 || IsG16) {
7201 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7202 // instructions expect VGPR_32
7203 SmallVector<Register, 4> PackedRegs;
7204
7205 packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7206
7207 // See also below in the non-a16 branch
7208 const bool UseNSA = ST.hasNSAEncoding() &&
7209 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7210 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7211 const bool UsePartialNSA =
7212 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7213
7214 if (UsePartialNSA) {
7215 // Pack registers that would go over NSAMaxSize into last VAddr register
7216 LLT PackedAddrTy =
7217 LLT::fixed_vector(NumElements: 2 * (PackedRegs.size() - NSAMaxSize + 1), ScalarSizeInBits: 16);
7218 auto Concat = B.buildConcatVectors(
7219 Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - 1));
7220 PackedRegs[NSAMaxSize - 1] = Concat.getReg(Idx: 0);
7221 PackedRegs.resize(N: NSAMaxSize);
7222 } else if (!UseNSA && PackedRegs.size() > 1) {
7223 LLT PackedAddrTy = LLT::fixed_vector(NumElements: 2 * PackedRegs.size(), ScalarSizeInBits: 16);
7224 auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
7225 PackedRegs[0] = Concat.getReg(Idx: 0);
7226 PackedRegs.resize(N: 1);
7227 }
7228
7229 const unsigned NumPacked = PackedRegs.size();
7230 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7231 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
7232 if (!SrcOp.isReg()) {
7233 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7234 continue;
7235 }
7236
7237 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7238
7239 if (I - Intr->VAddrStart < NumPacked)
7240 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7241 else
7242 SrcOp.setReg(AMDGPU::NoRegister);
7243 }
7244 } else {
7245 // If the register allocator cannot place the address registers contiguously
7246 // without introducing moves, then using the non-sequential address encoding
7247 // is always preferable, since it saves VALU instructions and is usually a
7248 // wash in terms of code size or even better.
7249 //
7250 // However, we currently have no way of hinting to the register allocator
7251 // that MIMG addresses should be placed contiguously when it is possible to
7252 // do so, so force non-NSA for the common 2-address case as a heuristic.
7253 //
7254 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7255 // allocation when possible.
7256 //
7257 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7258 // set of the remaining addresses.
7259 const bool UseNSA = ST.hasNSAEncoding() &&
7260 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7261 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7262 const bool UsePartialNSA =
7263 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7264
7265 if (UsePartialNSA) {
7266 convertImageAddrToPacked(B, MI,
7267 DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7268 NumVAddrs: Intr->NumVAddrs - NSAMaxSize + 1);
7269 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7270 convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
7271 NumVAddrs: Intr->NumVAddrs);
7272 }
7273 }
7274
7275 int Flags = 0;
7276 if (IsA16)
7277 Flags |= 1;
7278 if (IsG16)
7279 Flags |= 2;
7280 MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
7281
7282 if (BaseOpcode->NoReturn) { // No TFE for stores?
7283 // TODO: Handle dmask trim
7284 if (!Ty.isVector() || !IsD16)
7285 return true;
7286
7287 Register RepackedReg = handleD16VData(B, MRI&: *MRI, Reg: VData, ImageStore: true);
7288 if (RepackedReg != VData) {
7289 MI.getOperand(i: 1).setReg(RepackedReg);
7290 }
7291
7292 return true;
7293 }
7294
7295 Register DstReg = MI.getOperand(i: 0).getReg();
7296 const LLT EltTy = Ty.getScalarType();
7297 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7298
7299 // Confirm that the return type is large enough for the dmask specified
7300 if (NumElts < DMaskLanes)
7301 return false;
7302
7303 if (NumElts > 4 || DMaskLanes > 4)
7304 return false;
7305
7306 // Image atomic instructions are using DMask to specify how many bits
7307 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7308 // DMaskLanes for image atomic has default value '0'.
7309 // We must be sure that atomic variants (especially packed) will not be
7310 // truncated from v2s16 or v4s16 to s16 type.
7311 //
7312 // ChangeElementCount will be needed for image load where Ty is always scalar.
7313 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7314 const LLT AdjustedTy =
7315 DMaskLanes == 0
7316 ? Ty
7317 : Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
7318
7319 // The raw dword aligned data component of the load. The only legal cases
7320 // where this matters should be when using the packed D16 format, for
7321 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7322 LLT RoundedTy;
7323
7324 // S32 vector to cover all data, plus TFE result element.
7325 LLT TFETy;
7326
7327 // Register type to use for each loaded component. Will be S32 or V2S16.
7328 LLT RegTy;
7329
7330 if (IsD16 && ST.hasUnpackedD16VMem()) {
7331 RoundedTy =
7332 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: 32);
7333 TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + 1, ScalarSizeInBits: 32);
7334 RegTy = S32;
7335 } else {
7336 unsigned EltSize = EltTy.getSizeInBits();
7337 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7338 unsigned RoundedSize = 32 * RoundedElts;
7339 RoundedTy = LLT::scalarOrVector(
7340 EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
7341 TFETy = LLT::fixed_vector(NumElements: RoundedSize / 32 + 1, ScalarTy: S32);
7342 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7343 }
7344
7345 // The return type does not need adjustment.
7346 // TODO: Should we change s16 case to s32 or <2 x s16>?
7347 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7348 return true;
7349
7350 Register Dst1Reg;
7351
7352 // Insert after the instruction.
7353 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
7354
7355 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7356 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7357 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7358 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7359
7360 Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
7361
7362 MI.getOperand(i: 0).setReg(NewResultReg);
7363
7364 // In the IR, TFE is supposed to be used with a 2 element struct return
7365 // type. The instruction really returns these two values in one contiguous
7366 // register, with one additional dword beyond the loaded data. Rewrite the
7367 // return type to use a single register result.
7368
7369 if (IsTFE) {
7370 Dst1Reg = MI.getOperand(i: 1).getReg();
7371 if (MRI->getType(Reg: Dst1Reg) != S32)
7372 return false;
7373
7374 // TODO: Make sure the TFE operand bit is set.
7375 MI.removeOperand(OpNo: 1);
7376
7377 // Handle the easy case that requires no repack instructions.
7378 if (Ty == S32) {
7379 B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
7380 return true;
7381 }
7382 }
7383
7384 // Now figure out how to copy the new result register back into the old
7385 // result.
7386 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7387
7388 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7389
7390 if (ResultNumRegs == 1) {
7391 assert(!IsTFE);
7392 ResultRegs[0] = NewResultReg;
7393 } else {
7394 // We have to repack into a new vector of some kind.
7395 for (int I = 0; I != NumDataRegs; ++I)
7396 ResultRegs[I] = MRI->createGenericVirtualRegister(Ty: RegTy);
7397 B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
7398
7399 // Drop the final TFE element to get the data part. The TFE result is
7400 // directly written to the right place already.
7401 if (IsTFE)
7402 ResultRegs.resize(N: NumDataRegs);
7403 }
7404
7405 // For an s16 scalar result, we form an s32 result with a truncate regardless
7406 // of packed vs. unpacked.
7407 if (IsD16 && !Ty.isVector()) {
7408 B.buildTrunc(Res: DstReg, Op: ResultRegs[0]);
7409 return true;
7410 }
7411
7412 // Avoid a build/concat_vector of 1 entry.
7413 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7414 B.buildBitcast(Dst: DstReg, Src: ResultRegs[0]);
7415 return true;
7416 }
7417
7418 assert(Ty.isVector());
7419
7420 if (IsD16) {
7421 // For packed D16 results with TFE enabled, all the data components are
7422 // S32. Cast back to the expected type.
7423 //
7424 // TODO: We don't really need to use load s32 elements. We would only need one
7425 // cast for the TFE result if a multiple of v2s16 was used.
7426 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7427 for (Register &Reg : ResultRegs)
7428 Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: 0);
7429 } else if (ST.hasUnpackedD16VMem()) {
7430 for (Register &Reg : ResultRegs)
7431 Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: 0);
7432 }
7433 }
7434
7435 auto padWithUndef = [&](LLT Ty, int NumElts) {
7436 if (NumElts == 0)
7437 return;
7438 Register Undef = B.buildUndef(Res: Ty).getReg(Idx: 0);
7439 for (int I = 0; I != NumElts; ++I)
7440 ResultRegs.push_back(Elt: Undef);
7441 };
7442
7443 // Pad out any elements eliminated due to the dmask.
7444 LLT ResTy = MRI->getType(Reg: ResultRegs[0]);
7445 if (!ResTy.isVector()) {
7446 padWithUndef(ResTy, NumElts - ResultRegs.size());
7447 B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
7448 return true;
7449 }
7450
7451 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7452 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7453
7454 // Deal with the one annoying legal case.
7455 const LLT V3S16 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 16);
7456 if (Ty == V3S16) {
7457 if (IsTFE) {
7458 if (ResultRegs.size() == 1) {
7459 NewResultReg = ResultRegs[0];
7460 } else if (ResultRegs.size() == 2) {
7461 LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
7462 NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: 0);
7463 } else {
7464 return false;
7465 }
7466 }
7467
7468 if (MRI->getType(Reg: DstReg).getNumElements() <
7469 MRI->getType(Reg: NewResultReg).getNumElements()) {
7470 B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
7471 } else {
7472 B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
7473 }
7474 return true;
7475 }
7476
7477 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7478 B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
7479 return true;
7480}
7481
7482bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
7483 MachineInstr &MI) const {
7484 MachineIRBuilder &B = Helper.MIRBuilder;
7485 GISelChangeObserver &Observer = Helper.Observer;
7486
7487 Register OrigDst = MI.getOperand(i: 0).getReg();
7488 Register Dst;
7489 LLT Ty = B.getMRI()->getType(Reg: OrigDst);
7490 unsigned Size = Ty.getSizeInBits();
7491 MachineFunction &MF = B.getMF();
7492 unsigned Opc = 0;
7493 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7494 assert(Size == 8 || Size == 16);
7495 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7496 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7497 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7498 // destination register.
7499 Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
7500 } else {
7501 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7502 Dst = OrigDst;
7503 }
7504
7505 Observer.changingInstr(MI);
7506
7507 // Handle needing to s.buffer.load() a p8 value.
7508 if (hasBufferRsrcWorkaround(Ty)) {
7509 Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: 0);
7510 B.setInsertPt(MBB&: B.getMBB(), II: MI);
7511 }
7512 if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
7513 Ty = getBitcastRegisterType(Ty);
7514 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
7515 B.setInsertPt(MBB&: B.getMBB(), II: MI);
7516 }
7517
7518 // FIXME: We don't really need this intermediate instruction. The intrinsic
7519 // should be fixed to have a memory operand. Since it's readnone, we're not
7520 // allowed to add one.
7521 MI.setDesc(B.getTII().get(Opcode: Opc));
7522 MI.removeOperand(OpNo: 1); // Remove intrinsic ID
7523
7524 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7525 const unsigned MemSize = (Size + 7) / 8;
7526 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7527 Ty: getTypeForLLT(Ty, C&: MF.getFunction().getContext()));
7528 MachineMemOperand *MMO = MF.getMachineMemOperand(
7529 PtrInfo: MachinePointerInfo(),
7530 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7531 MachineMemOperand::MOInvariant,
7532 Size: MemSize, BaseAlignment: MemAlign);
7533 MI.addMemOperand(MF, MO: MMO);
7534 if (Dst != OrigDst) {
7535 MI.getOperand(i: 0).setReg(Dst);
7536 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
7537 B.buildTrunc(Res: OrigDst, Op: Dst);
7538 }
7539
7540 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7541 // always be legal. We may need to restore this to a 96-bit result if it turns
7542 // out this needs to be converted to a vector load during RegBankSelect.
7543 if (!isPowerOf2_32(Value: Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7544 if (Ty.isVector())
7545 Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: 0);
7546 else
7547 Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: 0);
7548 }
7549
7550 Observer.changedInstr(MI);
7551 return true;
7552}
7553
7554bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
7555 MachineInstr &MI) const {
7556 MachineIRBuilder &B = Helper.MIRBuilder;
7557 GISelChangeObserver &Observer = Helper.Observer;
7558 Observer.changingInstr(MI);
7559 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7560 MI.removeOperand(OpNo: 0); // Remove intrinsic ID
7561 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
7562 Observer.changedInstr(MI);
7563 return true;
7564}
7565
7566// TODO: Move to selection
7567bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
7568 MachineRegisterInfo &MRI,
7569 MachineIRBuilder &B) const {
7570 if (!ST.hasTrapHandler() ||
7571 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7572 return legalizeTrapEndpgm(MI, MRI, B);
7573
7574 return ST.supportsGetDoorbellID() ?
7575 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
7576}
7577
7578bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
7579 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
7580 const DebugLoc &DL = MI.getDebugLoc();
7581 MachineBasicBlock &BB = B.getMBB();
7582 MachineFunction *MF = BB.getParent();
7583
7584 if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
7585 BuildMI(BB, I: BB.end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
7586 .addImm(Val: 0);
7587 MI.eraseFromParent();
7588 return true;
7589 }
7590
7591 // We need a block split to make the real endpgm a terminator. We also don't
7592 // want to break phis in successor blocks, so we can't just delete to the
7593 // end of the block.
7594 BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
7595 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7596 MF->push_back(MBB: TrapBB);
7597 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
7598 .addImm(Val: 0);
7599 BuildMI(BB, I: &MI, MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
7600 .addMBB(MBB: TrapBB);
7601
7602 BB.addSuccessor(Succ: TrapBB);
7603 MI.eraseFromParent();
7604 return true;
7605}
7606
7607bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
7608 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
7609 MachineFunction &MF = B.getMF();
7610 const LLT S64 = LLT::scalar(SizeInBits: 64);
7611
7612 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7613 // For code object version 5, queue_ptr is passed through implicit kernarg.
7614 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
7615 AMDGPU::AMDHSA_COV5) {
7616 AMDGPUTargetLowering::ImplicitParameter Param =
7617 AMDGPUTargetLowering::QUEUE_PTR;
7618 uint64_t Offset =
7619 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
7620
7621 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7622 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7623
7624 if (!loadInputValue(DstReg: KernargPtrReg, B,
7625 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
7626 return false;
7627
7628 // TODO: can we be smarter about machine pointer info?
7629 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF);
7630 MachineMemOperand *MMO = MF.getMachineMemOperand(
7631 PtrInfo: PtrInfo.getWithOffset(O: Offset),
7632 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7633 MachineMemOperand::MOInvariant,
7634 MemTy: LLT::scalar(SizeInBits: 64), base_alignment: commonAlignment(A: Align(64), Offset));
7635
7636 // Pointer address
7637 Register LoadAddr = MRI.createGenericVirtualRegister(
7638 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7639 B.buildObjectPtrOffset(Res: LoadAddr, Op0: KernargPtrReg,
7640 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
7641 // Load address
7642 Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
7643 B.buildCopy(Res: SGPR01, Op: Temp);
7644 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7645 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7646 .addReg(RegNo: SGPR01, Flags: RegState::Implicit);
7647 MI.eraseFromParent();
7648 return true;
7649 }
7650
7651 // Pass queue pointer to trap handler as input, and insert trap instruction
7652 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7653 Register LiveIn =
7654 MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7655 if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
7656 return false;
7657
7658 B.buildCopy(Res: SGPR01, Op: LiveIn);
7659 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7660 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7661 .addReg(RegNo: SGPR01, Flags: RegState::Implicit);
7662
7663 MI.eraseFromParent();
7664 return true;
7665}
7666
7667bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
7668 MachineRegisterInfo &MRI,
7669 MachineIRBuilder &B) const {
7670 // We need to simulate the 's_trap 2' instruction on targets that run in
7671 // PRIV=1 (where it is treated as a nop).
7672 if (ST.hasPrivEnabledTrap2NopBug()) {
7673 ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
7674 DL: MI.getDebugLoc());
7675 MI.eraseFromParent();
7676 return true;
7677 }
7678
7679 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7680 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7681 MI.eraseFromParent();
7682 return true;
7683}
7684
7685bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
7686 MachineRegisterInfo &MRI,
7687 MachineIRBuilder &B) const {
7688 // Is non-HSA path or trap-handler disabled? Then, report a warning
7689 // accordingly
7690 if (!ST.hasTrapHandler() ||
7691 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7692 Function &Fn = B.getMF().getFunction();
7693 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7694 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7695 } else {
7696 // Insert debug-trap instruction
7697 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7698 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7699 }
7700
7701 MI.eraseFromParent();
7702 return true;
7703}
7704
7705bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic(
7706 MachineInstr &MI, MachineIRBuilder &B) const {
7707 MachineRegisterInfo &MRI = *B.getMRI();
7708 const LLT S16 = LLT::scalar(SizeInBits: 16);
7709 const LLT S32 = LLT::scalar(SizeInBits: 32);
7710 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
7711 const LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
7712
7713 Register DstReg = MI.getOperand(i: 0).getReg();
7714 Register NodePtr = MI.getOperand(i: 2).getReg();
7715 Register RayExtent = MI.getOperand(i: 3).getReg();
7716 Register RayOrigin = MI.getOperand(i: 4).getReg();
7717 Register RayDir = MI.getOperand(i: 5).getReg();
7718 Register RayInvDir = MI.getOperand(i: 6).getReg();
7719 Register TDescr = MI.getOperand(i: 7).getReg();
7720
7721 if (!ST.hasGFX10_AEncoding()) {
7722 Function &Fn = B.getMF().getFunction();
7723 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7724 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7725 return false;
7726 }
7727
7728 const bool IsGFX11 = AMDGPU::isGFX11(STI: ST);
7729 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: ST);
7730 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: ST);
7731 const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == 16;
7732 const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == 64;
7733 const unsigned NumVDataDwords = 4;
7734 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7735 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7736 const bool UseNSA =
7737 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7738
7739 const unsigned BaseOpcodes[2][2] = {
7740 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7741 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7742 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7743 int Opcode;
7744 if (UseNSA) {
7745 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7746 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7747 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7748 : AMDGPU::MIMGEncGfx10NSA,
7749 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7750 } else {
7751 assert(!IsGFX12Plus);
7752 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7753 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7754 : AMDGPU::MIMGEncGfx10Default,
7755 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7756 }
7757 assert(Opcode != -1);
7758
7759 SmallVector<Register, 12> Ops;
7760 if (UseNSA && IsGFX11Plus) {
7761 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7762 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7763 auto Merged = B.buildMergeLikeInstr(
7764 Res: V3S32, Ops: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1), Unmerge.getReg(Idx: 2)});
7765 Ops.push_back(Elt: Merged.getReg(Idx: 0));
7766 };
7767
7768 Ops.push_back(Elt: NodePtr);
7769 Ops.push_back(Elt: RayExtent);
7770 packLanes(RayOrigin);
7771
7772 if (IsA16) {
7773 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7774 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7775 auto MergedDir = B.buildMergeLikeInstr(
7776 Res: V3S32,
7777 Ops: {B.buildBitcast(
7778 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 0),
7779 UnmergeRayDir.getReg(Idx: 0)}))
7780 .getReg(Idx: 0),
7781 B.buildBitcast(
7782 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 1),
7783 UnmergeRayDir.getReg(Idx: 1)}))
7784 .getReg(Idx: 0),
7785 B.buildBitcast(
7786 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 2),
7787 UnmergeRayDir.getReg(Idx: 2)}))
7788 .getReg(Idx: 0)});
7789 Ops.push_back(Elt: MergedDir.getReg(Idx: 0));
7790 } else {
7791 packLanes(RayDir);
7792 packLanes(RayInvDir);
7793 }
7794 } else {
7795 if (Is64) {
7796 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
7797 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7798 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7799 } else {
7800 Ops.push_back(Elt: NodePtr);
7801 }
7802 Ops.push_back(Elt: RayExtent);
7803
7804 auto packLanes = [&Ops, &S32, &B](Register Src) {
7805 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7806 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7807 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7808 Ops.push_back(Elt: Unmerge.getReg(Idx: 2));
7809 };
7810
7811 packLanes(RayOrigin);
7812 if (IsA16) {
7813 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7814 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7815 Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
7816 Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
7817 Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
7818 B.buildMergeLikeInstr(Res: R1,
7819 Ops: {UnmergeRayDir.getReg(Idx: 0), UnmergeRayDir.getReg(Idx: 1)});
7820 B.buildMergeLikeInstr(
7821 Res: R2, Ops: {UnmergeRayDir.getReg(Idx: 2), UnmergeRayInvDir.getReg(Idx: 0)});
7822 B.buildMergeLikeInstr(
7823 Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: 1), UnmergeRayInvDir.getReg(Idx: 2)});
7824 Ops.push_back(Elt: R1);
7825 Ops.push_back(Elt: R2);
7826 Ops.push_back(Elt: R3);
7827 } else {
7828 packLanes(RayDir);
7829 packLanes(RayInvDir);
7830 }
7831 }
7832
7833 if (!UseNSA) {
7834 // Build a single vector containing all the operands so far prepared.
7835 LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: 32);
7836 Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: 0);
7837 Ops.clear();
7838 Ops.push_back(Elt: MergedOps);
7839 }
7840
7841 auto MIB = B.buildInstr(Opcode: AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7842 .addDef(RegNo: DstReg)
7843 .addImm(Val: Opcode);
7844
7845 for (Register R : Ops) {
7846 MIB.addUse(RegNo: R);
7847 }
7848
7849 MIB.addUse(RegNo: TDescr)
7850 .addImm(Val: IsA16 ? 1 : 0)
7851 .cloneMemRefs(OtherMI: MI);
7852
7853 MI.eraseFromParent();
7854 return true;
7855}
7856
7857bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic(
7858 MachineInstr &MI, MachineIRBuilder &B) const {
7859 const LLT S32 = LLT::scalar(SizeInBits: 32);
7860 const LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
7861
7862 Register DstReg = MI.getOperand(i: 0).getReg();
7863 Register DstOrigin = MI.getOperand(i: 1).getReg();
7864 Register DstDir = MI.getOperand(i: 2).getReg();
7865 Register NodePtr = MI.getOperand(i: 4).getReg();
7866 Register RayExtent = MI.getOperand(i: 5).getReg();
7867 Register InstanceMask = MI.getOperand(i: 6).getReg();
7868 Register RayOrigin = MI.getOperand(i: 7).getReg();
7869 Register RayDir = MI.getOperand(i: 8).getReg();
7870 Register Offsets = MI.getOperand(i: 9).getReg();
7871 Register TDescr = MI.getOperand(i: 10).getReg();
7872
7873 if (!ST.hasBVHDualAndBVH8Insts()) {
7874 Function &Fn = B.getMF().getFunction();
7875 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7876 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7877 return false;
7878 }
7879
7880 bool IsBVH8 = cast<GIntrinsic>(Val&: MI).getIntrinsicID() ==
7881 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7882 const unsigned NumVDataDwords = 10;
7883 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7884 int Opcode = AMDGPU::getMIMGOpcode(
7885 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7886 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7887 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7888 assert(Opcode != -1);
7889
7890 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7891 Res: V2S32, Ops: {RayExtent, B.buildAnyExt(Res: S32, Op: InstanceMask)});
7892
7893 B.buildInstr(Opcode: IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7894 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7895 .addDef(RegNo: DstReg)
7896 .addDef(RegNo: DstOrigin)
7897 .addDef(RegNo: DstDir)
7898 .addImm(Val: Opcode)
7899 .addUse(RegNo: NodePtr)
7900 .addUse(RegNo: RayExtentInstanceMaskVec.getReg(Idx: 0))
7901 .addUse(RegNo: RayOrigin)
7902 .addUse(RegNo: RayDir)
7903 .addUse(RegNo: Offsets)
7904 .addUse(RegNo: TDescr)
7905 .cloneMemRefs(OtherMI: MI);
7906
7907 MI.eraseFromParent();
7908 return true;
7909}
7910
7911bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7912 MachineIRBuilder &B) const {
7913 const SITargetLowering *TLI = ST.getTargetLowering();
7914 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7915 Register DstReg = MI.getOperand(i: 0).getReg();
7916 B.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {DstReg}, SrcOps: {StackPtr});
7917 MI.eraseFromParent();
7918 return true;
7919}
7920
7921bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7922 MachineIRBuilder &B) const {
7923 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7924 if (!ST.hasArchitectedSGPRs())
7925 return false;
7926 LLT S32 = LLT::scalar(SizeInBits: 32);
7927 Register DstReg = MI.getOperand(i: 0).getReg();
7928 auto TTMP8 = B.buildCopy(Res: S32, Op: Register(AMDGPU::TTMP8));
7929 auto LSB = B.buildConstant(Res: S32, Val: 25);
7930 auto Width = B.buildConstant(Res: S32, Val: 5);
7931 B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
7932 MI.eraseFromParent();
7933 return true;
7934}
7935
7936bool AMDGPULegalizerInfo::legalizeConstHwRegRead(MachineInstr &MI,
7937 MachineIRBuilder &B,
7938 AMDGPU::Hwreg::Id HwReg,
7939 unsigned LowBit,
7940 unsigned Width) const {
7941 MachineRegisterInfo &MRI = *B.getMRI();
7942 Register DstReg = MI.getOperand(i: 0).getReg();
7943 if (!MRI.getRegClassOrNull(Reg: DstReg))
7944 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32RegClass);
7945 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32_const)
7946 .addDef(RegNo: DstReg)
7947 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(Values: HwReg, Values: LowBit, Values: Width));
7948 MI.eraseFromParent();
7949 return true;
7950}
7951
7952static constexpr unsigned FPEnvModeBitField =
7953 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
7954
7955static constexpr unsigned FPEnvTrapBitField =
7956 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
7957
7958bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7959 MachineRegisterInfo &MRI,
7960 MachineIRBuilder &B) const {
7961 Register Src = MI.getOperand(i: 0).getReg();
7962 if (MRI.getType(Reg: Src) != S64)
7963 return false;
7964
7965 auto ModeReg =
7966 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7967 /*HasSideEffects=*/true, /*isConvergent=*/false)
7968 .addImm(Val: FPEnvModeBitField);
7969 auto TrapReg =
7970 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7971 /*HasSideEffects=*/true, /*isConvergent=*/false)
7972 .addImm(Val: FPEnvTrapBitField);
7973 B.buildMergeLikeInstr(Res: Src, Ops: {ModeReg, TrapReg});
7974 MI.eraseFromParent();
7975 return true;
7976}
7977
7978bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7979 MachineRegisterInfo &MRI,
7980 MachineIRBuilder &B) const {
7981 Register Src = MI.getOperand(i: 0).getReg();
7982 if (MRI.getType(Reg: Src) != S64)
7983 return false;
7984
7985 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: 0));
7986 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7987 /*HasSideEffects=*/true, /*isConvergent=*/false)
7988 .addImm(Val: static_cast<int16_t>(FPEnvModeBitField))
7989 .addReg(RegNo: Unmerge.getReg(Idx: 0));
7990 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7991 /*HasSideEffects=*/true, /*isConvergent=*/false)
7992 .addImm(Val: static_cast<int16_t>(FPEnvTrapBitField))
7993 .addReg(RegNo: Unmerge.getReg(Idx: 1));
7994 MI.eraseFromParent();
7995 return true;
7996}
7997
7998bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7999 MachineInstr &MI) const {
8000 MachineIRBuilder &B = Helper.MIRBuilder;
8001 MachineRegisterInfo &MRI = *B.getMRI();
8002
8003 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8004 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
8005 switch (IntrID) {
8006 case Intrinsic::sponentry:
8007 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8008 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8009 // that we can remove this cast.
8010 const LLT S32 = LLT::scalar(SizeInBits: 32);
8011 Register TmpReg = MRI.createGenericVirtualRegister(Ty: S32);
8012 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_SPONENTRY).addDef(RegNo: TmpReg);
8013
8014 Register DstReg = MI.getOperand(i: 0).getReg();
8015 B.buildIntToPtr(Dst: DstReg, Src: TmpReg);
8016 MI.eraseFromParent();
8017 } else {
8018 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8019 Size: 1, SPOffset: 0, /*IsImmutable=*/false);
8020 B.buildFrameIndex(Res: MI.getOperand(i: 0), Idx: FI);
8021 MI.eraseFromParent();
8022 }
8023 return true;
8024 case Intrinsic::amdgcn_if:
8025 case Intrinsic::amdgcn_else: {
8026 MachineInstr *Br = nullptr;
8027 MachineBasicBlock *UncondBrTarget = nullptr;
8028 bool Negated = false;
8029 if (MachineInstr *BrCond =
8030 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8031 const SIRegisterInfo *TRI
8032 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8033
8034 Register Def = MI.getOperand(i: 1).getReg();
8035 Register Use = MI.getOperand(i: 3).getReg();
8036
8037 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
8038
8039 if (Negated)
8040 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
8041
8042 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
8043 if (IntrID == Intrinsic::amdgcn_if) {
8044 B.buildInstr(Opcode: AMDGPU::SI_IF)
8045 .addDef(RegNo: Def)
8046 .addUse(RegNo: Use)
8047 .addMBB(MBB: UncondBrTarget);
8048 } else {
8049 B.buildInstr(Opcode: AMDGPU::SI_ELSE)
8050 .addDef(RegNo: Def)
8051 .addUse(RegNo: Use)
8052 .addMBB(MBB: UncondBrTarget);
8053 }
8054
8055 if (Br) {
8056 Br->getOperand(i: 0).setMBB(CondBrTarget);
8057 } else {
8058 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8059 // since we're swapping branch targets it needs to be reinserted.
8060 // FIXME: IRTranslator should probably not do this
8061 B.buildBr(Dest&: *CondBrTarget);
8062 }
8063
8064 MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
8065 MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
8066 MI.eraseFromParent();
8067 BrCond->eraseFromParent();
8068 return true;
8069 }
8070
8071 return false;
8072 }
8073 case Intrinsic::amdgcn_loop: {
8074 MachineInstr *Br = nullptr;
8075 MachineBasicBlock *UncondBrTarget = nullptr;
8076 bool Negated = false;
8077 if (MachineInstr *BrCond =
8078 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8079 const SIRegisterInfo *TRI
8080 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8081
8082 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
8083 Register Reg = MI.getOperand(i: 2).getReg();
8084
8085 if (Negated)
8086 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
8087
8088 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
8089 B.buildInstr(Opcode: AMDGPU::SI_LOOP)
8090 .addUse(RegNo: Reg)
8091 .addMBB(MBB: UncondBrTarget);
8092
8093 if (Br)
8094 Br->getOperand(i: 0).setMBB(CondBrTarget);
8095 else
8096 B.buildBr(Dest&: *CondBrTarget);
8097
8098 MI.eraseFromParent();
8099 BrCond->eraseFromParent();
8100 MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
8101 return true;
8102 }
8103
8104 return false;
8105 }
8106 case Intrinsic::amdgcn_addrspacecast_nonnull:
8107 return legalizeAddrSpaceCast(MI, MRI, B);
8108 case Intrinsic::amdgcn_make_buffer_rsrc:
8109 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8110 case Intrinsic::amdgcn_kernarg_segment_ptr:
8111 if (!AMDGPU::isKernel(F: B.getMF().getFunction())) {
8112 // This only makes sense to call in a kernel, so just lower to null.
8113 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: 0);
8114 MI.eraseFromParent();
8115 return true;
8116 }
8117
8118 return legalizePreloadedArgIntrin(
8119 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
8120 case Intrinsic::amdgcn_implicitarg_ptr:
8121 return legalizeImplicitArgPtr(MI, MRI, B);
8122 case Intrinsic::amdgcn_workitem_id_x:
8123 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 0,
8124 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
8125 case Intrinsic::amdgcn_workitem_id_y:
8126 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 1,
8127 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
8128 case Intrinsic::amdgcn_workitem_id_z:
8129 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 2,
8130 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
8131 case Intrinsic::amdgcn_workgroup_id_x:
8132 return legalizeWorkGroupId(
8133 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
8134 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
8135 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
8136 case Intrinsic::amdgcn_workgroup_id_y:
8137 return legalizeWorkGroupId(
8138 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
8139 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
8140 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
8141 case Intrinsic::amdgcn_workgroup_id_z:
8142 return legalizeWorkGroupId(
8143 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
8144 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
8145 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
8146 case Intrinsic::amdgcn_cluster_id_x:
8147 return ST.hasClusters() &&
8148 legalizePreloadedArgIntrin(MI, MRI, B,
8149 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
8150 case Intrinsic::amdgcn_cluster_id_y:
8151 return ST.hasClusters() &&
8152 legalizePreloadedArgIntrin(MI, MRI, B,
8153 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
8154 case Intrinsic::amdgcn_cluster_id_z:
8155 return ST.hasClusters() &&
8156 legalizePreloadedArgIntrin(MI, MRI, B,
8157 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
8158 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8159 return ST.hasClusters() &&
8160 legalizePreloadedArgIntrin(
8161 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
8162 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8163 return ST.hasClusters() &&
8164 legalizePreloadedArgIntrin(
8165 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
8166 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8167 return ST.hasClusters() &&
8168 legalizePreloadedArgIntrin(
8169 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
8170 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8171 return ST.hasClusters() &&
8172 legalizeConstHwRegRead(MI, B, HwReg: AMDGPU::Hwreg::ID_IB_STS2, LowBit: 21, Width: 4);
8173 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8174 return ST.hasClusters() &&
8175 legalizePreloadedArgIntrin(
8176 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
8177 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8178 return ST.hasClusters() &&
8179 legalizePreloadedArgIntrin(
8180 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
8181 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8182 return ST.hasClusters() &&
8183 legalizePreloadedArgIntrin(
8184 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
8185 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8186 return ST.hasClusters() &&
8187 legalizePreloadedArgIntrin(
8188 MI, MRI, B,
8189 ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
8190 case Intrinsic::amdgcn_wave_id:
8191 return legalizeWaveID(MI, B);
8192 case Intrinsic::amdgcn_lds_kernel_id:
8193 return legalizePreloadedArgIntrin(MI, MRI, B,
8194 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
8195 case Intrinsic::amdgcn_dispatch_ptr:
8196 return legalizePreloadedArgIntrin(MI, MRI, B,
8197 ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
8198 case Intrinsic::amdgcn_queue_ptr:
8199 return legalizePreloadedArgIntrin(MI, MRI, B,
8200 ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
8201 case Intrinsic::amdgcn_implicit_buffer_ptr:
8202 return legalizePreloadedArgIntrin(
8203 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
8204 case Intrinsic::amdgcn_dispatch_id:
8205 return legalizePreloadedArgIntrin(MI, MRI, B,
8206 ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
8207 case Intrinsic::r600_read_ngroups_x:
8208 // TODO: Emit error for hsa
8209 return legalizeKernargMemParameter(MI, B,
8210 Offset: SI::KernelInputOffsets::NGROUPS_X);
8211 case Intrinsic::r600_read_ngroups_y:
8212 return legalizeKernargMemParameter(MI, B,
8213 Offset: SI::KernelInputOffsets::NGROUPS_Y);
8214 case Intrinsic::r600_read_ngroups_z:
8215 return legalizeKernargMemParameter(MI, B,
8216 Offset: SI::KernelInputOffsets::NGROUPS_Z);
8217 case Intrinsic::r600_read_local_size_x:
8218 // TODO: Could insert G_ASSERT_ZEXT from s16
8219 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
8220 case Intrinsic::r600_read_local_size_y:
8221 // TODO: Could insert G_ASSERT_ZEXT from s16
8222 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
8223 // TODO: Could insert G_ASSERT_ZEXT from s16
8224 case Intrinsic::r600_read_local_size_z:
8225 return legalizeKernargMemParameter(MI, B,
8226 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
8227 case Intrinsic::amdgcn_fdiv_fast:
8228 return legalizeFDIVFastIntrin(MI, MRI, B);
8229 case Intrinsic::amdgcn_is_shared:
8230 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
8231 case Intrinsic::amdgcn_is_private:
8232 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
8233 case Intrinsic::amdgcn_wavefrontsize: {
8234 B.buildConstant(Res: MI.getOperand(i: 0), Val: ST.getWavefrontSize());
8235 MI.eraseFromParent();
8236 return true;
8237 }
8238 case Intrinsic::amdgcn_s_buffer_load:
8239 return legalizeSBufferLoad(Helper, MI);
8240 case Intrinsic::amdgcn_raw_buffer_store:
8241 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8242 case Intrinsic::amdgcn_struct_buffer_store:
8243 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8244 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: false);
8245 case Intrinsic::amdgcn_raw_buffer_store_format:
8246 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8247 case Intrinsic::amdgcn_struct_buffer_store_format:
8248 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8249 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: true);
8250 case Intrinsic::amdgcn_raw_tbuffer_store:
8251 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8252 case Intrinsic::amdgcn_struct_tbuffer_store:
8253 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8254 return legalizeBufferStore(MI, Helper, IsTyped: true, IsFormat: true);
8255 case Intrinsic::amdgcn_raw_buffer_load:
8256 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8257 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8258 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8259 case Intrinsic::amdgcn_struct_buffer_load:
8260 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8261 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8262 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8263 return legalizeBufferLoad(MI, Helper, IsFormat: false, IsTyped: false);
8264 case Intrinsic::amdgcn_raw_buffer_load_format:
8265 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8266 case Intrinsic::amdgcn_struct_buffer_load_format:
8267 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8268 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: false);
8269 case Intrinsic::amdgcn_raw_tbuffer_load:
8270 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8271 case Intrinsic::amdgcn_struct_tbuffer_load:
8272 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8273 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: true);
8274 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8275 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8276 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8277 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8278 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8279 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8280 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8281 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8282 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8284 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8286 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8287 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8288 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8289 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8290 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8291 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8292 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8293 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8294 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8295 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8296 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8297 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8298 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8299 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8300 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8301 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8302 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8303 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8304 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8305 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8306 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8307 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8308 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8309 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8310 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8311 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8312 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8313 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8314 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8315 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8316 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8317 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8318 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8319 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8320 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8321 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8322 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8323 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8324 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8325 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8326 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8327 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8328 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8329 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8330 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8331 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8332 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8333 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8334 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8335 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8336 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8337 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8338 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8339 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8340 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8341 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8342 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8343 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8344 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8345 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8346 return legalizeBufferAtomic(MI, B, IID: IntrID);
8347 case Intrinsic::amdgcn_rsq_clamp:
8348 return legalizeRsqClampIntrinsic(MI, MRI, B);
8349 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8350 return legalizeBVHIntersectRayIntrinsic(MI, B);
8351 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8352 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8353 return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
8354 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8355 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8356 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8357 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8358 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8359 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8360 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8361 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8362 Register Index = MI.getOperand(i: 5).getReg();
8363 LLT S64 = LLT::scalar(SizeInBits: 64);
8364 LLT IndexArgTy = MRI.getType(Reg: Index);
8365 if (IndexArgTy != S64) {
8366 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(Dst: S64, Src: Index)
8367 : B.buildAnyExt(Res: S64, Op: Index);
8368 MI.getOperand(i: 5).setReg(NewIndex.getReg(Idx: 0));
8369 }
8370 return true;
8371 }
8372 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8373 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8374 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8375 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8376 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8377 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8378 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8379 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8380 Register Index = MI.getOperand(i: 5).getReg();
8381 LLT S32 = LLT::scalar(SizeInBits: 32);
8382 if (MRI.getType(Reg: Index) != S32)
8383 MI.getOperand(i: 5).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
8384 return true;
8385 }
8386 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8387 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8388 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8389 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8390 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8391 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8392 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8393 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8394 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8395 Register Index = MI.getOperand(i: 7).getReg();
8396 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8397 ? LLT::scalar(SizeInBits: 64)
8398 : LLT::scalar(SizeInBits: 32);
8399 LLT IndexArgTy = MRI.getType(Reg: Index);
8400 if (IndexArgTy != IdxTy) {
8401 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(Dst: IdxTy, Src: Index)
8402 : B.buildAnyExt(Res: IdxTy, Op: Index);
8403 MI.getOperand(i: 7).setReg(NewIndex.getReg(Idx: 0));
8404 }
8405 return true;
8406 }
8407
8408 case Intrinsic::amdgcn_fmed3: {
8409 GISelChangeObserver &Observer = Helper.Observer;
8410
8411 // FIXME: This is to workaround the inability of tablegen match combiners to
8412 // match intrinsics in patterns.
8413 Observer.changingInstr(MI);
8414 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_FMED3));
8415 MI.removeOperand(OpNo: 1);
8416 Observer.changedInstr(MI);
8417 return true;
8418 }
8419 case Intrinsic::amdgcn_readlane:
8420 case Intrinsic::amdgcn_writelane:
8421 case Intrinsic::amdgcn_readfirstlane:
8422 case Intrinsic::amdgcn_permlane16:
8423 case Intrinsic::amdgcn_permlanex16:
8424 case Intrinsic::amdgcn_permlane64:
8425 case Intrinsic::amdgcn_set_inactive:
8426 case Intrinsic::amdgcn_set_inactive_chain_arg:
8427 case Intrinsic::amdgcn_mov_dpp8:
8428 case Intrinsic::amdgcn_update_dpp:
8429 return legalizeLaneOp(Helper, MI, IID: IntrID);
8430 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8431 return legalizeSBufferPrefetch(Helper, MI);
8432 case Intrinsic::amdgcn_dead: {
8433 // TODO: Use poison instead of undef
8434 for (const MachineOperand &Def : MI.defs())
8435 B.buildUndef(Res: Def);
8436 MI.eraseFromParent();
8437 return true;
8438 }
8439 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8440 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8441 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8442 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8443 B.buildLoad(Res: MI.getOperand(i: 0), Addr: MI.getOperand(i: 2), MMO&: **MI.memoperands_begin());
8444 MI.eraseFromParent();
8445 return true;
8446 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8447 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8448 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8449 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8450 B.buildStore(Val: MI.getOperand(i: 2), Addr: MI.getOperand(i: 1), MMO&: **MI.memoperands_begin());
8451 MI.eraseFromParent();
8452 return true;
8453 case Intrinsic::amdgcn_flat_load_monitor_b32:
8454 case Intrinsic::amdgcn_flat_load_monitor_b64:
8455 case Intrinsic::amdgcn_flat_load_monitor_b128:
8456 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8457 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8458 .add(MO: MI.getOperand(i: 0))
8459 .add(MO: MI.getOperand(i: 2))
8460 .addMemOperand(MMO: *MI.memoperands_begin());
8461 MI.eraseFromParent();
8462 return true;
8463 case Intrinsic::amdgcn_global_load_monitor_b32:
8464 case Intrinsic::amdgcn_global_load_monitor_b64:
8465 case Intrinsic::amdgcn_global_load_monitor_b128:
8466 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8467 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8468 .add(MO: MI.getOperand(i: 0))
8469 .add(MO: MI.getOperand(i: 2))
8470 .addMemOperand(MMO: *MI.memoperands_begin());
8471 MI.eraseFromParent();
8472 return true;
8473 default: {
8474 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8475 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
8476 return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
8477 return true;
8478 }
8479 }
8480
8481 return true;
8482}
8483