1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
24#include "SIRegisterInfo.h"
25#include "Utils/AMDGPUBaseInfo.h"
26#include "llvm/ADT/ScopeExit.h"
27#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
30#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
32#include "llvm/CodeGen/GlobalISel/Utils.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/PseudoSourceValueManager.h"
35#include "llvm/CodeGen/TargetOpcodes.h"
36#include "llvm/IR/DiagnosticInfo.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
49static cl::opt<bool> EnableNewLegality(
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(Val: false),
54 cl::ReallyHidden);
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
59static LLT getPow2VectorType(LLT Ty) {
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(Value: NElts);
62 return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
66static LLT getPow2ScalarType(LLT Ty) {
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Value: Bits);
69 return LLT::scalar(SizeInBits: Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(NumElements: Ty.getNumElements() + 1, ScalarTy: EltTy));
110 };
111}
112
113static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
144static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) {
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(x: TypeIdx, y: LLT::scalar(SizeInBits: MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
152static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: Ty.getElementType()));
170 };
171}
172
173static LLT getBufferRsrcScalarType(const LLT Ty) {
174 if (!Ty.isVector())
175 return LLT::scalar(SizeInBits: 128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: 128));
178}
179
180static LLT getBufferRsrcRegisterType(const LLT Ty) {
181 if (!Ty.isVector())
182 return LLT::fixed_vector(NumElements: 4, ScalarTy: LLT::scalar(SizeInBits: 32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElements: NumElems * 4, ScalarTy: LLT::scalar(SizeInBits: 32));
185}
186
187static LLT getBitcastRegisterType(const LLT Ty) {
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(SizeInBits: Size);
194 }
195
196 return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32);
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
206static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
212 TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32));
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
239 Size <= MaxRegisterSize;
240}
241
242static bool isRegisterVectorElementType(LLT EltTy) {
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Size: Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
267static LegalityPredicate isRegisterType(const GCNSubtarget &ST,
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Ty: Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
277static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST,
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(SizeInBits: 16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(SizeInBits: 1);
297constexpr LLT S8 = LLT::scalar(SizeInBits: 8);
298constexpr LLT S16 = LLT::scalar(SizeInBits: 16);
299constexpr LLT S32 = LLT::scalar(SizeInBits: 32);
300constexpr LLT F32 = LLT::scalar(SizeInBits: 32); // TODO: Expected float32
301constexpr LLT S64 = LLT::scalar(SizeInBits: 64);
302constexpr LLT F64 = LLT::scalar(SizeInBits: 64); // TODO: Expected float64
303constexpr LLT S96 = LLT::scalar(SizeInBits: 96);
304constexpr LLT S128 = LLT::scalar(SizeInBits: 128);
305constexpr LLT S160 = LLT::scalar(SizeInBits: 160);
306constexpr LLT S192 = LLT::scalar(SizeInBits: 192);
307constexpr LLT S224 = LLT::scalar(SizeInBits: 224);
308constexpr LLT S256 = LLT::scalar(SizeInBits: 256);
309constexpr LLT S512 = LLT::scalar(SizeInBits: 512);
310constexpr LLT S1024 = LLT::scalar(SizeInBits: 1024);
311constexpr LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
312
313constexpr LLT V2S8 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 8);
314constexpr LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
315constexpr LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
316constexpr LLT V6S16 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16);
317constexpr LLT V8S16 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
318constexpr LLT V10S16 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 16);
319constexpr LLT V12S16 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 16);
320constexpr LLT V16S16 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16);
321
322// TODO: Expected LLT::fixed_vector(2, LLT::float16())
323constexpr LLT V2F16 = LLT::fixed_vector(NumElements: 2, ScalarTy: LLT::scalar(SizeInBits: 16));
324constexpr LLT V2BF16 = V2F16; // FIXME
325
326constexpr LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
327constexpr LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
328constexpr LLT V4S32 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
329constexpr LLT V5S32 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 32);
330constexpr LLT V6S32 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 32);
331constexpr LLT V7S32 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 32);
332constexpr LLT V8S32 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32);
333constexpr LLT V9S32 = LLT::fixed_vector(NumElements: 9, ScalarSizeInBits: 32);
334constexpr LLT V10S32 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 32);
335constexpr LLT V11S32 = LLT::fixed_vector(NumElements: 11, ScalarSizeInBits: 32);
336constexpr LLT V12S32 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 32);
337constexpr LLT V16S32 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32);
338constexpr LLT V32S32 = LLT::fixed_vector(NumElements: 32, ScalarSizeInBits: 32);
339
340constexpr LLT V2S64 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
341constexpr LLT V3S64 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 64);
342constexpr LLT V4S64 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64);
343constexpr LLT V5S64 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 64);
344constexpr LLT V6S64 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 64);
345constexpr LLT V7S64 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 64);
346constexpr LLT V8S64 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64);
347constexpr LLT V16S64 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 64);
348
349constexpr LLT V2S128 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 128);
350constexpr LLT V4S128 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 128);
351
352constexpr std::initializer_list<LLT> AllScalarTypes = {
353 S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
354
355constexpr std::initializer_list<LLT> AllS16Vectors{
356 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
357
358constexpr std::initializer_list<LLT> AllS32Vectors = {
359 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
360 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
361
362constexpr std::initializer_list<LLT> AllS64Vectors = {
363 V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
364
365constexpr std::initializer_list<LLT> AllVectors{
366 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128,
367 V4S128, V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
368 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32, V2S64, V3S64,
369 V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
370
371// Checks whether a type is in the list of legal register types.
372static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
375
376 return is_contained(Set: AllS32Vectors, Element: Ty) || is_contained(Set: AllS64Vectors, Element: Ty) ||
377 is_contained(Set: AllScalarTypes, Element: Ty) ||
378 (ST.useRealTrue16Insts() && Ty == S16) ||
379 is_contained(Set: AllS16Vectors, Element: Ty);
380}
381
382static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST,
383 unsigned TypeIdx) {
384 return [&ST, TypeIdx](const LegalityQuery &Query) {
385 return isRegisterClassType(ST, Ty: Query.Types[TypeIdx]);
386 };
387}
388
389// If we have a truncating store or an extending load with a data size larger
390// than 32-bits, we need to reduce to a 32-bit type.
391static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
392 return [=](const LegalityQuery &Query) {
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
396 };
397}
398
399// If we have a truncating store or an extending load with a data size larger
400// than 32-bits and mem location is a power of 2
401static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) {
402 return [=](const LegalityQuery &Query) {
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
404 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
405 isPowerOf2_64(Value: MemSize);
406 };
407}
408
409// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
410// handle some operations by just promoting the register during
411// selection. There are also d16 loads on GFX9+ which preserve the high bits.
412static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
413 bool IsLoad, bool IsAtomic) {
414 switch (AS) {
415 case AMDGPUAS::PRIVATE_ADDRESS:
416 // FIXME: Private element size.
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
418 case AMDGPUAS::LOCAL_ADDRESS:
419 return ST.useDS128() ? 128 : 64;
420 case AMDGPUAS::GLOBAL_ADDRESS:
421 case AMDGPUAS::CONSTANT_ADDRESS:
422 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
423 case AMDGPUAS::BUFFER_RESOURCE:
424 // Treat constant and global as identical. SMRD loads are sometimes usable for
425 // global loads (ideally constant address space should be eliminated)
426 // depending on the context. Legality cannot be context dependent, but
427 // RegBankSelect can split the load as necessary depending on the pointer
428 // register bank/uniformity and if the memory is invariant or not written in a
429 // kernel.
430 return IsLoad ? 512 : 128;
431 default:
432 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
433 // if they may alias scratch depending on the subtarget. This needs to be
434 // moved to custom handling to use addressMayBeAccessedAsPrivate
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
436 }
437}
438
439static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
440 const LegalityQuery &Query) {
441 const LLT Ty = Query.Types[0];
442
443 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
444 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
445
446 unsigned RegSize = Ty.getSizeInBits();
447 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
448 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
449 unsigned AS = Query.Types[1].getAddressSpace();
450
451 // All of these need to be custom lowered to cast the pointer operand.
452 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
453 return false;
454
455 // Do not handle extending vector loads.
456 if (Ty.isVector() && MemSize != RegSize)
457 return false;
458
459 // TODO: We should be able to widen loads if the alignment is high enough, but
460 // we also need to modify the memory access size.
461#if 0
462 // Accept widening loads based on alignment.
463 if (IsLoad && MemSize < Size)
464 MemSize = std::max(MemSize, Align);
465#endif
466
467 // Only 1-byte and 2-byte to 32-bit extloads are valid.
468 if (MemSize != RegSize && RegSize != 32)
469 return false;
470
471 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
472 IsAtomic: Query.MMODescrs[0].Ordering !=
473 AtomicOrdering::NotAtomic))
474 return false;
475
476 switch (MemSize) {
477 case 8:
478 case 16:
479 case 32:
480 case 64:
481 case 128:
482 break;
483 case 96:
484 if (!ST.hasDwordx3LoadStores())
485 return false;
486 break;
487 case 256:
488 case 512:
489 // These may contextually need to be broken down.
490 break;
491 default:
492 return false;
493 }
494
495 assert(RegSize >= MemSize);
496
497 if (AlignBits < MemSize) {
498 const SITargetLowering *TLI = ST.getTargetLowering();
499 if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
500 Alignment: Align(AlignBits / 8)))
501 return false;
502 }
503
504 return true;
505}
506
507// The newer buffer intrinsic forms take their resource arguments as
508// pointers in address space 8, aka s128 values. However, in order to not break
509// SelectionDAG, the underlying operations have to continue to take v4i32
510// arguments. Therefore, we convert resource pointers - or vectors of them
511// to integer values here.
512static bool hasBufferRsrcWorkaround(const LLT Ty) {
513 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
514 return true;
515 if (Ty.isVector()) {
516 const LLT ElemTy = Ty.getElementType();
517 return hasBufferRsrcWorkaround(Ty: ElemTy);
518 }
519 return false;
520}
521
522// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
523// workaround this. Eventually it should ignore the type for loads and only care
524// about the size. Return true in cases where we will workaround this for now by
525// bitcasting.
526static bool loadStoreBitcastWorkaround(const LLT Ty) {
527 if (EnableNewLegality)
528 return false;
529
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
532 return true;
533 if (Size <= 64)
534 return false;
535 // Address space 8 pointers get their own workaround.
536 if (hasBufferRsrcWorkaround(Ty))
537 return false;
538 if (!Ty.isVector())
539 return true;
540
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
543}
544
545static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
546 const LLT Ty = Query.Types[0];
547 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
548 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
549}
550
551/// Return true if a load or store of the type should be lowered with a bitcast
552/// to a different type.
553static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
554 const LLT MemTy) {
555 const unsigned MemSizeInBits = MemTy.getSizeInBits();
556 const unsigned Size = Ty.getSizeInBits();
557 if (Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
559
560 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
561 return true;
562
563 // Don't try to handle bitcasting vector ext loads for now.
564 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
565 (Size <= 32 || isRegisterSize(ST, Size)) &&
566 !isRegisterVectorElementType(EltTy: Ty.getElementType());
567}
568
569/// Return true if we should legalize a load by widening an odd sized memory
570/// access up to the alignment. Note this case when the memory access itself
571/// changes, not the size of the result register.
572static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
573 uint64_t AlignInBits, unsigned AddrSpace,
574 unsigned Opcode) {
575 unsigned SizeInBits = MemoryTy.getSizeInBits();
576 // We don't want to widen cases that are naturally legal.
577 if (isPowerOf2_32(Value: SizeInBits))
578 return false;
579
580 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
581 // end up widening these for a scalar load during RegBankSelect, if we don't
582 // have 96-bit scalar loads.
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
584 return false;
585
586 if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
587 return false;
588
589 // A load is known dereferenceable up to the alignment, so it's legal to widen
590 // to it.
591 //
592 // TODO: Could check dereferenceable for less aligned cases.
593 unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
594 if (AlignInBits < RoundedSize)
595 return false;
596
597 // Do not widen if it would introduce a slow unaligned load.
598 const SITargetLowering *TLI = ST.getTargetLowering();
599 unsigned Fast = 0;
600 return TLI->allowsMisalignedMemoryAccessesImpl(
601 Size: RoundedSize, AddrSpace, Alignment: Align(AlignInBits / 8),
602 Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
603 Fast;
604}
605
606static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
607 unsigned Opcode) {
608 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
609 return false;
610
611 return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs[0].MemoryTy,
612 AlignInBits: Query.MMODescrs[0].AlignInBits,
613 AddrSpace: Query.Types[1].getAddressSpace(), Opcode);
614}
615
616/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
617/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
618/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
619static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
620 MachineRegisterInfo &MRI, unsigned Idx) {
621 MachineOperand &MO = MI.getOperand(i: Idx);
622
623 const LLT PointerTy = MRI.getType(Reg: MO.getReg());
624
625 // Paranoidly prevent us from doing this multiple times.
626 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
627 return PointerTy;
628
629 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
631 if (!PointerTy.isVector()) {
632 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
633 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
634 const LLT S32 = LLT::scalar(SizeInBits: 32);
635
636 Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
639 for (unsigned I = 0; I < NumParts; ++I)
640 VectorElems[I] =
641 B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: 0);
642 B.buildMergeValues(Res: MO, Ops: VectorElems);
643 MO.setReg(VectorReg);
644 return VectorTy;
645 }
646 Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
647 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
648 auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
649 B.buildIntToPtr(Dst: MO, Src: Scalar);
650 MO.setReg(BitcastReg);
651
652 return VectorTy;
653}
654
655/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
656/// the form in which the value must be in order to be passed to the low-level
657/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
658/// needed in order to account for the fact that we can't define a register
659/// class for s128 without breaking SelectionDAG.
660static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
661 MachineRegisterInfo &MRI = *B.getMRI();
662 const LLT PointerTy = MRI.getType(Reg: Pointer);
663 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
664 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
665
666 if (!PointerTy.isVector()) {
667 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
668 SmallVector<Register, 4> PointerParts;
669 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
670 auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: Pointer);
671 for (unsigned I = 0; I < NumParts; ++I)
672 PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
673 return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: 0);
674 }
675 Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: 0);
676 return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: 0);
677}
678
679static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
680 unsigned Idx) {
681 MachineOperand &MO = MI.getOperand(i: Idx);
682
683 const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
684 // Paranoidly prevent us from doing this multiple times.
685 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
686 return;
687 MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
688}
689
690AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
691 const GCNTargetMachine &TM)
692 : ST(ST_) {
693 using namespace TargetOpcode;
694
695 auto GetAddrSpacePtr = [&TM](unsigned AS) {
696 return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
697 };
698
699 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
700 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
701 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
702 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
703 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
704 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
705 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
706 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
707 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
708 const LLT BufferStridedPtr =
709 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
710
711 const LLT CodePtr = FlatPtr;
712
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
715 };
716
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 };
720
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722
723 const std::initializer_list<LLT> FPTypesBase = {
724 S32, S64
725 };
726
727 const std::initializer_list<LLT> FPTypes16 = {
728 S32, S64, S16
729 };
730
731 const std::initializer_list<LLT> FPTypesPK16 = {
732 S32, S64, S16, V2S16
733 };
734
735 const std::initializer_list<LLT> FPTypesPK16_64 = {S32, S64, S16, V2S16,
736 V2S64};
737
738 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
739
740 getActionDefinitionsBuilder(Opcode: G_BR).alwaysLegal();
741
742 // s1 for VCC branches, s32 for SCC branches.
743 getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
744
745 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
746 // elements for v3s16
747 getActionDefinitionsBuilder(Opcode: G_PHI)
748 .legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
749 .legalFor(Types: AllS32Vectors)
750 .legalFor(Types: AllS64Vectors)
751 .legalFor(Types: AddrSpaces64)
752 .legalFor(Types: AddrSpaces32)
753 .legalFor(Types: AddrSpaces128)
754 .legalIf(Predicate: isPointer(TypeIdx: 0))
755 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S256)
756 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
757 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16)
758 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
759 .scalarize(TypeIdx: 0);
760
761 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
762 // Full set of gfx9 features.
763 if (ST.hasPackedU64Ops()) {
764 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
765 .legalFor(Types: {S64, S32, S16, V2S16, V2S64})
766 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
767 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S64, NumElts: 2)
768 .scalarize(TypeIdx: 0)
769 .minScalar(TypeIdx: 0, Ty: S16)
770 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
771 .maxScalar(TypeIdx: 0, Ty: S32);
772 } else if (ST.hasScalarAddSub64()) {
773 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
774 .legalFor(Types: {S64, S32, S16, V2S16})
775 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
776 .scalarize(TypeIdx: 0)
777 .minScalar(TypeIdx: 0, Ty: S16)
778 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
779 .maxScalar(TypeIdx: 0, Ty: S32);
780 } else {
781 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
782 .legalFor(Types: {S32, S16, V2S16})
783 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
784 .scalarize(TypeIdx: 0)
785 .minScalar(TypeIdx: 0, Ty: S16)
786 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
787 .maxScalar(TypeIdx: 0, Ty: S32);
788 }
789
790 if (ST.hasScalarSMulU64()) {
791 getActionDefinitionsBuilder(Opcode: G_MUL)
792 .legalFor(Types: {S64, S32, S16, V2S16})
793 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
794 .scalarize(TypeIdx: 0)
795 .minScalar(TypeIdx: 0, Ty: S16)
796 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
797 .custom();
798 } else {
799 getActionDefinitionsBuilder(Opcode: G_MUL)
800 .legalFor(Types: {S32, S16, V2S16})
801 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
802 .scalarize(TypeIdx: 0)
803 .minScalar(TypeIdx: 0, Ty: S16)
804 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
805 .custom();
806 }
807 assert(ST.hasMad64_32());
808
809 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
810 .legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
811 .minScalarOrElt(TypeIdx: 0, Ty: S16)
812 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
813 .scalarize(TypeIdx: 0)
814 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
815 .lower();
816 } else if (ST.has16BitInsts()) {
817 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
818 .legalFor(Types: {S32, S16})
819 .minScalar(TypeIdx: 0, Ty: S16)
820 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
821 .maxScalar(TypeIdx: 0, Ty: S32)
822 .scalarize(TypeIdx: 0);
823
824 getActionDefinitionsBuilder(Opcode: G_MUL)
825 .legalFor(Types: {S32, S16})
826 .scalarize(TypeIdx: 0)
827 .minScalar(TypeIdx: 0, Ty: S16)
828 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
829 .custom();
830 assert(ST.hasMad64_32());
831
832 // Technically the saturating operations require clamp bit support, but this
833 // was introduced at the same time as 16-bit operations.
834 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
835 .legalFor(Types: {S32, S16}) // Clamp modifier
836 .minScalar(TypeIdx: 0, Ty: S16)
837 .scalarize(TypeIdx: 0)
838 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 16)
839 .lower();
840
841 // We're just lowering this, but it helps get a better result to try to
842 // coerce to the desired type first.
843 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
844 .minScalar(TypeIdx: 0, Ty: S16)
845 .scalarize(TypeIdx: 0)
846 .lower();
847 } else {
848 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
849 .legalFor(Types: {S32})
850 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
851 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
852 .scalarize(TypeIdx: 0);
853
854 auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
855 .legalFor(Types: {S32})
856 .scalarize(TypeIdx: 0)
857 .minScalar(TypeIdx: 0, Ty: S32)
858 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32);
859
860 if (ST.hasMad64_32())
861 Mul.custom();
862 else
863 Mul.maxScalar(TypeIdx: 0, Ty: S32);
864
865 if (ST.hasIntClamp()) {
866 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
867 .legalFor(Types: {S32}) // Clamp modifier.
868 .scalarize(TypeIdx: 0)
869 .minScalarOrElt(TypeIdx: 0, Ty: S32)
870 .lower();
871 } else {
872 // Clamp bit support was added in VI, along with 16-bit operations.
873 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
874 .minScalar(TypeIdx: 0, Ty: S32)
875 .scalarize(TypeIdx: 0)
876 .lower();
877 }
878
879 // FIXME: DAG expansion gets better results. The widening uses the smaller
880 // range values and goes for the min/max lowering directly.
881 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
882 .minScalar(TypeIdx: 0, Ty: S32)
883 .scalarize(TypeIdx: 0)
884 .lower();
885 }
886
887 getActionDefinitionsBuilder(
888 Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
889 .customFor(Types: {S32, S64})
890 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
891 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
892 .scalarize(TypeIdx: 0);
893
894 auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
895 .legalFor(Types: {S32})
896 .maxScalar(TypeIdx: 0, Ty: S32);
897
898 if (ST.hasVOP3PInsts()) {
899 Mulh
900 .clampMaxNumElements(TypeIdx: 0, EltTy: S8, MaxElements: 2)
901 .lowerFor(Types: {V2S8});
902 }
903
904 Mulh
905 .scalarize(TypeIdx: 0)
906 .lower();
907
908 // Report legal for any types we can handle anywhere. For the cases only legal
909 // on the SALU, RegBankSelect will be able to re-legalize.
910 getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
911 .legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
912 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
913 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
914 .fewerElementsIf(
915 Predicate: all(P0: vectorWiderThan(TypeIdx: 0, Size: 64), P1: scalarOrEltNarrowerThan(TypeIdx: 0, Size: 64)),
916 Mutation: fewerEltsToSize64Vector(TypeIdx: 0))
917 .widenScalarToNextPow2(TypeIdx: 0)
918 .scalarize(TypeIdx: 0);
919
920 getActionDefinitionsBuilder(
921 Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
922 .legalFor(Types: {{S32, S1}, {S32, S32}})
923 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
924 .scalarize(TypeIdx: 0);
925
926 getActionDefinitionsBuilder(Opcode: G_BITCAST)
927 // Don't worry about the size constraint.
928 .legalIf(Predicate: all(P0: isRegisterClassType(ST, TypeIdx: 0), P1: isRegisterClassType(ST, TypeIdx: 1)))
929 .lower();
930
931 getActionDefinitionsBuilder(Opcode: G_CONSTANT)
932 .legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
933 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
934 .legalIf(Predicate: isPointer(TypeIdx: 0))
935 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
936 .widenScalarToNextPow2(TypeIdx: 0);
937
938 getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
939 .legalFor(Types: {S32, S64, S16})
940 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
941
942 getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
943 .legalIf(Predicate: isRegisterClassType(ST, TypeIdx: 0))
944 // s1 and s16 are special cases because they have legal operations on
945 // them, but don't really occupy registers in the normal way.
946 .legalFor(Types: {S1, S16})
947 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
948 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
949 .clampScalarOrElt(TypeIdx: 0, MinTy: S32, MaxTy: MaxScalar)
950 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
951 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16);
952
953 getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
954
955 // If the amount is divergent, we have to do a wave reduction to get the
956 // maximum value, so this is expanded during RegBankSelect.
957 getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
958 .legalFor(Types: {{PrivatePtr, S32}});
959
960 getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
961 .customFor(Types: {PrivatePtr});
962 getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
963 .legalFor(Types: {PrivatePtr});
964
965 getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
966
967 getActionDefinitionsBuilder(Opcodes: {G_GET_ROUNDING, G_SET_ROUNDING}).legalFor(Types: {S32});
968
969 getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
970 .customIf(Predicate: typeIsNot(TypeIdx: 0, Type: PrivatePtr));
971
972 getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
973
974 auto &FPOpActions = getActionDefinitionsBuilder(
975 Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
976 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
977 .legalFor(Types: {S32, S64});
978 auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
979 .customFor(Types: {S32, S64});
980 auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
981 .customFor(Types: {S32, S64});
982
983 if (ST.has16BitInsts()) {
984 if (ST.hasVOP3PInsts())
985 FPOpActions.legalFor(Types: {S16, V2S16});
986 else
987 FPOpActions.legalFor(Types: {S16});
988
989 TrigActions.customFor(Types: {S16});
990 FDIVActions.customFor(Types: {S16});
991 }
992
993 if (ST.hasPackedFP32Ops()) {
994 FPOpActions.legalFor(Types: {V2S32});
995 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S32, NumElts: 2);
996 }
997
998 if (ST.hasPackedFP64Ops()) {
999 FPOpActions.legalFor(Types: {V2S64});
1000 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S64, NumElts: 2);
1001 }
1002
1003 if (ST.hasPackedFP64Ops()) {
1004 FPOpActions.legalFor(Types: {V2S64});
1005 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S64, NumElts: 2);
1006 }
1007
1008 auto &MinNumMaxNumIeee =
1009 getActionDefinitionsBuilder(Opcodes: {G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
1010
1011 if (ST.hasVOP3PInsts()) {
1012 MinNumMaxNumIeee.legalFor(Types: FPTypesPK16)
1013 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1014 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1015 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1016 .scalarize(TypeIdx: 0);
1017 } else if (ST.has16BitInsts()) {
1018 MinNumMaxNumIeee.legalFor(Types: FPTypes16).clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64).scalarize(TypeIdx: 0);
1019 } else {
1020 MinNumMaxNumIeee.legalFor(Types: FPTypesBase)
1021 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1022 .scalarize(TypeIdx: 0);
1023 }
1024
1025 auto &MinNumMaxNum = getActionDefinitionsBuilder(
1026 Opcodes: {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1027
1028 if (ST.hasPackedFP64Ops()) {
1029 MinNumMaxNum.customFor(Types: FPTypesPK16_64)
1030 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1031 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1032 .clampMaxNumElements(TypeIdx: 0, EltTy: S64, MaxElements: 2)
1033 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1034 .scalarize(TypeIdx: 0);
1035 } else if (ST.hasVOP3PInsts()) {
1036 MinNumMaxNum.customFor(Types: FPTypesPK16)
1037 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1038 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1039 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1040 .scalarize(TypeIdx: 0);
1041 } else if (ST.has16BitInsts()) {
1042 MinNumMaxNum.customFor(Types: FPTypes16)
1043 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1044 .scalarize(TypeIdx: 0);
1045 } else {
1046 MinNumMaxNum.customFor(Types: FPTypesBase)
1047 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1048 .scalarize(TypeIdx: 0);
1049 }
1050
1051 if (ST.hasVOP3PInsts())
1052 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
1053
1054 FPOpActions
1055 .scalarize(TypeIdx: 0)
1056 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1057
1058 TrigActions
1059 .scalarize(TypeIdx: 0)
1060 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1061
1062 FDIVActions
1063 .scalarize(TypeIdx: 0)
1064 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
1065
1066 auto &FNegAbs = getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS});
1067 FNegAbs.legalFor(Types: FPTypesPK16)
1068 .legalFor(Pred: ST.hasPackedFP32Ops(), Types: {V2S32})
1069 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
1070 if (ST.hasPackedFP32Ops())
1071 FNegAbs.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S32, NumElts: 2);
1072 FNegAbs.scalarize(TypeIdx: 0).clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1073
1074 if (ST.has16BitInsts()) {
1075 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1076 .legalFor(Types: {S16})
1077 .customFor(Types: {S32, S64})
1078 .scalarize(TypeIdx: 0)
1079 .unsupported();
1080 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1081 .legalFor(Types: {S32, S64, S16})
1082 .scalarize(TypeIdx: 0)
1083 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1084
1085 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1086 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
1087 .scalarize(TypeIdx: 0)
1088 .maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16)
1089 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1090 .lower();
1091
1092 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1093 .customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1094 .scalarize(TypeIdx: 0)
1095 .lower();
1096
1097 getActionDefinitionsBuilder(Opcode: G_FMODF)
1098 .lowerFor(Types: {S16, S32, S64})
1099 .scalarize(TypeIdx: 0)
1100 .lower();
1101 } else {
1102 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1103 .customFor(Types: {S32, S64, S16})
1104 .scalarize(TypeIdx: 0)
1105 .unsupported();
1106
1107
1108 if (ST.hasFractBug()) {
1109 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1110 .customFor(Types: {S64})
1111 .legalFor(Types: {S32, S64})
1112 .scalarize(TypeIdx: 0)
1113 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1114 } else {
1115 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1116 .legalFor(Types: {S32, S64})
1117 .scalarize(TypeIdx: 0)
1118 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1119 }
1120
1121 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1122 .legalFor(Types: {{S32, S32}, {S64, S32}})
1123 .scalarize(TypeIdx: 0)
1124 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1125 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1126 .lower();
1127
1128 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1129 .customFor(Types: {{S32, S32}, {S64, S32}})
1130 .scalarize(TypeIdx: 0)
1131 .minScalar(TypeIdx: 0, Ty: S32)
1132 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1133 .lower();
1134
1135 getActionDefinitionsBuilder(Opcode: G_FMODF)
1136 .lowerFor(Types: {S32, S64})
1137 .scalarize(TypeIdx: 0)
1138 .lower();
1139 }
1140
1141 auto &FPTruncActions = getActionDefinitionsBuilder(Opcode: G_FPTRUNC);
1142 if (ST.hasCvtPkF16F32Inst()) {
1143 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1144 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1145 } else {
1146 FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}});
1147 }
1148 FPTruncActions.scalarize(TypeIdx: 0).lower();
1149
1150 getActionDefinitionsBuilder(Opcode: G_FPEXT)
1151 .legalFor(Types: {{S64, S32}, {S32, S16}})
1152 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32))
1153 .scalarize(TypeIdx: 0);
1154
1155 auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1156 if (ST.has16BitInsts()) {
1157 FSubActions
1158 // Use actual fsub instruction
1159 .legalFor(Types: {S32, S16})
1160 // Must use fadd + fneg
1161 .lowerFor(Types: {S64, V2S16});
1162 } else {
1163 FSubActions
1164 // Use actual fsub instruction
1165 .legalFor(Types: {S32})
1166 // Must use fadd + fneg
1167 .lowerFor(Types: {S64, S16, V2S16});
1168 }
1169
1170 if (ST.hasPackedFP32Ops())
1171 FSubActions.lowerFor(Types: {V2S32}).clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 2);
1172
1173 FSubActions
1174 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1175 .scalarize(TypeIdx: 0)
1176 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1177
1178 // Whether this is legal depends on the floating point mode for the function.
1179 auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1180 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1181 FMad.customFor(Types: {S32, S16});
1182 else if (ST.hasMadMacF32Insts())
1183 FMad.customFor(Types: {S32});
1184 else if (ST.hasMadF16())
1185 FMad.customFor(Types: {S16});
1186 FMad.scalarize(TypeIdx: 0)
1187 .lower();
1188
1189 auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1190 if (ST.has16BitInsts()) {
1191 FRem.customFor(Types: {S16, S32, S64});
1192 } else {
1193 FRem.minScalar(TypeIdx: 0, Ty: S32)
1194 .customFor(Types: {S32, S64});
1195 }
1196 FRem.scalarize(TypeIdx: 0);
1197
1198 // TODO: Do we need to clamp maximum bitwidth?
1199 getActionDefinitionsBuilder(Opcode: G_TRUNC)
1200 .legalIf(Predicate: isScalar(TypeIdx: 0))
1201 .legalFor(Types: {{V2S16, V2S32}})
1202 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1203 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1204 // situations (like an invalid implicit use), we don't want to infinite loop
1205 // in the legalizer.
1206 .fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: 0), Mutation: LegalizeMutations::scalarize(TypeIdx: 0))
1207 .alwaysLegal();
1208
1209 getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1210 .legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1211 {S32, S1}, {S64, S1}, {S16, S1}})
1212 .scalarize(TypeIdx: 0)
1213 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1214 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1215
1216 // TODO: Split s1->s64 during regbankselect for VALU.
1217 auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1218 .legalFor(Types: {{S32, S32}, {S64, S32}})
1219 .widenScalarFor(Types: {{S16, S32}}, Mutation: changeTo(TypeIdx: 0, Ty: S32))
1220 .lowerIf(Predicate: typeIs(TypeIdx: 1, TypesInit: S1))
1221 .customFor(Types: {{S32, S64}, {S64, S64}});
1222 if (ST.has16BitInsts())
1223 IToFP.legalFor(Types: {{S16, S16}});
1224 IToFP.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1225 .minScalar(TypeIdx: 0, Ty: S32)
1226 .scalarize(TypeIdx: 0)
1227 .widenScalarToNextPow2(TypeIdx: 1);
1228
1229 auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1230 .legalFor(Types: {{S32, S32}, {S32, S64}})
1231 .customFor(Types: {{S64, S32}, {S64, S64}})
1232 .widenScalarFor(Types: {{S32, S16}}, Mutation: changeTo(TypeIdx: 1, Ty: S32))
1233 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1234 if (ST.has16BitInsts())
1235 FPToI.legalFor(Types: {{S16, S16}});
1236 else
1237 FPToI.minScalar(TypeIdx: 1, Ty: S32);
1238
1239 FPToI.minScalar(TypeIdx: 0, Ty: S32)
1240 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1241 .scalarize(TypeIdx: 0)
1242 .lower();
1243
1244 // clang-format off
1245 auto &FPToISat = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI_SAT, G_FPTOUI_SAT})
1246 .legalFor(Types: {{S32, S32}, {S32, S64}, {S16, S32}})
1247 .legalFor(Pred: ST.has16BitInsts(), Types: {{S16, S16}})
1248 .legalFor(Pred: ST.hasVCvtPkIU16F32(), Types: {{V2S16, V2S32}})
1249 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1250
1251 // If available, widen width <16 to i16, intead of i32 so v_cvt_i16/u16_f16 can be used.
1252 if (ST.has16BitInsts())
1253 FPToISat.minScalarIf(Predicate: typeIs(TypeIdx: 1, TypesInit: S16), TypeIdx: 0, Ty: S16);
1254
1255 if (ST.hasVCvtPkIU16F32())
1256 FPToISat.clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1257
1258 FPToISat.minScalar(TypeIdx: 1, Ty: S32);
1259 FPToISat.minScalar(TypeIdx: 0, Ty: S32)
1260 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1261 .scalarize(TypeIdx: 0)
1262 .lower();
1263 // clang-format on
1264
1265 getActionDefinitionsBuilder(Opcodes: {G_LROUND, G_LLROUND})
1266 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1267 .scalarize(TypeIdx: 0)
1268 .lower();
1269
1270 getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1271 .legalFor(Types: {S16, S32})
1272 .scalarize(TypeIdx: 0)
1273 .lower();
1274
1275 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1276 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1277 .scalarize(TypeIdx: 0)
1278 .lower();
1279
1280 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1281 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1282 .scalarize(TypeIdx: 0)
1283 .lower();
1284
1285 if (ST.has16BitInsts()) {
1286 getActionDefinitionsBuilder(
1287 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1288 .legalFor(Types: {S16, S32, S64})
1289 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1290 .scalarize(TypeIdx: 0);
1291 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1292 getActionDefinitionsBuilder(
1293 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1294 .legalFor(Types: {S32, S64})
1295 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1296 .scalarize(TypeIdx: 0);
1297 } else {
1298 getActionDefinitionsBuilder(
1299 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1300 .legalFor(Types: {S32})
1301 .customFor(Types: {S64})
1302 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1303 .scalarize(TypeIdx: 0);
1304 }
1305
1306 getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1307 .unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1308 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: sameSize(TypeIdx0: 0, TypeIdx1: 1)))
1309 .scalarize(TypeIdx: 0)
1310 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0);
1311
1312 getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1313 .legalIf(Predicate: all(P0: sameSize(TypeIdx0: 0, TypeIdx1: 1), P1: typeInSet(TypeIdx: 1, TypesInit: {S64, S32})))
1314 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0)
1315 .scalarize(TypeIdx: 0);
1316
1317 auto &CmpBuilder =
1318 getActionDefinitionsBuilder(Opcode: G_ICMP)
1319 // The compare output type differs based on the register bank of the output,
1320 // so make both s1 and s32 legal.
1321 //
1322 // Scalar compares producing output in scc will be promoted to s32, as that
1323 // is the allocatable register type that will be needed for the copy from
1324 // scc. This will be promoted during RegBankSelect, and we assume something
1325 // before that won't try to use s32 result types.
1326 //
1327 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1328 // bank.
1329 .legalForCartesianProduct(
1330 Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1331 .legalForCartesianProduct(
1332 Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1333 if (ST.has16BitInsts()) {
1334 CmpBuilder.legalFor(Types: {{S1, S16}});
1335 }
1336
1337 CmpBuilder
1338 .widenScalarToNextPow2(TypeIdx: 1)
1339 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1340 .scalarize(TypeIdx: 0)
1341 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: 1)));
1342
1343 auto &FCmpBuilder =
1344 getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1345 Types0: {S1}, Types1: ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1346
1347 if (ST.hasSALUFloatInsts())
1348 FCmpBuilder.legalForCartesianProduct(Types0: {S32}, Types1: {S16, S32});
1349
1350 FCmpBuilder
1351 .widenScalarToNextPow2(TypeIdx: 1)
1352 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1353 .scalarize(TypeIdx: 0);
1354
1355 // FIXME: fpow has a selection pattern that should move to custom lowering.
1356 auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1357 if (ST.has16BitInsts())
1358 ExpOps.customFor(Types: {{S32}, {S16}});
1359 else
1360 ExpOps.customFor(Types: {S32});
1361 ExpOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1362 .scalarize(TypeIdx: 0);
1363
1364 getActionDefinitionsBuilder(Opcode: G_FPOWI)
1365 .clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1366 .lower();
1367
1368 getActionDefinitionsBuilder(Opcode: G_FLOG2)
1369 .legalFor(Pred: ST.has16BitInsts(), Types: {S16})
1370 .customFor(Types: {S32, S16})
1371 .scalarize(TypeIdx: 0)
1372 .lower();
1373
1374 getActionDefinitionsBuilder(Opcode: G_FEXP2)
1375 .legalFor(Pred: ST.has16BitInsts(), Types: {S16})
1376 .customFor(Types: {S32, S64, S16})
1377 .scalarize(TypeIdx: 0)
1378 .lower();
1379
1380 auto &LogOps =
1381 getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1382 LogOps.customFor(Types: {S32, S16, S64});
1383 LogOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1384 .scalarize(TypeIdx: 0);
1385
1386 // The 64-bit versions produce 32-bit results, but only on the SALU.
1387 getActionDefinitionsBuilder(Opcode: G_CTPOP)
1388 .legalFor(Types: {{S32, S32}, {S32, S64}})
1389 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1390 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1391 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1392 .scalarize(TypeIdx: 0)
1393 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1394
1395 // If no 16 bit instr is available, lower into different instructions.
1396 if (ST.has16BitInsts())
1397 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1398 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1399 .widenScalarToNextPow2(TypeIdx: 1)
1400 .scalarize(TypeIdx: 0)
1401 .lower();
1402 else
1403 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1404 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1405 .lowerFor(Types: {S1, S16})
1406 .widenScalarToNextPow2(TypeIdx: 1)
1407 .scalarize(TypeIdx: 0)
1408 .lower();
1409
1410 // The hardware instructions return a different result on 0 than the generic
1411 // instructions expect. The hardware produces -1, but these produce the
1412 // bitwidth.
1413 getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1414 .scalarize(TypeIdx: 0)
1415 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1416 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1417 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1418 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1419 .custom();
1420
1421 // The 64-bit versions produce 32-bit results, but only on the SALU.
1422 getActionDefinitionsBuilder(Opcode: G_CTLZ_ZERO_POISON)
1423 .legalFor(Types: {{S32, S32}, {S32, S64}})
1424 .customIf(Predicate: scalarNarrowerThan(TypeIdx: 1, Size: 32))
1425 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1426 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1427 .scalarize(TypeIdx: 0)
1428 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1429 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1430
1431 getActionDefinitionsBuilder(Opcode: G_CTTZ_ZERO_POISON)
1432 .legalFor(Types: {{S32, S32}, {S32, S64}})
1433 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1434 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1435 .scalarize(TypeIdx: 0)
1436 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1437 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1438
1439 getActionDefinitionsBuilder(Opcode: G_CTLS)
1440 .customFor(Types: {{S32, S32}})
1441 .scalarize(TypeIdx: 0)
1442 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1443 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1444
1445 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1446 // RegBankSelect.
1447 getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1448 .legalFor(Types: {S32, S64})
1449 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1450 .scalarize(TypeIdx: 0)
1451 .widenScalarToNextPow2(TypeIdx: 0);
1452
1453 if (ST.has16BitInsts()) {
1454 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1455 .legalFor(Types: {S16, S32, V2S16})
1456 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1457 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1458 // narrowScalar limitation.
1459 .widenScalarToNextPow2(TypeIdx: 0)
1460 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S32)
1461 .scalarize(TypeIdx: 0);
1462
1463 if (ST.hasVOP3PInsts()) {
1464 getActionDefinitionsBuilder(Opcode: G_ABS)
1465 .legalFor(Types: {S32, S16, V2S16})
1466 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1467 .minScalar(TypeIdx: 0, Ty: S16)
1468 .widenScalarToNextPow2(TypeIdx: 0)
1469 .scalarize(TypeIdx: 0)
1470 .lower();
1471 if (ST.hasMinMaxI64Insts()) {
1472 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1473 .legalFor(Types: {S32, S16, S64, V2S16})
1474 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1475 .minScalar(TypeIdx: 0, Ty: S16)
1476 .widenScalarToNextPow2(TypeIdx: 0)
1477 .scalarize(TypeIdx: 0)
1478 .lower();
1479 } else {
1480 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1481 .legalFor(Types: {S32, S16, V2S16})
1482 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1483 .minScalar(TypeIdx: 0, Ty: S16)
1484 .widenScalarToNextPow2(TypeIdx: 0)
1485 .scalarize(TypeIdx: 0)
1486 .lower();
1487 }
1488 } else {
1489 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1490 .legalFor(Types: {S32, S16})
1491 .widenScalarToNextPow2(TypeIdx: 0)
1492 .minScalar(TypeIdx: 0, Ty: S16)
1493 .scalarize(TypeIdx: 0)
1494 .lower();
1495 }
1496 } else {
1497 // TODO: Should have same legality without v_perm_b32
1498 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1499 .legalFor(Types: {S32})
1500 .lowerIf(Predicate: scalarNarrowerThan(TypeIdx: 0, Size: 32))
1501 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1502 // narrowScalar limitation.
1503 .widenScalarToNextPow2(TypeIdx: 0)
1504 .maxScalar(TypeIdx: 0, Ty: S32)
1505 .scalarize(TypeIdx: 0)
1506 .lower();
1507
1508 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1509 .legalFor(Types: {S32})
1510 .minScalar(TypeIdx: 0, Ty: S32)
1511 .widenScalarToNextPow2(TypeIdx: 0)
1512 .scalarize(TypeIdx: 0)
1513 .lower();
1514 }
1515
1516 getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1517 // List the common cases
1518 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1519 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1520 .scalarize(TypeIdx: 0)
1521 // Accept any address space as long as the size matches
1522 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1523 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 1, TypeIdx1: 0),
1524 Mutation: [](const LegalityQuery &Query) {
1525 return std::pair(
1526 1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1527 })
1528 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 1, TypeIdx1: 0), Mutation: [](const LegalityQuery &Query) {
1529 return std::pair(1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1530 });
1531
1532 getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1533 // List the common cases
1534 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1535 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1536 .scalarize(TypeIdx: 0)
1537 // Accept any address space as long as the size matches
1538 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1539 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 0, TypeIdx1: 1),
1540 Mutation: [](const LegalityQuery &Query) {
1541 return std::pair(
1542 0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1543 })
1544 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 0, TypeIdx1: 1), Mutation: [](const LegalityQuery &Query) {
1545 return std::pair(0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1546 });
1547
1548 getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1549 .scalarize(TypeIdx: 0)
1550 .custom();
1551
1552 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1553 bool IsLoad) -> bool {
1554 const LLT DstTy = Query.Types[0];
1555
1556 // Split vector extloads.
1557 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1558
1559 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1560 return true;
1561
1562 const LLT PtrTy = Query.Types[1];
1563 unsigned AS = PtrTy.getAddressSpace();
1564 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1565 IsAtomic: Query.MMODescrs[0].Ordering !=
1566 AtomicOrdering::NotAtomic))
1567 return true;
1568
1569 // Catch weird sized loads that don't evenly divide into the access sizes
1570 // TODO: May be able to widen depending on alignment etc.
1571 unsigned NumRegs = (MemSize + 31) / 32;
1572 if (NumRegs == 3) {
1573 if (!ST.hasDwordx3LoadStores())
1574 return true;
1575 } else {
1576 // If the alignment allows, these should have been widened.
1577 if (!isPowerOf2_32(Value: NumRegs))
1578 return true;
1579 }
1580
1581 return false;
1582 };
1583
1584 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1585 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1586 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1587
1588 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1589 // LDS
1590 // TODO: Unsupported flat for SI.
1591
1592 for (unsigned Op : {G_LOAD, G_STORE}) {
1593 const bool IsStore = Op == G_STORE;
1594
1595 auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1596 // Explicitly list some common cases.
1597 // TODO: Does this help compile time at all?
1598 Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1599 {.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1600 {.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1601 {.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1602 {.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1603 {.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1604 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1605 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1606
1607 {.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1608 {.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: 32},
1609 {.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: 32},
1610 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1611 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1612 {.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1613
1614 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1615 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1616 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1617 {.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1618
1619 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1620 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1621 {.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1622 {.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1623 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1624
1625 Actions.legalForTypesWithMemDesc(Pred: ST.useRealTrue16Insts(), /* Pred */
1626 TypesAndMemDesc: {{.Type0: S16, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1627 {.Type0: S16, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1628 {.Type0: S16, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1629 {.Type0: S16, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1630 {.Type0: S16, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1631 {.Type0: S16, .Type1: PrivatePtr, .MemTy: S16, .Align: 16}});
1632
1633 Actions.legalIf(
1634 Predicate: [=](const LegalityQuery &Query) -> bool {
1635 return isLoadStoreLegal(ST, Query);
1636 });
1637
1638 // The custom pointers (fat pointers, buffer resources) don't work with load
1639 // and store at this level. Fat pointers should have been lowered to
1640 // intrinsics before the translation to MIR.
1641 Actions.unsupportedIf(
1642 Predicate: typeInSet(TypeIdx: 1, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1643
1644 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1645 // ptrtoint. This is needed to account for the fact that we can't have i128
1646 // as a register class for SelectionDAG reasons.
1647 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1648 return hasBufferRsrcWorkaround(Ty: Query.Types[0]);
1649 });
1650
1651 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1652 // 64-bits.
1653 //
1654 // TODO: Should generalize bitcast action into coerce, which will also cover
1655 // inserting addrspacecasts.
1656 Actions.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1657
1658 // Turn any illegal element vectors into something easier to deal
1659 // with. These will ultimately produce 32-bit scalar shifts to extract the
1660 // parts anyway.
1661 //
1662 // For odd 16-bit element vectors, prefer to split those into pieces with
1663 // 16-bit vector parts.
1664 Actions.bitcastIf(
1665 Predicate: [=](const LegalityQuery &Query) -> bool {
1666 return shouldBitcastLoadStoreType(ST, Ty: Query.Types[0],
1667 MemTy: Query.MMODescrs[0].MemoryTy);
1668 }, Mutation: bitcastToRegisterType(TypeIdx: 0));
1669
1670 if (!IsStore) {
1671 // Widen suitably aligned loads by loading extra bytes. The standard
1672 // legalization actions can't properly express widening memory operands.
1673 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1674 return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1675 });
1676 }
1677
1678 // FIXME: load/store narrowing should be moved to lower action
1679 Actions
1680 .narrowScalarIf(
1681 Predicate: [=](const LegalityQuery &Query) -> bool {
1682 return !Query.Types[0].isVector() &&
1683 needToSplitMemOp(Query, Op == G_LOAD);
1684 },
1685 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1686 const LLT DstTy = Query.Types[0];
1687 const LLT PtrTy = Query.Types[1];
1688
1689 const unsigned DstSize = DstTy.getSizeInBits();
1690 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1691
1692 // Split extloads.
1693 if (DstSize > MemSize)
1694 return std::pair(0, LLT::scalar(SizeInBits: MemSize));
1695
1696 unsigned MaxSize = maxSizeForAddrSpace(
1697 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1698 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1699 if (MemSize > MaxSize)
1700 return std::pair(0, LLT::scalar(SizeInBits: MaxSize));
1701
1702 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1703 return std::pair(0, LLT::scalar(SizeInBits: Align));
1704 })
1705 .fewerElementsIf(
1706 Predicate: [=](const LegalityQuery &Query) -> bool {
1707 return Query.Types[0].isVector() &&
1708 needToSplitMemOp(Query, Op == G_LOAD);
1709 },
1710 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1711 const LLT DstTy = Query.Types[0];
1712 const LLT PtrTy = Query.Types[1];
1713
1714 LLT EltTy = DstTy.getElementType();
1715 unsigned MaxSize = maxSizeForAddrSpace(
1716 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1717 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1718
1719 // FIXME: Handle widened to power of 2 results better. This ends
1720 // up scalarizing.
1721 // FIXME: 3 element stores scalarized on SI
1722
1723 // Split if it's too large for the address space.
1724 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1725 if (MemSize > MaxSize) {
1726 unsigned NumElts = DstTy.getNumElements();
1727 unsigned EltSize = EltTy.getSizeInBits();
1728
1729 if (MaxSize % EltSize == 0) {
1730 return std::pair(
1731 0, LLT::scalarOrVector(
1732 EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1733 }
1734
1735 unsigned NumPieces = MemSize / MaxSize;
1736
1737 // FIXME: Refine when odd breakdowns handled
1738 // The scalars will need to be re-legalized.
1739 if (NumPieces == 1 || NumPieces >= NumElts ||
1740 NumElts % NumPieces != 0)
1741 return std::pair(0, EltTy);
1742
1743 return std::pair(0,
1744 LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1745 }
1746
1747 // FIXME: We could probably handle weird extending loads better.
1748 if (DstTy.getSizeInBits() > MemSize)
1749 return std::pair(0, EltTy);
1750
1751 unsigned EltSize = EltTy.getSizeInBits();
1752 unsigned DstSize = DstTy.getSizeInBits();
1753 if (!isPowerOf2_32(Value: DstSize)) {
1754 // We're probably decomposing an odd sized store. Try to split
1755 // to the widest type. TODO: Account for alignment. As-is it
1756 // should be OK, since the new parts will be further legalized.
1757 unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1758 return std::pair(
1759 0, LLT::scalarOrVector(
1760 EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1761 }
1762
1763 // May need relegalization for the scalars.
1764 return std::pair(0, EltTy);
1765 })
1766 .minScalar(TypeIdx: 0, Ty: S32)
1767 .narrowScalarIf(Predicate: isTruncStoreToSizePowerOf2(TypeIdx: 0),
1768 Mutation: getScalarTypeFromMemDesc(TypeIdx: 0))
1769 .widenScalarToNextPow2(TypeIdx: 0)
1770 .moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: 0, Size: 32), Mutation: moreEltsToNext32Bit(TypeIdx: 0))
1771 .lower();
1772 }
1773
1774 // FIXME: Unaligned accesses not lowered.
1775 auto &ExtLoads =
1776 getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1777 .legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: 8},
1778 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: 2 * 8},
1779 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1780 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1781 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1782 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1783 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: 8},
1784 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: 2 * 8}})
1785 .legalForTypesWithMemDesc(Pred: ST.useRealTrue16Insts(),
1786 TypesAndMemDesc: {{.Type0: S16, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1787 {.Type0: S16, .Type1: LocalPtr, .MemTy: S8, .Align: GlobalAlign8},
1788 {.Type0: S16, .Type1: PrivatePtr, .MemTy: S8, .Align: GlobalAlign8},
1789 {.Type0: S16, .Type1: ConstantPtr, .MemTy: S8, .Align: GlobalAlign8}})
1790 .legalIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1791 return isLoadStoreLegal(ST, Query);
1792 });
1793
1794 if (ST.hasFlatAddressSpace()) {
1795 ExtLoads.legalForTypesWithMemDesc(
1796 TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: 8}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: 16}});
1797
1798 ExtLoads.legalForTypesWithMemDesc(Pred: ST.useRealTrue16Insts(),
1799 TypesAndMemDesc: {{.Type0: S16, .Type1: FlatPtr, .MemTy: S8, .Align: GlobalAlign8}});
1800 }
1801
1802 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1803 // 64-bits.
1804 //
1805 // TODO: Should generalize bitcast action into coerce, which will also cover
1806 // inserting addrspacecasts.
1807 ExtLoads.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1808
1809 ExtLoads.narrowScalarIf(
1810 Predicate: [](const LegalityQuery &Query) {
1811 LLT MemTy = Query.MMODescrs[0].MemoryTy;
1812 return MemTy.isAnyScalar() && MemTy.getSizeInBits() > 32 &&
1813 Query.Types[0].getSizeInBits() > MemTy.getSizeInBits();
1814 }, // For large MemSize, narrowscalar to MemSize (load MemSize + ext)
1815 Mutation: getScalarTypeFromMemDesc(TypeIdx: 0));
1816 ExtLoads.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1817 .widenScalarToNextPow2(TypeIdx: 0)
1818 .lower();
1819
1820 auto &Atomics = getActionDefinitionsBuilder(
1821 Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1822 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1823 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1824 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1825 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1826 {S64, GlobalPtr}, {S64, LocalPtr},
1827 {S32, RegionPtr}, {S64, RegionPtr}});
1828 if (ST.hasFlatAddressSpace()) {
1829 Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1830 }
1831
1832 auto &Atomics32 =
1833 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1834 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1835 if (ST.hasFlatAddressSpace()) {
1836 Atomics32.legalFor(Types: {{S32, FlatPtr}});
1837 }
1838
1839 // TODO: v2bf16 operations, and fat buffer pointer support.
1840 auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1841 if (ST.hasLDSFPAtomicAddF32()) {
1842 Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1843 if (ST.hasLdsAtomicAddF64())
1844 Atomic.legalFor(Types: {{S64, LocalPtr}});
1845 if (ST.hasAtomicDsPkAdd16Insts())
1846 Atomic.legalFor(Types: {{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1847 }
1848 if (ST.hasAtomicFaddInsts())
1849 Atomic.legalFor(Types: {{S32, GlobalPtr}});
1850 if (ST.hasFlatAtomicFaddF32Inst())
1851 Atomic.legalFor(Types: {{S32, FlatPtr}});
1852
1853 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1854 // These are legal with some caveats, and should have undergone expansion in
1855 // the IR in most situations
1856 // TODO: Move atomic expansion into legalizer
1857 Atomic.legalFor(Types: {
1858 {S32, GlobalPtr},
1859 {S64, GlobalPtr},
1860 {S64, FlatPtr}
1861 });
1862 }
1863
1864 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1865 ST.hasAtomicBufferGlobalPkAddF16Insts())
1866 Atomic.legalFor(Types: {{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1867 if (ST.hasAtomicGlobalPkAddBF16Inst())
1868 Atomic.legalFor(Types: {{V2BF16, GlobalPtr}});
1869 if (ST.hasAtomicFlatPkAdd16Insts())
1870 Atomic.legalFor(Types: {{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1871
1872
1873 // Most of the legalization work here is done by AtomicExpand. We could
1874 // probably use a simpler legality rule that just assumes anything is OK.
1875 auto &AtomicFMinFMax =
1876 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1877 .legalFor(Types: {{F32, LocalPtr}, {F64, LocalPtr}});
1878
1879 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1880 AtomicFMinFMax.legalFor(Types: {{F32, GlobalPtr},{F32, BufferFatPtr}});
1881 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1882 AtomicFMinFMax.legalFor(Types: {{F64, GlobalPtr}, {F64, BufferFatPtr}});
1883 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1884 AtomicFMinFMax.legalFor(Types: {F32, FlatPtr});
1885 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1886 AtomicFMinFMax.legalFor(Types: {F64, FlatPtr});
1887
1888 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1889 // demarshalling
1890 getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1891 .customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1892 {S32, FlatPtr}, {S64, FlatPtr}})
1893 .legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1894 {S32, RegionPtr}, {S64, RegionPtr}});
1895 // TODO: Pointer types, any 32-bit or 64-bit vector
1896
1897 // Condition should be s32 for scalar, s1 for vector.
1898 getActionDefinitionsBuilder(Opcode: G_SELECT)
1899 .legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1900 LocalPtr, FlatPtr, PrivatePtr,
1901 LLT::fixed_vector(NumElements: 2, ScalarTy: LocalPtr),
1902 LLT::fixed_vector(NumElements: 2, ScalarTy: PrivatePtr)},
1903 Types1: {S1, S32})
1904 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1905 .scalarize(TypeIdx: 1)
1906 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1907 .fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: 0), Mutation: scalarize(TypeIdx: 0))
1908 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 2)
1909 .clampMaxNumElements(TypeIdx: 0, EltTy: LocalPtr, MaxElements: 2)
1910 .clampMaxNumElements(TypeIdx: 0, EltTy: PrivatePtr, MaxElements: 2)
1911 .scalarize(TypeIdx: 0)
1912 .widenScalarToNextPow2(TypeIdx: 0)
1913 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: typeInSet(TypeIdx: 1, TypesInit: {S1, S32})));
1914
1915 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1916 // be more flexible with the shift amount type.
1917 auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1918 .legalFor(Types: {{S32, S32}, {S64, S32}});
1919 if (ST.has16BitInsts()) {
1920 if (ST.hasVOP3PInsts()) {
1921 Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1922 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1923 } else
1924 Shifts.legalFor(Types: {{S16, S16}});
1925
1926 // TODO: Support 16-bit shift amounts for all types
1927 Shifts.widenScalarIf(
1928 Predicate: [=](const LegalityQuery &Query) {
1929 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1930 // 32-bit amount.
1931 const LLT ValTy = Query.Types[0];
1932 const LLT AmountTy = Query.Types[1];
1933 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1934 AmountTy.getSizeInBits() < 16;
1935 }, Mutation: changeTo(TypeIdx: 1, Ty: S16));
1936 Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16);
1937 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1938 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 16);
1939 Shifts.clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1940
1941 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1942 .minScalar(TypeIdx: 0, Ty: S16)
1943 .scalarize(TypeIdx: 0)
1944 .lower();
1945 } else {
1946 // Make sure we legalize the shift amount type first, as the general
1947 // expansion for the shifted type will produce much worse code if it hasn't
1948 // been truncated already.
1949 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1950 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1951 Shifts.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1952
1953 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1954 .minScalar(TypeIdx: 0, Ty: S32)
1955 .scalarize(TypeIdx: 0)
1956 .lower();
1957 }
1958 Shifts.scalarize(TypeIdx: 0);
1959
1960 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1961 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1962 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1963 unsigned IdxTypeIdx = 2;
1964
1965 getActionDefinitionsBuilder(Opcode: Op)
1966 .customIf(Predicate: [=](const LegalityQuery &Query) {
1967 const LLT EltTy = Query.Types[EltTypeIdx];
1968 const LLT VecTy = Query.Types[VecTypeIdx];
1969 const LLT IdxTy = Query.Types[IdxTypeIdx];
1970 const unsigned EltSize = EltTy.getSizeInBits();
1971 const bool isLegalVecType =
1972 !!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1973 // Address space 8 pointers are 128-bit wide values, but the logic
1974 // below will try to bitcast them to 2N x s64, which will fail.
1975 // Therefore, as an intermediate step, wrap extracts/insertions from a
1976 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1977 // extraction result) in order to produce a vector operation that can
1978 // be handled by the logic below.
1979 if (EltTy.isPointer() && EltSize > 64)
1980 return true;
1981 return (EltSize == 32 || EltSize == 64) &&
1982 VecTy.getSizeInBits() % 32 == 0 &&
1983 VecTy.getSizeInBits() <= MaxRegisterSize &&
1984 IdxTy.getSizeInBits() == 32 &&
1985 isLegalVecType;
1986 })
1987 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1988 P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: 32)),
1989 Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1990 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1991 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1992 P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: 64)),
1993 Mutation: [=](const LegalityQuery &Query) {
1994 // For > 64-bit element types, try to turn this into a
1995 // 64-bit element vector since we may be able to do better
1996 // indexing if this is scalar. If not, fall back to 32.
1997 const LLT EltTy = Query.Types[EltTypeIdx];
1998 const LLT VecTy = Query.Types[VecTypeIdx];
1999 const unsigned DstEltSize = EltTy.getSizeInBits();
2000 const unsigned VecSize = VecTy.getSizeInBits();
2001
2002 const unsigned TargetEltSize =
2003 DstEltSize % 64 == 0 ? 64 : 32;
2004 return std::pair(VecTypeIdx,
2005 LLT::fixed_vector(NumElements: VecSize / TargetEltSize,
2006 ScalarSizeInBits: TargetEltSize));
2007 })
2008 .clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
2009 .clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
2010 .clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
2011 .clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: 32)
2012 // TODO: Clamp elements for 64-bit vectors?
2013 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: VecTypeIdx),
2014 Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
2015 // It should only be necessary with variable indexes.
2016 // As a last resort, lower to the stack
2017 .lower();
2018 }
2019
2020 getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
2021 .unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
2022 const LLT &EltTy = Query.Types[1].getElementType();
2023 return Query.Types[0] != EltTy;
2024 });
2025
2026 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
2027 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
2028 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
2029 getActionDefinitionsBuilder(Opcode: Op)
2030 .widenScalarIf(
2031 Predicate: [=](const LegalityQuery &Query) {
2032 const LLT BigTy = Query.Types[BigTyIdx];
2033 return (BigTy.getScalarSizeInBits() < 16);
2034 },
2035 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: 16))
2036 .widenScalarIf(
2037 Predicate: [=](const LegalityQuery &Query) {
2038 const LLT LitTy = Query.Types[LitTyIdx];
2039 return (LitTy.getScalarSizeInBits() < 16);
2040 },
2041 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: 16))
2042 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
2043 .widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: 32)
2044 .customIf(Predicate: [=](const LegalityQuery &Query) {
2045 // Generic lower operates on the full-width value, producing
2046 // shift+trunc/mask sequences. For simple cases where extract/insert
2047 // values are 32-bit aligned, we can instead unmerge/merge and work on
2048 // the 32-bit components. However, we can't check the offset here so
2049 // custom lower function will have to call generic lowering if offset
2050 // is not 32-bit aligned.
2051 const LLT BigTy = Query.Types[BigTyIdx];
2052 const LLT LitTy = Query.Types[LitTyIdx];
2053 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
2054 LitTy.getSizeInBits() % 32 == 0;
2055 })
2056 .lower();
2057 }
2058
2059 auto &BuildVector =
2060 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
2061 .legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
2062 .legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
2063 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
2064 .clampNumElements(TypeIdx: 0, MinTy: V2S64, MaxTy: V16S64)
2065 .fewerElementsIf(Predicate: isWideVec16(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: V2S16))
2066 .moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: 0),
2067 Mutation: moreElementsToNextExistingRegClass(TypeIdx: 0));
2068
2069 if (ST.hasScalarPackInsts()) {
2070 BuildVector
2071 // FIXME: Should probably widen s1 vectors straight to s32
2072 .minScalarOrElt(TypeIdx: 0, Ty: S16)
2073 .minScalar(TypeIdx: 1, Ty: S16);
2074
2075 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
2076 .legalFor(Types: {V2S16, S32})
2077 .lower();
2078 } else {
2079 BuildVector.customFor(Types: {V2S16, S16});
2080 BuildVector.minScalarOrElt(TypeIdx: 0, Ty: S32);
2081
2082 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
2083 .customFor(Types: {V2S16, S32})
2084 .lower();
2085 }
2086
2087 BuildVector.legalIf(Predicate: isRegisterType(ST, TypeIdx: 0));
2088
2089 // FIXME: Clamp maximum size
2090 getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
2091 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
2092 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 32)
2093 .clampMaxNumElements(TypeIdx: 1, EltTy: S16, MaxElements: 2) // TODO: Make 4?
2094 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 64);
2095
2096 getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
2097
2098 // Merge/Unmerge
2099 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2100 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2101 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2102
2103 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2104 const LLT Ty = Query.Types[TypeIdx];
2105 if (Ty.isVector()) {
2106 const LLT &EltTy = Ty.getElementType();
2107 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2108 return true;
2109 if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
2110 return true;
2111 }
2112 return false;
2113 };
2114
2115 auto &Builder =
2116 getActionDefinitionsBuilder(Opcode: Op)
2117 .legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: 0), P1: isRegisterType(ST, TypeIdx: 1)))
2118 .lowerFor(Types: {{S16, V2S16}})
2119 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
2120 const LLT BigTy = Query.Types[BigTyIdx];
2121 return BigTy.getSizeInBits() == 32;
2122 })
2123 // Try to widen to s16 first for small types.
2124 // TODO: Only do this on targets with legal s16 shifts
2125 .minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: 16), TypeIdx: LitTyIdx, Ty: S16)
2126 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 16)
2127 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx),
2128 Mutation: oneMoreElement(TypeIdx: BigTyIdx))
2129 .fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: 0, TypesInit: S16), P1: vectorWiderThan(TypeIdx: 1, Size: 32),
2130 args: elementTypeIs(TypeIdx: 1, EltTy: S16)),
2131 Mutation: changeTo(TypeIdx: 1, Ty: V2S16))
2132 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2133 // not worth considering the multiples of 64 since 2*192 and 2*384
2134 // are not valid.
2135 .clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
2136 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 32)
2137 // Break up vectors with weird elements into scalars
2138 .fewerElementsIf(
2139 Predicate: [=](const LegalityQuery &Query) {
2140 return notValidElt(Query, LitTyIdx);
2141 },
2142 Mutation: scalarize(TypeIdx: 0))
2143 .fewerElementsIf(
2144 Predicate: [=](const LegalityQuery &Query) {
2145 return notValidElt(Query, BigTyIdx);
2146 },
2147 Mutation: scalarize(TypeIdx: 1))
2148 .clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
2149
2150 if (Op == G_MERGE_VALUES) {
2151 Builder.widenScalarIf(
2152 // TODO: Use 16-bit shifts if legal for 8-bit values?
2153 Predicate: [=](const LegalityQuery &Query) {
2154 const LLT Ty = Query.Types[LitTyIdx];
2155 return Ty.getSizeInBits() < 32;
2156 },
2157 Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
2158 }
2159
2160 Builder.widenScalarIf(
2161 Predicate: [=](const LegalityQuery &Query) {
2162 const LLT Ty = Query.Types[BigTyIdx];
2163 return Ty.getSizeInBits() % 16 != 0;
2164 },
2165 Mutation: [=](const LegalityQuery &Query) {
2166 // Pick the next power of 2, or a multiple of 64 over 128.
2167 // Whichever is smaller.
2168 const LLT &Ty = Query.Types[BigTyIdx];
2169 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Value: Ty.getSizeInBits() + 1);
2170 if (NewSizeInBits >= 256) {
2171 unsigned RoundedTo = alignTo<64>(Value: Ty.getSizeInBits() + 1);
2172 if (RoundedTo < NewSizeInBits)
2173 NewSizeInBits = RoundedTo;
2174 }
2175 return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
2176 })
2177 // Any vectors left are the wrong size. Scalarize them.
2178 .scalarize(TypeIdx: 0)
2179 .scalarize(TypeIdx: 1);
2180 }
2181
2182 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2183 // RegBankSelect.
2184 auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
2185 .legalFor(Types: {{S32}, {S64}})
2186 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
2187
2188 if (ST.hasVOP3PInsts()) {
2189 SextInReg.lowerFor(Types: {{V2S16}})
2190 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2191 // get more vector shift opportunities, since we'll get those when
2192 // expanded.
2193 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2194 } else if (ST.has16BitInsts()) {
2195 SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
2196 } else {
2197 // Prefer to promote to s32 before lowering if we don't have 16-bit
2198 // shifts. This avoid a lot of intermediate truncate and extend operations.
2199 SextInReg.lowerFor(Types: {{S32}, {S64}});
2200 }
2201
2202 SextInReg
2203 .scalarize(TypeIdx: 0)
2204 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2205 .lower();
2206
2207 getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
2208 .scalarize(TypeIdx: 0)
2209 .lower();
2210
2211 auto &FSHRActionDefs = getActionDefinitionsBuilder(Opcode: G_FSHR);
2212 FSHRActionDefs.legalFor(Types: {{S32, S32}})
2213 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
2214 if (ST.hasVOP3PInsts())
2215 FSHRActionDefs.lowerFor(Types: {{V2S16, V2S16}});
2216 FSHRActionDefs.scalarize(TypeIdx: 0).lower();
2217
2218 if (ST.hasVOP3PInsts()) {
2219 getActionDefinitionsBuilder(Opcode: G_FSHL)
2220 .lowerFor(Types: {{V2S16, V2S16}})
2221 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2222 .scalarize(TypeIdx: 0)
2223 .lower();
2224 } else {
2225 getActionDefinitionsBuilder(Opcode: G_FSHL)
2226 .scalarize(TypeIdx: 0)
2227 .lower();
2228 }
2229
2230 getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
2231 .legalFor(Types: {S64});
2232
2233 getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
2234
2235 getActionDefinitionsBuilder(Opcode: G_FENCE)
2236 .alwaysLegal();
2237
2238 getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
2239 .scalarize(TypeIdx: 0)
2240 .minScalar(TypeIdx: 0, Ty: S32)
2241 .lower();
2242
2243 getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2244 .legalFor(Types: {{S32, S32}, {S64, S32}})
2245 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
2246 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2247 .widenScalarToNextPow2(TypeIdx: 0)
2248 .scalarize(TypeIdx: 0);
2249
2250 getActionDefinitionsBuilder(
2251 Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2252 G_FCOPYSIGN,
2253
2254 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2255 G_READ_REGISTER, G_WRITE_REGISTER,
2256
2257 G_SADDO, G_SSUBO})
2258 .lower();
2259
2260 if (ST.hasIEEEMinimumMaximumInsts()) {
2261 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2262 .legalFor(Types: FPTypesPK16)
2263 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
2264 .scalarize(TypeIdx: 0);
2265 } else if (ST.hasVOP3PInsts()) {
2266 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2267 .lowerFor(Types: {V2S16})
2268 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
2269 .scalarize(TypeIdx: 0)
2270 .lower();
2271 } else {
2272 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2273 .scalarize(TypeIdx: 0)
2274 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2275 .lower();
2276 }
2277
2278 getActionDefinitionsBuilder(
2279 Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET, G_MEMSET_INLINE})
2280 .lower();
2281
2282 getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2283
2284 getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2285 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2286 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2287 .unsupported();
2288
2289 getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2290
2291 getActionDefinitionsBuilder(
2292 Opcodes: {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2293 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2294 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2295 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2296 .legalFor(Types: AllVectors)
2297 .scalarize(TypeIdx: 1)
2298 .lower();
2299
2300 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2301 G_INTRINSIC_CONVERGENT,
2302 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2303 .alwaysLegal();
2304
2305 getLegacyLegalizerInfo().computeTables();
2306 verify(MII: *ST.getInstrInfo());
2307}
2308
2309bool AMDGPULegalizerInfo::legalizeCustom(
2310 LegalizerHelper &Helper, MachineInstr &MI,
2311 LostDebugLocObserver &LocObserver) const {
2312 MachineIRBuilder &B = Helper.MIRBuilder;
2313 MachineRegisterInfo &MRI = *B.getMRI();
2314
2315 switch (MI.getOpcode()) {
2316 case TargetOpcode::G_ADDRSPACE_CAST:
2317 return legalizeAddrSpaceCast(MI, MRI, B);
2318 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2319 return legalizeFroundeven(MI, MRI, B);
2320 case TargetOpcode::G_FCEIL:
2321 return legalizeFceil(MI, MRI, B);
2322 case TargetOpcode::G_FREM:
2323 return legalizeFrem(MI, MRI, B);
2324 case TargetOpcode::G_INTRINSIC_TRUNC:
2325 return legalizeIntrinsicTrunc(MI, MRI, B);
2326 case TargetOpcode::G_SITOFP:
2327 return legalizeITOFP(MI, MRI, B, Signed: true);
2328 case TargetOpcode::G_UITOFP:
2329 return legalizeITOFP(MI, MRI, B, Signed: false);
2330 case TargetOpcode::G_FPTOSI:
2331 return legalizeFPTOI(MI, MRI, B, Signed: true);
2332 case TargetOpcode::G_FPTOUI:
2333 return legalizeFPTOI(MI, MRI, B, Signed: false);
2334 case TargetOpcode::G_FMINNUM:
2335 case TargetOpcode::G_FMAXNUM:
2336 case TargetOpcode::G_FMINIMUMNUM:
2337 case TargetOpcode::G_FMAXIMUMNUM:
2338 return legalizeMinNumMaxNum(Helper, MI);
2339 case TargetOpcode::G_EXTRACT:
2340 return legalizeExtract(Helper, MI);
2341 case TargetOpcode::G_INSERT:
2342 return legalizeInsert(Helper, MI);
2343 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2344 return legalizeExtractVectorElt(MI, MRI, B);
2345 case TargetOpcode::G_INSERT_VECTOR_ELT:
2346 return legalizeInsertVectorElt(MI, MRI, B);
2347 case TargetOpcode::G_FSIN:
2348 case TargetOpcode::G_FCOS:
2349 return legalizeSinCos(MI, MRI, B);
2350 case TargetOpcode::G_GLOBAL_VALUE:
2351 return legalizeGlobalValue(MI, MRI, B);
2352 case TargetOpcode::G_LOAD:
2353 case TargetOpcode::G_SEXTLOAD:
2354 case TargetOpcode::G_ZEXTLOAD:
2355 return legalizeLoad(Helper, MI);
2356 case TargetOpcode::G_STORE:
2357 return legalizeStore(Helper, MI);
2358 case TargetOpcode::G_FMAD:
2359 return legalizeFMad(MI, MRI, B);
2360 case TargetOpcode::G_FDIV:
2361 return legalizeFDIV(MI, MRI, B);
2362 case TargetOpcode::G_FFREXP:
2363 return legalizeFFREXP(MI, MRI, B);
2364 case TargetOpcode::G_FSQRT:
2365 return legalizeFSQRT(MI, MRI, B);
2366 case TargetOpcode::G_UDIV:
2367 case TargetOpcode::G_UREM:
2368 case TargetOpcode::G_UDIVREM:
2369 return legalizeUnsignedDIV_REM(MI, MRI, B);
2370 case TargetOpcode::G_SDIV:
2371 case TargetOpcode::G_SREM:
2372 case TargetOpcode::G_SDIVREM:
2373 return legalizeSignedDIV_REM(MI, MRI, B);
2374 case TargetOpcode::G_ATOMIC_CMPXCHG:
2375 return legalizeAtomicCmpXChg(MI, MRI, B);
2376 case TargetOpcode::G_FLOG2:
2377 return legalizeFlog2(MI, B);
2378 case TargetOpcode::G_FLOG:
2379 case TargetOpcode::G_FLOG10:
2380 return legalizeFlogCommon(MI, B);
2381 case TargetOpcode::G_FEXP2:
2382 return legalizeFExp2(MI, B);
2383 case TargetOpcode::G_FEXP:
2384 case TargetOpcode::G_FEXP10:
2385 return legalizeFExp(MI, B);
2386 case TargetOpcode::G_FPOW:
2387 return legalizeFPow(MI, B);
2388 case TargetOpcode::G_FFLOOR:
2389 return legalizeFFloor(MI, MRI, B);
2390 case TargetOpcode::G_BUILD_VECTOR:
2391 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2392 return legalizeBuildVector(MI, MRI, B);
2393 case TargetOpcode::G_MUL:
2394 return legalizeMul(Helper, MI);
2395 case TargetOpcode::G_CTLZ:
2396 case TargetOpcode::G_CTTZ:
2397 return legalizeCTLZ_CTTZ(MI, MRI, B);
2398 case TargetOpcode::G_CTLS:
2399 return legalizeCTLS(MI, MRI, B);
2400 case TargetOpcode::G_CTLZ_ZERO_POISON:
2401 return legalizeCTLZ_ZERO_POISON(MI, MRI, B);
2402 case TargetOpcode::G_STACKSAVE:
2403 return legalizeStackSave(MI, B);
2404 case TargetOpcode::G_GET_FPENV:
2405 return legalizeGetFPEnv(MI, MRI, B);
2406 case TargetOpcode::G_SET_FPENV:
2407 return legalizeSetFPEnv(MI, MRI, B);
2408 case TargetOpcode::G_TRAP:
2409 return legalizeTrap(MI, MRI, B);
2410 case TargetOpcode::G_DEBUGTRAP:
2411 return legalizeDebugTrap(MI, MRI, B);
2412 default:
2413 return false;
2414 }
2415
2416 llvm_unreachable("expected switch to return");
2417}
2418
2419Register AMDGPULegalizerInfo::getSegmentAperture(
2420 unsigned AS,
2421 MachineRegisterInfo &MRI,
2422 MachineIRBuilder &B) const {
2423 MachineFunction &MF = B.getMF();
2424 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2425 const LLT S32 = LLT::scalar(SizeInBits: 32);
2426 const LLT S64 = LLT::scalar(SizeInBits: 64);
2427
2428 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2429
2430 if (ST.hasApertureRegs()) {
2431 // Note: this register is somewhat broken. When used as a 32-bit operand,
2432 // it only returns zeroes. The real value is in the upper 32 bits.
2433 // Thus, we must emit extract the high 32 bits.
2434 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2435 ? AMDGPU::SRC_SHARED_BASE
2436 : AMDGPU::SRC_PRIVATE_BASE;
2437 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2438 !ST.hasGloballyAddressableScratch()) &&
2439 "Cannot use src_private_base with globally addressable scratch!");
2440 Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2441 MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2442 B.buildCopy(Res: {Dst}, Op: {Register(ApertureRegNo)});
2443 return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: 1);
2444 }
2445
2446 Register LoadAddr = MRI.createGenericVirtualRegister(
2447 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2448 // For code object version 5, private_base and shared_base are passed through
2449 // implicit kernargs.
2450 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2451 AMDGPU::AMDHSA_COV5) {
2452 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF&: B.getMF());
2453
2454 AMDGPUTargetLowering::ImplicitParameter Param =
2455 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2456 : AMDGPUTargetLowering::PRIVATE_BASE;
2457 uint64_t Offset =
2458 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2459
2460 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2461 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2462
2463 if (!loadInputValue(DstReg: KernargPtrReg, B,
2464 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2465 return Register();
2466
2467 MachineMemOperand *MMO = MF.getMachineMemOperand(
2468 PtrInfo: PtrInfo.getWithOffset(O: Offset),
2469 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2470 MachineMemOperand::MOInvariant,
2471 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset));
2472
2473 // Pointer address
2474 B.buildObjectPtrOffset(Res: LoadAddr, Op0: KernargPtrReg,
2475 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
2476 // Load address
2477 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2478 }
2479
2480 Register QueuePtr = MRI.createGenericVirtualRegister(
2481 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2482
2483 if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2484 return Register();
2485
2486 // TODO: Use custom PseudoSourceValue
2487 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2488
2489 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2490 // private_segment_aperture_base_hi.
2491 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2492
2493 MachineMemOperand *MMO = MF.getMachineMemOperand(
2494 PtrInfo,
2495 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2496 MachineMemOperand::MOInvariant,
2497 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset: StructOffset));
2498
2499 B.buildObjectPtrOffset(
2500 Res: LoadAddr, Op0: QueuePtr,
2501 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: StructOffset).getReg(Idx: 0));
2502 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2503}
2504
2505/// Return true if the value is a known valid address, such that a null check is
2506/// not necessary.
2507static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2508 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2509 MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2510 switch (Def->getOpcode()) {
2511 case AMDGPU::G_FRAME_INDEX:
2512 case AMDGPU::G_GLOBAL_VALUE:
2513 case AMDGPU::G_BLOCK_ADDR:
2514 return true;
2515 case AMDGPU::G_CONSTANT: {
2516 const ConstantInt *CI = Def->getOperand(i: 1).getCImm();
2517 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AS: AddrSpace);
2518 }
2519 default:
2520 return false;
2521 }
2522
2523 return false;
2524}
2525
2526bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2527 MachineInstr &MI, MachineRegisterInfo &MRI,
2528 MachineIRBuilder &B) const {
2529 MachineFunction &MF = B.getMF();
2530
2531 // MI can either be a G_ADDRSPACE_CAST or a
2532 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2533 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2534 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2535 Intrinsic::amdgcn_addrspacecast_nonnull));
2536
2537 const LLT S32 = LLT::scalar(SizeInBits: 32);
2538 Register Dst = MI.getOperand(i: 0).getReg();
2539 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
2540 : MI.getOperand(i: 1).getReg();
2541 LLT DstTy = MRI.getType(Reg: Dst);
2542 LLT SrcTy = MRI.getType(Reg: Src);
2543 unsigned DestAS = DstTy.getAddressSpace();
2544 unsigned SrcAS = SrcTy.getAddressSpace();
2545
2546 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2547 // vector element.
2548 assert(!DstTy.isVector());
2549
2550 const AMDGPUTargetMachine &TM
2551 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2552
2553 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2554 MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2555 return true;
2556 }
2557
2558 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2559 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2560 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2561 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2562 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2563 ST.hasGloballyAddressableScratch()) {
2564 // flat -> private with globally addressable scratch: subtract
2565 // src_flat_scratch_base_lo.
2566 const LLT S32 = LLT::scalar(SizeInBits: 32);
2567 Register SrcLo = B.buildExtract(Res: S32, Src, Index: 0).getReg(Idx: 0);
2568 Register FlatScratchBaseLo =
2569 B.buildInstr(Opc: AMDGPU::S_MOV_B32, DstOps: {S32},
2570 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2571 .getReg(Idx: 0);
2572 MRI.setRegClass(Reg: FlatScratchBaseLo, RC: &AMDGPU::SReg_32RegClass);
2573 Register Sub = B.buildSub(Dst: S32, Src0: SrcLo, Src1: FlatScratchBaseLo).getReg(Idx: 0);
2574 return B.buildIntToPtr(Dst, Src: Sub).getReg(Idx: 0);
2575 }
2576
2577 // Extract low 32-bits of the pointer.
2578 return B.buildExtract(Res: Dst, Src, Index: 0).getReg(Idx: 0);
2579 };
2580
2581 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2582 // G_ADDRSPACE_CAST we need to guess.
2583 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2584 castFlatToLocalOrPrivate(Dst);
2585 MI.eraseFromParent();
2586 return true;
2587 }
2588
2589 unsigned NullVal = AMDGPU::getNullPointerValue(AS: DestAS);
2590
2591 auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2592 auto FlatNull = B.buildConstant(Res: SrcTy, Val: 0);
2593
2594 // Extract low 32-bits of the pointer.
2595 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2596
2597 auto CmpRes =
2598 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: FlatNull.getReg(Idx: 0));
2599 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: 0));
2600
2601 MI.eraseFromParent();
2602 return true;
2603 }
2604
2605 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2606 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2607 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2608 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2609 // Coerce the type of the low half of the result so we can use
2610 // merge_values.
2611 Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: 0);
2612
2613 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2614 ST.hasGloballyAddressableScratch()) {
2615 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2616 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2617 Register AllOnes = B.buildConstant(Res: S32, Val: -1).getReg(Idx: 0);
2618 Register ThreadID = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
2619 ThreadID = B.buildIntrinsic(ID: Intrinsic::amdgcn_mbcnt_lo, Res: {S32})
2620 .addUse(RegNo: AllOnes)
2621 .addUse(RegNo: ThreadID)
2622 .getReg(Idx: 0);
2623 if (ST.isWave64()) {
2624 ThreadID = B.buildIntrinsic(ID: Intrinsic::amdgcn_mbcnt_hi, Res: {S32})
2625 .addUse(RegNo: AllOnes)
2626 .addUse(RegNo: ThreadID)
2627 .getReg(Idx: 0);
2628 }
2629 Register ShAmt =
2630 B.buildConstant(Res: S32, Val: 57 - 32 - ST.getWavefrontSizeLog2()).getReg(Idx: 0);
2631 Register SrcHi = B.buildShl(Dst: S32, Src0: ThreadID, Src1: ShAmt).getReg(Idx: 0);
2632 Register CvtPtr =
2633 B.buildMergeLikeInstr(Res: DstTy, Ops: {SrcAsInt, SrcHi}).getReg(Idx: 0);
2634 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2635 // 64-bit hi:lo value.
2636 Register FlatScratchBase =
2637 B.buildInstr(Opc: AMDGPU::S_MOV_B64, DstOps: {S64},
2638 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2639 .getReg(Idx: 0);
2640 MRI.setRegClass(Reg: FlatScratchBase, RC: &AMDGPU::SReg_64RegClass);
2641 return B.buildPtrAdd(Res: Dst, Op0: CvtPtr, Op1: FlatScratchBase).getReg(Idx: 0);
2642 }
2643
2644 Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2645 if (!ApertureReg.isValid())
2646 return false;
2647
2648 // TODO: Should we allow mismatched types but matching sizes in merges to
2649 // avoid the ptrtoint?
2650 return B.buildMergeLikeInstr(Res: Dst, Ops: {SrcAsInt, ApertureReg}).getReg(Idx: 0);
2651 };
2652
2653 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2654 // G_ADDRSPACE_CAST we need to guess.
2655 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2656 castLocalOrPrivateToFlat(Dst);
2657 MI.eraseFromParent();
2658 return true;
2659 }
2660
2661 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2662
2663 auto SegmentNull =
2664 B.buildConstant(Res: SrcTy, Val: AMDGPU::getNullPointerValue(AS: SrcAS));
2665 auto FlatNull = B.buildConstant(Res: DstTy, Val: AMDGPU::getNullPointerValue(AS: DestAS));
2666
2667 auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
2668 Op1: SegmentNull.getReg(Idx: 0));
2669
2670 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2671
2672 MI.eraseFromParent();
2673 return true;
2674 }
2675
2676 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2677 SrcTy.getSizeInBits() == 64) {
2678 // Truncate.
2679 B.buildExtract(Res: Dst, Src, Index: 0);
2680 MI.eraseFromParent();
2681 return true;
2682 }
2683
2684 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2685 DstTy.getSizeInBits() == 64) {
2686 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2687 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2688 auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2689 if (AddrHiVal == 0) {
2690 auto Zext = B.buildZExt(Res: LLT::scalar(SizeInBits: 64), Op: PtrLo);
2691 B.buildIntToPtr(Dst, Src: Zext);
2692 } else {
2693 auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2694 B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2695 }
2696
2697 MI.eraseFromParent();
2698 return true;
2699 }
2700
2701 // Invalid casts are poison.
2702 // TODO: Should return poison
2703 B.buildUndef(Res: Dst);
2704 MI.eraseFromParent();
2705 return true;
2706}
2707
2708bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2709 MachineRegisterInfo &MRI,
2710 MachineIRBuilder &B) const {
2711 Register Src = MI.getOperand(i: 1).getReg();
2712 LLT Ty = MRI.getType(Reg: Src);
2713 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2714
2715 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2716 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2717
2718 auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2719 auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2720
2721 // TODO: Should this propagate fast-math-flags?
2722 auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2723 auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2724
2725 auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2726 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2727
2728 auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: C2);
2729 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2730 MI.eraseFromParent();
2731 return true;
2732}
2733
2734bool AMDGPULegalizerInfo::legalizeFceil(
2735 MachineInstr &MI, MachineRegisterInfo &MRI,
2736 MachineIRBuilder &B) const {
2737
2738 const LLT S1 = LLT::scalar(SizeInBits: 1);
2739 const LLT S64 = LLT::scalar(SizeInBits: 64);
2740
2741 Register Src = MI.getOperand(i: 1).getReg();
2742 assert(MRI.getType(Src) == S64);
2743
2744 // result = trunc(src)
2745 // if (src > 0.0 && src != result)
2746 // result += 1.0
2747
2748 auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2749
2750 const auto Zero = B.buildFConstant(Res: S64, Val: 0.0);
2751 const auto One = B.buildFConstant(Res: S64, Val: 1.0);
2752 auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2753 auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2754 auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2755 auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2756
2757 // TODO: Should this propagate fast-math-flags?
2758 B.buildFAdd(Dst: MI.getOperand(i: 0).getReg(), Src0: Trunc, Src1: Add);
2759 MI.eraseFromParent();
2760 return true;
2761}
2762
2763bool AMDGPULegalizerInfo::legalizeFrem(
2764 MachineInstr &MI, MachineRegisterInfo &MRI,
2765 MachineIRBuilder &B) const {
2766 Register DstReg = MI.getOperand(i: 0).getReg();
2767 Register Src0Reg = MI.getOperand(i: 1).getReg();
2768 Register Src1Reg = MI.getOperand(i: 2).getReg();
2769 auto Flags = MI.getFlags();
2770 LLT Ty = MRI.getType(Reg: DstReg);
2771
2772 auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2773 auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2774 auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2775 B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2776 MI.eraseFromParent();
2777 return true;
2778}
2779
2780static MachineInstrBuilder extractF64Exponent(Register Hi,
2781 MachineIRBuilder &B) {
2782 const unsigned FractBits = 52;
2783 const unsigned ExpBits = 11;
2784 LLT S32 = LLT::scalar(SizeInBits: 32);
2785
2786 auto Const0 = B.buildConstant(Res: S32, Val: FractBits - 32);
2787 auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2788
2789 auto ExpPart = B.buildIntrinsic(ID: Intrinsic::amdgcn_ubfe, Res: {S32})
2790 .addUse(RegNo: Hi)
2791 .addUse(RegNo: Const0.getReg(Idx: 0))
2792 .addUse(RegNo: Const1.getReg(Idx: 0));
2793
2794 return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: 1023));
2795}
2796
2797bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2798 MachineInstr &MI, MachineRegisterInfo &MRI,
2799 MachineIRBuilder &B) const {
2800 const LLT S1 = LLT::scalar(SizeInBits: 1);
2801 const LLT S32 = LLT::scalar(SizeInBits: 32);
2802 const LLT S64 = LLT::scalar(SizeInBits: 64);
2803
2804 Register Src = MI.getOperand(i: 1).getReg();
2805 assert(MRI.getType(Src) == S64);
2806
2807 // TODO: Should this use extract since the low half is unused?
2808 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2809 Register Hi = Unmerge.getReg(Idx: 1);
2810
2811 // Extract the upper half, since this is where we will find the sign and
2812 // exponent.
2813 auto Exp = extractF64Exponent(Hi, B);
2814
2815 const unsigned FractBits = 52;
2816
2817 // Extract the sign bit.
2818 const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(1) << 31);
2819 auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2820
2821 const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(1) << FractBits) - 1);
2822
2823 const auto Zero32 = B.buildConstant(Res: S32, Val: 0);
2824
2825 // Extend back to 64-bits.
2826 auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2827
2828 auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2829 auto Not = B.buildNot(Dst: S64, Src0: Shr);
2830 auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2831 auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - 1);
2832
2833 auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2834 auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2835
2836 auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2837 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2838 MI.eraseFromParent();
2839 return true;
2840}
2841
2842bool AMDGPULegalizerInfo::legalizeITOFP(
2843 MachineInstr &MI, MachineRegisterInfo &MRI,
2844 MachineIRBuilder &B, bool Signed) const {
2845
2846 Register Dst = MI.getOperand(i: 0).getReg();
2847 Register Src = MI.getOperand(i: 1).getReg();
2848
2849 const LLT S64 = LLT::scalar(SizeInBits: 64);
2850 const LLT S32 = LLT::scalar(SizeInBits: 32);
2851
2852 assert(MRI.getType(Src) == S64);
2853
2854 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2855 auto ThirtyTwo = B.buildConstant(Res: S32, Val: 32);
2856
2857 if (MRI.getType(Reg: Dst) == S64) {
2858 auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1))
2859 : B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1));
2860
2861 auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 0));
2862 auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2863
2864 // TODO: Should this propagate fast-math-flags?
2865 B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2866 MI.eraseFromParent();
2867 return true;
2868 }
2869
2870 assert(MRI.getType(Dst) == S32);
2871
2872 auto One = B.buildConstant(Res: S32, Val: 1);
2873
2874 MachineInstrBuilder ShAmt;
2875 if (Signed) {
2876 auto ThirtyOne = B.buildConstant(Res: S32, Val: 31);
2877 auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: 0), Src1: Unmerge.getReg(Idx: 1));
2878 auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2879 auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2880 auto LS = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32})
2881 .addUse(RegNo: Unmerge.getReg(Idx: 1));
2882 auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2883 ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2884 } else
2885 ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
2886 auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2887 auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2888 auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: 0));
2889 auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: 1), Src1: Adjust);
2890 auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2891 auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2892 B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2893 MI.eraseFromParent();
2894 return true;
2895}
2896
2897// TODO: Copied from DAG implementation. Verify logic and document how this
2898// actually works.
2899bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2900 MachineRegisterInfo &MRI,
2901 MachineIRBuilder &B,
2902 bool Signed) const {
2903
2904 Register Dst = MI.getOperand(i: 0).getReg();
2905 Register Src = MI.getOperand(i: 1).getReg();
2906
2907 const LLT S64 = LLT::scalar(SizeInBits: 64);
2908 const LLT S32 = LLT::scalar(SizeInBits: 32);
2909
2910 const LLT SrcLT = MRI.getType(Reg: Src);
2911 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2912
2913 unsigned Flags = MI.getFlags();
2914
2915 // The basic idea of converting a floating point number into a pair of 32-bit
2916 // integers is illustrated as follows:
2917 //
2918 // tf := trunc(val);
2919 // hif := floor(tf * 2^-32);
2920 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2921 // hi := fptoi(hif);
2922 // lo := fptoi(lof);
2923 //
2924 auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2925 MachineInstrBuilder Sign;
2926 if (Signed && SrcLT == S32) {
2927 // However, a 32-bit floating point number has only 23 bits mantissa and
2928 // it's not enough to hold all the significant bits of `lof` if val is
2929 // negative. To avoid the loss of precision, We need to take the absolute
2930 // value after truncating and flip the result back based on the original
2931 // signedness.
2932 Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: 31));
2933 Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2934 }
2935 MachineInstrBuilder K0, K1;
2936 if (SrcLT == S64) {
2937 K0 = B.buildFConstant(
2938 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2939 K1 = B.buildFConstant(
2940 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2941 } else {
2942 K0 = B.buildFConstant(
2943 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2944 K1 = B.buildFConstant(
2945 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2946 }
2947
2948 auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2949 auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2950 auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2951
2952 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2953 : B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2954 auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2955
2956 if (Signed && SrcLT == S32) {
2957 // Flip the result based on the signedness, which is either all 0s or 1s.
2958 Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2959 // r := xor({lo, hi}, sign) - sign;
2960 B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2961 Src1: Sign);
2962 } else
2963 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2964 MI.eraseFromParent();
2965
2966 return true;
2967}
2968
2969bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2970 MachineInstr &MI) const {
2971 MachineFunction &MF = Helper.MIRBuilder.getMF();
2972 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2973
2974 // With ieee_mode disabled, the instructions have the correct behavior.
2975 if (!MFI->getMode().IEEE)
2976 return true;
2977
2978 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2979}
2980
2981bool AMDGPULegalizerInfo::legalizeExtract(LegalizerHelper &Helper,
2982 MachineInstr &MI) const {
2983 MachineIRBuilder &B = Helper.MIRBuilder;
2984 MachineRegisterInfo &MRI = *B.getMRI();
2985 Register DstReg = MI.getOperand(i: 0).getReg();
2986 Register SrcReg = MI.getOperand(i: 1).getReg();
2987 uint64_t Offset = MI.getOperand(i: 2).getImm();
2988
2989 // Fall back to generic lowering for offset 0 (trivial trunc) and
2990 // non-32-bit-aligned cases which require shift+trunc sequences
2991 // that generic code handles correctly.
2992 if (Offset == 0 || Offset % 32 != 0)
2993 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2994
2995 const LLT DstTy = MRI.getType(Reg: DstReg);
2996 unsigned StartIdx = Offset / 32;
2997 unsigned DstCount = DstTy.getSizeInBits() / 32;
2998 auto Unmerge = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: SrcReg);
2999
3000 if (DstCount == 1) {
3001 if (DstTy.isPointer())
3002 B.buildIntToPtr(Dst: DstReg, Src: Unmerge.getReg(Idx: StartIdx));
3003 else
3004 MRI.replaceRegWith(FromReg: DstReg, ToReg: Unmerge.getReg(Idx: StartIdx));
3005 } else {
3006 SmallVector<Register, 8> MergeVec;
3007 for (unsigned I = 0; I < DstCount; ++I)
3008 MergeVec.push_back(Elt: Unmerge.getReg(Idx: StartIdx + I));
3009 B.buildMergeLikeInstr(Res: DstReg, Ops: MergeVec);
3010 }
3011
3012 MI.eraseFromParent();
3013 return true;
3014}
3015
3016bool AMDGPULegalizerInfo::legalizeInsert(LegalizerHelper &Helper,
3017 MachineInstr &MI) const {
3018 MachineIRBuilder &B = Helper.MIRBuilder;
3019 MachineRegisterInfo &MRI = *B.getMRI();
3020 Register DstReg = MI.getOperand(i: 0).getReg();
3021 Register SrcReg = MI.getOperand(i: 1).getReg();
3022 Register InsertSrc = MI.getOperand(i: 2).getReg();
3023 uint64_t Offset = MI.getOperand(i: 3).getImm();
3024
3025 unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
3026 const LLT InsertTy = MRI.getType(Reg: InsertSrc);
3027 unsigned InsertSize = InsertTy.getSizeInBits();
3028
3029 // Fall back to generic lowering for non-32-bit-aligned cases which
3030 // require shift+mask sequences that generic code handles correctly.
3031 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
3032 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
3033
3034 const LLT S32 = LLT::scalar(SizeInBits: 32);
3035 unsigned DstCount = DstSize / 32;
3036 unsigned InsertCount = InsertSize / 32;
3037 unsigned StartIdx = Offset / 32;
3038
3039 auto SrcUnmerge = B.buildUnmerge(Res: S32, Op: SrcReg);
3040
3041 SmallVector<Register, 8> MergeVec;
3042 for (unsigned I = 0; I < StartIdx; ++I)
3043 MergeVec.push_back(Elt: SrcUnmerge.getReg(Idx: I));
3044
3045 if (InsertCount == 1) {
3046 // Merge-like instructions require same source types. Convert pointer
3047 // to scalar when inserting a pointer value into a scalar.
3048 if (InsertTy.isPointer())
3049 InsertSrc = B.buildPtrToInt(Dst: S32, Src: InsertSrc).getReg(Idx: 0);
3050 MergeVec.push_back(Elt: InsertSrc);
3051 } else {
3052 auto InsertUnmerge = B.buildUnmerge(Res: S32, Op: InsertSrc);
3053 for (unsigned I = 0; I < InsertCount; ++I)
3054 MergeVec.push_back(Elt: InsertUnmerge.getReg(Idx: I));
3055 }
3056
3057 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
3058 MergeVec.push_back(Elt: SrcUnmerge.getReg(Idx: I));
3059
3060 B.buildMergeLikeInstr(Res: DstReg, Ops: MergeVec);
3061
3062 MI.eraseFromParent();
3063 return true;
3064}
3065
3066bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
3067 MachineInstr &MI, MachineRegisterInfo &MRI,
3068 MachineIRBuilder &B) const {
3069 // TODO: Should move some of this into LegalizerHelper.
3070
3071 // TODO: Promote dynamic indexing of s16 to s32
3072
3073 Register Dst = MI.getOperand(i: 0).getReg();
3074 Register Vec = MI.getOperand(i: 1).getReg();
3075
3076 LLT VecTy = MRI.getType(Reg: Vec);
3077 LLT EltTy = VecTy.getElementType();
3078 assert(EltTy == MRI.getType(Dst));
3079
3080 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3081 // but we can't go directly to that logic becasue you can't bitcast a vector
3082 // of pointers to a vector of integers. Therefore, introduce an intermediate
3083 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3084 // drive the legalization forward.
3085 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3086 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
3087 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
3088
3089 auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
3090 auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: 2));
3091 B.buildIntToPtr(Dst, Src: IntElt);
3092
3093 MI.eraseFromParent();
3094 return true;
3095 }
3096
3097 // FIXME: Artifact combiner probably should have replaced the truncated
3098 // constant before this, so we shouldn't need
3099 // getIConstantVRegValWithLookThrough.
3100 std::optional<ValueAndVReg> MaybeIdxVal =
3101 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 2).getReg(), MRI);
3102 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3103 return true;
3104 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3105
3106 if (IdxVal < VecTy.getNumElements()) {
3107 auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
3108 B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
3109 } else {
3110 B.buildUndef(Res: Dst);
3111 }
3112
3113 MI.eraseFromParent();
3114 return true;
3115}
3116
3117bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
3118 MachineInstr &MI, MachineRegisterInfo &MRI,
3119 MachineIRBuilder &B) const {
3120 // TODO: Should move some of this into LegalizerHelper.
3121
3122 // TODO: Promote dynamic indexing of s16 to s32
3123
3124 Register Dst = MI.getOperand(i: 0).getReg();
3125 Register Vec = MI.getOperand(i: 1).getReg();
3126 Register Ins = MI.getOperand(i: 2).getReg();
3127
3128 LLT VecTy = MRI.getType(Reg: Vec);
3129 LLT EltTy = VecTy.getElementType();
3130 assert(EltTy == MRI.getType(Ins));
3131
3132 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3133 // but we can't go directly to that logic becasue you can't bitcast a vector
3134 // of pointers to a vector of integers. Therefore, make the pointer vector
3135 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3136 // new value, and then inttoptr the result vector back. This will then allow
3137 // the rest of legalization to take over.
3138 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3139 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
3140 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
3141
3142 auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
3143 auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
3144 auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
3145 Idx: MI.getOperand(i: 3));
3146 B.buildIntToPtr(Dst, Src: IntVecDest);
3147 MI.eraseFromParent();
3148 return true;
3149 }
3150
3151 // FIXME: Artifact combiner probably should have replaced the truncated
3152 // constant before this, so we shouldn't need
3153 // getIConstantVRegValWithLookThrough.
3154 std::optional<ValueAndVReg> MaybeIdxVal =
3155 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 3).getReg(), MRI);
3156 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3157 return true;
3158
3159 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3160
3161 unsigned NumElts = VecTy.getNumElements();
3162 if (IdxVal < NumElts) {
3163 SmallVector<Register, 8> SrcRegs;
3164 for (unsigned i = 0; i < NumElts; ++i)
3165 SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
3166 B.buildUnmerge(Res: SrcRegs, Op: Vec);
3167
3168 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
3169 B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3170 } else {
3171 B.buildUndef(Res: Dst);
3172 }
3173
3174 MI.eraseFromParent();
3175 return true;
3176}
3177
3178bool AMDGPULegalizerInfo::legalizeSinCos(
3179 MachineInstr &MI, MachineRegisterInfo &MRI,
3180 MachineIRBuilder &B) const {
3181
3182 Register DstReg = MI.getOperand(i: 0).getReg();
3183 Register SrcReg = MI.getOperand(i: 1).getReg();
3184 LLT Ty = MRI.getType(Reg: DstReg);
3185 unsigned Flags = MI.getFlags();
3186
3187 Register TrigVal;
3188 auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: 0.5 * numbers::inv_pi);
3189 if (ST.hasTrigReducedRange()) {
3190 auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
3191 TrigVal = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {Ty})
3192 .addUse(RegNo: MulVal.getReg(Idx: 0))
3193 .setMIFlags(Flags)
3194 .getReg(Idx: 0);
3195 } else
3196 TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: 0);
3197
3198 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3199 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3200 B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
3201 .addUse(RegNo: TrigVal)
3202 .setMIFlags(Flags);
3203 MI.eraseFromParent();
3204 return true;
3205}
3206
3207bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
3208 MachineIRBuilder &B,
3209 const GlobalValue *GV,
3210 int64_t Offset,
3211 unsigned GAFlags) const {
3212 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3213 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3214 // to the following code sequence:
3215 //
3216 // For constant address space:
3217 // s_getpc_b64 s[0:1]
3218 // s_add_u32 s0, s0, $symbol
3219 // s_addc_u32 s1, s1, 0
3220 //
3221 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3222 // a fixup or relocation is emitted to replace $symbol with a literal
3223 // constant, which is a pc-relative offset from the encoding of the $symbol
3224 // operand to the global variable.
3225 //
3226 // For global address space:
3227 // s_getpc_b64 s[0:1]
3228 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3229 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3230 //
3231 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3232 // fixups or relocations are emitted to replace $symbol@*@lo and
3233 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3234 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3235 // operand to the global variable.
3236
3237 LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3238
3239 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3240 B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
3241
3242 if (ST.has64BitLiterals()) {
3243 assert(GAFlags != SIInstrInfo::MO_NONE);
3244
3245 MachineInstrBuilder MIB =
3246 B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(RegNo: PCReg);
3247 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 2);
3248 } else {
3249 MachineInstrBuilder MIB =
3250 B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(RegNo: PCReg);
3251
3252 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
3253 if (GAFlags == SIInstrInfo::MO_NONE)
3254 MIB.addImm(Val: 0);
3255 else
3256 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 1);
3257 }
3258
3259 if (!B.getMRI()->getRegClassOrNull(Reg: PCReg))
3260 B.getMRI()->setRegClass(Reg: PCReg, RC: &AMDGPU::SReg_64RegClass);
3261
3262 if (PtrTy.getSizeInBits() == 32)
3263 B.buildExtract(Res: DstReg, Src: PCReg, Index: 0);
3264 return true;
3265}
3266
3267// Emit a ABS32_LO / ABS32_HI relocation stub.
3268void AMDGPULegalizerInfo::buildAbsGlobalAddress(
3269 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3270 MachineRegisterInfo &MRI) const {
3271 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3272
3273 if (RequiresHighHalf && ST.has64BitLiterals()) {
3274 if (!MRI.getRegClassOrNull(Reg: DstReg))
3275 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_64RegClass);
3276 B.buildInstr(Opcode: AMDGPU::S_MOV_B64)
3277 .addDef(RegNo: DstReg)
3278 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS64);
3279 return;
3280 }
3281
3282 LLT S32 = LLT::scalar(SizeInBits: 32);
3283
3284 // Use the destination directly, if and only if we store the lower address
3285 // part only and we don't have a register class being set.
3286 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
3287 ? DstReg
3288 : MRI.createGenericVirtualRegister(Ty: S32);
3289
3290 if (!MRI.getRegClassOrNull(Reg: AddrLo))
3291 MRI.setRegClass(Reg: AddrLo, RC: &AMDGPU::SReg_32RegClass);
3292
3293 // Write the lower half.
3294 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
3295 .addDef(RegNo: AddrLo)
3296 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
3297
3298 // If required, write the upper half as well.
3299 if (RequiresHighHalf) {
3300 assert(PtrTy.getSizeInBits() == 64 &&
3301 "Must provide a 64-bit pointer type!");
3302
3303 Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
3304 MRI.setRegClass(Reg: AddrHi, RC: &AMDGPU::SReg_32RegClass);
3305
3306 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
3307 .addDef(RegNo: AddrHi)
3308 .addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_HI);
3309
3310 // Use the destination directly, if and only if we don't have a register
3311 // class being set.
3312 Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
3313 ? DstReg
3314 : MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
3315
3316 if (!MRI.getRegClassOrNull(Reg: AddrDst))
3317 MRI.setRegClass(Reg: AddrDst, RC: &AMDGPU::SReg_64RegClass);
3318
3319 B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
3320
3321 // If we created a new register for the destination, cast the result into
3322 // the final output.
3323 if (AddrDst != DstReg)
3324 B.buildCast(Dst: DstReg, Src: AddrDst);
3325 } else if (AddrLo != DstReg) {
3326 // If we created a new register for the destination, cast the result into
3327 // the final output.
3328 B.buildCast(Dst: DstReg, Src: AddrLo);
3329 }
3330}
3331
3332bool AMDGPULegalizerInfo::legalizeGlobalValue(
3333 MachineInstr &MI, MachineRegisterInfo &MRI,
3334 MachineIRBuilder &B) const {
3335 Register DstReg = MI.getOperand(i: 0).getReg();
3336 LLT Ty = MRI.getType(Reg: DstReg);
3337 unsigned AS = Ty.getAddressSpace();
3338
3339 const GlobalValue *GV = MI.getOperand(i: 1).getGlobal();
3340 MachineFunction &MF = B.getMF();
3341 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3342
3343 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
3344 if (!MFI->isModuleEntryFunction() &&
3345 GV->getName() != "llvm.amdgcn.module.lds" &&
3346 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
3347 const Function &Fn = MF.getFunction();
3348 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
3349 Fn, "local memory global used by non-kernel function",
3350 MI.getDebugLoc(), DS_Warning));
3351
3352 // We currently don't have a way to correctly allocate LDS objects that
3353 // aren't directly associated with a kernel. We do force inlining of
3354 // functions that use local objects. However, if these dead functions are
3355 // not eliminated, we don't want a compile time error. Just emit a warning
3356 // and a trap, since there should be no callable path here.
3357 B.buildTrap();
3358 B.buildUndef(Res: DstReg);
3359 MI.eraseFromParent();
3360 return true;
3361 }
3362
3363 // TODO: We could emit code to handle the initialization somewhere.
3364 // We ignore the initializer for now and legalize it to allow selection.
3365 // The initializer will anyway get errored out during assembly emission.
3366 const SITargetLowering *TLI = ST.getTargetLowering();
3367 if (!TLI->shouldUseLDSConstAddress(GV)) {
3368 MI.getOperand(i: 1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3369 return true; // Leave in place;
3370 }
3371
3372 const GlobalVariable &GVar = *cast<GlobalVariable>(Val: GV);
3373 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3374 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3375 // zero-sized type in other languages to declare the dynamic shared
3376 // memory which size is not known at the compile time. They will be
3377 // allocated by the runtime and placed directly after the static
3378 // allocated ones. They all share the same offset.
3379 if (GVar.getGlobalSize(DL: GVar.getDataLayout()) == 0) {
3380 // Adjust alignment for that dynamic shared memory array.
3381 MFI->setDynLDSAlign(F: MF.getFunction(), GV: GVar);
3382 LLT S32 = LLT::scalar(SizeInBits: 32);
3383 auto Sz = B.buildIntrinsic(ID: Intrinsic::amdgcn_groupstaticsize, Res: {S32});
3384 B.buildIntToPtr(Dst: DstReg, Src: Sz);
3385 MI.eraseFromParent();
3386 return true;
3387 }
3388 }
3389
3390 B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(), GV: GVar));
3391 MI.eraseFromParent();
3392 return true;
3393 }
3394
3395 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3396 buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
3397 MI.eraseFromParent();
3398 return true;
3399 }
3400
3401 const SITargetLowering *TLI = ST.getTargetLowering();
3402
3403 if (TLI->shouldEmitFixup(GV)) {
3404 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0);
3405 MI.eraseFromParent();
3406 return true;
3407 }
3408
3409 if (TLI->shouldEmitPCReloc(GV)) {
3410 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_REL32);
3411 MI.eraseFromParent();
3412 return true;
3413 }
3414
3415 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3416 Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
3417
3418 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3419 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3420 PtrInfo: MachinePointerInfo::getGOT(MF),
3421 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3422 MachineMemOperand::MOInvariant,
3423 MemTy: LoadTy, base_alignment: Align(8));
3424
3425 buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3426
3427 if (Ty.getSizeInBits() == 32) {
3428 // Truncate if this is a 32-bit constant address.
3429 auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3430 B.buildExtract(Res: DstReg, Src: Load, Index: 0);
3431 } else
3432 B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3433
3434 MI.eraseFromParent();
3435 return true;
3436}
3437
3438static LLT widenToNextPowerOf2(LLT Ty) {
3439 if (Ty.isVector())
3440 return Ty.changeElementCount(
3441 EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3442 return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3443}
3444
3445bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3446 MachineInstr &MI) const {
3447 MachineIRBuilder &B = Helper.MIRBuilder;
3448 MachineRegisterInfo &MRI = *B.getMRI();
3449 GISelChangeObserver &Observer = Helper.Observer;
3450
3451 Register PtrReg = MI.getOperand(i: 1).getReg();
3452 LLT PtrTy = MRI.getType(Reg: PtrReg);
3453 unsigned AddrSpace = PtrTy.getAddressSpace();
3454
3455 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3456 LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3457 auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3458 Observer.changingInstr(MI);
3459 MI.getOperand(i: 1).setReg(Cast.getReg(Idx: 0));
3460 Observer.changedInstr(MI);
3461 return true;
3462 }
3463
3464 if (MI.getOpcode() != AMDGPU::G_LOAD)
3465 return false;
3466
3467 Register ValReg = MI.getOperand(i: 0).getReg();
3468 LLT ValTy = MRI.getType(Reg: ValReg);
3469
3470 if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3471 Observer.changingInstr(MI);
3472 castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
3473 Observer.changedInstr(MI);
3474 return true;
3475 }
3476
3477 MachineMemOperand *MMO = *MI.memoperands_begin();
3478 const unsigned ValSize = ValTy.getSizeInBits();
3479 const LLT MemTy = MMO->getMemoryType();
3480 const Align MemAlign = MMO->getAlign();
3481 const unsigned MemSize = MemTy.getSizeInBits();
3482 const uint64_t AlignInBits = 8 * MemAlign.value();
3483
3484 // Widen non-power-of-2 loads to the alignment if needed
3485 if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3486 const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3487
3488 // This was already the correct extending load result type, so just adjust
3489 // the memory type.
3490 if (WideMemSize == ValSize) {
3491 MachineFunction &MF = B.getMF();
3492
3493 MachineMemOperand *WideMMO =
3494 MF.getMachineMemOperand(MMO, Offset: 0, Size: WideMemSize / 8);
3495 Observer.changingInstr(MI);
3496 MI.setMemRefs(MF, MemRefs: {WideMMO});
3497 Observer.changedInstr(MI);
3498 return true;
3499 }
3500
3501 // Don't bother handling edge case that should probably never be produced.
3502 if (ValSize > WideMemSize)
3503 return false;
3504
3505 LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3506
3507 Register WideLoad;
3508 if (!WideTy.isVector()) {
3509 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3510 B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: 0);
3511 } else {
3512 // Extract the subvector.
3513
3514 if (isRegisterType(ST, Ty: ValTy)) {
3515 // If this a case where G_EXTRACT is legal, use it.
3516 // (e.g. <3 x s32> -> <4 x s32>)
3517 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3518 B.buildExtract(Res: ValReg, Src: WideLoad, Index: 0);
3519 } else {
3520 // For cases where the widened type isn't a nice register value, unmerge
3521 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3522 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3523 B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3524 }
3525 }
3526
3527 MI.eraseFromParent();
3528 return true;
3529 }
3530
3531 return false;
3532}
3533
3534bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3535 MachineInstr &MI) const {
3536 MachineIRBuilder &B = Helper.MIRBuilder;
3537 MachineRegisterInfo &MRI = *B.getMRI();
3538 GISelChangeObserver &Observer = Helper.Observer;
3539
3540 Register DataReg = MI.getOperand(i: 0).getReg();
3541 LLT DataTy = MRI.getType(Reg: DataReg);
3542
3543 if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3544 Observer.changingInstr(MI);
3545 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
3546 Observer.changedInstr(MI);
3547 return true;
3548 }
3549 return false;
3550}
3551
3552bool AMDGPULegalizerInfo::legalizeFMad(
3553 MachineInstr &MI, MachineRegisterInfo &MRI,
3554 MachineIRBuilder &B) const {
3555 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3556 assert(Ty.isScalar());
3557
3558 MachineFunction &MF = B.getMF();
3559 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3560
3561 // TODO: Always legal with future ftz flag.
3562 // TODO: Type is expected to be LLT::float32()/LLT::float16()
3563 // FIXME: Do we need just output?
3564 if (Ty == LLT::scalar(SizeInBits: 32) &&
3565 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3566 return true;
3567 if (Ty == LLT::scalar(SizeInBits: 16) &&
3568 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3569 return true;
3570
3571 MachineIRBuilder HelperBuilder(MI);
3572 GISelObserverWrapper DummyObserver;
3573 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3574 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3575}
3576
3577bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3578 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3579 Register DstReg = MI.getOperand(i: 0).getReg();
3580 Register PtrReg = MI.getOperand(i: 1).getReg();
3581 Register CmpVal = MI.getOperand(i: 2).getReg();
3582 Register NewVal = MI.getOperand(i: 3).getReg();
3583
3584 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3585 "this should not have been custom lowered");
3586
3587 LLT ValTy = MRI.getType(Reg: CmpVal);
3588 LLT VecTy = LLT::fixed_vector(NumElements: 2, ScalarTy: ValTy);
3589
3590 Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: 0);
3591
3592 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3593 .addDef(RegNo: DstReg)
3594 .addUse(RegNo: PtrReg)
3595 .addUse(RegNo: PackedVal)
3596 .setMemRefs(MI.memoperands());
3597
3598 MI.eraseFromParent();
3599 return true;
3600}
3601
3602/// Return true if it's known that \p Src can never be an f32 denormal value.
3603static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3604 Register Src) {
3605 const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3606 switch (DefMI->getOpcode()) {
3607 case TargetOpcode::G_INTRINSIC: {
3608 switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3609 case Intrinsic::amdgcn_frexp_mant:
3610 case Intrinsic::amdgcn_log:
3611 case Intrinsic::amdgcn_log_clamp:
3612 case Intrinsic::amdgcn_exp2:
3613 case Intrinsic::amdgcn_sqrt:
3614 return true;
3615 default:
3616 break;
3617 }
3618
3619 break;
3620 }
3621 case TargetOpcode::G_FSQRT:
3622 return true;
3623 case TargetOpcode::G_FFREXP: {
3624 if (DefMI->getOperand(i: 0).getReg() == Src)
3625 return true;
3626 break;
3627 }
3628 case TargetOpcode::G_FPEXT: {
3629 return MRI.getType(Reg: DefMI->getOperand(i: 1).getReg()) == LLT::scalar(SizeInBits: 16);
3630 }
3631 default:
3632 return false;
3633 }
3634
3635 return false;
3636}
3637
3638static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3639 return Flags & MachineInstr::FmAfn;
3640}
3641
3642static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3643 unsigned Flags) {
3644 return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3645 MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3646 DenormalMode::PreserveSign;
3647}
3648
3649std::pair<Register, Register>
3650AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3651 unsigned Flags) const {
3652 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3653 return {};
3654
3655 const LLT F32 = LLT::scalar(SizeInBits: 32);
3656 auto SmallestNormal = B.buildFConstant(
3657 Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3658 auto IsLtSmallestNormal =
3659 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: SmallestNormal);
3660
3661 auto Scale32 = B.buildFConstant(Res: F32, Val: 0x1.0p+32);
3662 auto One = B.buildFConstant(Res: F32, Val: 1.0);
3663 auto ScaleFactor =
3664 B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3665 auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3666
3667 return {ScaledInput.getReg(Idx: 0), IsLtSmallestNormal.getReg(Idx: 0)};
3668}
3669
3670bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3671 MachineIRBuilder &B) const {
3672 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3673 // If we have to handle denormals, scale up the input and adjust the result.
3674
3675 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3676 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3677
3678 Register Dst = MI.getOperand(i: 0).getReg();
3679 Register Src = MI.getOperand(i: 1).getReg();
3680 LLT Ty = B.getMRI()->getType(Reg: Dst);
3681 unsigned Flags = MI.getFlags();
3682
3683 if (Ty == LLT::scalar(SizeInBits: 16)) {
3684 const LLT F32 = LLT::scalar(SizeInBits: 32);
3685 // Nothing in half is a denormal when promoted to f32.
3686 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3687 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {F32})
3688 .addUse(RegNo: Ext.getReg(Idx: 0))
3689 .setMIFlags(Flags);
3690 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3691 MI.eraseFromParent();
3692 return true;
3693 }
3694
3695 assert(Ty == LLT::scalar(32));
3696
3697 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3698 if (!ScaledInput) {
3699 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {MI.getOperand(i: 0)})
3700 .addUse(RegNo: Src)
3701 .setMIFlags(Flags);
3702 MI.eraseFromParent();
3703 return true;
3704 }
3705
3706 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3707 .addUse(RegNo: ScaledInput)
3708 .setMIFlags(Flags);
3709
3710 auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: 32.0);
3711 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3712 auto ResultOffset =
3713 B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3714 B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3715
3716 MI.eraseFromParent();
3717 return true;
3718}
3719
3720static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3721 Register Z, unsigned Flags) {
3722 auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3723 return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: 0);
3724}
3725
3726bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3727 MachineIRBuilder &B) const {
3728 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3729 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3730
3731 MachineRegisterInfo &MRI = *B.getMRI();
3732 Register Dst = MI.getOperand(i: 0).getReg();
3733 Register X = MI.getOperand(i: 1).getReg();
3734 unsigned Flags = MI.getFlags();
3735 const LLT Ty = MRI.getType(Reg: X);
3736
3737 const LLT F32 = LLT::scalar(SizeInBits: 32);
3738 const LLT F16 = LLT::scalar(SizeInBits: 16);
3739
3740 if (Ty == F16 || MI.getFlag(Flag: MachineInstr::FmAfn)) {
3741 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3742 // depending on !fpmath metadata.
3743 bool PromoteToF32 =
3744 Ty == F16 && (!MI.getFlag(Flag: MachineInstr::FmAfn) || !ST.has16BitInsts());
3745 if (PromoteToF32) {
3746 Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3747 auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3748 legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: 0), IsLog10, Flags);
3749 B.buildFPTrunc(Res: Dst, Op: LogVal);
3750 } else {
3751 legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3752 }
3753
3754 MI.eraseFromParent();
3755 return true;
3756 }
3757
3758 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3759 if (ScaledInput)
3760 X = ScaledInput;
3761
3762 auto Y =
3763 B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty}).addUse(RegNo: X).setMIFlags(Flags);
3764
3765 Register R;
3766 if (ST.hasFastFMAF32()) {
3767 // c+cc are ln(2)/ln(10) to more than 49 bits
3768 const float c_log10 = 0x1.344134p-2f;
3769 const float cc_log10 = 0x1.09f79ep-26f;
3770
3771 // c + cc is ln(2) to more than 49 bits
3772 const float c_log = 0x1.62e42ep-1f;
3773 const float cc_log = 0x1.efa39ep-25f;
3774
3775 auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3776 auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3777 // This adds correction terms for which contraction may lead to an increase
3778 // in the error of the approximation, so disable it.
3779 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3780 R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags: NewFlags).getReg(Idx: 0);
3781 auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags: NewFlags);
3782 auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags: NewFlags);
3783 auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags: NewFlags);
3784 R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags: NewFlags).getReg(Idx: 0);
3785 } else {
3786 // ch+ct is ln(2)/ln(10) to more than 36 bits
3787 const float ch_log10 = 0x1.344000p-2f;
3788 const float ct_log10 = 0x1.3509f6p-18f;
3789
3790 // ch + ct is ln(2) to more than 36 bits
3791 const float ch_log = 0x1.62e000p-1f;
3792 const float ct_log = 0x1.0bfbe8p-15f;
3793
3794 auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3795 auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3796
3797 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3798 auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3799 auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3800 // This adds correction terms for which contraction may lead to an increase
3801 // in the error of the approximation, so disable it.
3802 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3803 auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags: NewFlags);
3804
3805 Register Mad0 =
3806 getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CT.getReg(Idx: 0), Z: YTCT.getReg(Idx: 0), Flags: NewFlags);
3807 Register Mad1 = getMad(B, Ty, X: YT.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad0, Flags: NewFlags);
3808 R = getMad(B, Ty, X: YH.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: Mad1, Flags: NewFlags);
3809 }
3810
3811 const bool IsFiniteOnly =
3812 MI.getFlag(Flag: MachineInstr::FmNoNans) && MI.getFlag(Flag: MachineInstr::FmNoInfs);
3813
3814 if (!IsFiniteOnly) {
3815 // Expand isfinite(x) => fabs(x) < inf
3816 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3817 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3818 auto IsFinite =
3819 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
3820 R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(Idx: 0);
3821 }
3822
3823 if (ScaledInput) {
3824 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3825 auto ShiftK =
3826 B.buildFConstant(Res: Ty, Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3827 auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3828 B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3829 } else {
3830 B.buildCopy(Res: Dst, Op: R);
3831 }
3832
3833 MI.eraseFromParent();
3834 return true;
3835}
3836
3837bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3838 Register Src, bool IsLog10,
3839 unsigned Flags) const {
3840 const double Log2BaseInverted =
3841 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3842
3843 LLT Ty = B.getMRI()->getType(Reg: Dst);
3844
3845 if (Ty == LLT::scalar(SizeInBits: 32)) {
3846 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3847 if (ScaledInput) {
3848 auto LogSrc = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3849 .addUse(RegNo: Src)
3850 .setMIFlags(Flags);
3851 auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -32.0 * Log2BaseInverted);
3852 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3853 auto ResultOffset =
3854 B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3855 auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3856
3857 if (ST.hasFastFMAF32())
3858 B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3859 else {
3860 auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3861 B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3862 }
3863
3864 return true;
3865 }
3866 }
3867
3868 auto Log2Operand = Ty == LLT::scalar(SizeInBits: 16)
3869 ? B.buildFLog2(Dst: Ty, Src, Flags)
3870 : B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3871 .addUse(RegNo: Src)
3872 .setMIFlags(Flags);
3873 auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3874 B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3875 return true;
3876}
3877
3878bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3879 MachineIRBuilder &B) const {
3880 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3881 // If we have to handle denormals, scale up the input and adjust the result.
3882
3883 Register Dst = MI.getOperand(i: 0).getReg();
3884 Register Src = MI.getOperand(i: 1).getReg();
3885 unsigned Flags = MI.getFlags();
3886 LLT Ty = B.getMRI()->getType(Reg: Dst);
3887 const LLT F16 = LLT::scalar(SizeInBits: 16);
3888 const LLT F32 = LLT::scalar(SizeInBits: 32);
3889 const LLT F64 = LLT::scalar(SizeInBits: 64);
3890
3891 if (Ty == F64)
3892 return legalizeFEXPF64(MI, B);
3893
3894 if (Ty == F16) {
3895 // Nothing in half is a denormal when promoted to f32.
3896 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3897 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {F32})
3898 .addUse(RegNo: Ext.getReg(Idx: 0))
3899 .setMIFlags(Flags);
3900 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3901 MI.eraseFromParent();
3902 return true;
3903 }
3904
3905 assert(Ty == F32);
3906
3907 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3908 B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3909 .addUse(RegNo: Src)
3910 .setMIFlags(Flags);
3911 MI.eraseFromParent();
3912 return true;
3913 }
3914
3915 // bool needs_scaling = x < -0x1.f80000p+6f;
3916 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3917
3918 // -nextafter(128.0, -1)
3919 auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -0x1.f80000p+6f);
3920 auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
3921 Op1: RangeCheckConst, Flags);
3922
3923 auto SixtyFour = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3924 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3925 auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3926 auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3927
3928 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3929 .addUse(RegNo: AddInput.getReg(Idx: 0))
3930 .setMIFlags(Flags);
3931
3932 auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: 0x1.0p-64f);
3933 auto One = B.buildFConstant(Res: Ty, Val: 1.0);
3934 auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3935 B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3936 MI.eraseFromParent();
3937 return true;
3938}
3939
3940static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst,
3941 const SrcOp &Src, unsigned Flags) {
3942 LLT Ty = Dst.getLLTTy(MRI: *B.getMRI());
3943
3944 if (Ty == LLT::scalar(SizeInBits: 32)) {
3945 return B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Dst})
3946 .addUse(RegNo: Src.getReg())
3947 .setMIFlags(Flags);
3948 }
3949 return B.buildFExp2(Dst, Src, Flags);
3950}
3951
3952bool AMDGPULegalizerInfo::legalizeFExpUnsafeImpl(MachineIRBuilder &B,
3953 Register Dst, Register X,
3954 unsigned Flags,
3955 bool IsExp10) const {
3956 LLT Ty = B.getMRI()->getType(Reg: X);
3957
3958 // exp(x) -> exp2(M_LOG2E_F * x);
3959 // exp10(x) -> exp2(log2(10) * x);
3960 auto Const = B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3961 auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Const, Flags);
3962 buildExp(B, Dst, Src: Mul, Flags);
3963 return true;
3964}
3965
3966bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3967 Register X, unsigned Flags) const {
3968 LLT Ty = B.getMRI()->getType(Reg: Dst);
3969 LLT F32 = LLT::scalar(SizeInBits: 32);
3970
3971 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3972 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3973 }
3974
3975 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.5d58a0p+6f);
3976 auto NeedsScaling =
3977 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold, Flags);
3978 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3979 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3980 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3981
3982 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3983 auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3984
3985 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3986 .addUse(RegNo: ExpInput.getReg(Idx: 0))
3987 .setMIFlags(Flags);
3988
3989 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.969d48p-93f);
3990 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3991 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3992 return true;
3993}
3994
3995bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B,
3996 Register Dst, Register X,
3997 unsigned Flags) const {
3998 LLT Ty = B.getMRI()->getType(Reg: Dst);
3999 LLT F32 = LLT::scalar(SizeInBits: 32);
4000
4001 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
4002 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
4003 auto K0 = B.buildFConstant(Res: Ty, Val: 0x1.a92000p+1f);
4004 auto K1 = B.buildFConstant(Res: Ty, Val: 0x1.4f0978p-11f);
4005
4006 auto Mul1 = B.buildFMul(Dst: Ty, Src0: X, Src1: K1, Flags);
4007 auto Exp2_1 = buildExp(B, Dst: Ty, Src: Mul1, Flags);
4008 auto Mul0 = B.buildFMul(Dst: Ty, Src0: X, Src1: K0, Flags);
4009 auto Exp2_0 = buildExp(B, Dst: Ty, Src: Mul0, Flags);
4010 B.buildFMul(Dst, Src0: Exp2_0, Src1: Exp2_1, Flags);
4011 return true;
4012 }
4013
4014 // bool s = x < -0x1.2f7030p+5f;
4015 // x += s ? 0x1.0p+5f : 0.0f;
4016 // exp10 = exp2(x * 0x1.a92000p+1f) *
4017 // exp2(x * 0x1.4f0978p-11f) *
4018 // (s ? 0x1.9f623ep-107f : 1.0f);
4019
4020 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.2f7030p+5f);
4021 auto NeedsScaling =
4022 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold);
4023
4024 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+5f);
4025 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
4026 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X);
4027
4028 auto K0 = B.buildFConstant(Res: Ty, Val: 0x1.a92000p+1f);
4029 auto K1 = B.buildFConstant(Res: Ty, Val: 0x1.4f0978p-11f);
4030
4031 auto Mul1 = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: K1, Flags);
4032 auto Exp2_1 = buildExp(B, Dst: Ty, Src: Mul1, Flags);
4033 auto Mul0 = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: K0, Flags);
4034 auto Exp2_0 = buildExp(B, Dst: Ty, Src: Mul0, Flags);
4035
4036 auto MulExps = B.buildFMul(Dst: Ty, Src0: Exp2_0, Src1: Exp2_1, Flags);
4037 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.9f623ep-107f);
4038 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: MulExps, Src1: ResultScaleFactor, Flags);
4039
4040 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: MulExps);
4041 return true;
4042}
4043
4044// This expansion gives a result slightly better than 1ulp.
4045bool AMDGPULegalizerInfo::legalizeFEXPF64(MachineInstr &MI,
4046 MachineIRBuilder &B) const {
4047
4048 Register X = MI.getOperand(i: 1).getReg();
4049 LLT S64 = LLT::scalar(SizeInBits: 64);
4050 LLT S32 = LLT::scalar(SizeInBits: 32);
4051 LLT S1 = LLT::scalar(SizeInBits: 1);
4052
4053 // TODO: Check if reassoc is safe. There is an output change in exp2 and
4054 // exp10, which slightly increases ulp.
4055 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
4056
4057 Register Dn, F, T;
4058
4059 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
4060 // Dn = rint(X)
4061 Dn = B.buildFRint(Dst: S64, Src0: X, Flags).getReg(Idx: 0);
4062 // F = X - Dn
4063 F = B.buildFSub(Dst: S64, Src0: X, Src1: Dn, Flags).getReg(Idx: 0);
4064 // T = F*C1 + F*C2
4065 auto C1 = B.buildFConstant(Res: S64, Val: APFloat(0x1.62e42fefa39efp-1));
4066 auto C2 = B.buildFConstant(Res: S64, Val: APFloat(0x1.abc9e3b39803fp-56));
4067 auto Mul2 = B.buildFMul(Dst: S64, Src0: F, Src1: C2, Flags).getReg(Idx: 0);
4068 T = B.buildFMA(Dst: S64, Src0: F, Src1: C1, Src2: Mul2, Flags).getReg(Idx: 0);
4069
4070 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
4071 auto C1 = B.buildFConstant(Res: S64, Val: APFloat(0x1.a934f0979a371p+1));
4072 auto Mul = B.buildFMul(Dst: S64, Src0: X, Src1: C1, Flags).getReg(Idx: 0);
4073 Dn = B.buildFRint(Dst: S64, Src0: Mul, Flags).getReg(Idx: 0);
4074
4075 auto NegDn = B.buildFNeg(Dst: S64, Src0: Dn, Flags).getReg(Idx: 0);
4076 auto C2 = B.buildFConstant(Res: S64, Val: APFloat(-0x1.9dc1da994fd21p-59));
4077 auto C3 = B.buildFConstant(Res: S64, Val: APFloat(0x1.34413509f79ffp-2));
4078 auto Inner = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C3, Src2: X, Flags).getReg(Idx: 0);
4079 F = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C2, Src2: Inner, Flags).getReg(Idx: 0);
4080
4081 auto C4 = B.buildFConstant(Res: S64, Val: APFloat(0x1.26bb1bbb55516p+1));
4082 auto C5 = B.buildFConstant(Res: S64, Val: APFloat(-0x1.f48ad494ea3e9p-53));
4083 auto MulF = B.buildFMul(Dst: S64, Src0: F, Src1: C5, Flags).getReg(Idx: 0);
4084 T = B.buildFMA(Dst: S64, Src0: F, Src1: C4, Src2: MulF, Flags).getReg(Idx: 0);
4085
4086 } else { // G_FEXP
4087 auto C1 = B.buildFConstant(Res: S64, Val: APFloat(0x1.71547652b82fep+0));
4088 auto Mul = B.buildFMul(Dst: S64, Src0: X, Src1: C1, Flags).getReg(Idx: 0);
4089 Dn = B.buildFRint(Dst: S64, Src0: Mul, Flags).getReg(Idx: 0);
4090
4091 auto NegDn = B.buildFNeg(Dst: S64, Src0: Dn, Flags).getReg(Idx: 0);
4092 auto C2 = B.buildFConstant(Res: S64, Val: APFloat(0x1.abc9e3b39803fp-56));
4093 auto C3 = B.buildFConstant(Res: S64, Val: APFloat(0x1.62e42fefa39efp-1));
4094 auto Inner = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C3, Src2: X, Flags).getReg(Idx: 0);
4095 T = B.buildFMA(Dst: S64, Src0: NegDn, Src1: C2, Src2: Inner, Flags).getReg(Idx: 0);
4096 }
4097
4098 // Polynomial chain for P
4099 auto P = B.buildFConstant(Res: S64, Val: 0x1.ade156a5dcb37p-26);
4100 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.28af3fca7ab0cp-22),
4101 Flags);
4102 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.71dee623fde64p-19),
4103 Flags);
4104 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.a01997c89e6b0p-16),
4105 Flags);
4106 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.a01a014761f6ep-13),
4107 Flags);
4108 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.6c16c1852b7b0p-10),
4109 Flags);
4110 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.1111111122322p-7), Flags);
4111 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.55555555502a1p-5), Flags);
4112 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.5555555555511p-3), Flags);
4113 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: B.buildFConstant(Res: S64, Val: 0x1.000000000000bp-1), Flags);
4114
4115 auto One = B.buildFConstant(Res: S64, Val: 1.0);
4116 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: One, Flags);
4117 P = B.buildFMA(Dst: S64, Src0: T, Src1: P, Src2: One, Flags);
4118
4119 // Z = FLDEXP(P, (int)Dn)
4120 auto DnInt = B.buildFPTOSI(Dst: S32, Src0: Dn);
4121 auto Z = B.buildFLdexp(Dst: S64, Src0: P, Src1: DnInt, Flags);
4122
4123 if (!(Flags & MachineInstr::FmNoInfs)) {
4124 // Overflow guard: if X <= 1024.0 then Z else +inf
4125 auto CondHi = B.buildFCmp(Pred: CmpInst::FCMP_ULE, Res: S1, Op0: X,
4126 Op1: B.buildFConstant(Res: S64, Val: APFloat(1024.0)));
4127 auto PInf = B.buildFConstant(Res: S64, Val: APFloat::getInf(Sem: APFloat::IEEEdouble()));
4128 Z = B.buildSelect(Res: S64, Tst: CondHi, Op0: Z, Op1: PInf, Flags);
4129 }
4130
4131 // Underflow guard: if X >= -1075.0 then Z else 0.0
4132 auto CondLo = B.buildFCmp(Pred: CmpInst::FCMP_UGE, Res: S1, Op0: X,
4133 Op1: B.buildFConstant(Res: S64, Val: APFloat(-1075.0)));
4134 auto Zero = B.buildFConstant(Res: S64, Val: APFloat(0.0));
4135 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: CondLo, Op0: Z, Op1: Zero, Flags);
4136
4137 MI.eraseFromParent();
4138 return true;
4139}
4140
4141bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
4142 MachineIRBuilder &B) const {
4143 Register Dst = MI.getOperand(i: 0).getReg();
4144 Register X = MI.getOperand(i: 1).getReg();
4145 const unsigned Flags = MI.getFlags();
4146 MachineFunction &MF = B.getMF();
4147 MachineRegisterInfo &MRI = *B.getMRI();
4148 LLT Ty = MRI.getType(Reg: Dst);
4149
4150 const LLT F64 = LLT::scalar(SizeInBits: 64);
4151
4152 if (Ty == F64)
4153 return legalizeFEXPF64(MI, B);
4154
4155 const LLT F16 = LLT::scalar(SizeInBits: 16);
4156 const LLT F32 = LLT::scalar(SizeInBits: 32);
4157 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4158
4159 if (Ty == F16) {
4160 // v_exp_f16 (fmul x, log2e)
4161 if (allowApproxFunc(MF, Flags)) {
4162 // TODO: Does this really require fast?
4163 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4164 : legalizeFExpUnsafe(B, Dst, X, Flags);
4165 MI.eraseFromParent();
4166 return true;
4167 }
4168
4169 // Nothing in half is a denormal when promoted to f32.
4170 //
4171 // exp(f16 x) ->
4172 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4173 //
4174 // exp10(f16 x) ->
4175 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4176 auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
4177 Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
4178 legalizeFExpUnsafeImpl(B, Dst: Lowered, X: Ext.getReg(Idx: 0), Flags, IsExp10);
4179 B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
4180 MI.eraseFromParent();
4181 return true;
4182 }
4183
4184 assert(Ty == F32);
4185
4186 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4187 // library behavior. Also, is known-not-daz source sufficient?
4188 if (allowApproxFunc(MF, Flags)) {
4189 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4190 : legalizeFExpUnsafe(B, Dst, X, Flags);
4191 MI.eraseFromParent();
4192 return true;
4193 }
4194
4195 // Algorithm:
4196 //
4197 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4198 //
4199 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4200 // n = 64*m + j, 0 <= j < 64
4201 //
4202 // e^x = 2^((64*m + j + f)/64)
4203 // = (2^m) * (2^(j/64)) * 2^(f/64)
4204 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4205 //
4206 // f = x*(64/ln(2)) - n
4207 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4208 //
4209 // e^x = (2^m) * (2^(j/64)) * e^r
4210 //
4211 // (2^(j/64)) is precomputed
4212 //
4213 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4214 // e^r = 1 + q
4215 //
4216 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4217 //
4218 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4219 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4220 Register PH, PL;
4221
4222 if (ST.hasFastFMAF32()) {
4223 const float c_exp = numbers::log2ef;
4224 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4225 const float c_exp10 = 0x1.a934f0p+1f;
4226 const float cc_exp10 = 0x1.2f346ep-24f;
4227
4228 auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
4229 PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: 0);
4230 auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
4231 auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
4232
4233 auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
4234 PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: 0);
4235 } else {
4236 const float ch_exp = 0x1.714000p+0f;
4237 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4238
4239 const float ch_exp10 = 0x1.a92000p+1f;
4240 const float cl_exp10 = 0x1.4f0978p-11f;
4241
4242 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
4243 auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
4244 auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
4245
4246 auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
4247 PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: 0);
4248
4249 auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
4250 auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
4251
4252 Register Mad0 =
4253 getMad(B, Ty, X: XL.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: XLCL.getReg(Idx: 0), Flags);
4254 PL = getMad(B, Ty, X: XH.getReg(Idx: 0), Y: CL.getReg(Idx: 0), Z: Mad0, Flags);
4255 }
4256
4257 auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
4258
4259 // It is unsafe to contract this fsub into the PH multiply.
4260 auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
4261 auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
4262 auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: 32), Src0: E);
4263
4264 auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
4265 .addUse(RegNo: A.getReg(Idx: 0))
4266 .setMIFlags(Flags);
4267 auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
4268
4269 auto UnderflowCheckConst =
4270 B.buildFConstant(Res: Ty, Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4271 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
4272 auto Underflow =
4273 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: UnderflowCheckConst);
4274
4275 R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
4276
4277 if (!(Flags & MachineInstr::FmNoInfs)) {
4278 auto OverflowCheckConst =
4279 B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4280
4281 auto Overflow =
4282 B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: OverflowCheckConst);
4283 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
4284 R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
4285 }
4286
4287 B.buildCopy(Res: Dst, Op: R);
4288 MI.eraseFromParent();
4289 return true;
4290}
4291
4292bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
4293 MachineIRBuilder &B) const {
4294 Register Dst = MI.getOperand(i: 0).getReg();
4295 Register Src0 = MI.getOperand(i: 1).getReg();
4296 Register Src1 = MI.getOperand(i: 2).getReg();
4297 unsigned Flags = MI.getFlags();
4298 LLT Ty = B.getMRI()->getType(Reg: Dst);
4299 const LLT F16 = LLT::scalar(SizeInBits: 16); // TODO: Expected LLT::float16()
4300 const LLT F32 = LLT::scalar(SizeInBits: 32); // TODO: Expected LLT::float32()
4301
4302 if (Ty == F32) {
4303 auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
4304 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
4305 .addUse(RegNo: Log.getReg(Idx: 0))
4306 .addUse(RegNo: Src1)
4307 .setMIFlags(Flags);
4308 B.buildFExp2(Dst, Src: Mul, Flags);
4309 } else if (Ty == F16) {
4310 // There's no f16 fmul_legacy, so we need to convert for it.
4311 auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
4312 auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
4313 auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
4314 auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
4315 .addUse(RegNo: Ext0.getReg(Idx: 0))
4316 .addUse(RegNo: Ext1.getReg(Idx: 0))
4317 .setMIFlags(Flags);
4318 B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
4319 } else
4320 return false;
4321
4322 MI.eraseFromParent();
4323 return true;
4324}
4325
4326// Find a source register, ignoring any possible source modifiers.
4327static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
4328 Register ModSrc = OrigSrc;
4329 if (MachineInstr *SrcFNeg = getOpcodeDef(Opcode: AMDGPU::G_FNEG, Reg: ModSrc, MRI)) {
4330 ModSrc = SrcFNeg->getOperand(i: 1).getReg();
4331 if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
4332 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
4333 } else if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
4334 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
4335 return ModSrc;
4336}
4337
4338bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
4339 MachineRegisterInfo &MRI,
4340 MachineIRBuilder &B) const {
4341
4342 const LLT S1 = LLT::scalar(SizeInBits: 1);
4343 const LLT F64 = LLT::scalar(SizeInBits: 64); // TODO: Expected float64
4344 Register Dst = MI.getOperand(i: 0).getReg();
4345 Register OrigSrc = MI.getOperand(i: 1).getReg();
4346 unsigned Flags = MI.getFlags();
4347 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4348 "this should not have been custom lowered");
4349
4350 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4351 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4352 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4353 // V_FRACT bug is:
4354 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4355 //
4356 // Convert floor(x) to (x - fract(x))
4357
4358 auto Fract = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {F64})
4359 .addUse(RegNo: OrigSrc)
4360 .setMIFlags(Flags);
4361
4362 // Give source modifier matching some assistance before obscuring a foldable
4363 // pattern.
4364
4365 // TODO: We can avoid the neg on the fract? The input sign to fract
4366 // shouldn't matter?
4367 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4368
4369 auto Const =
4370 B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: 0x3fefffffffffffff));
4371
4372 Register Min = MRI.createGenericVirtualRegister(Ty: F64);
4373
4374 // We don't need to concern ourselves with the snan handling difference, so
4375 // use the one which will directly select.
4376 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4377 if (MFI->getMode().IEEE)
4378 B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
4379 else
4380 B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
4381
4382 Register CorrectedFract = Min;
4383 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
4384 auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
4385 CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: 0);
4386 }
4387
4388 auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
4389 B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
4390
4391 MI.eraseFromParent();
4392 return true;
4393}
4394
4395// Turn an illegal packed v2s16 build vector into bit operations.
4396// TODO: This should probably be a bitcast action in LegalizerHelper.
4397bool AMDGPULegalizerInfo::legalizeBuildVector(
4398 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4399 Register Dst = MI.getOperand(i: 0).getReg();
4400 const LLT S32 = LLT::scalar(SizeInBits: 32);
4401 const LLT S16 = LLT::scalar(SizeInBits: 16);
4402 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4403
4404 Register Src0 = MI.getOperand(i: 1).getReg();
4405 Register Src1 = MI.getOperand(i: 2).getReg();
4406
4407 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4408 assert(MRI.getType(Src0) == S32);
4409 Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 1).getReg()).getReg(Idx: 0);
4410 Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 2).getReg()).getReg(Idx: 0);
4411 }
4412
4413 auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
4414 B.buildBitcast(Dst, Src: Merge);
4415
4416 MI.eraseFromParent();
4417 return true;
4418}
4419
4420// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4421//
4422// Source and accumulation registers must all be 32-bits.
4423//
4424// TODO: When the multiply is uniform, we should produce a code sequence
4425// that is better suited to instruction selection on the SALU. Instead of
4426// the outer loop going over parts of the result, the outer loop should go
4427// over parts of one of the factors. This should result in instruction
4428// selection that makes full use of S_ADDC_U32 instructions.
4429void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
4430 MutableArrayRef<Register> Accum,
4431 ArrayRef<Register> Src0,
4432 ArrayRef<Register> Src1,
4433 bool UsePartialMad64_32,
4434 bool SeparateOddAlignedProducts) const {
4435 // Use (possibly empty) vectors of S1 registers to represent the set of
4436 // carries from one pair of positions to the next.
4437 using Carry = SmallVector<Register, 2>;
4438
4439 MachineIRBuilder &B = Helper.MIRBuilder;
4440 GISelValueTracking &VT = *Helper.getValueTracking();
4441
4442 const LLT S1 = LLT::scalar(SizeInBits: 1);
4443 const LLT S32 = LLT::scalar(SizeInBits: 32);
4444 const LLT S64 = LLT::scalar(SizeInBits: 64);
4445
4446 Register Zero32;
4447 Register Zero64;
4448
4449 auto getZero32 = [&]() -> Register {
4450 if (!Zero32)
4451 Zero32 = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
4452 return Zero32;
4453 };
4454 auto getZero64 = [&]() -> Register {
4455 if (!Zero64)
4456 Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
4457 return Zero64;
4458 };
4459
4460 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4461 for (unsigned i = 0; i < Src0.size(); ++i) {
4462 Src0KnownZeros.push_back(Elt: VT.getKnownBits(R: Src0[i]).isZero());
4463 Src1KnownZeros.push_back(Elt: VT.getKnownBits(R: Src1[i]).isZero());
4464 }
4465
4466 // Merge the given carries into the 32-bit LocalAccum, which is modified
4467 // in-place.
4468 //
4469 // Returns the carry-out, which is a single S1 register or null.
4470 auto mergeCarry =
4471 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4472 if (CarryIn.empty())
4473 return Register();
4474
4475 bool HaveCarryOut = true;
4476 Register CarryAccum;
4477 if (CarryIn.size() == 1) {
4478 if (!LocalAccum) {
4479 LocalAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
4480 return Register();
4481 }
4482
4483 CarryAccum = getZero32();
4484 } else {
4485 CarryAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
4486 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4487 CarryAccum =
4488 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32(), CarryIn: CarryIn[i])
4489 .getReg(Idx: 0);
4490 }
4491
4492 if (!LocalAccum) {
4493 LocalAccum = getZero32();
4494 HaveCarryOut = false;
4495 }
4496 }
4497
4498 auto Add =
4499 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
4500 LocalAccum = Add.getReg(Idx: 0);
4501 return HaveCarryOut ? Add.getReg(Idx: 1) : Register();
4502 };
4503
4504 // Build a multiply-add chain to compute
4505 //
4506 // LocalAccum + (partial products at DstIndex)
4507 // + (opportunistic subset of CarryIn)
4508 //
4509 // LocalAccum is an array of one or two 32-bit registers that are updated
4510 // in-place. The incoming registers may be null.
4511 //
4512 // In some edge cases, carry-ins can be consumed "for free". In that case,
4513 // the consumed carry bits are removed from CarryIn in-place.
4514 auto buildMadChain =
4515 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4516 -> Carry {
4517 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4518 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4519
4520 Carry CarryOut;
4521 unsigned j0 = 0;
4522
4523 // Use plain 32-bit multiplication for the most significant part of the
4524 // result by default.
4525 if (LocalAccum.size() == 1 &&
4526 (!UsePartialMad64_32 || !CarryIn.empty())) {
4527 do {
4528 // Skip multiplication if one of the operands is 0
4529 unsigned j1 = DstIndex - j0;
4530 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4531 ++j0;
4532 continue;
4533 }
4534 auto Mul = B.buildMul(Dst: S32, Src0: Src0[j0], Src1: Src1[j1]);
4535 if (!LocalAccum[0] || VT.getKnownBits(R: LocalAccum[0]).isZero()) {
4536 LocalAccum[0] = Mul.getReg(Idx: 0);
4537 } else {
4538 if (CarryIn.empty()) {
4539 LocalAccum[0] = B.buildAdd(Dst: S32, Src0: LocalAccum[0], Src1: Mul).getReg(Idx: 0);
4540 } else {
4541 LocalAccum[0] =
4542 B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum[0], Op1: Mul, CarryIn: CarryIn.back())
4543 .getReg(Idx: 0);
4544 CarryIn.pop_back();
4545 }
4546 }
4547 ++j0;
4548 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4549 }
4550
4551 // Build full 64-bit multiplies.
4552 if (j0 <= DstIndex) {
4553 bool HaveSmallAccum = false;
4554 Register Tmp;
4555
4556 if (LocalAccum[0]) {
4557 if (LocalAccum.size() == 1) {
4558 Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4559 HaveSmallAccum = true;
4560 } else if (LocalAccum[1]) {
4561 Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: 0);
4562 HaveSmallAccum = false;
4563 } else {
4564 Tmp = B.buildZExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
4565 HaveSmallAccum = true;
4566 }
4567 } else {
4568 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4569 Tmp = getZero64();
4570 HaveSmallAccum = true;
4571 }
4572
4573 do {
4574 unsigned j1 = DstIndex - j0;
4575 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4576 ++j0;
4577 continue;
4578 }
4579 auto Mad = B.buildInstr(Opc: AMDGPU::G_AMDGPU_MAD_U64_U32, DstOps: {S64, S1},
4580 SrcOps: {Src0[j0], Src1[j1], Tmp});
4581 Tmp = Mad.getReg(Idx: 0);
4582 if (!HaveSmallAccum)
4583 CarryOut.push_back(Elt: Mad.getReg(Idx: 1));
4584 HaveSmallAccum = false;
4585
4586 ++j0;
4587 } while (j0 <= DstIndex);
4588
4589 auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
4590 LocalAccum[0] = Unmerge.getReg(Idx: 0);
4591 if (LocalAccum.size() > 1)
4592 LocalAccum[1] = Unmerge.getReg(Idx: 1);
4593 }
4594
4595 return CarryOut;
4596 };
4597
4598 // Outer multiply loop, iterating over destination parts from least
4599 // significant to most significant parts.
4600 //
4601 // The columns of the following diagram correspond to the destination parts
4602 // affected by one iteration of the outer loop (ignoring boundary
4603 // conditions).
4604 //
4605 // Dest index relative to 2 * i: 1 0 -1
4606 // ------
4607 // Carries from previous iteration: e o
4608 // Even-aligned partial product sum: E E .
4609 // Odd-aligned partial product sum: O O
4610 //
4611 // 'o' is OddCarry, 'e' is EvenCarry.
4612 // EE and OO are computed from partial products via buildMadChain and use
4613 // accumulation where possible and appropriate.
4614 //
4615 Register SeparateOddCarry;
4616 Carry EvenCarry;
4617 Carry OddCarry;
4618
4619 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4620 Carry OddCarryIn = std::move(OddCarry);
4621 Carry EvenCarryIn = std::move(EvenCarry);
4622 OddCarry.clear();
4623 EvenCarry.clear();
4624
4625 // Partial products at offset 2 * i.
4626 if (2 * i < Accum.size()) {
4627 auto LocalAccum = Accum.drop_front(N: 2 * i).take_front(N: 2);
4628 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4629 }
4630
4631 // Partial products at offset 2 * i - 1.
4632 if (i > 0) {
4633 if (!SeparateOddAlignedProducts) {
4634 auto LocalAccum = Accum.drop_front(N: 2 * i - 1).take_front(N: 2);
4635 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4636 } else {
4637 bool IsHighest = 2 * i >= Accum.size();
4638 Register SeparateOddOut[2];
4639 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4640 .take_front(N: IsHighest ? 1 : 2);
4641 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4642
4643 MachineInstr *Lo;
4644
4645 if (i == 1) {
4646 if (!IsHighest)
4647 Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0]);
4648 else
4649 Lo = B.buildAdd(Dst: S32, Src0: Accum[2 * i - 1], Src1: SeparateOddOut[0]);
4650 } else {
4651 Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0],
4652 CarryIn: SeparateOddCarry);
4653 }
4654 Accum[2 * i - 1] = Lo->getOperand(i: 0).getReg();
4655
4656 if (!IsHighest) {
4657 auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i], Op1: SeparateOddOut[1],
4658 CarryIn: Lo->getOperand(i: 1).getReg());
4659 Accum[2 * i] = Hi.getReg(Idx: 0);
4660 SeparateOddCarry = Hi.getReg(Idx: 1);
4661 }
4662 }
4663 }
4664
4665 // Add in the carries from the previous iteration
4666 if (i > 0) {
4667 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4668 EvenCarryIn.push_back(Elt: CarryOut);
4669
4670 if (2 * i < Accum.size()) {
4671 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4672 OddCarry.push_back(Elt: CarryOut);
4673 }
4674 }
4675 }
4676}
4677
4678// Custom narrowing of wide multiplies using wide multiply-add instructions.
4679//
4680// TODO: If the multiply is followed by an addition, we should attempt to
4681// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4682bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4683 MachineInstr &MI) const {
4684 assert(ST.hasMad64_32());
4685 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4686
4687 MachineIRBuilder &B = Helper.MIRBuilder;
4688 MachineRegisterInfo &MRI = *B.getMRI();
4689
4690 Register DstReg = MI.getOperand(i: 0).getReg();
4691 Register Src0 = MI.getOperand(i: 1).getReg();
4692 Register Src1 = MI.getOperand(i: 2).getReg();
4693
4694 LLT Ty = MRI.getType(Reg: DstReg);
4695 assert(Ty.isScalar());
4696
4697 unsigned Size = Ty.getSizeInBits();
4698 if (ST.hasVMulU64Inst() && Size == 64)
4699 return true;
4700
4701 unsigned NumParts = Size / 32;
4702 assert((Size % 32) == 0);
4703 assert(NumParts >= 2);
4704
4705 // Whether to use MAD_64_32 for partial products whose high half is
4706 // discarded. This avoids some ADD instructions but risks false dependency
4707 // stalls on some subtargets in some cases.
4708 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4709
4710 // Whether to compute odd-aligned partial products separately. This is
4711 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4712 // in an even-aligned VGPR.
4713 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4714
4715 LLT S32 = LLT::scalar(SizeInBits: 32);
4716 SmallVector<Register, 2> Src0Parts, Src1Parts;
4717 for (unsigned i = 0; i < NumParts; ++i) {
4718 Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4719 Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4720 }
4721 B.buildUnmerge(Res: Src0Parts, Op: Src0);
4722 B.buildUnmerge(Res: Src1Parts, Op: Src1);
4723
4724 SmallVector<Register, 2> AccumRegs(NumParts);
4725 buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4726 SeparateOddAlignedProducts);
4727
4728 B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4729 MI.eraseFromParent();
4730 return true;
4731}
4732
4733// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4734// ctlz/cttz_zero_poison. This allows us to fix up the result for the zero input
4735// case with a single min instruction instead of a compare+select.
4736bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4737 MachineRegisterInfo &MRI,
4738 MachineIRBuilder &B) const {
4739 Register Dst = MI.getOperand(i: 0).getReg();
4740 Register Src = MI.getOperand(i: 1).getReg();
4741 LLT DstTy = MRI.getType(Reg: Dst);
4742 LLT SrcTy = MRI.getType(Reg: Src);
4743
4744 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4745 ? AMDGPU::G_AMDGPU_FFBH_U32
4746 : AMDGPU::G_AMDGPU_FFBL_B32;
4747 auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4748 B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4749
4750 MI.eraseFromParent();
4751 return true;
4752}
4753
4754bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_POISON(MachineInstr &MI,
4755 MachineRegisterInfo &MRI,
4756 MachineIRBuilder &B) const {
4757 Register Dst = MI.getOperand(i: 0).getReg();
4758 Register Src = MI.getOperand(i: 1).getReg();
4759 LLT SrcTy = MRI.getType(Reg: Src);
4760 TypeSize NumBits = SrcTy.getSizeInBits();
4761
4762 assert(NumBits < 32u);
4763
4764 auto ShiftAmt = B.buildConstant(Res: S32, Val: 32u - NumBits);
4765 auto Extend = B.buildAnyExt(Res: S32, Op: {Src}).getReg(Idx: 0u);
4766 auto Shift = B.buildShl(Dst: S32, Src0: Extend, Src1: ShiftAmt);
4767 auto Ctlz = B.buildInstr(Opc: AMDGPU::G_AMDGPU_FFBH_U32, DstOps: {S32}, SrcOps: {Shift});
4768 B.buildTrunc(Res: Dst, Op: Ctlz);
4769 MI.eraseFromParent();
4770 return true;
4771}
4772
4773bool AMDGPULegalizerInfo::legalizeCTLS(MachineInstr &MI,
4774 MachineRegisterInfo &MRI,
4775 MachineIRBuilder &B) const {
4776 Register Dst = MI.getOperand(i: 0).getReg();
4777 Register Src = MI.getOperand(i: 1).getReg();
4778 LLT SrcTy = MRI.getType(Reg: Src);
4779 const LLT S32 = LLT::scalar(SizeInBits: 32);
4780 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4781 unsigned BitWidth = SrcTy.getSizeInBits();
4782
4783 auto Sffbh = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32}).addUse(RegNo: Src);
4784 auto Clamped = B.buildUMin(Dst: S32, Src0: Sffbh, Src1: B.buildConstant(Res: S32, Val: BitWidth));
4785 B.buildSub(Dst, Src0: Clamped, Src1: B.buildConstant(Res: S32, Val: 1));
4786 MI.eraseFromParent();
4787 return true;
4788}
4789
4790// Check that this is a G_XOR x, -1
4791static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4792 if (MI.getOpcode() != TargetOpcode::G_XOR)
4793 return false;
4794 auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: 2).getReg(), MRI);
4795 return ConstVal == -1;
4796}
4797
4798// Return the use branch instruction, otherwise null if the usage is invalid.
4799static MachineInstr *
4800verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4801 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4802 Register CondDef = MI.getOperand(i: 0).getReg();
4803 if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4804 return nullptr;
4805
4806 MachineBasicBlock *Parent = MI.getParent();
4807 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(RegNo: CondDef);
4808
4809 if (isNot(MRI, MI: *UseMI)) {
4810 Register NegatedCond = UseMI->getOperand(i: 0).getReg();
4811 if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4812 return nullptr;
4813
4814 // We're deleting the def of this value, so we need to remove it.
4815 eraseInstr(MI&: *UseMI, MRI);
4816
4817 UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4818 Negated = true;
4819 }
4820
4821 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4822 return nullptr;
4823
4824 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4825 MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4826 if (Next == Parent->end()) {
4827 MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4828 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4829 return nullptr;
4830 UncondBrTarget = &*NextMBB;
4831 } else {
4832 if (Next->getOpcode() != AMDGPU::G_BR)
4833 return nullptr;
4834 Br = &*Next;
4835 UncondBrTarget = Br->getOperand(i: 0).getMBB();
4836 }
4837
4838 return UseMI;
4839}
4840
4841void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
4842 MachineIRBuilder &B,
4843 const ArgDescriptor *Arg,
4844 const TargetRegisterClass *ArgRC,
4845 LLT ArgTy) const {
4846 MCRegister SrcReg = Arg->getRegister();
4847 assert(SrcReg.isPhysical() && "Physical register expected");
4848 assert(DstReg.isVirtual() && "Virtual register expected");
4849
4850 Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4851 RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4852 if (Arg->isMasked()) {
4853 // TODO: Should we try to emit this once in the entry block?
4854 const LLT S32 = LLT::scalar(SizeInBits: 32);
4855 const unsigned Mask = Arg->getMask();
4856 const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4857
4858 Register AndMaskSrc = LiveIn;
4859
4860 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4861 // 0.
4862 if (Shift != 0) {
4863 auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4864 AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: 0);
4865 }
4866
4867 B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4868 } else {
4869 B.buildCopy(Res: DstReg, Op: LiveIn);
4870 }
4871}
4872
4873bool AMDGPULegalizerInfo::legalizeWorkGroupId(
4874 MachineInstr &MI, MachineIRBuilder &B,
4875 AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
4876 AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
4877 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4878 Register DstReg = MI.getOperand(i: 0).getReg();
4879 if (!ST.hasClusters()) {
4880 if (!loadInputValue(DstReg, B, ArgType: WorkGroupIdPV))
4881 return false;
4882 MI.eraseFromParent();
4883 return true;
4884 }
4885
4886 // Clusters are supported. Return the global position in the grid. If clusters
4887 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4888
4889 // WorkGroupIdXYZ = ClusterId == 0 ?
4890 // ClusterIdXYZ :
4891 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4892 MachineRegisterInfo &MRI = *B.getMRI();
4893 const LLT S32 = LLT::scalar(SizeInBits: 32);
4894 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4895 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4896 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(Ty: S32);
4897 if (!loadInputValue(DstReg: ClusterIdXYZ, B, ArgType: WorkGroupIdPV) ||
4898 !loadInputValue(DstReg: ClusterWorkGroupIdXYZ, B, ArgType: ClusterWorkGroupIdPV) ||
4899 !loadInputValue(DstReg: ClusterMaxIdXYZ, B, ArgType: ClusterMaxIdPV))
4900 return false;
4901
4902 auto One = B.buildConstant(Res: S32, Val: 1);
4903 auto ClusterSizeXYZ = B.buildAdd(Dst: S32, Src0: ClusterMaxIdXYZ, Src1: One);
4904 auto GlobalIdXYZ = B.buildAdd(Dst: S32, Src0: ClusterWorkGroupIdXYZ,
4905 Src1: B.buildMul(Dst: S32, Src0: ClusterIdXYZ, Src1: ClusterSizeXYZ));
4906
4907 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4908
4909 switch (MFI->getClusterDims().getKind()) {
4910 case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
4911 case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
4912 B.buildCopy(Res: DstReg, Op: GlobalIdXYZ);
4913 MI.eraseFromParent();
4914 return true;
4915 }
4916 case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
4917 B.buildCopy(Res: DstReg, Op: ClusterIdXYZ);
4918 MI.eraseFromParent();
4919 return true;
4920 }
4921 case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
4922 using namespace AMDGPU::Hwreg;
4923 unsigned ClusterIdField = HwregEncoding::encode(Values: ID_IB_STS2, Values: 6, Values: 4);
4924 Register ClusterId = MRI.createGenericVirtualRegister(Ty: S32);
4925 MRI.setRegClass(Reg: ClusterId, RC: &AMDGPU::SReg_32RegClass);
4926 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32_const)
4927 .addDef(RegNo: ClusterId)
4928 .addImm(Val: ClusterIdField);
4929 auto Zero = B.buildConstant(Res: S32, Val: 0);
4930 auto NoClusters =
4931 B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1), Op0: ClusterId, Op1: Zero);
4932 B.buildSelect(Res: DstReg, Tst: NoClusters, Op0: ClusterIdXYZ, Op1: GlobalIdXYZ);
4933 MI.eraseFromParent();
4934 return true;
4935 }
4936 }
4937
4938 llvm_unreachable("nothing should reach here");
4939}
4940
4941bool AMDGPULegalizerInfo::loadInputValue(
4942 Register DstReg, MachineIRBuilder &B,
4943 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4944 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4945 const ArgDescriptor *Arg = nullptr;
4946 const TargetRegisterClass *ArgRC;
4947 LLT ArgTy;
4948
4949 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4950 const ArgDescriptor WorkGroupIDX =
4951 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
4952 // If GridZ is not programmed in an entry function then the hardware will set
4953 // it to all zeros, so there is no need to mask the GridY value in the low
4954 // order bits.
4955 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4956 Reg: AMDGPU::TTMP7,
4957 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4958 const ArgDescriptor WorkGroupIDZ =
4959 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
4960 const ArgDescriptor ClusterWorkGroupIDX =
4961 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000000Fu);
4962 const ArgDescriptor ClusterWorkGroupIDY =
4963 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000000F0u);
4964 const ArgDescriptor ClusterWorkGroupIDZ =
4965 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00000F00u);
4966 const ArgDescriptor ClusterWorkGroupMaxIDX =
4967 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000F000u);
4968 const ArgDescriptor ClusterWorkGroupMaxIDY =
4969 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000F0000u);
4970 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4971 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00F00000u);
4972 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4973 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0F000000u);
4974
4975 auto LoadConstant = [&](unsigned N) {
4976 B.buildConstant(Res: DstReg, Val: N);
4977 return true;
4978 };
4979
4980 if (ST.hasArchitectedSGPRs() &&
4981 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4982 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4983 bool HasFixedDims = ClusterDims.isFixedDims();
4984
4985 switch (ArgType) {
4986 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4987 Arg = &WorkGroupIDX;
4988 ArgRC = &AMDGPU::SReg_32RegClass;
4989 ArgTy = LLT::scalar(SizeInBits: 32);
4990 break;
4991 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4992 Arg = &WorkGroupIDY;
4993 ArgRC = &AMDGPU::SReg_32RegClass;
4994 ArgTy = LLT::scalar(SizeInBits: 32);
4995 break;
4996 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4997 Arg = &WorkGroupIDZ;
4998 ArgRC = &AMDGPU::SReg_32RegClass;
4999 ArgTy = LLT::scalar(SizeInBits: 32);
5000 break;
5001 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
5002 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
5003 return LoadConstant(0);
5004 Arg = &ClusterWorkGroupIDX;
5005 ArgRC = &AMDGPU::SReg_32RegClass;
5006 ArgTy = LLT::scalar(SizeInBits: 32);
5007 break;
5008 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
5009 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
5010 return LoadConstant(0);
5011 Arg = &ClusterWorkGroupIDY;
5012 ArgRC = &AMDGPU::SReg_32RegClass;
5013 ArgTy = LLT::scalar(SizeInBits: 32);
5014 break;
5015 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
5016 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
5017 return LoadConstant(0);
5018 Arg = &ClusterWorkGroupIDZ;
5019 ArgRC = &AMDGPU::SReg_32RegClass;
5020 ArgTy = LLT::scalar(SizeInBits: 32);
5021 break;
5022 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
5023 if (HasFixedDims)
5024 return LoadConstant(ClusterDims.getDims()[0] - 1);
5025 Arg = &ClusterWorkGroupMaxIDX;
5026 ArgRC = &AMDGPU::SReg_32RegClass;
5027 ArgTy = LLT::scalar(SizeInBits: 32);
5028 break;
5029 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
5030 if (HasFixedDims)
5031 return LoadConstant(ClusterDims.getDims()[1] - 1);
5032 Arg = &ClusterWorkGroupMaxIDY;
5033 ArgRC = &AMDGPU::SReg_32RegClass;
5034 ArgTy = LLT::scalar(SizeInBits: 32);
5035 break;
5036 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
5037 if (HasFixedDims)
5038 return LoadConstant(ClusterDims.getDims()[2] - 1);
5039 Arg = &ClusterWorkGroupMaxIDZ;
5040 ArgRC = &AMDGPU::SReg_32RegClass;
5041 ArgTy = LLT::scalar(SizeInBits: 32);
5042 break;
5043 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
5044 Arg = &ClusterWorkGroupMaxFlatID;
5045 ArgRC = &AMDGPU::SReg_32RegClass;
5046 ArgTy = LLT::scalar(SizeInBits: 32);
5047 break;
5048 default:
5049 break;
5050 }
5051 }
5052
5053 if (!Arg)
5054 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
5055
5056 if (!Arg) {
5057 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
5058 // The intrinsic may appear when we have a 0 sized kernarg segment, in
5059 // which case the pointer argument may be missing and we use null.
5060 return LoadConstant(0);
5061 }
5062
5063 // It's undefined behavior if a function marked with the amdgpu-no-*
5064 // attributes uses the corresponding intrinsic.
5065 B.buildUndef(Res: DstReg);
5066 return true;
5067 }
5068
5069 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5070 return false; // TODO: Handle these
5071 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
5072 return true;
5073}
5074
5075bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
5076 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
5077 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5078 if (!loadInputValue(DstReg: MI.getOperand(i: 0).getReg(), B, ArgType))
5079 return false;
5080
5081 MI.eraseFromParent();
5082 return true;
5083}
5084
5085static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
5086 int64_t C) {
5087 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: C);
5088 MI.eraseFromParent();
5089 return true;
5090}
5091
5092bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
5093 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
5094 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5095 unsigned MaxID = ST.getMaxWorkitemID(Kernel: B.getMF().getFunction(), Dimension: Dim);
5096 if (MaxID == 0)
5097 return replaceWithConstant(B, MI, C: 0);
5098
5099 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5100 const ArgDescriptor *Arg;
5101 const TargetRegisterClass *ArgRC;
5102 LLT ArgTy;
5103 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
5104
5105 Register DstReg = MI.getOperand(i: 0).getReg();
5106 if (!Arg) {
5107 // It's undefined behavior if a function marked with the amdgpu-no-*
5108 // attributes uses the corresponding intrinsic.
5109 B.buildUndef(Res: DstReg);
5110 MI.eraseFromParent();
5111 return true;
5112 }
5113
5114 if (Arg->isMasked()) {
5115 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5116 // masking operations anyway.
5117 //
5118 // TODO: We could assert the top bit is 0 for the source copy.
5119 if (!loadInputValue(DstReg, B, ArgType))
5120 return false;
5121 } else {
5122 Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
5123 if (!loadInputValue(DstReg: TmpReg, B, ArgType))
5124 return false;
5125 B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
5126 }
5127
5128 MI.eraseFromParent();
5129 return true;
5130}
5131
5132MachinePointerInfo
5133AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const {
5134 // This isn't really a constant pool but close enough.
5135 MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
5136 PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
5137 return PtrInfo;
5138}
5139
5140Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
5141 int64_t Offset) const {
5142 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
5143 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
5144
5145 // TODO: If we passed in the base kernel offset we could have a better
5146 // alignment than 4, but we don't really need it.
5147 if (!loadInputValue(DstReg: KernArgReg, B,
5148 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5149 llvm_unreachable("failed to find kernarg segment ptr");
5150
5151 auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset);
5152 return B.buildObjectPtrOffset(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: 0);
5153}
5154
5155/// Legalize a value that's loaded from kernel arguments. This is only used by
5156/// legacy intrinsics.
5157bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
5158 MachineIRBuilder &B,
5159 uint64_t Offset,
5160 Align Alignment) const {
5161 Register DstReg = MI.getOperand(i: 0).getReg();
5162
5163 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5164 "unexpected kernarg parameter type");
5165
5166 Register Ptr = getKernargParameterPtr(B, Offset);
5167 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF&: B.getMF());
5168 B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment: Align(4),
5169 MMOFlags: MachineMemOperand::MODereferenceable |
5170 MachineMemOperand::MOInvariant);
5171 MI.eraseFromParent();
5172 return true;
5173}
5174
5175bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
5176 MachineRegisterInfo &MRI,
5177 MachineIRBuilder &B) const {
5178 Register Dst = MI.getOperand(i: 0).getReg();
5179 LLT DstTy = MRI.getType(Reg: Dst);
5180 LLT S16 = LLT::scalar(SizeInBits: 16);
5181 LLT S32 = LLT::scalar(SizeInBits: 32);
5182 LLT S64 = LLT::scalar(SizeInBits: 64);
5183
5184 if (DstTy == S16)
5185 return legalizeFDIV16(MI, MRI, B);
5186 if (DstTy == S32)
5187 return legalizeFDIV32(MI, MRI, B);
5188 if (DstTy == S64)
5189 return legalizeFDIV64(MI, MRI, B);
5190
5191 return false;
5192}
5193
5194void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
5195 Register DstDivReg,
5196 Register DstRemReg,
5197 Register X,
5198 Register Y) const {
5199 const LLT S1 = LLT::scalar(SizeInBits: 1);
5200 const LLT S32 = LLT::scalar(SizeInBits: 32);
5201
5202 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5203 // algorithm used here.
5204
5205 // Initial estimate of inv(y).
5206 auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
5207 auto RcpIFlag = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {FloatY});
5208 auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f7ffffe));
5209 auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
5210 auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
5211
5212 // One round of UNR.
5213 auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 0), Src1: Y);
5214 auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
5215 Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
5216
5217 // Quotient/remainder estimate.
5218 auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
5219 auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
5220
5221 // First quotient/remainder refinement.
5222 auto One = B.buildConstant(Res: S32, Val: 1);
5223 auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
5224 if (DstDivReg)
5225 Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
5226 R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
5227
5228 // Second quotient/remainder refinement.
5229 Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
5230 if (DstDivReg)
5231 B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
5232
5233 if (DstRemReg)
5234 B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
5235}
5236
5237// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5238//
5239// Return lo, hi of result
5240//
5241// %cvt.lo = G_UITOFP Val.lo
5242// %cvt.hi = G_UITOFP Val.hi
5243// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5244// %rcp = G_AMDGPU_RCP_IFLAG %mad
5245// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5246// %mul2 = G_FMUL %mul1, 2**(-32)
5247// %trunc = G_INTRINSIC_TRUNC %mul2
5248// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5249// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5250static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5251 Register Val) {
5252 const LLT S32 = LLT::scalar(SizeInBits: 32);
5253 auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
5254
5255 auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 0));
5256 auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
5257
5258 auto Mad = B.buildFMAD(
5259 Dst: S32, Src0: CvtHi, // 2**32
5260 Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f800000)), Src2: CvtLo);
5261
5262 auto Rcp = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {Mad});
5263 auto Mul1 = B.buildFMul(
5264 Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x5f7ffffc)));
5265
5266 // 2**(-32)
5267 auto Mul2 = B.buildFMul(
5268 Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x2f800000)));
5269 auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
5270
5271 // -(2**32)
5272 auto Mad2 = B.buildFMAD(
5273 Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0xcf800000)),
5274 Src2: Mul1);
5275
5276 auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
5277 auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
5278
5279 return {ResultLo.getReg(Idx: 0), ResultHi.getReg(Idx: 0)};
5280}
5281
5282void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
5283 Register DstDivReg,
5284 Register DstRemReg,
5285 Register Numer,
5286 Register Denom) const {
5287 const LLT S32 = LLT::scalar(SizeInBits: 32);
5288 const LLT S64 = LLT::scalar(SizeInBits: 64);
5289 const LLT S1 = LLT::scalar(SizeInBits: 1);
5290 Register RcpLo, RcpHi;
5291
5292 std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
5293
5294 auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
5295
5296 auto Zero64 = B.buildConstant(Res: S64, Val: 0);
5297 auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
5298
5299 auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
5300 auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
5301
5302 auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
5303 Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: 0);
5304 Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: 1);
5305
5306 auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
5307 auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: 1));
5308 auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
5309
5310 auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
5311 auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
5312 auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
5313 Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: 0);
5314 Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: 1);
5315
5316 auto Zero32 = B.buildConstant(Res: S32, Val: 0);
5317 auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
5318 auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: 1));
5319 auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
5320
5321 auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
5322 Register NumerLo = UnmergeNumer.getReg(Idx: 0);
5323 Register NumerHi = UnmergeNumer.getReg(Idx: 1);
5324
5325 auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
5326 auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
5327 auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
5328 Register Mul3_Lo = UnmergeMul3.getReg(Idx: 0);
5329 Register Mul3_Hi = UnmergeMul3.getReg(Idx: 1);
5330 auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
5331 auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: 1));
5332 auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
5333 auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
5334
5335 auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
5336 Register DenomLo = UnmergeDenom.getReg(Idx: 0);
5337 Register DenomHi = UnmergeDenom.getReg(Idx: 1);
5338
5339 auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
5340 auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
5341
5342 auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
5343 auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
5344
5345 auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
5346 auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
5347
5348 // TODO: Here and below portions of the code can be enclosed into if/endif.
5349 // Currently control flow is unconditional and we have 4 selects after
5350 // potential endif to substitute PHIs.
5351
5352 // if C3 != 0 ...
5353 auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
5354 auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: 1));
5355 auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: 1));
5356 auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
5357
5358 auto One64 = B.buildConstant(Res: S64, Val: 1);
5359 auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
5360
5361 auto C4 =
5362 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
5363 auto C5 =
5364 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
5365 auto C6 = B.buildSelect(
5366 Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
5367
5368 // if (C6 != 0)
5369 auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
5370 auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
5371
5372 auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: 1));
5373 auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: 1));
5374 auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
5375
5376 // endif C6
5377 // endif C3
5378
5379 if (DstDivReg) {
5380 auto Sel1 = B.buildSelect(
5381 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
5382 B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
5383 Op0: Sel1, Op1: MulHi3);
5384 }
5385
5386 if (DstRemReg) {
5387 auto Sel2 = B.buildSelect(
5388 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
5389 B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
5390 Op0: Sel2, Op1: Sub1);
5391 }
5392}
5393
5394bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
5395 MachineRegisterInfo &MRI,
5396 MachineIRBuilder &B) const {
5397 Register DstDivReg, DstRemReg;
5398 switch (MI.getOpcode()) {
5399 default:
5400 llvm_unreachable("Unexpected opcode!");
5401 case AMDGPU::G_UDIV: {
5402 DstDivReg = MI.getOperand(i: 0).getReg();
5403 break;
5404 }
5405 case AMDGPU::G_UREM: {
5406 DstRemReg = MI.getOperand(i: 0).getReg();
5407 break;
5408 }
5409 case AMDGPU::G_UDIVREM: {
5410 DstDivReg = MI.getOperand(i: 0).getReg();
5411 DstRemReg = MI.getOperand(i: 1).getReg();
5412 break;
5413 }
5414 }
5415
5416 const LLT S64 = LLT::scalar(SizeInBits: 64);
5417 const LLT S32 = LLT::scalar(SizeInBits: 32);
5418 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5419 Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
5420 Register Den = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
5421 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5422
5423 if (Ty == S32)
5424 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
5425 else if (Ty == S64)
5426 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
5427 else
5428 return false;
5429
5430 MI.eraseFromParent();
5431 return true;
5432}
5433
5434bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
5435 MachineRegisterInfo &MRI,
5436 MachineIRBuilder &B) const {
5437 const LLT S64 = LLT::scalar(SizeInBits: 64);
5438 const LLT S32 = LLT::scalar(SizeInBits: 32);
5439
5440 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5441 if (Ty != S32 && Ty != S64)
5442 return false;
5443
5444 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5445 Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
5446 Register RHS = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
5447
5448 auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - 1);
5449 auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
5450 auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
5451
5452 LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
5453 RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
5454
5455 LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
5456 RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
5457
5458 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5459 switch (MI.getOpcode()) {
5460 default:
5461 llvm_unreachable("Unexpected opcode!");
5462 case AMDGPU::G_SDIV: {
5463 DstDivReg = MI.getOperand(i: 0).getReg();
5464 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5465 break;
5466 }
5467 case AMDGPU::G_SREM: {
5468 DstRemReg = MI.getOperand(i: 0).getReg();
5469 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5470 break;
5471 }
5472 case AMDGPU::G_SDIVREM: {
5473 DstDivReg = MI.getOperand(i: 0).getReg();
5474 DstRemReg = MI.getOperand(i: 1).getReg();
5475 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5476 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5477 break;
5478 }
5479 }
5480
5481 if (Ty == S32)
5482 legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
5483 else
5484 legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
5485
5486 if (DstDivReg) {
5487 auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: 0);
5488 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: 0);
5489 B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
5490 }
5491
5492 if (DstRemReg) {
5493 auto Sign = LHSign.getReg(Idx: 0); // Remainder sign is the same as LHS
5494 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: 0);
5495 B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
5496 }
5497
5498 MI.eraseFromParent();
5499 return true;
5500}
5501
5502bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
5503 MachineRegisterInfo &MRI,
5504 MachineIRBuilder &B) const {
5505 Register Res = MI.getOperand(i: 0).getReg();
5506 Register LHS = MI.getOperand(i: 1).getReg();
5507 Register RHS = MI.getOperand(i: 2).getReg();
5508 uint16_t Flags = MI.getFlags();
5509 LLT ResTy = MRI.getType(Reg: Res);
5510
5511 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn);
5512
5513 if (const auto *CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
5514 if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: 16))
5515 return false;
5516
5517 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5518 // the CI documentation has a worst case error of 1 ulp.
5519 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5520 // use it as long as we aren't trying to use denormals.
5521 //
5522 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5523
5524 // 1 / x -> RCP(x)
5525 if (CLHS->isOne()) {
5526 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
5527 .addUse(RegNo: RHS)
5528 .setMIFlags(Flags);
5529
5530 MI.eraseFromParent();
5531 return true;
5532 }
5533
5534 // -1 / x -> RCP( FNEG(x) )
5535 if (CLHS->isMinusOne()) {
5536 auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
5537 B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
5538 .addUse(RegNo: FNeg.getReg(Idx: 0))
5539 .setMIFlags(Flags);
5540
5541 MI.eraseFromParent();
5542 return true;
5543 }
5544 }
5545
5546 // For f16 require afn or arcp.
5547 // For f32 require afn.
5548 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: 16) ||
5549 !MI.getFlag(Flag: MachineInstr::FmArcp)))
5550 return false;
5551
5552 // x / y -> x * (1.0 / y)
5553 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
5554 .addUse(RegNo: RHS)
5555 .setMIFlags(Flags);
5556 B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
5557
5558 MI.eraseFromParent();
5559 return true;
5560}
5561
5562bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
5563 MachineRegisterInfo &MRI,
5564 MachineIRBuilder &B) const {
5565 Register Res = MI.getOperand(i: 0).getReg();
5566 Register X = MI.getOperand(i: 1).getReg();
5567 Register Y = MI.getOperand(i: 2).getReg();
5568 uint16_t Flags = MI.getFlags();
5569 LLT ResTy = MRI.getType(Reg: Res);
5570
5571 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn);
5572
5573 if (!AllowInaccurateRcp)
5574 return false;
5575
5576 const ConstantFP *CLHS = getConstantFPVRegVal(VReg: X, MRI);
5577 bool IsNegRcp = CLHS && CLHS->isMinusOne();
5578
5579 // Pull out the negation so it folds for free into the source modifiers.
5580 if (IsNegRcp)
5581 X = B.buildFConstant(Res: ResTy, Val: 1.0).getReg(Idx: 0);
5582
5583 Register NegY = IsNegRcp ? Y : B.buildFNeg(Dst: ResTy, Src0: Y).getReg(Idx: 0);
5584 auto One = B.buildFConstant(Res: ResTy, Val: 1.0);
5585
5586 auto R = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
5587 .addUse(RegNo: Y)
5588 .setMIFlags(Flags);
5589 if (IsNegRcp)
5590 R = B.buildFNeg(Dst: ResTy, Src0: R);
5591
5592 auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
5593 R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
5594
5595 auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
5596 R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
5597
5598 // Skip the last 2 correction terms for reciprocal.
5599 if (IsNegRcp || (CLHS && CLHS->isOne())) {
5600 B.buildCopy(Res, Op: R);
5601 MI.eraseFromParent();
5602 return true;
5603 }
5604
5605 auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
5606 auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
5607
5608 B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
5609 MI.eraseFromParent();
5610 return true;
5611}
5612
5613bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
5614 MachineRegisterInfo &MRI,
5615 MachineIRBuilder &B) const {
5616 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5617 return true;
5618
5619 Register Res = MI.getOperand(i: 0).getReg();
5620 Register LHS = MI.getOperand(i: 1).getReg();
5621 Register RHS = MI.getOperand(i: 2).getReg();
5622
5623 uint16_t Flags = MI.getFlags();
5624
5625 LLT S16 = LLT::scalar(SizeInBits: 16);
5626 LLT S32 = LLT::scalar(SizeInBits: 32);
5627
5628 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5629 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5630 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5631 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5632 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5633 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5634 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5635 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5636 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5637 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5638 // q16.u = opx(V_CVT_F16_F32, q32.u);
5639 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5640
5641 auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
5642 auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
5643 auto NegRHSExt = B.buildFNeg(Dst: S32, Src0: RHSExt);
5644 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5645 .addUse(RegNo: RHSExt.getReg(Idx: 0))
5646 .setMIFlags(Flags);
5647 auto Quot = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: Rcp, Flags);
5648 MachineInstrBuilder Err;
5649 if (ST.hasMadMacF32Insts()) {
5650 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5651 Quot = B.buildFMAD(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
5652 Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5653 } else {
5654 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5655 Quot = B.buildFMA(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
5656 Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
5657 }
5658 auto Tmp = B.buildFMul(Dst: S32, Src0: Err, Src1: Rcp, Flags);
5659 Tmp = B.buildAnd(Dst: S32, Src0: Tmp, Src1: B.buildConstant(Res: S32, Val: 0xff800000));
5660 Quot = B.buildFAdd(Dst: S32, Src0: Tmp, Src1: Quot, Flags);
5661 auto RDst = B.buildFPTrunc(Res: S16, Op: Quot, Flags);
5662 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5663 .addUse(RegNo: RDst.getReg(Idx: 0))
5664 .addUse(RegNo: RHS)
5665 .addUse(RegNo: LHS)
5666 .setMIFlags(Flags);
5667
5668 MI.eraseFromParent();
5669 return true;
5670}
5671
5672static constexpr unsigned SPDenormModeBitField =
5673 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 4, Values: 2);
5674
5675// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5676// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5677static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
5678 const GCNSubtarget &ST,
5679 SIModeRegisterDefaults Mode) {
5680 // Set SP denorm mode to this value.
5681 unsigned SPDenormMode =
5682 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5683
5684 if (ST.hasDenormModeInst()) {
5685 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5686 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5687
5688 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5689 B.buildInstr(Opcode: AMDGPU::S_DENORM_MODE)
5690 .addImm(Val: NewDenormModeValue);
5691
5692 } else {
5693 B.buildInstr(Opcode: AMDGPU::S_SETREG_IMM32_B32)
5694 .addImm(Val: SPDenormMode)
5695 .addImm(Val: SPDenormModeBitField);
5696 }
5697}
5698
5699bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
5700 MachineRegisterInfo &MRI,
5701 MachineIRBuilder &B) const {
5702 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5703 return true;
5704
5705 Register Res = MI.getOperand(i: 0).getReg();
5706 Register LHS = MI.getOperand(i: 1).getReg();
5707 Register RHS = MI.getOperand(i: 2).getReg();
5708 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5709 SIModeRegisterDefaults Mode = MFI->getMode();
5710
5711 uint16_t Flags = MI.getFlags();
5712
5713 LLT S32 = LLT::scalar(SizeInBits: 32);
5714 LLT S1 = LLT::scalar(SizeInBits: 1);
5715
5716 auto One = B.buildFConstant(Res: S32, Val: 1.0f);
5717
5718 auto DenominatorScaled =
5719 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5720 .addUse(RegNo: LHS)
5721 .addUse(RegNo: RHS)
5722 .addImm(Val: 0)
5723 .setMIFlags(Flags);
5724 auto NumeratorScaled =
5725 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5726 .addUse(RegNo: LHS)
5727 .addUse(RegNo: RHS)
5728 .addImm(Val: 1)
5729 .setMIFlags(Flags);
5730
5731 auto ApproxRcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5732 .addUse(RegNo: DenominatorScaled.getReg(Idx: 0))
5733 .setMIFlags(Flags);
5734 auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
5735
5736 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5737 const bool HasDynamicDenormals =
5738 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5739 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5740
5741 Register SavedSPDenormMode;
5742 if (!PreservesDenormals) {
5743 if (HasDynamicDenormals) {
5744 SavedSPDenormMode = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5745 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32)
5746 .addDef(RegNo: SavedSPDenormMode)
5747 .addImm(Val: SPDenormModeBitField);
5748 }
5749 toggleSPDenormMode(Enable: true, B, ST, Mode);
5750 }
5751
5752 auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
5753 auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
5754 auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
5755 auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
5756 auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
5757 auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
5758
5759 if (!PreservesDenormals) {
5760 if (HasDynamicDenormals) {
5761 assert(SavedSPDenormMode);
5762 B.buildInstr(Opcode: AMDGPU::S_SETREG_B32)
5763 .addReg(RegNo: SavedSPDenormMode)
5764 .addImm(Val: SPDenormModeBitField);
5765 } else
5766 toggleSPDenormMode(Enable: false, B, ST, Mode);
5767 }
5768
5769 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S32})
5770 .addUse(RegNo: Fma4.getReg(Idx: 0))
5771 .addUse(RegNo: Fma1.getReg(Idx: 0))
5772 .addUse(RegNo: Fma3.getReg(Idx: 0))
5773 .addUse(RegNo: NumeratorScaled.getReg(Idx: 1))
5774 .setMIFlags(Flags);
5775
5776 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5777 .addUse(RegNo: Fmas.getReg(Idx: 0))
5778 .addUse(RegNo: RHS)
5779 .addUse(RegNo: LHS)
5780 .setMIFlags(Flags);
5781
5782 MI.eraseFromParent();
5783 return true;
5784}
5785
5786bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5787 MachineRegisterInfo &MRI,
5788 MachineIRBuilder &B) const {
5789 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5790 return true;
5791
5792 Register Res = MI.getOperand(i: 0).getReg();
5793 Register LHS = MI.getOperand(i: 1).getReg();
5794 Register RHS = MI.getOperand(i: 2).getReg();
5795
5796 uint16_t Flags = MI.getFlags();
5797
5798 LLT S64 = LLT::scalar(SizeInBits: 64);
5799 LLT S1 = LLT::scalar(SizeInBits: 1);
5800
5801 auto One = B.buildFConstant(Res: S64, Val: 1.0);
5802
5803 auto DivScale0 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5804 .addUse(RegNo: LHS)
5805 .addUse(RegNo: RHS)
5806 .addImm(Val: 0)
5807 .setMIFlags(Flags);
5808
5809 auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(Idx: 0), Flags);
5810
5811 auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S64})
5812 .addUse(RegNo: DivScale0.getReg(Idx: 0))
5813 .setMIFlags(Flags);
5814
5815 auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
5816 auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
5817 auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
5818
5819 auto DivScale1 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5820 .addUse(RegNo: LHS)
5821 .addUse(RegNo: RHS)
5822 .addImm(Val: 1)
5823 .setMIFlags(Flags);
5824
5825 auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5826 auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(Idx: 0), Src1: Fma3, Flags);
5827 auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(Idx: 0), Flags);
5828
5829 Register Scale;
5830 if (!ST.hasUsableDivScaleConditionOutput()) {
5831 // Workaround a hardware bug on SI where the condition output from div_scale
5832 // is not usable.
5833
5834 LLT S32 = LLT::scalar(SizeInBits: 32);
5835
5836 auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5837 auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5838 auto Scale0Unmerge = B.buildUnmerge(Res: S32, Op: DivScale0);
5839 auto Scale1Unmerge = B.buildUnmerge(Res: S32, Op: DivScale1);
5840
5841 auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: 1),
5842 Op1: Scale1Unmerge.getReg(Idx: 1));
5843 auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: 1),
5844 Op1: Scale0Unmerge.getReg(Idx: 1));
5845 Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(Idx: 0);
5846 } else {
5847 Scale = DivScale1.getReg(Idx: 1);
5848 }
5849
5850 auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S64})
5851 .addUse(RegNo: Fma4.getReg(Idx: 0))
5852 .addUse(RegNo: Fma3.getReg(Idx: 0))
5853 .addUse(RegNo: Mul.getReg(Idx: 0))
5854 .addUse(RegNo: Scale)
5855 .setMIFlags(Flags);
5856
5857 B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res: ArrayRef(Res))
5858 .addUse(RegNo: Fmas.getReg(Idx: 0))
5859 .addUse(RegNo: RHS)
5860 .addUse(RegNo: LHS)
5861 .setMIFlags(Flags);
5862
5863 MI.eraseFromParent();
5864 return true;
5865}
5866
5867bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5868 MachineRegisterInfo &MRI,
5869 MachineIRBuilder &B) const {
5870 Register Res0 = MI.getOperand(i: 0).getReg();
5871 Register Res1 = MI.getOperand(i: 1).getReg();
5872 Register Val = MI.getOperand(i: 2).getReg();
5873 uint16_t Flags = MI.getFlags();
5874
5875 LLT Ty = MRI.getType(Reg: Res0);
5876 LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: 16) ? LLT::scalar(SizeInBits: 16) : LLT::scalar(SizeInBits: 32);
5877
5878 auto Mant = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_mant, Res: {Ty})
5879 .addUse(RegNo: Val)
5880 .setMIFlags(Flags);
5881 auto Exp = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_exp, Res: {InstrExpTy})
5882 .addUse(RegNo: Val)
5883 .setMIFlags(Flags);
5884
5885 if (ST.hasFractBug()) {
5886 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5887 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5888 auto IsFinite =
5889 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
5890 auto Zero = B.buildConstant(Res: InstrExpTy, Val: 0);
5891 Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5892 Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5893 }
5894
5895 B.buildCopy(Res: Res0, Op: Mant);
5896 B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5897
5898 MI.eraseFromParent();
5899 return true;
5900}
5901
5902bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5903 MachineRegisterInfo &MRI,
5904 MachineIRBuilder &B) const {
5905 Register Res = MI.getOperand(i: 0).getReg();
5906 Register LHS = MI.getOperand(i: 2).getReg();
5907 Register RHS = MI.getOperand(i: 3).getReg();
5908 uint16_t Flags = MI.getFlags();
5909
5910 LLT S32 = LLT::scalar(SizeInBits: 32);
5911 LLT S1 = LLT::scalar(SizeInBits: 1);
5912
5913 auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5914 const APFloat C0Val(1.0f);
5915
5916 auto C0 = B.buildFConstant(Res: S32, Val: 0x1p+96f);
5917 auto C1 = B.buildFConstant(Res: S32, Val: 0x1p-32f);
5918 auto C2 = B.buildFConstant(Res: S32, Val: 1.0f);
5919
5920 auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5921 auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5922
5923 auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5924
5925 auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5926 .addUse(RegNo: Mul0.getReg(Idx: 0))
5927 .setMIFlags(Flags);
5928
5929 auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5930
5931 B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5932
5933 MI.eraseFromParent();
5934 return true;
5935}
5936
5937bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5938 MachineRegisterInfo &MRI,
5939 MachineIRBuilder &B) const {
5940 // Bypass the correct expansion a standard promotion through G_FSQRT would
5941 // get. The f32 op is accurate enough for the f16 cas.
5942 unsigned Flags = MI.getFlags();
5943 assert(!ST.has16BitInsts());
5944 const LLT F32 = LLT::scalar(SizeInBits: 32);
5945 auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: 1), Flags);
5946 auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: {F32})
5947 .addUse(RegNo: Ext.getReg(Idx: 0))
5948 .setMIFlags(Flags);
5949 B.buildFPTrunc(Res: MI.getOperand(i: 0), Op: Log2, Flags);
5950 MI.eraseFromParent();
5951 return true;
5952}
5953
5954bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5955 MachineRegisterInfo &MRI,
5956 MachineIRBuilder &B) const {
5957 MachineFunction &MF = B.getMF();
5958 Register Dst = MI.getOperand(i: 0).getReg();
5959 Register X = MI.getOperand(i: 1).getReg();
5960 const unsigned Flags = MI.getFlags();
5961 const LLT S1 = LLT::scalar(SizeInBits: 1);
5962 const LLT F32 = LLT::scalar(SizeInBits: 32);
5963 const LLT I32 = LLT::scalar(SizeInBits: 32);
5964
5965 if (allowApproxFunc(MF, Flags)) {
5966 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({Dst}))
5967 .addUse(RegNo: X)
5968 .setMIFlags(Flags);
5969 MI.eraseFromParent();
5970 return true;
5971 }
5972
5973 auto ScaleThreshold = B.buildFConstant(Res: F32, Val: 0x1.0p-96f);
5974 auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5975 auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: 0x1.0p+32f);
5976 auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5977 auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5978
5979 Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5980 if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5981 B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({SqrtS}))
5982 .addUse(RegNo: SqrtX.getReg(Idx: 0))
5983 .setMIFlags(Flags);
5984
5985 auto NegOne = B.buildConstant(Res: I32, Val: -1);
5986 auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5987
5988 auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5989 auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5990
5991 auto PosOne = B.buildConstant(Res: I32, Val: 1);
5992 auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5993
5994 auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5995 auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5996
5997 auto Zero = B.buildFConstant(Res: F32, Val: 0.0f);
5998 auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5999
6000 SqrtS =
6001 B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: 0);
6002
6003 auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
6004 SqrtS =
6005 B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: 0);
6006 } else {
6007 auto SqrtR =
6008 B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F32}).addReg(RegNo: SqrtX.getReg(Idx: 0));
6009 B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
6010
6011 auto Half = B.buildFConstant(Res: F32, Val: 0.5f);
6012 auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
6013 auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
6014 auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
6015 SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
6016 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(Idx: 0);
6017 auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
6018 auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
6019 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(Idx: 0);
6020 }
6021
6022 auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: 0x1.0p-16f);
6023
6024 auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
6025
6026 SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: 0);
6027
6028 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
6029 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
6030
6031 MI.eraseFromParent();
6032 return true;
6033}
6034
6035bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
6036 MachineRegisterInfo &MRI,
6037 MachineIRBuilder &B) const {
6038 // For double type, the SQRT and RSQ instructions don't have required
6039 // precision, we apply Goldschmidt's algorithm to improve the result:
6040 //
6041 // y0 = rsq(x)
6042 // g0 = x * y0
6043 // h0 = 0.5 * y0
6044 //
6045 // r0 = 0.5 - h0 * g0
6046 // g1 = g0 * r0 + g0
6047 // h1 = h0 * r0 + h0
6048 //
6049 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
6050 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
6051 // h2 = h1 * r1 + h1
6052 //
6053 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
6054 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
6055 //
6056 // sqrt(x) = g3
6057
6058 const LLT S1 = LLT::scalar(SizeInBits: 1);
6059 const LLT S32 = LLT::scalar(SizeInBits: 32);
6060 const LLT F64 = LLT::scalar(SizeInBits: 64);
6061
6062 Register Dst = MI.getOperand(i: 0).getReg();
6063 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
6064
6065 Register X = MI.getOperand(i: 1).getReg();
6066 unsigned Flags = MI.getFlags();
6067
6068 Register SqrtX = X;
6069 Register Scaling, ZeroInt;
6070 if (!MI.getFlag(Flag: MachineInstr::FmAfn)) {
6071 auto ScaleConstant = B.buildFConstant(Res: F64, Val: 0x1.0p-767);
6072
6073 ZeroInt = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6074 Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant).getReg(Idx: 0);
6075
6076 // Scale up input if it is too small.
6077 auto ScaleUpFactor = B.buildConstant(Res: S32, Val: 256);
6078 auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
6079 SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags).getReg(Idx: 0);
6080 }
6081
6082 auto SqrtY = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F64}).addReg(RegNo: SqrtX);
6083
6084 auto Half = B.buildFConstant(Res: F64, Val: 0.5);
6085 auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
6086 auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
6087
6088 auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
6089 auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
6090
6091 auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
6092 auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
6093
6094 auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
6095 auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
6096
6097 auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
6098
6099 Register SqrtRet = SqrtS2.getReg(Idx: 0);
6100 if (!MI.getFlag(Flag: MachineInstr::FmAfn)) {
6101 auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
6102 auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
6103 auto SqrtD2 = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
6104
6105 // Scale down the result.
6106 auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -128);
6107 auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
6108 SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtD2, Src1: ScaleDown, Flags).getReg(Idx: 0);
6109 }
6110
6111 Register IsZeroOrInf;
6112 if (MI.getFlag(Flag: MachineInstr::FmNoInfs)) {
6113 auto ZeroFP = B.buildFConstant(Res: F64, Val: 0.0);
6114 IsZeroOrInf = B.buildFCmp(Pred: FCmpInst::FCMP_OEQ, Res: S1, Op0: SqrtX, Op1: ZeroFP).getReg(Idx: 0);
6115 } else {
6116 IsZeroOrInf = B.buildIsFPClass(Res: S1, Src: SqrtX, Mask: fcZero | fcPosInf).getReg(Idx: 0);
6117 }
6118
6119 // TODO: Check for DAZ and expand to subnormals
6120
6121 // If x is +INF, +0, or -0, use its original value
6122 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
6123
6124 MI.eraseFromParent();
6125 return true;
6126}
6127
6128bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
6129 MachineRegisterInfo &MRI,
6130 MachineIRBuilder &B) const {
6131 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
6132 if (Ty == LLT::scalar(SizeInBits: 32))
6133 return legalizeFSQRTF32(MI, MRI, B);
6134 if (Ty == LLT::scalar(SizeInBits: 64))
6135 return legalizeFSQRTF64(MI, MRI, B);
6136 if (Ty == LLT::scalar(SizeInBits: 16))
6137 return legalizeFSQRTF16(MI, MRI, B);
6138 return false;
6139}
6140
6141// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6142// FIXME: Why do we handle this one but not other removed instructions?
6143//
6144// Reciprocal square root. The clamp prevents infinite results, clamping
6145// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6146// +-max_float.
6147bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
6148 MachineRegisterInfo &MRI,
6149 MachineIRBuilder &B) const {
6150 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6151 return true;
6152
6153 Register Dst = MI.getOperand(i: 0).getReg();
6154 Register Src = MI.getOperand(i: 2).getReg();
6155 auto Flags = MI.getFlags();
6156
6157 LLT Ty = MRI.getType(Reg: Dst);
6158
6159 const fltSemantics *FltSemantics;
6160 if (Ty == LLT::scalar(SizeInBits: 32))
6161 FltSemantics = &APFloat::IEEEsingle();
6162 else if (Ty == LLT::scalar(SizeInBits: 64))
6163 FltSemantics = &APFloat::IEEEdouble();
6164 else
6165 return false;
6166
6167 auto Rsq = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {Ty})
6168 .addUse(RegNo: Src)
6169 .setMIFlags(Flags);
6170
6171 // We don't need to concern ourselves with the snan handling difference, since
6172 // the rsq quieted (or not) so use the one which will directly select.
6173 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6174 const bool UseIEEE = MFI->getMode().IEEE;
6175
6176 auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
6177 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
6178 B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
6179
6180 auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics, Negative: true));
6181
6182 if (UseIEEE)
6183 B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
6184 else
6185 B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
6186 MI.eraseFromParent();
6187 return true;
6188}
6189
6190// TODO: Fix pointer type handling
6191bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
6192 MachineInstr &MI,
6193 Intrinsic::ID IID) const {
6194
6195 MachineIRBuilder &B = Helper.MIRBuilder;
6196 MachineRegisterInfo &MRI = *B.getMRI();
6197
6198 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6199 IID == Intrinsic::amdgcn_permlanex16;
6200 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6201 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6202 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6203 IID == Intrinsic::amdgcn_permlane_up ||
6204 IID == Intrinsic::amdgcn_permlane_down ||
6205 IID == Intrinsic::amdgcn_permlane_xor;
6206
6207 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6208 Register Src2, LLT VT) -> Register {
6209 auto LaneOp = B.buildIntrinsic(ID: IID, Res: {VT}).addUse(RegNo: Src0);
6210 switch (IID) {
6211 case Intrinsic::amdgcn_readfirstlane:
6212 case Intrinsic::amdgcn_permlane64:
6213 return LaneOp.getReg(Idx: 0);
6214 case Intrinsic::amdgcn_readlane:
6215 case Intrinsic::amdgcn_set_inactive:
6216 case Intrinsic::amdgcn_set_inactive_chain_arg:
6217 return LaneOp.addUse(RegNo: Src1).getReg(Idx: 0);
6218 case Intrinsic::amdgcn_writelane:
6219 case Intrinsic::amdgcn_permlane_bcast:
6220 case Intrinsic::amdgcn_permlane_up:
6221 case Intrinsic::amdgcn_permlane_down:
6222 case Intrinsic::amdgcn_permlane_xor:
6223 return LaneOp.addUse(RegNo: Src1).addUse(RegNo: Src2).getReg(Idx: 0);
6224 case Intrinsic::amdgcn_permlane16:
6225 case Intrinsic::amdgcn_permlanex16: {
6226 Register Src3 = MI.getOperand(i: 5).getReg();
6227 int64_t Src4 = MI.getOperand(i: 6).getImm();
6228 int64_t Src5 = MI.getOperand(i: 7).getImm();
6229 return LaneOp.addUse(RegNo: Src1)
6230 .addUse(RegNo: Src2)
6231 .addUse(RegNo: Src3)
6232 .addImm(Val: Src4)
6233 .addImm(Val: Src5)
6234 .getReg(Idx: 0);
6235 }
6236 case Intrinsic::amdgcn_mov_dpp8:
6237 return LaneOp.addImm(Val: MI.getOperand(i: 3).getImm()).getReg(Idx: 0);
6238 case Intrinsic::amdgcn_update_dpp:
6239 return LaneOp.addUse(RegNo: Src1)
6240 .addImm(Val: MI.getOperand(i: 4).getImm())
6241 .addImm(Val: MI.getOperand(i: 5).getImm())
6242 .addImm(Val: MI.getOperand(i: 6).getImm())
6243 .addImm(Val: MI.getOperand(i: 7).getImm())
6244 .getReg(Idx: 0);
6245 default:
6246 llvm_unreachable("unhandled lane op");
6247 }
6248 };
6249
6250 Register DstReg = MI.getOperand(i: 0).getReg();
6251 Register Src0 = MI.getOperand(i: 2).getReg();
6252 Register Src1, Src2;
6253 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6254 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6255 IsPermlaneShuffle) {
6256 Src1 = MI.getOperand(i: 3).getReg();
6257 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6258 IsPermlaneShuffle) {
6259 Src2 = MI.getOperand(i: 4).getReg();
6260 }
6261 }
6262
6263 LLT Ty = MRI.getType(Reg: DstReg);
6264 unsigned Size = Ty.getSizeInBits();
6265
6266 unsigned SplitSize = 32;
6267 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6268 ST.hasDPALU_DPP() &&
6269 AMDGPU::isLegalDPALU_DPPControl(ST, DC: MI.getOperand(i: 4).getImm()))
6270 SplitSize = 64;
6271
6272 if (Size == SplitSize) {
6273 // Already legal
6274 return true;
6275 }
6276
6277 if (Size < 32) {
6278 Src0 = B.buildAnyExt(Res: S32, Op: Src0).getReg(Idx: 0);
6279
6280 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6281 Src1 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src1).getReg(Idx: 0);
6282
6283 if (IID == Intrinsic::amdgcn_writelane)
6284 Src2 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: Src2).getReg(Idx: 0);
6285
6286 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6287 B.buildTrunc(Res: DstReg, Op: LaneOpDst);
6288 MI.eraseFromParent();
6289 return true;
6290 }
6291
6292 if (Size % SplitSize != 0)
6293 return false;
6294
6295 LLT PartialResTy = LLT::scalar(SizeInBits: SplitSize);
6296 bool NeedsBitcast = false;
6297 if (Ty.isVector()) {
6298 LLT EltTy = Ty.getElementType();
6299 unsigned EltSize = EltTy.getSizeInBits();
6300 if (EltSize == SplitSize) {
6301 PartialResTy = EltTy;
6302 } else if (EltSize == 16 || EltSize == 32) {
6303 unsigned NElem = SplitSize / EltSize;
6304 PartialResTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: NElem));
6305 } else {
6306 // Handle all other cases via S32/S64 pieces
6307 NeedsBitcast = true;
6308 }
6309 }
6310
6311 SmallVector<Register, 4> PartialRes;
6312 unsigned NumParts = Size / SplitSize;
6313 MachineInstrBuilder Src0Parts = B.buildUnmerge(Res: PartialResTy, Op: Src0);
6314 MachineInstrBuilder Src1Parts, Src2Parts;
6315
6316 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6317 Src1Parts = B.buildUnmerge(Res: PartialResTy, Op: Src1);
6318
6319 if (IID == Intrinsic::amdgcn_writelane)
6320 Src2Parts = B.buildUnmerge(Res: PartialResTy, Op: Src2);
6321
6322 for (unsigned i = 0; i < NumParts; ++i) {
6323 Src0 = Src0Parts.getReg(Idx: i);
6324
6325 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6326 Src1 = Src1Parts.getReg(Idx: i);
6327
6328 if (IID == Intrinsic::amdgcn_writelane)
6329 Src2 = Src2Parts.getReg(Idx: i);
6330
6331 PartialRes.push_back(Elt: createLaneOp(Src0, Src1, Src2, PartialResTy));
6332 }
6333
6334 if (NeedsBitcast)
6335 B.buildBitcast(Dst: DstReg, Src: B.buildMergeLikeInstr(
6336 Res: LLT::scalar(SizeInBits: Ty.getSizeInBits()), Ops: PartialRes));
6337 else
6338 B.buildMergeLikeInstr(Res: DstReg, Ops: PartialRes);
6339
6340 MI.eraseFromParent();
6341 return true;
6342}
6343
6344bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
6345 MachineRegisterInfo &MRI,
6346 MachineIRBuilder &B) const {
6347 uint64_t Offset =
6348 ST.getTargetLowering()->getImplicitParameterOffset(
6349 MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
6350 LLT DstTy = MRI.getType(Reg: DstReg);
6351 LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
6352
6353 Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
6354 if (!loadInputValue(DstReg: KernargPtrReg, B,
6355 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6356 return false;
6357
6358 B.buildObjectPtrOffset(Res: DstReg, Op0: KernargPtrReg,
6359 Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: 0));
6360 return true;
6361}
6362
6363/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6364/// bits of the pointer and replace them with the stride argument, then
6365/// merge_values everything together. In the common case of a raw buffer (the
6366/// stride component is 0), we can just AND off the upper half.
6367bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
6368 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6369 Register Result = MI.getOperand(i: 0).getReg();
6370 Register Pointer = MI.getOperand(i: 2).getReg();
6371 Register Stride = MI.getOperand(i: 3).getReg();
6372 Register NumRecords = MI.getOperand(i: 4).getReg();
6373 Register Flags = MI.getOperand(i: 5).getReg();
6374
6375 LLT S32 = LLT::scalar(SizeInBits: 32);
6376 LLT S64 = LLT::scalar(SizeInBits: 64);
6377
6378 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6379
6380 auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
6381
6382 if (ST.has45BitNumRecordsBufferResource()) {
6383 Register Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6384 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6385 // num_records.
6386 LLT PtrIntTy = LLT::scalar(SizeInBits: MRI.getType(Reg: Pointer).getSizeInBits());
6387 auto PointerInt = B.buildPtrToInt(Dst: PtrIntTy, Src: Pointer);
6388 auto ExtPointer = B.buildAnyExtOrTrunc(Res: S64, Op: PointerInt);
6389 auto NumRecordsLHS = B.buildShl(Dst: S64, Src0: NumRecords, Src1: B.buildConstant(Res: S32, Val: 57));
6390 Register LowHalf = B.buildOr(Dst: S64, Src0: ExtPointer, Src1: NumRecordsLHS).getReg(Idx: 0);
6391
6392 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6393 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6394 auto NumRecordsRHS = B.buildLShr(Dst: S64, Src0: NumRecords, Src1: B.buildConstant(Res: S32, Val: 7));
6395 auto ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: B.buildConstant(Res: S32, Val: 12));
6396 auto ExtShiftedStride =
6397 B.buildMergeValues(Res: S64, Ops: {Zero, ShiftedStride.getReg(Idx: 0)});
6398 auto ShiftedFlags = B.buildShl(Dst: S32, Src0: Flags, Src1: B.buildConstant(Res: S32, Val: 28));
6399 auto ExtShiftedFlags =
6400 B.buildMergeValues(Res: S64, Ops: {Zero, ShiftedFlags.getReg(Idx: 0)});
6401 auto CombinedFields = B.buildOr(Dst: S64, Src0: NumRecordsRHS, Src1: ExtShiftedStride);
6402 Register HighHalf =
6403 B.buildOr(Dst: S64, Src0: CombinedFields, Src1: ExtShiftedFlags).getReg(Idx: 0);
6404 B.buildMergeValues(Res: Result, Ops: {LowHalf, HighHalf});
6405 } else {
6406 NumRecords = B.buildTrunc(Res: S32, Op: NumRecords).getReg(Idx: 0);
6407 auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
6408 auto LowHalf = Unmerge.getReg(Idx: 0);
6409 auto HighHalf = Unmerge.getReg(Idx: 1);
6410
6411 auto AndMask = B.buildConstant(Res: S32, Val: 0x0000ffff);
6412 auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
6413 auto ShiftConst = B.buildConstant(Res: S32, Val: 16);
6414 auto ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
6415 auto NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
6416 Register NewHighHalfReg = NewHighHalf.getReg(Idx: 0);
6417 B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
6418 }
6419
6420 MI.eraseFromParent();
6421 return true;
6422}
6423
6424bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
6425 MachineRegisterInfo &MRI,
6426 MachineIRBuilder &B) const {
6427 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6428 if (!MFI->isEntryFunction()) {
6429 return legalizePreloadedArgIntrin(MI, MRI, B,
6430 ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
6431 }
6432
6433 Register DstReg = MI.getOperand(i: 0).getReg();
6434 if (!getImplicitArgPtr(DstReg, MRI, B))
6435 return false;
6436
6437 MI.eraseFromParent();
6438 return true;
6439}
6440
6441bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
6442 MachineRegisterInfo &MRI,
6443 MachineIRBuilder &B) const {
6444 Function &F = B.getMF().getFunction();
6445 std::optional<uint32_t> KnownSize =
6446 AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(F);
6447 if (KnownSize.has_value())
6448 B.buildConstant(Res: DstReg, Val: *KnownSize);
6449 return false;
6450}
6451
6452bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
6453 MachineRegisterInfo &MRI,
6454 MachineIRBuilder &B) const {
6455
6456 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6457 if (!MFI->isEntryFunction()) {
6458 return legalizePreloadedArgIntrin(MI, MRI, B,
6459 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
6460 }
6461
6462 Register DstReg = MI.getOperand(i: 0).getReg();
6463 if (!getLDSKernelId(DstReg, MRI, B))
6464 return false;
6465
6466 MI.eraseFromParent();
6467 return true;
6468}
6469
6470bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
6471 MachineRegisterInfo &MRI,
6472 MachineIRBuilder &B,
6473 unsigned AddrSpace) const {
6474 const LLT S32 = LLT::scalar(SizeInBits: 32);
6475 auto Unmerge = B.buildUnmerge(Res: S32, Op: MI.getOperand(i: 2).getReg());
6476 Register Hi32 = Unmerge.getReg(Idx: 1);
6477
6478 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6479 ST.hasGloballyAddressableScratch()) {
6480 Register FlatScratchBaseHi =
6481 B.buildInstr(Opc: AMDGPU::S_MOV_B32, DstOps: {S32},
6482 SrcOps: {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6483 .getReg(Idx: 0);
6484 MRI.setRegClass(Reg: FlatScratchBaseHi, RC: &AMDGPU::SReg_32RegClass);
6485 // Test bits 63..58 against the aperture address.
6486 Register XOR = B.buildXor(Dst: S32, Src0: Hi32, Src1: FlatScratchBaseHi).getReg(Idx: 0);
6487 B.buildICmp(Pred: ICmpInst::ICMP_ULT, Res: MI.getOperand(i: 0), Op0: XOR,
6488 Op1: B.buildConstant(Res: S32, Val: 1u << 26));
6489 } else {
6490 Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
6491 B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: 0), Op0: Hi32, Op1: ApertureReg);
6492 }
6493 MI.eraseFromParent();
6494 return true;
6495}
6496
6497// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6498// offset (the offset that is included in bounds checking and swizzling, to be
6499// split between the instruction's voffset and immoffset fields) and soffset
6500// (the offset that is excluded from bounds checking and swizzling, to go in
6501// the instruction's soffset field). This function takes the first kind of
6502// offset and figures out how to split it between voffset and immoffset.
6503std::pair<Register, unsigned>
6504AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
6505 Register OrigOffset) const {
6506 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6507 Register BaseReg;
6508 unsigned ImmOffset;
6509 const LLT S32 = LLT::scalar(SizeInBits: 32);
6510 MachineRegisterInfo &MRI = *B.getMRI();
6511
6512 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6513 // being added, so we can only safely match a 32-bit addition with no unsigned
6514 // overflow.
6515 bool CheckNUW = ST.hasGFX1250Insts();
6516 std::tie(args&: BaseReg, args&: ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6517 MRI, Reg: OrigOffset, /*KnownBits=*/ValueTracking: nullptr, CheckNUW);
6518
6519 // If BaseReg is a pointer, convert it to int.
6520 if (MRI.getType(Reg: BaseReg).isPointer())
6521 BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: 0);
6522
6523 // If the immediate value is too big for the immoffset field, put only bits
6524 // that would normally fit in the immoffset field. The remaining value that
6525 // is copied/added for the voffset field is a large power of 2, and it
6526 // stands more chance of being CSEd with the copy/add for another similar
6527 // load/store.
6528 // However, do not do that rounding down if that is a negative
6529 // number, as it appears to be illegal to have a negative offset in the
6530 // vgpr, even if adding the immediate offset makes it positive.
6531 unsigned Overflow = ImmOffset & ~MaxImm;
6532 ImmOffset -= Overflow;
6533 if ((int32_t)Overflow < 0) {
6534 Overflow += ImmOffset;
6535 ImmOffset = 0;
6536 }
6537
6538 if (Overflow != 0) {
6539 if (!BaseReg) {
6540 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
6541 } else {
6542 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
6543 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
6544 }
6545 }
6546
6547 if (!BaseReg)
6548 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6549
6550 return std::pair(BaseReg, ImmOffset);
6551}
6552
6553/// Handle register layout difference for f16 images for some subtargets.
6554Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
6555 MachineRegisterInfo &MRI,
6556 Register Reg,
6557 bool ImageStore) const {
6558 const LLT S16 = LLT::scalar(SizeInBits: 16);
6559 const LLT S32 = LLT::scalar(SizeInBits: 32);
6560 LLT StoreVT = MRI.getType(Reg);
6561 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6562
6563 if (ST.hasUnpackedD16VMem()) {
6564 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
6565
6566 SmallVector<Register, 4> WideRegs;
6567 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6568 WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6569
6570 int NumElts = StoreVT.getNumElements();
6571
6572 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
6573 .getReg(Idx: 0);
6574 }
6575
6576 if (ImageStore && ST.hasImageStoreD16Bug()) {
6577 if (StoreVT.getNumElements() == 2) {
6578 SmallVector<Register, 4> PackedRegs;
6579 Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: 0);
6580 PackedRegs.push_back(Elt: Reg);
6581 PackedRegs.resize(N: 2, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
6582 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Ops: PackedRegs)
6583 .getReg(Idx: 0);
6584 }
6585
6586 if (StoreVT.getNumElements() == 3) {
6587 SmallVector<Register, 4> PackedRegs;
6588 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
6589 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6590 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
6591 PackedRegs.resize(N: 6, NV: B.buildUndef(Res: S16).getReg(Idx: 0));
6592 Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 6, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: 0);
6593 return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 3, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
6594 }
6595
6596 if (StoreVT.getNumElements() == 4) {
6597 SmallVector<Register, 4> PackedRegs;
6598 Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
6599 auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
6600 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6601 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
6602 PackedRegs.resize(N: 4, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
6603 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S32), Ops: PackedRegs)
6604 .getReg(Idx: 0);
6605 }
6606
6607 llvm_unreachable("invalid data type");
6608 }
6609
6610 if (StoreVT == LLT::fixed_vector(NumElements: 3, ScalarTy: S16)) {
6611 Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S16), Op0: Reg)
6612 .getReg(Idx: 0);
6613 }
6614 return Reg;
6615}
6616
6617Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
6618 Register VData, LLT MemTy,
6619 bool IsFormat) const {
6620 MachineRegisterInfo *MRI = B.getMRI();
6621 LLT Ty = MRI->getType(Reg: VData);
6622
6623 const LLT S16 = LLT::scalar(SizeInBits: 16);
6624
6625 // Fixup buffer resources themselves needing to be v4i128.
6626 if (hasBufferRsrcWorkaround(Ty))
6627 return castBufferRsrcToV4I32(Pointer: VData, B);
6628
6629 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6630 Ty = getBitcastRegisterType(Ty);
6631 VData = B.buildBitcast(Dst: Ty, Src: VData).getReg(Idx: 0);
6632 }
6633 // Fixup illegal register types for i8 stores.
6634 if (Ty == LLT::scalar(SizeInBits: 8) || Ty == S16) {
6635 Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: VData).getReg(Idx: 0);
6636 return AnyExt;
6637 }
6638
6639 if (Ty.isVector()) {
6640 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6641 if (IsFormat)
6642 return handleD16VData(B, MRI&: *MRI, Reg: VData);
6643 }
6644 }
6645
6646 return VData;
6647}
6648
6649bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
6650 LegalizerHelper &Helper,
6651 bool IsTyped,
6652 bool IsFormat) const {
6653 MachineIRBuilder &B = Helper.MIRBuilder;
6654 MachineRegisterInfo &MRI = *B.getMRI();
6655
6656 Register VData = MI.getOperand(i: 1).getReg();
6657 LLT Ty = MRI.getType(Reg: VData);
6658 LLT EltTy = Ty.getScalarType();
6659 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6660 const LLT S32 = LLT::scalar(SizeInBits: 32);
6661
6662 MachineMemOperand *MMO = *MI.memoperands_begin();
6663 const int MemSize = MMO->getSize().getValue();
6664 LLT MemTy = MMO->getMemoryType();
6665
6666 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6667
6668 castBufferRsrcArgToV4I32(MI, B, Idx: 2);
6669 Register RSrc = MI.getOperand(i: 2).getReg();
6670
6671 unsigned ImmOffset;
6672
6673 // The typed intrinsics add an immediate after the registers.
6674 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6675
6676 // The struct intrinsic variants add one additional operand over raw.
6677 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6678 Register VIndex;
6679 int OpOffset = 0;
6680 if (HasVIndex) {
6681 VIndex = MI.getOperand(i: 3).getReg();
6682 OpOffset = 1;
6683 } else {
6684 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6685 }
6686
6687 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6688 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6689
6690 unsigned Format = 0;
6691 if (IsTyped) {
6692 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6693 ++OpOffset;
6694 }
6695
6696 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6697
6698 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6699
6700 unsigned Opc;
6701 if (IsTyped) {
6702 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6703 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6704 } else if (IsFormat) {
6705 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6706 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6707 } else {
6708 switch (MemSize) {
6709 case 1:
6710 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6711 break;
6712 case 2:
6713 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6714 break;
6715 default:
6716 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6717 break;
6718 }
6719 }
6720
6721 auto MIB = B.buildInstr(Opcode: Opc)
6722 .addUse(RegNo: VData) // vdata
6723 .addUse(RegNo: RSrc) // rsrc
6724 .addUse(RegNo: VIndex) // vindex
6725 .addUse(RegNo: VOffset) // voffset
6726 .addUse(RegNo: SOffset) // soffset
6727 .addImm(Val: ImmOffset); // offset(imm)
6728
6729 if (IsTyped)
6730 MIB.addImm(Val: Format);
6731
6732 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6733 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6734 .addMemOperand(MMO);
6735
6736 MI.eraseFromParent();
6737 return true;
6738}
6739
6740static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6741 Register VIndex, Register VOffset, Register SOffset,
6742 unsigned ImmOffset, unsigned Format,
6743 unsigned AuxiliaryData, MachineMemOperand *MMO,
6744 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6745 auto MIB = B.buildInstr(Opcode: Opc)
6746 .addDef(RegNo: LoadDstReg) // vdata
6747 .addUse(RegNo: RSrc) // rsrc
6748 .addUse(RegNo: VIndex) // vindex
6749 .addUse(RegNo: VOffset) // voffset
6750 .addUse(RegNo: SOffset) // soffset
6751 .addImm(Val: ImmOffset); // offset(imm)
6752
6753 if (IsTyped)
6754 MIB.addImm(Val: Format);
6755
6756 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6757 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6758 .addMemOperand(MMO);
6759}
6760
6761bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
6762 LegalizerHelper &Helper,
6763 bool IsFormat,
6764 bool IsTyped) const {
6765 MachineIRBuilder &B = Helper.MIRBuilder;
6766 MachineRegisterInfo &MRI = *B.getMRI();
6767 GISelChangeObserver &Observer = Helper.Observer;
6768
6769 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6770 MachineMemOperand *MMO = *MI.memoperands_begin();
6771 const LLT MemTy = MMO->getMemoryType();
6772 const LLT S32 = LLT::scalar(SizeInBits: 32);
6773
6774 Register Dst = MI.getOperand(i: 0).getReg();
6775
6776 Register StatusDst;
6777 int OpOffset = 0;
6778 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6779 bool IsTFE = MI.getNumExplicitDefs() == 2;
6780 if (IsTFE) {
6781 StatusDst = MI.getOperand(i: 1).getReg();
6782 ++OpOffset;
6783 }
6784
6785 castBufferRsrcArgToV4I32(MI, B, Idx: 2 + OpOffset);
6786 Register RSrc = MI.getOperand(i: 2 + OpOffset).getReg();
6787
6788 // The typed intrinsics add an immediate after the registers.
6789 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6790
6791 // The struct intrinsic variants add one additional operand over raw.
6792 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6793 Register VIndex;
6794 if (HasVIndex) {
6795 VIndex = MI.getOperand(i: 3 + OpOffset).getReg();
6796 ++OpOffset;
6797 } else {
6798 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
6799 }
6800
6801 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
6802 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6803
6804 unsigned Format = 0;
6805 if (IsTyped) {
6806 Format = MI.getOperand(i: 5 + OpOffset).getImm();
6807 ++OpOffset;
6808 }
6809
6810 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
6811 unsigned ImmOffset;
6812
6813 LLT Ty = MRI.getType(Reg: Dst);
6814 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6815 // logic doesn't have to handle that case.
6816 if (hasBufferRsrcWorkaround(Ty)) {
6817 Observer.changingInstr(MI);
6818 Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
6819 Observer.changedInstr(MI);
6820 Dst = MI.getOperand(i: 0).getReg();
6821 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6822 }
6823 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6824 Ty = getBitcastRegisterType(Ty);
6825 Observer.changingInstr(MI);
6826 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
6827 Observer.changedInstr(MI);
6828 Dst = MI.getOperand(i: 0).getReg();
6829 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6830 }
6831
6832 LLT EltTy = Ty.getScalarType();
6833 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6834 const bool Unpacked = ST.hasUnpackedD16VMem();
6835
6836 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6837
6838 unsigned Opc;
6839
6840 // TODO: Support TFE for typed and narrow loads.
6841 if (IsTyped) {
6842 if (IsTFE)
6843 return false;
6844 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6845 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6846 } else if (IsFormat) {
6847 if (IsD16) {
6848 if (IsTFE)
6849 return false;
6850 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6851 } else {
6852 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6853 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6854 }
6855 } else {
6856 switch (MemTy.getSizeInBits()) {
6857 case 8:
6858 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6859 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6860 break;
6861 case 16:
6862 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6863 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6864 break;
6865 default:
6866 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6867 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6868 break;
6869 }
6870 }
6871
6872 if (IsTFE) {
6873 unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: 32);
6874 unsigned NumLoadDWords = NumValueDWords + 1;
6875 LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
6876 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
6877 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6878 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6879 if (MemTy.getSizeInBits() < 32) {
6880 Register ExtDst = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6881 B.buildUnmerge(Res: {ExtDst, StatusDst}, Op: LoadDstReg);
6882 B.buildTrunc(Res: Dst, Op: ExtDst);
6883 } else if (NumValueDWords == 1) {
6884 B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
6885 } else {
6886 SmallVector<Register, 5> LoadElts;
6887 for (unsigned I = 0; I != NumValueDWords; ++I)
6888 LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
6889 LoadElts.push_back(Elt: StatusDst);
6890 B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
6891 LoadElts.truncate(N: NumValueDWords);
6892 B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
6893 }
6894 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6895 (IsD16 && !Ty.isVector())) {
6896 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6897 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6898 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6899 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6900 B.buildTrunc(Res: Dst, Op: LoadDstReg);
6901 } else if (Unpacked && IsD16 && Ty.isVector()) {
6902 LLT UnpackedTy = Ty.changeElementSize(NewEltSize: 32);
6903 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
6904 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6905 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6906 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6907 // FIXME: G_TRUNC should work, but legalization currently fails
6908 auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
6909 SmallVector<Register, 4> Repack;
6910 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6911 Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
6912 B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
6913 } else {
6914 buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6915 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6916 }
6917
6918 MI.eraseFromParent();
6919 return true;
6920}
6921
6922static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6923 switch (IntrID) {
6924 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6925 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6926 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6927 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6928 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6929 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6930 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6931 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6933 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6934 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6935 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6936 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6937 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6938 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6939 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6940 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6941 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6942 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6943 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6944 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6945 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6946 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6947 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6948 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6949 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6950 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6951 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6952 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6953 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6954 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6955 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6956 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6957 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6958 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6959 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6960 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6961 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6963 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6964 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6965 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6966 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6967 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6968 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6969 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6970 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6971 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6972 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6973 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6974 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6975 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6976 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6977 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6978 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6979 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6980 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6981 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6982 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6983 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6984 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6985 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6986 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6987 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6988 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6989 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6990 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6991 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6992 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6993 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6994 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6995 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6996 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6997 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6998 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6999 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7000 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7001 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7002 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7003 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
7004 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
7005 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
7006 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
7007 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
7008 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
7009 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
7010 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
7011 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
7012 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
7013 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
7014 default:
7015 llvm_unreachable("unhandled atomic opcode");
7016 }
7017}
7018
7019bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
7020 MachineIRBuilder &B,
7021 Intrinsic::ID IID) const {
7022 const bool IsCmpSwap =
7023 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
7024 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
7025 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
7026 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
7027
7028 Register Dst = MI.getOperand(i: 0).getReg();
7029 // Since we don't have 128-bit atomics, we don't need to handle the case of
7030 // p8 argmunents to the atomic itself
7031 Register VData = MI.getOperand(i: 2).getReg();
7032
7033 Register CmpVal;
7034 int OpOffset = 0;
7035
7036 if (IsCmpSwap) {
7037 CmpVal = MI.getOperand(i: 3).getReg();
7038 ++OpOffset;
7039 }
7040
7041 castBufferRsrcArgToV4I32(MI, B, Idx: 3 + OpOffset);
7042 Register RSrc = MI.getOperand(i: 3 + OpOffset).getReg();
7043 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7044
7045 // The struct intrinsic variants add one additional operand over raw.
7046 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
7047 Register VIndex;
7048 if (HasVIndex) {
7049 VIndex = MI.getOperand(i: 4 + OpOffset).getReg();
7050 ++OpOffset;
7051 } else {
7052 VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 0).getReg(Idx: 0);
7053 }
7054
7055 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
7056 Register SOffset = MI.getOperand(i: 5 + OpOffset).getReg();
7057 unsigned AuxiliaryData = MI.getOperand(i: 6 + OpOffset).getImm();
7058
7059 MachineMemOperand *MMO = *MI.memoperands_begin();
7060
7061 unsigned ImmOffset;
7062 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
7063
7064 auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
7065 .addDef(RegNo: Dst)
7066 .addUse(RegNo: VData); // vdata
7067
7068 if (IsCmpSwap)
7069 MIB.addReg(RegNo: CmpVal);
7070
7071 MIB.addUse(RegNo: RSrc) // rsrc
7072 .addUse(RegNo: VIndex) // vindex
7073 .addUse(RegNo: VOffset) // voffset
7074 .addUse(RegNo: SOffset) // soffset
7075 .addImm(Val: ImmOffset) // offset(imm)
7076 .addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
7077 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
7078 .addMemOperand(MMO);
7079
7080 MI.eraseFromParent();
7081 return true;
7082}
7083
7084/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
7085/// vector with s16 typed elements.
7086static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
7087 SmallVectorImpl<Register> &PackedAddrs,
7088 unsigned ArgOffset,
7089 const AMDGPU::ImageDimIntrinsicInfo *Intr,
7090 bool IsA16, bool IsG16) {
7091 const LLT S16 = LLT::scalar(SizeInBits: 16);
7092 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
7093 auto EndIdx = Intr->VAddrEnd;
7094
7095 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
7096 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
7097 if (!SrcOp.isReg())
7098 continue; // _L to _LZ may have eliminated this.
7099
7100 Register AddrReg = SrcOp.getReg();
7101
7102 if ((I < Intr->GradientStart) ||
7103 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
7104 (I >= Intr->CoordStart && !IsA16)) {
7105 if ((I < Intr->GradientStart) && IsA16 &&
7106 (B.getMRI()->getType(Reg: AddrReg) == S16)) {
7107 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7108 // Special handling of bias when A16 is on. Bias is of type half but
7109 // occupies full 32-bit.
7110 PackedAddrs.push_back(
7111 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
7112 .getReg(Idx: 0));
7113 } else {
7114 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7115 "Bias needs to be converted to 16 bit in A16 mode");
7116 // Handle any gradient or coordinate operands that should not be packed
7117 AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: 0);
7118 PackedAddrs.push_back(Elt: AddrReg);
7119 }
7120 } else {
7121 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7122 // derivatives dx/dh and dx/dv are packed with undef.
7123 if (((I + 1) >= EndIdx) ||
7124 ((Intr->NumGradients / 2) % 2 == 1 &&
7125 (I == static_cast<unsigned>(Intr->GradientStart +
7126 (Intr->NumGradients / 2) - 1) ||
7127 I == static_cast<unsigned>(Intr->GradientStart +
7128 Intr->NumGradients - 1))) ||
7129 // Check for _L to _LZ optimization
7130 !MI.getOperand(i: ArgOffset + I + 1).isReg()) {
7131 PackedAddrs.push_back(
7132 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
7133 .getReg(Idx: 0));
7134 } else {
7135 PackedAddrs.push_back(
7136 Elt: B.buildBuildVector(
7137 Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + 1).getReg()})
7138 .getReg(Idx: 0));
7139 ++I;
7140 }
7141 }
7142 }
7143}
7144
7145/// Convert from separate vaddr components to a single vector address register,
7146/// and replace the remaining operands with $noreg.
7147static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
7148 int DimIdx, int NumVAddrs) {
7149 const LLT S32 = LLT::scalar(SizeInBits: 32);
7150 (void)S32;
7151 SmallVector<Register, 8> AddrRegs;
7152 for (int I = 0; I != NumVAddrs; ++I) {
7153 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
7154 if (SrcOp.isReg()) {
7155 AddrRegs.push_back(Elt: SrcOp.getReg());
7156 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7157 }
7158 }
7159
7160 int NumAddrRegs = AddrRegs.size();
7161 if (NumAddrRegs != 1) {
7162 auto VAddr =
7163 B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: 32), Ops: AddrRegs);
7164 MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: 0));
7165 }
7166
7167 for (int I = 1; I != NumVAddrs; ++I) {
7168 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
7169 if (SrcOp.isReg())
7170 MI.getOperand(i: DimIdx + I).setReg(AMDGPU::NoRegister);
7171 }
7172}
7173
7174/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7175///
7176/// Depending on the subtarget, load/store with 16-bit element data need to be
7177/// rewritten to use the low half of 32-bit registers, or directly use a packed
7178/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7179/// registers.
7180///
7181/// We don't want to directly select image instructions just yet, but also want
7182/// to exposes all register repacking to the legalizer/combiners. We also don't
7183/// want a selected instruction entering RegBankSelect. In order to avoid
7184/// defining a multitude of intermediate image instructions, directly hack on
7185/// the intrinsic's arguments. In cases like a16 addresses, this requires
7186/// padding now unnecessary arguments with $noreg.
7187bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
7188 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
7189 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7190
7191 const MachineFunction &MF = *MI.getMF();
7192 const unsigned NumDefs = MI.getNumExplicitDefs();
7193 const unsigned ArgOffset = NumDefs + 1;
7194 bool IsTFE = NumDefs == 2;
7195 // We are only processing the operands of d16 image operations on subtargets
7196 // that use the unpacked register layout, or need to repack the TFE result.
7197
7198 // TODO: Do we need to guard against already legalized intrinsics?
7199 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7200 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
7201
7202 MachineRegisterInfo *MRI = B.getMRI();
7203 const LLT S32 = LLT::scalar(SizeInBits: 32);
7204 const LLT S16 = LLT::scalar(SizeInBits: 16);
7205 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
7206
7207 unsigned DMask = 0;
7208 Register VData;
7209 LLT Ty;
7210
7211 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7212 VData = MI.getOperand(i: NumDefs == 0 ? 1 : 0).getReg();
7213 Ty = MRI->getType(Reg: VData);
7214 }
7215
7216 const bool IsAtomicPacked16Bit =
7217 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7218 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7219
7220 // Check for 16 bit addresses and pack if true.
7221 LLT GradTy =
7222 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
7223 LLT AddrTy =
7224 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
7225 const bool IsG16 =
7226 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7227 const bool IsA16 = AddrTy == S16;
7228 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7229
7230 int DMaskLanes = 0;
7231 if (!BaseOpcode->Atomic) {
7232 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
7233 if (BaseOpcode->Gather4) {
7234 DMaskLanes = 4;
7235 } else if (DMask != 0) {
7236 DMaskLanes = llvm::popcount(Value: DMask);
7237 } else if (!IsTFE && !BaseOpcode->Store) {
7238 // If dmask is 0, this is a no-op load. This can be eliminated.
7239 B.buildUndef(Res: MI.getOperand(i: 0));
7240 MI.eraseFromParent();
7241 return true;
7242 }
7243 }
7244
7245 Observer.changingInstr(MI);
7246 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7247
7248 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7249 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7250 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7251 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7252 unsigned NewOpcode = LoadOpcode;
7253 if (BaseOpcode->Store)
7254 NewOpcode = StoreOpcode;
7255 else if (BaseOpcode->NoReturn)
7256 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7257
7258 // Track that we legalized this
7259 MI.setDesc(B.getTII().get(Opcode: NewOpcode));
7260
7261 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7262 // dmask to be at least 1 otherwise the instruction will fail
7263 if (IsTFE && DMask == 0) {
7264 DMask = 0x1;
7265 DMaskLanes = 1;
7266 MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
7267 }
7268
7269 if (BaseOpcode->Atomic) {
7270 Register VData0 = MI.getOperand(i: 2).getReg();
7271 LLT Ty = MRI->getType(Reg: VData0);
7272
7273 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7274 if (Ty.isVector() && !IsAtomicPacked16Bit)
7275 return false;
7276
7277 if (BaseOpcode->AtomicX2) {
7278 Register VData1 = MI.getOperand(i: 3).getReg();
7279 // The two values are packed in one register.
7280 LLT PackedTy = LLT::fixed_vector(NumElements: 2, ScalarTy: Ty);
7281 auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
7282 MI.getOperand(i: 2).setReg(Concat.getReg(Idx: 0));
7283 MI.getOperand(i: 3).setReg(AMDGPU::NoRegister);
7284 }
7285 }
7286
7287 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7288
7289 // Rewrite the addressing register layout before doing anything else.
7290 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7291 // 16 bit gradients are supported, but are tied to the A16 control
7292 // so both gradients and addresses must be 16 bit
7293 return false;
7294 }
7295
7296 if (IsA16 && !ST.hasA16()) {
7297 // A16 not supported
7298 return false;
7299 }
7300
7301 const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
7302 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7303
7304 if (IsA16 || IsG16) {
7305 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7306 // instructions expect VGPR_32
7307 SmallVector<Register, 4> PackedRegs;
7308
7309 packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7310
7311 // See also below in the non-a16 branch
7312 const bool UseNSA = ST.hasNSAEncoding() &&
7313 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7314 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7315 const bool UsePartialNSA =
7316 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7317
7318 if (UsePartialNSA) {
7319 // Pack registers that would go over NSAMaxSize into last VAddr register
7320 LLT PackedAddrTy =
7321 LLT::fixed_vector(NumElements: 2 * (PackedRegs.size() - NSAMaxSize + 1), ScalarSizeInBits: 16);
7322 auto Concat = B.buildConcatVectors(
7323 Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - 1));
7324 PackedRegs[NSAMaxSize - 1] = Concat.getReg(Idx: 0);
7325 PackedRegs.resize(N: NSAMaxSize);
7326 } else if (!UseNSA && PackedRegs.size() > 1) {
7327 LLT PackedAddrTy = LLT::fixed_vector(NumElements: 2 * PackedRegs.size(), ScalarSizeInBits: 16);
7328 auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
7329 PackedRegs[0] = Concat.getReg(Idx: 0);
7330 PackedRegs.resize(N: 1);
7331 }
7332
7333 const unsigned NumPacked = PackedRegs.size();
7334 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7335 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
7336 if (!SrcOp.isReg()) {
7337 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7338 continue;
7339 }
7340
7341 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7342
7343 if (I - Intr->VAddrStart < NumPacked)
7344 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7345 else
7346 SrcOp.setReg(AMDGPU::NoRegister);
7347 }
7348 } else {
7349 // If the register allocator cannot place the address registers contiguously
7350 // without introducing moves, then using the non-sequential address encoding
7351 // is always preferable, since it saves VALU instructions and is usually a
7352 // wash in terms of code size or even better.
7353 //
7354 // However, we currently have no way of hinting to the register allocator
7355 // that MIMG addresses should be placed contiguously when it is possible to
7356 // do so, so force non-NSA for the common 2-address case as a heuristic.
7357 //
7358 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7359 // allocation when possible.
7360 //
7361 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7362 // set of the remaining addresses.
7363 const bool UseNSA = ST.hasNSAEncoding() &&
7364 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7365 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7366 const bool UsePartialNSA =
7367 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7368
7369 if (UsePartialNSA) {
7370 convertImageAddrToPacked(B, MI,
7371 DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7372 NumVAddrs: Intr->NumVAddrs - NSAMaxSize + 1);
7373 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7374 convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
7375 NumVAddrs: Intr->NumVAddrs);
7376 }
7377 }
7378
7379 int Flags = 0;
7380 if (IsA16)
7381 Flags |= 1;
7382 if (IsG16)
7383 Flags |= 2;
7384 MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
7385
7386 if (BaseOpcode->NoReturn) { // No TFE for stores?
7387 // TODO: Handle dmask trim
7388 if (!Ty.isVector() || !IsD16)
7389 return true;
7390
7391 Register RepackedReg = handleD16VData(B, MRI&: *MRI, Reg: VData, ImageStore: true);
7392 if (RepackedReg != VData) {
7393 MI.getOperand(i: 1).setReg(RepackedReg);
7394 }
7395
7396 return true;
7397 }
7398
7399 Register DstReg = MI.getOperand(i: 0).getReg();
7400 const LLT EltTy = Ty.getScalarType();
7401 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7402
7403 // Confirm that the return type is large enough for the dmask specified
7404 if (NumElts < DMaskLanes)
7405 return false;
7406
7407 if (NumElts > 4 || DMaskLanes > 4)
7408 return false;
7409
7410 // Image atomic instructions are using DMask to specify how many bits
7411 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7412 // DMaskLanes for image atomic has default value '0'.
7413 // We must be sure that atomic variants (especially packed) will not be
7414 // truncated from v2s16 or v4s16 to s16 type.
7415 //
7416 // ChangeElementCount will be needed for image load where Ty is always scalar.
7417 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7418 const LLT AdjustedTy =
7419 DMaskLanes == 0
7420 ? Ty
7421 : Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
7422
7423 // The raw dword aligned data component of the load. The only legal cases
7424 // where this matters should be when using the packed D16 format, for
7425 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7426 LLT RoundedTy;
7427
7428 // S32 vector to cover all data, plus TFE result element.
7429 LLT TFETy;
7430
7431 // Register type to use for each loaded component. Will be S32 or V2S16.
7432 LLT RegTy;
7433
7434 if (IsD16 && ST.hasUnpackedD16VMem()) {
7435 RoundedTy =
7436 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: 32);
7437 TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + 1, ScalarSizeInBits: 32);
7438 RegTy = S32;
7439 } else {
7440 unsigned EltSize = EltTy.getSizeInBits();
7441 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7442 unsigned RoundedSize = 32 * RoundedElts;
7443 RoundedTy = LLT::scalarOrVector(
7444 EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
7445 TFETy = LLT::fixed_vector(NumElements: RoundedSize / 32 + 1, ScalarTy: S32);
7446 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7447 }
7448
7449 // The return type does not need adjustment.
7450 // TODO: Should we change s16 case to s32 or <2 x s16>?
7451 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7452 return true;
7453
7454 Register Dst1Reg;
7455
7456 // Insert after the instruction.
7457 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
7458
7459 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7460 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7461 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7462 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7463
7464 Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
7465
7466 MI.getOperand(i: 0).setReg(NewResultReg);
7467
7468 // In the IR, TFE is supposed to be used with a 2 element struct return
7469 // type. The instruction really returns these two values in one contiguous
7470 // register, with one additional dword beyond the loaded data. Rewrite the
7471 // return type to use a single register result.
7472
7473 if (IsTFE) {
7474 Dst1Reg = MI.getOperand(i: 1).getReg();
7475 if (MRI->getType(Reg: Dst1Reg) != S32)
7476 return false;
7477
7478 // TODO: Make sure the TFE operand bit is set.
7479 MI.removeOperand(OpNo: 1);
7480
7481 // Handle the easy case that requires no repack instructions.
7482 if (Ty == S32) {
7483 B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
7484 return true;
7485 }
7486 }
7487
7488 // Now figure out how to copy the new result register back into the old
7489 // result.
7490 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7491
7492 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7493
7494 if (ResultNumRegs == 1) {
7495 assert(!IsTFE);
7496 ResultRegs[0] = NewResultReg;
7497 } else {
7498 // We have to repack into a new vector of some kind.
7499 for (int I = 0; I != NumDataRegs; ++I)
7500 ResultRegs[I] = MRI->createGenericVirtualRegister(Ty: RegTy);
7501 B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
7502
7503 // Drop the final TFE element to get the data part. The TFE result is
7504 // directly written to the right place already.
7505 if (IsTFE)
7506 ResultRegs.resize(N: NumDataRegs);
7507 }
7508
7509 // For an s16 scalar result, we form an s32 result with a truncate regardless
7510 // of packed vs. unpacked.
7511 if (IsD16 && !Ty.isVector()) {
7512 B.buildTrunc(Res: DstReg, Op: ResultRegs[0]);
7513 return true;
7514 }
7515
7516 // Avoid a build/concat_vector of 1 entry.
7517 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7518 B.buildBitcast(Dst: DstReg, Src: ResultRegs[0]);
7519 return true;
7520 }
7521
7522 assert(Ty.isVector());
7523
7524 if (IsD16) {
7525 // For packed D16 results with TFE enabled, all the data components are
7526 // S32. Cast back to the expected type.
7527 //
7528 // TODO: We don't really need to use load s32 elements. We would only need one
7529 // cast for the TFE result if a multiple of v2s16 was used.
7530 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7531 for (Register &Reg : ResultRegs)
7532 Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: 0);
7533 } else if (ST.hasUnpackedD16VMem()) {
7534 for (Register &Reg : ResultRegs)
7535 Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: 0);
7536 }
7537 }
7538
7539 auto padWithUndef = [&](LLT Ty, int NumElts) {
7540 if (NumElts == 0)
7541 return;
7542 Register Undef = B.buildUndef(Res: Ty).getReg(Idx: 0);
7543 for (int I = 0; I != NumElts; ++I)
7544 ResultRegs.push_back(Elt: Undef);
7545 };
7546
7547 // Pad out any elements eliminated due to the dmask.
7548 LLT ResTy = MRI->getType(Reg: ResultRegs[0]);
7549 if (!ResTy.isVector()) {
7550 padWithUndef(ResTy, NumElts - ResultRegs.size());
7551 B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
7552 return true;
7553 }
7554
7555 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7556 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7557
7558 // Deal with the one annoying legal case.
7559 const LLT V3S16 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 16);
7560 if (Ty == V3S16) {
7561 if (IsTFE) {
7562 if (ResultRegs.size() == 1) {
7563 NewResultReg = ResultRegs[0];
7564 } else if (ResultRegs.size() == 2) {
7565 LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
7566 NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: 0);
7567 } else {
7568 return false;
7569 }
7570 }
7571
7572 if (MRI->getType(Reg: DstReg).getNumElements() <
7573 MRI->getType(Reg: NewResultReg).getNumElements()) {
7574 B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
7575 } else {
7576 B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
7577 }
7578 return true;
7579 }
7580
7581 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7582 B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
7583 return true;
7584}
7585
7586bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
7587 MachineInstr &MI) const {
7588 MachineIRBuilder &B = Helper.MIRBuilder;
7589 GISelChangeObserver &Observer = Helper.Observer;
7590
7591 Register OrigDst = MI.getOperand(i: 0).getReg();
7592 Register Dst;
7593 LLT Ty = B.getMRI()->getType(Reg: OrigDst);
7594 unsigned Size = Ty.getSizeInBits();
7595 MachineFunction &MF = B.getMF();
7596 unsigned Opc = 0;
7597 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7598 assert(Size == 8 || Size == 16);
7599 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7600 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7601 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7602 // destination register.
7603 Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
7604 } else {
7605 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7606 Dst = OrigDst;
7607 }
7608
7609 Observer.changingInstr(MI);
7610
7611 // Handle needing to s.buffer.load() a p8 value.
7612 if (hasBufferRsrcWorkaround(Ty)) {
7613 Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: 0);
7614 B.setInsertPt(MBB&: B.getMBB(), II: MI);
7615 }
7616 if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
7617 Ty = getBitcastRegisterType(Ty);
7618 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
7619 B.setInsertPt(MBB&: B.getMBB(), II: MI);
7620 }
7621
7622 // FIXME: We don't really need this intermediate instruction. The intrinsic
7623 // should be fixed to have a memory operand. Since it's readnone, we're not
7624 // allowed to add one.
7625 MI.setDesc(B.getTII().get(Opcode: Opc));
7626 MI.removeOperand(OpNo: 1); // Remove intrinsic ID
7627
7628 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7629 const unsigned MemSize = (Size + 7) / 8;
7630 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7631 Ty: getTypeForLLT(Ty, C&: MF.getFunction().getContext()));
7632 MachineMemOperand *MMO = MF.getMachineMemOperand(
7633 PtrInfo: MachinePointerInfo(),
7634 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7635 MachineMemOperand::MOInvariant,
7636 Size: MemSize, BaseAlignment: MemAlign);
7637 MI.addMemOperand(MF, MO: MMO);
7638 if (Dst != OrigDst) {
7639 MI.getOperand(i: 0).setReg(Dst);
7640 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
7641 B.buildTrunc(Res: OrigDst, Op: Dst);
7642 }
7643
7644 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7645 // always be legal. We may need to restore this to a 96-bit result if it turns
7646 // out this needs to be converted to a vector load during RegBankSelect.
7647 if (!isPowerOf2_32(Value: Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7648 if (Ty.isVector())
7649 Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: 0);
7650 else
7651 Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: 0);
7652 }
7653
7654 Observer.changedInstr(MI);
7655 return true;
7656}
7657
7658bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
7659 MachineInstr &MI) const {
7660 MachineIRBuilder &B = Helper.MIRBuilder;
7661 GISelChangeObserver &Observer = Helper.Observer;
7662 Observer.changingInstr(MI);
7663 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7664 MI.removeOperand(OpNo: 0); // Remove intrinsic ID
7665 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
7666 Observer.changedInstr(MI);
7667 return true;
7668}
7669
7670// TODO: Move to selection
7671bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
7672 MachineRegisterInfo &MRI,
7673 MachineIRBuilder &B) const {
7674 if (!ST.hasTrapHandler() ||
7675 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7676 return legalizeTrapEndpgm(MI, MRI, B);
7677
7678 return ST.supportsGetDoorbellID() ?
7679 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
7680}
7681
7682bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
7683 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
7684 const DebugLoc &DL = MI.getDebugLoc();
7685 MachineBasicBlock &BB = B.getMBB();
7686 MachineFunction *MF = BB.getParent();
7687
7688 if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
7689 BuildMI(BB, I: BB.end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
7690 .addImm(Val: 0);
7691 MI.eraseFromParent();
7692 return true;
7693 }
7694
7695 // We need a block split to make the real endpgm a terminator. We also don't
7696 // want to break phis in successor blocks, so we can't just delete to the
7697 // end of the block.
7698 BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
7699 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7700 MF->push_back(MBB: TrapBB);
7701 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
7702 .addImm(Val: 0);
7703 BuildMI(BB, I: &MI, MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
7704 .addMBB(MBB: TrapBB);
7705
7706 BB.addSuccessor(Succ: TrapBB);
7707 MI.eraseFromParent();
7708 return true;
7709}
7710
7711bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
7712 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
7713 MachineFunction &MF = B.getMF();
7714 const LLT S64 = LLT::scalar(SizeInBits: 64);
7715
7716 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7717 // For code object version 5, queue_ptr is passed through implicit kernarg.
7718 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
7719 AMDGPU::AMDHSA_COV5) {
7720 AMDGPUTargetLowering::ImplicitParameter Param =
7721 AMDGPUTargetLowering::QUEUE_PTR;
7722 uint64_t Offset =
7723 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
7724
7725 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7726 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7727
7728 if (!loadInputValue(DstReg: KernargPtrReg, B,
7729 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
7730 return false;
7731
7732 // TODO: can we be smarter about machine pointer info?
7733 MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF);
7734 MachineMemOperand *MMO = MF.getMachineMemOperand(
7735 PtrInfo: PtrInfo.getWithOffset(O: Offset),
7736 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7737 MachineMemOperand::MOInvariant,
7738 MemTy: LLT::scalar(SizeInBits: 64), base_alignment: commonAlignment(A: Align(64), Offset));
7739
7740 // Pointer address
7741 Register LoadAddr = MRI.createGenericVirtualRegister(
7742 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7743 B.buildObjectPtrOffset(Res: LoadAddr, Op0: KernargPtrReg,
7744 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
7745 // Load address
7746 Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
7747 B.buildCopy(Res: SGPR01, Op: Temp);
7748 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7749 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7750 .addReg(RegNo: SGPR01, Flags: RegState::Implicit);
7751 MI.eraseFromParent();
7752 return true;
7753 }
7754
7755 // Pass queue pointer to trap handler as input, and insert trap instruction
7756 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7757 Register LiveIn =
7758 MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
7759 if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
7760 return false;
7761
7762 B.buildCopy(Res: SGPR01, Op: LiveIn);
7763 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7764 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7765 .addReg(RegNo: SGPR01, Flags: RegState::Implicit);
7766
7767 MI.eraseFromParent();
7768 return true;
7769}
7770
7771bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
7772 MachineRegisterInfo &MRI,
7773 MachineIRBuilder &B) const {
7774 // We need to simulate the 's_trap 2' instruction on targets that run in
7775 // PRIV=1 (where it is treated as a nop).
7776 if (ST.hasPrivEnabledTrap2NopBug()) {
7777 ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
7778 DL: MI.getDebugLoc());
7779 MI.eraseFromParent();
7780 return true;
7781 }
7782
7783 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7784 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7785 MI.eraseFromParent();
7786 return true;
7787}
7788
7789bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
7790 MachineRegisterInfo &MRI,
7791 MachineIRBuilder &B) const {
7792 // Is non-HSA path or trap-handler disabled? Then, report a warning
7793 // accordingly
7794 if (!ST.hasTrapHandler() ||
7795 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7796 Function &Fn = B.getMF().getFunction();
7797 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7798 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7799 } else {
7800 // Insert debug-trap instruction
7801 B.buildInstr(Opcode: AMDGPU::S_TRAP)
7802 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7803 }
7804
7805 MI.eraseFromParent();
7806 return true;
7807}
7808
7809bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic(
7810 MachineInstr &MI, MachineIRBuilder &B) const {
7811 MachineRegisterInfo &MRI = *B.getMRI();
7812 const LLT S16 = LLT::scalar(SizeInBits: 16);
7813 const LLT S32 = LLT::scalar(SizeInBits: 32);
7814 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
7815 const LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
7816
7817 Register DstReg = MI.getOperand(i: 0).getReg();
7818 Register NodePtr = MI.getOperand(i: 2).getReg();
7819 Register RayExtent = MI.getOperand(i: 3).getReg();
7820 Register RayOrigin = MI.getOperand(i: 4).getReg();
7821 Register RayDir = MI.getOperand(i: 5).getReg();
7822 Register RayInvDir = MI.getOperand(i: 6).getReg();
7823 Register TDescr = MI.getOperand(i: 7).getReg();
7824
7825 if (!ST.hasGFX10_AEncoding()) {
7826 Function &Fn = B.getMF().getFunction();
7827 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7828 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7829 return false;
7830 }
7831
7832 const bool IsGFX11 = AMDGPU::isGFX11(STI: ST);
7833 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: ST);
7834 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: ST);
7835 const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == 16;
7836 const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == 64;
7837 const unsigned NumVDataDwords = 4;
7838 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7839 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7840 const bool UseNSA =
7841 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7842
7843 const unsigned BaseOpcodes[2][2] = {
7844 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7845 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7846 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7847 int Opcode;
7848 if (UseNSA) {
7849 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7850 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7851 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7852 : AMDGPU::MIMGEncGfx10NSA,
7853 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7854 } else {
7855 assert(!IsGFX12Plus);
7856 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7857 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7858 : AMDGPU::MIMGEncGfx10Default,
7859 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7860 }
7861 assert(Opcode != -1);
7862
7863 SmallVector<Register, 12> Ops;
7864 if (UseNSA && IsGFX11Plus) {
7865 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7866 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7867 auto Merged = B.buildMergeLikeInstr(
7868 Res: V3S32, Ops: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1), Unmerge.getReg(Idx: 2)});
7869 Ops.push_back(Elt: Merged.getReg(Idx: 0));
7870 };
7871
7872 Ops.push_back(Elt: NodePtr);
7873 Ops.push_back(Elt: RayExtent);
7874 packLanes(RayOrigin);
7875
7876 if (IsA16) {
7877 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7878 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7879 auto MergedDir = B.buildMergeLikeInstr(
7880 Res: V3S32,
7881 Ops: {B.buildBitcast(
7882 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 0),
7883 UnmergeRayDir.getReg(Idx: 0)}))
7884 .getReg(Idx: 0),
7885 B.buildBitcast(
7886 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 1),
7887 UnmergeRayDir.getReg(Idx: 1)}))
7888 .getReg(Idx: 0),
7889 B.buildBitcast(
7890 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 2),
7891 UnmergeRayDir.getReg(Idx: 2)}))
7892 .getReg(Idx: 0)});
7893 Ops.push_back(Elt: MergedDir.getReg(Idx: 0));
7894 } else {
7895 packLanes(RayDir);
7896 packLanes(RayInvDir);
7897 }
7898 } else {
7899 if (Is64) {
7900 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
7901 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7902 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7903 } else {
7904 Ops.push_back(Elt: NodePtr);
7905 }
7906 Ops.push_back(Elt: RayExtent);
7907
7908 auto packLanes = [&Ops, &S32, &B](Register Src) {
7909 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7910 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
7911 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
7912 Ops.push_back(Elt: Unmerge.getReg(Idx: 2));
7913 };
7914
7915 packLanes(RayOrigin);
7916 if (IsA16) {
7917 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7918 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7919 Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
7920 Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
7921 Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
7922 B.buildMergeLikeInstr(Res: R1,
7923 Ops: {UnmergeRayDir.getReg(Idx: 0), UnmergeRayDir.getReg(Idx: 1)});
7924 B.buildMergeLikeInstr(
7925 Res: R2, Ops: {UnmergeRayDir.getReg(Idx: 2), UnmergeRayInvDir.getReg(Idx: 0)});
7926 B.buildMergeLikeInstr(
7927 Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: 1), UnmergeRayInvDir.getReg(Idx: 2)});
7928 Ops.push_back(Elt: R1);
7929 Ops.push_back(Elt: R2);
7930 Ops.push_back(Elt: R3);
7931 } else {
7932 packLanes(RayDir);
7933 packLanes(RayInvDir);
7934 }
7935 }
7936
7937 if (!UseNSA) {
7938 // Build a single vector containing all the operands so far prepared.
7939 LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: 32);
7940 Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: 0);
7941 Ops.clear();
7942 Ops.push_back(Elt: MergedOps);
7943 }
7944
7945 auto MIB = B.buildInstr(Opcode: AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7946 .addDef(RegNo: DstReg)
7947 .addImm(Val: Opcode);
7948
7949 for (Register R : Ops) {
7950 MIB.addUse(RegNo: R);
7951 }
7952
7953 MIB.addUse(RegNo: TDescr)
7954 .addImm(Val: IsA16 ? 1 : 0)
7955 .cloneMemRefs(OtherMI: MI);
7956
7957 MI.eraseFromParent();
7958 return true;
7959}
7960
7961bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic(
7962 MachineInstr &MI, MachineIRBuilder &B) const {
7963 const LLT S32 = LLT::scalar(SizeInBits: 32);
7964 const LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
7965
7966 Register DstReg = MI.getOperand(i: 0).getReg();
7967 Register DstOrigin = MI.getOperand(i: 1).getReg();
7968 Register DstDir = MI.getOperand(i: 2).getReg();
7969 Register NodePtr = MI.getOperand(i: 4).getReg();
7970 Register RayExtent = MI.getOperand(i: 5).getReg();
7971 Register InstanceMask = MI.getOperand(i: 6).getReg();
7972 Register RayOrigin = MI.getOperand(i: 7).getReg();
7973 Register RayDir = MI.getOperand(i: 8).getReg();
7974 Register Offsets = MI.getOperand(i: 9).getReg();
7975 Register TDescr = MI.getOperand(i: 10).getReg();
7976
7977 if (!ST.hasBVHDualAndBVH8Insts()) {
7978 Function &Fn = B.getMF().getFunction();
7979 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
7980 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7981 return false;
7982 }
7983
7984 bool IsBVH8 = cast<GIntrinsic>(Val&: MI).getIntrinsicID() ==
7985 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7986 const unsigned NumVDataDwords = 10;
7987 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7988 int Opcode = AMDGPU::getMIMGOpcode(
7989 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7990 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7991 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7992 assert(Opcode != -1);
7993
7994 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7995 Res: V2S32, Ops: {RayExtent, B.buildAnyExt(Res: S32, Op: InstanceMask)});
7996
7997 B.buildInstr(Opcode: IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7998 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7999 .addDef(RegNo: DstReg)
8000 .addDef(RegNo: DstOrigin)
8001 .addDef(RegNo: DstDir)
8002 .addImm(Val: Opcode)
8003 .addUse(RegNo: NodePtr)
8004 .addUse(RegNo: RayExtentInstanceMaskVec.getReg(Idx: 0))
8005 .addUse(RegNo: RayOrigin)
8006 .addUse(RegNo: RayDir)
8007 .addUse(RegNo: Offsets)
8008 .addUse(RegNo: TDescr)
8009 .cloneMemRefs(OtherMI: MI);
8010
8011 MI.eraseFromParent();
8012 return true;
8013}
8014
8015bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
8016 MachineIRBuilder &B) const {
8017 const SITargetLowering *TLI = ST.getTargetLowering();
8018 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
8019 Register DstReg = MI.getOperand(i: 0).getReg();
8020 B.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {DstReg}, SrcOps: {StackPtr});
8021 MI.eraseFromParent();
8022 return true;
8023}
8024
8025bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
8026 MachineIRBuilder &B) const {
8027 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8028 if (!ST.hasArchitectedSGPRs())
8029 return false;
8030 LLT S32 = LLT::scalar(SizeInBits: 32);
8031 Register DstReg = MI.getOperand(i: 0).getReg();
8032 auto TTMP8 = B.buildCopy(Res: S32, Op: Register(AMDGPU::TTMP8));
8033 auto LSB = B.buildConstant(Res: S32, Val: 25);
8034 auto Width = B.buildConstant(Res: S32, Val: 5);
8035 B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
8036 MI.eraseFromParent();
8037 return true;
8038}
8039
8040bool AMDGPULegalizerInfo::legalizeConstHwRegRead(MachineInstr &MI,
8041 MachineIRBuilder &B,
8042 AMDGPU::Hwreg::Id HwReg,
8043 unsigned LowBit,
8044 unsigned Width) const {
8045 MachineRegisterInfo &MRI = *B.getMRI();
8046 Register DstReg = MI.getOperand(i: 0).getReg();
8047 if (!MRI.getRegClassOrNull(Reg: DstReg))
8048 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32RegClass);
8049 B.buildInstr(Opcode: AMDGPU::S_GETREG_B32_const)
8050 .addDef(RegNo: DstReg)
8051 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(Values: HwReg, Values: LowBit, Values: Width));
8052 MI.eraseFromParent();
8053 return true;
8054}
8055
8056static constexpr unsigned FPEnvModeBitField =
8057 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
8058
8059static constexpr unsigned FPEnvTrapBitField =
8060 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
8061
8062bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
8063 MachineRegisterInfo &MRI,
8064 MachineIRBuilder &B) const {
8065 Register Src = MI.getOperand(i: 0).getReg();
8066 if (MRI.getType(Reg: Src) != S64)
8067 return false;
8068
8069 auto ModeReg =
8070 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
8071 /*HasSideEffects=*/true, /*isConvergent=*/false)
8072 .addImm(Val: FPEnvModeBitField);
8073 auto TrapReg =
8074 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
8075 /*HasSideEffects=*/true, /*isConvergent=*/false)
8076 .addImm(Val: FPEnvTrapBitField);
8077 B.buildMergeLikeInstr(Res: Src, Ops: {ModeReg, TrapReg});
8078 MI.eraseFromParent();
8079 return true;
8080}
8081
8082bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
8083 MachineRegisterInfo &MRI,
8084 MachineIRBuilder &B) const {
8085 Register Src = MI.getOperand(i: 0).getReg();
8086 if (MRI.getType(Reg: Src) != S64)
8087 return false;
8088
8089 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: 0));
8090 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
8091 /*HasSideEffects=*/true, /*isConvergent=*/false)
8092 .addImm(Val: static_cast<int16_t>(FPEnvModeBitField))
8093 .addReg(RegNo: Unmerge.getReg(Idx: 0));
8094 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
8095 /*HasSideEffects=*/true, /*isConvergent=*/false)
8096 .addImm(Val: static_cast<int16_t>(FPEnvTrapBitField))
8097 .addReg(RegNo: Unmerge.getReg(Idx: 1));
8098 MI.eraseFromParent();
8099 return true;
8100}
8101
8102bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
8103 MachineInstr &MI) const {
8104 MachineIRBuilder &B = Helper.MIRBuilder;
8105 MachineRegisterInfo &MRI = *B.getMRI();
8106
8107 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8108 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
8109 switch (IntrID) {
8110 case Intrinsic::amdgcn_icmp: {
8111 // amdgcn.icmp(i1 src0, i1 0, NE) -> ballot(src0)
8112 // This is the only valid form of amdgcn.icmp with i1 inputs.
8113 Register Src0 = MI.getOperand(i: 2).getReg();
8114 LLT SrcTy = MRI.getType(Reg: Src0);
8115 if (SrcTy != LLT::scalar(SizeInBits: 1))
8116 return true; // Not i1, leave for default handling.
8117
8118 // Check that src1 is constant 0.
8119 Register Src1 = MI.getOperand(i: 3).getReg();
8120 auto Src1Const = getIConstantVRegValWithLookThrough(VReg: Src1, MRI);
8121 if (!Src1Const || Src1Const->Value != 0)
8122 return false; // Invalid i1 icmp form.
8123
8124 // Check that predicate is ICMP_NE.
8125 int64_t Pred = MI.getOperand(i: 4).getImm();
8126 if (Pred != CmpInst::ICMP_NE)
8127 return false; // Invalid i1 icmp form.
8128
8129 // Convert to ballot.
8130 Register Dst = MI.getOperand(i: 0).getReg();
8131 B.buildIntrinsic(ID: Intrinsic::amdgcn_ballot, Res: Dst).addUse(RegNo: Src0);
8132 MI.eraseFromParent();
8133 return true;
8134 }
8135 case Intrinsic::sponentry:
8136 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8137 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8138 // that we can remove this cast.
8139 const LLT S32 = LLT::scalar(SizeInBits: 32);
8140 Register TmpReg = MRI.createGenericVirtualRegister(Ty: S32);
8141 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_SPONENTRY).addDef(RegNo: TmpReg);
8142
8143 Register DstReg = MI.getOperand(i: 0).getReg();
8144 B.buildIntToPtr(Dst: DstReg, Src: TmpReg);
8145 MI.eraseFromParent();
8146 } else {
8147 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8148 Size: 1, SPOffset: 0, /*IsImmutable=*/false);
8149 B.buildFrameIndex(Res: MI.getOperand(i: 0), Idx: FI);
8150 MI.eraseFromParent();
8151 }
8152 return true;
8153 case Intrinsic::amdgcn_if:
8154 case Intrinsic::amdgcn_else: {
8155 MachineInstr *Br = nullptr;
8156 MachineBasicBlock *UncondBrTarget = nullptr;
8157 bool Negated = false;
8158 if (MachineInstr *BrCond =
8159 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8160 const SIRegisterInfo *TRI
8161 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8162
8163 Register Def = MI.getOperand(i: 1).getReg();
8164 Register Use = MI.getOperand(i: 3).getReg();
8165
8166 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
8167
8168 if (Negated)
8169 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
8170
8171 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
8172 if (IntrID == Intrinsic::amdgcn_if) {
8173 B.buildInstr(Opcode: AMDGPU::SI_IF)
8174 .addDef(RegNo: Def)
8175 .addUse(RegNo: Use)
8176 .addMBB(MBB: UncondBrTarget);
8177 } else {
8178 B.buildInstr(Opcode: AMDGPU::SI_ELSE)
8179 .addDef(RegNo: Def)
8180 .addUse(RegNo: Use)
8181 .addMBB(MBB: UncondBrTarget);
8182 }
8183
8184 if (Br) {
8185 Br->getOperand(i: 0).setMBB(CondBrTarget);
8186 } else {
8187 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8188 // since we're swapping branch targets it needs to be reinserted.
8189 // FIXME: IRTranslator should probably not do this
8190 B.buildBr(Dest&: *CondBrTarget);
8191 }
8192
8193 MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
8194 MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
8195 MI.eraseFromParent();
8196 BrCond->eraseFromParent();
8197 return true;
8198 }
8199
8200 return false;
8201 }
8202 case Intrinsic::amdgcn_loop: {
8203 MachineInstr *Br = nullptr;
8204 MachineBasicBlock *UncondBrTarget = nullptr;
8205 bool Negated = false;
8206 if (MachineInstr *BrCond =
8207 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8208 const SIRegisterInfo *TRI
8209 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8210
8211 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
8212 Register Reg = MI.getOperand(i: 2).getReg();
8213
8214 if (Negated)
8215 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
8216
8217 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
8218 B.buildInstr(Opcode: AMDGPU::SI_LOOP)
8219 .addUse(RegNo: Reg)
8220 .addMBB(MBB: UncondBrTarget);
8221
8222 if (Br)
8223 Br->getOperand(i: 0).setMBB(CondBrTarget);
8224 else
8225 B.buildBr(Dest&: *CondBrTarget);
8226
8227 MI.eraseFromParent();
8228 BrCond->eraseFromParent();
8229 MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
8230 return true;
8231 }
8232
8233 return false;
8234 }
8235 case Intrinsic::amdgcn_wave_reduce_min:
8236 case Intrinsic::amdgcn_wave_reduce_umin:
8237 case Intrinsic::amdgcn_wave_reduce_max:
8238 case Intrinsic::amdgcn_wave_reduce_umax:
8239 case Intrinsic::amdgcn_wave_reduce_add:
8240 case Intrinsic::amdgcn_wave_reduce_sub:
8241 case Intrinsic::amdgcn_wave_reduce_and:
8242 case Intrinsic::amdgcn_wave_reduce_or:
8243 case Intrinsic::amdgcn_wave_reduce_xor: {
8244 Register SrcReg = MI.getOperand(i: 2).getReg();
8245 if (MRI.getType(Reg: SrcReg) != LLT::scalar(SizeInBits: 16))
8246 return true;
8247 Register DstReg = MI.getOperand(i: 0).getReg();
8248 bool NeedsSignExt = IntrID == Intrinsic::amdgcn_wave_reduce_min ||
8249 IntrID == Intrinsic::amdgcn_wave_reduce_max ||
8250 IntrID == Intrinsic::amdgcn_wave_reduce_add ||
8251 IntrID == Intrinsic::amdgcn_wave_reduce_sub;
8252 auto Ext = NeedsSignExt ? B.buildSExt(Res: LLT::scalar(SizeInBits: 32), Op: SrcReg)
8253 : B.buildZExt(Res: LLT::scalar(SizeInBits: 32), Op: SrcReg);
8254 auto NewDst = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
8255 B.buildIntrinsic(ID: IntrID, Res: ArrayRef<Register>{NewDst},
8256 /*hasSideEffects=*/HasSideEffects: false, /*isConvergent=*/true)
8257 .addUse(RegNo: Ext.getReg(Idx: 0))
8258 .addImm(Val: MI.getOperand(i: 3).getImm()); // strategy
8259 B.buildTrunc(Res: DstReg, Op: NewDst);
8260 MI.eraseFromParent();
8261 return true;
8262 }
8263 case Intrinsic::amdgcn_addrspacecast_nonnull:
8264 return legalizeAddrSpaceCast(MI, MRI, B);
8265 case Intrinsic::amdgcn_make_buffer_rsrc:
8266 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8267 case Intrinsic::amdgcn_kernarg_segment_ptr:
8268 if (!AMDGPU::isKernel(F: B.getMF().getFunction())) {
8269 // This only makes sense to call in a kernel, so just lower to null.
8270 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: 0);
8271 MI.eraseFromParent();
8272 return true;
8273 }
8274
8275 return legalizePreloadedArgIntrin(
8276 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
8277 case Intrinsic::amdgcn_implicitarg_ptr:
8278 return legalizeImplicitArgPtr(MI, MRI, B);
8279 case Intrinsic::amdgcn_workitem_id_x:
8280 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 0,
8281 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
8282 case Intrinsic::amdgcn_workitem_id_y:
8283 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 1,
8284 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
8285 case Intrinsic::amdgcn_workitem_id_z:
8286 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 2,
8287 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
8288 case Intrinsic::amdgcn_workgroup_id_x:
8289 return legalizeWorkGroupId(
8290 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
8291 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
8292 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
8293 case Intrinsic::amdgcn_workgroup_id_y:
8294 return legalizeWorkGroupId(
8295 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
8296 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
8297 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
8298 case Intrinsic::amdgcn_workgroup_id_z:
8299 return legalizeWorkGroupId(
8300 MI, B, WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
8301 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
8302 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
8303 case Intrinsic::amdgcn_cluster_id_x:
8304 return ST.hasClusters() &&
8305 legalizePreloadedArgIntrin(MI, MRI, B,
8306 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
8307 case Intrinsic::amdgcn_cluster_id_y:
8308 return ST.hasClusters() &&
8309 legalizePreloadedArgIntrin(MI, MRI, B,
8310 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
8311 case Intrinsic::amdgcn_cluster_id_z:
8312 return ST.hasClusters() &&
8313 legalizePreloadedArgIntrin(MI, MRI, B,
8314 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
8315 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8316 return ST.hasClusters() &&
8317 legalizePreloadedArgIntrin(
8318 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
8319 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8320 return ST.hasClusters() &&
8321 legalizePreloadedArgIntrin(
8322 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
8323 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8324 return ST.hasClusters() &&
8325 legalizePreloadedArgIntrin(
8326 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
8327 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8328 return ST.hasClusters() &&
8329 legalizeConstHwRegRead(MI, B, HwReg: AMDGPU::Hwreg::ID_IB_STS2, LowBit: 21, Width: 4);
8330 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8331 return ST.hasClusters() &&
8332 legalizePreloadedArgIntrin(
8333 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
8334 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8335 return ST.hasClusters() &&
8336 legalizePreloadedArgIntrin(
8337 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
8338 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8339 return ST.hasClusters() &&
8340 legalizePreloadedArgIntrin(
8341 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
8342 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8343 return ST.hasClusters() &&
8344 legalizePreloadedArgIntrin(
8345 MI, MRI, B,
8346 ArgType: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
8347 case Intrinsic::amdgcn_wave_id:
8348 return legalizeWaveID(MI, B);
8349 case Intrinsic::amdgcn_lds_kernel_id:
8350 return legalizePreloadedArgIntrin(MI, MRI, B,
8351 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
8352 case Intrinsic::amdgcn_dispatch_ptr:
8353 return legalizePreloadedArgIntrin(MI, MRI, B,
8354 ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
8355 case Intrinsic::amdgcn_queue_ptr:
8356 return legalizePreloadedArgIntrin(MI, MRI, B,
8357 ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
8358 case Intrinsic::amdgcn_implicit_buffer_ptr:
8359 return legalizePreloadedArgIntrin(
8360 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
8361 case Intrinsic::amdgcn_dispatch_id:
8362 return legalizePreloadedArgIntrin(MI, MRI, B,
8363 ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
8364 case Intrinsic::r600_read_ngroups_x:
8365 // TODO: Emit error for hsa
8366 return legalizeKernargMemParameter(MI, B,
8367 Offset: SI::KernelInputOffsets::NGROUPS_X);
8368 case Intrinsic::r600_read_ngroups_y:
8369 return legalizeKernargMemParameter(MI, B,
8370 Offset: SI::KernelInputOffsets::NGROUPS_Y);
8371 case Intrinsic::r600_read_ngroups_z:
8372 return legalizeKernargMemParameter(MI, B,
8373 Offset: SI::KernelInputOffsets::NGROUPS_Z);
8374 case Intrinsic::r600_read_local_size_x:
8375 // TODO: Could insert G_ASSERT_ZEXT from s16
8376 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
8377 case Intrinsic::r600_read_local_size_y:
8378 // TODO: Could insert G_ASSERT_ZEXT from s16
8379 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
8380 // TODO: Could insert G_ASSERT_ZEXT from s16
8381 case Intrinsic::r600_read_local_size_z:
8382 return legalizeKernargMemParameter(MI, B,
8383 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
8384 case Intrinsic::amdgcn_fdiv_fast:
8385 return legalizeFDIVFastIntrin(MI, MRI, B);
8386 case Intrinsic::amdgcn_is_shared:
8387 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
8388 case Intrinsic::amdgcn_is_private:
8389 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
8390 case Intrinsic::amdgcn_wavefrontsize: {
8391 B.buildConstant(Res: MI.getOperand(i: 0), Val: ST.getWavefrontSize());
8392 MI.eraseFromParent();
8393 return true;
8394 }
8395 case Intrinsic::amdgcn_s_buffer_load:
8396 return legalizeSBufferLoad(Helper, MI);
8397 case Intrinsic::amdgcn_raw_buffer_store:
8398 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8399 case Intrinsic::amdgcn_struct_buffer_store:
8400 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8401 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: false);
8402 case Intrinsic::amdgcn_raw_buffer_store_format:
8403 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8404 case Intrinsic::amdgcn_struct_buffer_store_format:
8405 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8406 return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: true);
8407 case Intrinsic::amdgcn_raw_tbuffer_store:
8408 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8409 case Intrinsic::amdgcn_struct_tbuffer_store:
8410 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8411 return legalizeBufferStore(MI, Helper, IsTyped: true, IsFormat: true);
8412 case Intrinsic::amdgcn_raw_buffer_load:
8413 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8414 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8415 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8416 case Intrinsic::amdgcn_struct_buffer_load:
8417 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8418 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8419 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8420 return legalizeBufferLoad(MI, Helper, IsFormat: false, IsTyped: false);
8421 case Intrinsic::amdgcn_raw_buffer_load_format:
8422 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8423 case Intrinsic::amdgcn_struct_buffer_load_format:
8424 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8425 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: false);
8426 case Intrinsic::amdgcn_raw_tbuffer_load:
8427 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8428 case Intrinsic::amdgcn_struct_tbuffer_load:
8429 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8430 return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: true);
8431 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8432 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8433 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8434 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8435 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8436 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8437 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8438 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8439 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8440 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8441 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8442 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8443 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8445 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8446 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8447 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8448 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8449 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8450 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8451 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8452 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8453 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8454 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8455 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8457 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8458 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8459 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8460 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8461 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8462 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8463 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8464 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8465 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8466 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8467 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8469 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8470 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8471 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8473 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8475 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8476 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8477 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8478 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8479 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8480 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8481 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8482 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8483 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8484 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8485 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8486 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8487 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8488 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8489 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8491 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8492 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8493 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8495 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8496 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8497 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8498 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8499 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8500 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8501 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8502 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8503 return legalizeBufferAtomic(MI, B, IID: IntrID);
8504 case Intrinsic::amdgcn_rsq_clamp:
8505 return legalizeRsqClampIntrinsic(MI, MRI, B);
8506 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8507 return legalizeBVHIntersectRayIntrinsic(MI, B);
8508 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8509 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8510 return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
8511 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8512 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8513 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8514 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8515 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8516 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8517 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8518 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8519 Register Index = MI.getOperand(i: 5).getReg();
8520 LLT S64 = LLT::scalar(SizeInBits: 64);
8521 LLT IndexArgTy = MRI.getType(Reg: Index);
8522 if (IndexArgTy != S64) {
8523 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(Dst: S64, Src: Index)
8524 : B.buildAnyExt(Res: S64, Op: Index);
8525 MI.getOperand(i: 5).setReg(NewIndex.getReg(Idx: 0));
8526 }
8527 return true;
8528 }
8529 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8530 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8534 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8535 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8536 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8537 Register Index = MI.getOperand(i: 5).getReg();
8538 LLT S32 = LLT::scalar(SizeInBits: 32);
8539 if (MRI.getType(Reg: Index) != S32)
8540 MI.getOperand(i: 5).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
8541 return true;
8542 }
8543 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8544 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8545 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8546 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8547 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8548 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8549 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8550 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8551 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8552 Register Index = MI.getOperand(i: 7).getReg();
8553 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8554 ? LLT::scalar(SizeInBits: 64)
8555 : LLT::scalar(SizeInBits: 32);
8556 LLT IndexArgTy = MRI.getType(Reg: Index);
8557 if (IndexArgTy != IdxTy) {
8558 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(Dst: IdxTy, Src: Index)
8559 : B.buildAnyExt(Res: IdxTy, Op: Index);
8560 MI.getOperand(i: 7).setReg(NewIndex.getReg(Idx: 0));
8561 }
8562 return true;
8563 }
8564
8565 case Intrinsic::amdgcn_fmed3: {
8566 GISelChangeObserver &Observer = Helper.Observer;
8567
8568 // FIXME: This is to workaround the inability of tablegen match combiners to
8569 // match intrinsics in patterns.
8570 Observer.changingInstr(MI);
8571 MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_FMED3));
8572 MI.removeOperand(OpNo: 1);
8573 Observer.changedInstr(MI);
8574 return true;
8575 }
8576 case Intrinsic::amdgcn_readlane:
8577 case Intrinsic::amdgcn_writelane:
8578 case Intrinsic::amdgcn_readfirstlane:
8579 case Intrinsic::amdgcn_permlane16:
8580 case Intrinsic::amdgcn_permlanex16:
8581 case Intrinsic::amdgcn_permlane64:
8582 case Intrinsic::amdgcn_set_inactive:
8583 case Intrinsic::amdgcn_set_inactive_chain_arg:
8584 case Intrinsic::amdgcn_mov_dpp8:
8585 case Intrinsic::amdgcn_update_dpp:
8586 case Intrinsic::amdgcn_permlane_bcast:
8587 case Intrinsic::amdgcn_permlane_up:
8588 case Intrinsic::amdgcn_permlane_down:
8589 case Intrinsic::amdgcn_permlane_xor:
8590 return legalizeLaneOp(Helper, MI, IID: IntrID);
8591 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8592 return legalizeSBufferPrefetch(Helper, MI);
8593 case Intrinsic::amdgcn_dead: {
8594 // TODO: Use poison instead of undef
8595 for (const MachineOperand &Def : MI.defs())
8596 B.buildUndef(Res: Def);
8597 MI.eraseFromParent();
8598 return true;
8599 }
8600 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8601 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8602 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8603 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8604 B.buildLoad(Res: MI.getOperand(i: 0), Addr: MI.getOperand(i: 2), MMO&: **MI.memoperands_begin());
8605 MI.eraseFromParent();
8606 return true;
8607 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8608 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8609 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8610 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8611 B.buildStore(Val: MI.getOperand(i: 2), Addr: MI.getOperand(i: 1), MMO&: **MI.memoperands_begin());
8612 MI.eraseFromParent();
8613 return true;
8614 case Intrinsic::amdgcn_av_load_b128:
8615 case Intrinsic::amdgcn_av_store_b128: {
8616 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
8617 if (!ST.hasFlatGlobalInsts()) {
8618 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8619 ? "llvm.amdgcn.av.load.b128"
8620 : "llvm.amdgcn.av.store.b128";
8621 Function &Fn = B.getMF().getFunction();
8622 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
8623 Fn, Twine(Name) + " not supported on subtarget", MI.getDebugLoc()));
8624 return false;
8625 }
8626 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8627 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8628 B.buildLoad(Res: MI.getOperand(i: 0), Addr: MI.getOperand(i: 2), MMO&: **MI.memoperands_begin());
8629 else
8630 B.buildStore(Val: MI.getOperand(i: 2), Addr: MI.getOperand(i: 1),
8631 MMO&: **MI.memoperands_begin());
8632 MI.eraseFromParent();
8633 return true;
8634 }
8635 case Intrinsic::amdgcn_flat_load_monitor_b32:
8636 case Intrinsic::amdgcn_flat_load_monitor_b64:
8637 case Intrinsic::amdgcn_flat_load_monitor_b128:
8638 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8639 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8640 .add(MO: MI.getOperand(i: 0))
8641 .add(MO: MI.getOperand(i: 2))
8642 .addMemOperand(MMO: *MI.memoperands_begin());
8643 MI.eraseFromParent();
8644 return true;
8645 case Intrinsic::amdgcn_global_load_monitor_b32:
8646 case Intrinsic::amdgcn_global_load_monitor_b64:
8647 case Intrinsic::amdgcn_global_load_monitor_b128:
8648 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8649 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8650 .add(MO: MI.getOperand(i: 0))
8651 .add(MO: MI.getOperand(i: 2))
8652 .addMemOperand(MMO: *MI.memoperands_begin());
8653 MI.eraseFromParent();
8654 return true;
8655 default: {
8656 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8657 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
8658 return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
8659 return true;
8660 }
8661 }
8662
8663 return true;
8664}
8665