1//===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This file implements the LegalizerHelper class to legalize
10/// individual instructions and the LegalizeMachineIR wrapper pass for the
11/// primary legalization.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16#include "llvm/CodeGen/GlobalISel/CallLowering.h"
17#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
19#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24#include "llvm/CodeGen/GlobalISel/Utils.h"
25#include "llvm/CodeGen/LowLevelTypeUtils.h"
26#include "llvm/CodeGen/MachineConstantPool.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/CodeGen/MachineRegisterInfo.h"
29#include "llvm/CodeGen/RuntimeLibcallUtil.h"
30#include "llvm/CodeGen/TargetFrameLowering.h"
31#include "llvm/CodeGen/TargetInstrInfo.h"
32#include "llvm/CodeGen/TargetLowering.h"
33#include "llvm/CodeGen/TargetOpcodes.h"
34#include "llvm/CodeGen/TargetSubtargetInfo.h"
35#include "llvm/IR/Instructions.h"
36#include "llvm/Support/Debug.h"
37#include "llvm/Support/MathExtras.h"
38#include "llvm/Support/raw_ostream.h"
39#include "llvm/Target/TargetMachine.h"
40#include <numeric>
41#include <optional>
42
43#define DEBUG_TYPE "legalizer"
44
45using namespace llvm;
46using namespace LegalizeActions;
47using namespace MIPatternMatch;
48
49/// Try to break down \p OrigTy into \p NarrowTy sized pieces.
50///
51/// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
52/// with any leftover piece as type \p LeftoverTy
53///
54/// Returns -1 in the first element of the pair if the breakdown is not
55/// satisfiable.
56static std::pair<int, int>
57getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
58 assert(!LeftoverTy.isValid() && "this is an out argument");
59
60 unsigned Size = OrigTy.getSizeInBits();
61 unsigned NarrowSize = NarrowTy.getSizeInBits();
62 unsigned NumParts = Size / NarrowSize;
63 unsigned LeftoverSize = Size - NumParts * NarrowSize;
64 assert(Size > NarrowSize);
65
66 if (LeftoverSize == 0)
67 return {NumParts, 0};
68
69 if (NarrowTy.isVector()) {
70 unsigned EltSize = OrigTy.getScalarSizeInBits();
71 if (LeftoverSize % EltSize != 0)
72 return {-1, -1};
73 LeftoverTy =
74 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: LeftoverSize / EltSize),
75 ScalarTy: OrigTy.getElementType());
76 } else {
77 LeftoverTy = LLT::scalar(SizeInBits: LeftoverSize);
78 }
79
80 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
81 return std::make_pair(x&: NumParts, y&: NumLeftover);
82}
83
84static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
85
86 if (!Ty.isScalar())
87 return nullptr;
88
89 switch (Ty.getSizeInBits()) {
90 case 16:
91 return Type::getHalfTy(C&: Ctx);
92 case 32:
93 return Type::getFloatTy(C&: Ctx);
94 case 64:
95 return Type::getDoubleTy(C&: Ctx);
96 case 80:
97 return Type::getX86_FP80Ty(C&: Ctx);
98 case 128:
99 return Type::getFP128Ty(C&: Ctx);
100 default:
101 return nullptr;
102 }
103}
104
105LegalizerHelper::LegalizerHelper(MachineFunction &MF,
106 GISelChangeObserver &Observer,
107 MachineIRBuilder &Builder)
108 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
109 LI(*MF.getSubtarget().getLegalizerInfo()),
110 TLI(*MF.getSubtarget().getTargetLowering()), VT(nullptr) {}
111
112LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
113 GISelChangeObserver &Observer,
114 MachineIRBuilder &B, GISelValueTracking *VT)
115 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
116 TLI(*MF.getSubtarget().getTargetLowering()), VT(VT) {}
117
118LegalizerHelper::LegalizeResult
119LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
120 LostDebugLocObserver &LocObserver) {
121 LLVM_DEBUG(dbgs() << "\nLegalizing: " << MI);
122
123 MIRBuilder.setInstrAndDebugLoc(MI);
124
125 if (isa<GIntrinsic>(Val: MI))
126 return LI.legalizeIntrinsic(Helper&: *this, MI) ? Legalized : UnableToLegalize;
127 auto Step = LI.getAction(MI, MRI);
128 switch (Step.Action) {
129 case Legal:
130 LLVM_DEBUG(dbgs() << ".. Already legal\n");
131 return AlreadyLegal;
132 case Libcall:
133 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
134 return libcall(MI, LocObserver);
135 case NarrowScalar:
136 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
137 return narrowScalar(MI, TypeIdx: Step.TypeIdx, NarrowTy: Step.NewType);
138 case WidenScalar:
139 LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
140 return widenScalar(MI, TypeIdx: Step.TypeIdx, WideTy: Step.NewType);
141 case Bitcast:
142 LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
143 return bitcast(MI, TypeIdx: Step.TypeIdx, Ty: Step.NewType);
144 case Lower:
145 LLVM_DEBUG(dbgs() << ".. Lower\n");
146 return lower(MI, TypeIdx: Step.TypeIdx, Ty: Step.NewType);
147 case FewerElements:
148 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
149 return fewerElementsVector(MI, TypeIdx: Step.TypeIdx, NarrowTy: Step.NewType);
150 case MoreElements:
151 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
152 return moreElementsVector(MI, TypeIdx: Step.TypeIdx, MoreTy: Step.NewType);
153 case Custom:
154 LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
155 return LI.legalizeCustom(Helper&: *this, MI, LocObserver) ? Legalized
156 : UnableToLegalize;
157 default:
158 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
159 return UnableToLegalize;
160 }
161}
162
163void LegalizerHelper::insertParts(Register DstReg,
164 LLT ResultTy, LLT PartTy,
165 ArrayRef<Register> PartRegs,
166 LLT LeftoverTy,
167 ArrayRef<Register> LeftoverRegs) {
168 if (!LeftoverTy.isValid()) {
169 assert(LeftoverRegs.empty());
170
171 if (!ResultTy.isVector()) {
172 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: PartRegs);
173 return;
174 }
175
176 if (PartTy.isVector())
177 MIRBuilder.buildConcatVectors(Res: DstReg, Ops: PartRegs);
178 else
179 MIRBuilder.buildBuildVector(Res: DstReg, Ops: PartRegs);
180 return;
181 }
182
183 // Merge sub-vectors with different number of elements and insert into DstReg.
184 if (ResultTy.isVector()) {
185 assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
186 SmallVector<Register, 8> AllRegs(PartRegs);
187 AllRegs.append(in_start: LeftoverRegs.begin(), in_end: LeftoverRegs.end());
188 return mergeMixedSubvectors(DstReg, PartRegs: AllRegs);
189 }
190
191 SmallVector<Register> GCDRegs;
192 LLT GCDTy = getGCDType(OrigTy: getGCDType(OrigTy: ResultTy, TargetTy: LeftoverTy), TargetTy: PartTy);
193 for (auto PartReg : concat<const Register>(Ranges&: PartRegs, Ranges&: LeftoverRegs))
194 extractGCDType(Parts&: GCDRegs, GCDTy, SrcReg: PartReg);
195 LLT ResultLCMTy = buildLCMMergePieces(DstTy: ResultTy, NarrowTy: LeftoverTy, GCDTy, VRegs&: GCDRegs);
196 buildWidenedRemergeToDst(DstReg, LCMTy: ResultLCMTy, RemergeRegs: GCDRegs);
197}
198
199void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
200 Register Reg) {
201 LLT Ty = MRI.getType(Reg);
202 SmallVector<Register, 8> RegElts;
203 extractParts(Reg, Ty: Ty.getScalarType(), NumParts: Ty.getNumElements(), VRegs&: RegElts,
204 MIRBuilder, MRI);
205 Elts.append(RHS: RegElts);
206}
207
208/// Merge \p PartRegs with different types into \p DstReg.
209void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
210 ArrayRef<Register> PartRegs) {
211 SmallVector<Register, 8> AllElts;
212 for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
213 appendVectorElts(Elts&: AllElts, Reg: PartRegs[i]);
214
215 Register Leftover = PartRegs[PartRegs.size() - 1];
216 if (!MRI.getType(Reg: Leftover).isVector())
217 AllElts.push_back(Elt: Leftover);
218 else
219 appendVectorElts(Elts&: AllElts, Reg: Leftover);
220
221 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: AllElts);
222}
223
224/// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
225static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
226 const MachineInstr &MI) {
227 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
228
229 const int StartIdx = Regs.size();
230 const int NumResults = MI.getNumOperands() - 1;
231 Regs.resize(N: Regs.size() + NumResults);
232 for (int I = 0; I != NumResults; ++I)
233 Regs[StartIdx + I] = MI.getOperand(i: I).getReg();
234}
235
236void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
237 LLT GCDTy, Register SrcReg) {
238 LLT SrcTy = MRI.getType(Reg: SrcReg);
239 if (SrcTy == GCDTy) {
240 // If the source already evenly divides the result type, we don't need to do
241 // anything.
242 Parts.push_back(Elt: SrcReg);
243 } else {
244 // Need to split into common type sized pieces.
245 auto Unmerge = MIRBuilder.buildUnmerge(Res: GCDTy, Op: SrcReg);
246 getUnmergeResults(Regs&: Parts, MI: *Unmerge);
247 }
248}
249
250LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
251 LLT NarrowTy, Register SrcReg) {
252 LLT SrcTy = MRI.getType(Reg: SrcReg);
253 LLT GCDTy = getGCDType(OrigTy: getGCDType(OrigTy: SrcTy, TargetTy: NarrowTy), TargetTy: DstTy);
254 extractGCDType(Parts, GCDTy, SrcReg);
255 return GCDTy;
256}
257
258LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
259 SmallVectorImpl<Register> &VRegs,
260 unsigned PadStrategy) {
261 LLT LCMTy = getLCMType(OrigTy: DstTy, TargetTy: NarrowTy);
262
263 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
264 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
265 int NumOrigSrc = VRegs.size();
266
267 Register PadReg;
268
269 // Get a value we can use to pad the source value if the sources won't evenly
270 // cover the result type.
271 if (NumOrigSrc < NumParts * NumSubParts) {
272 if (PadStrategy == TargetOpcode::G_ZEXT)
273 PadReg = MIRBuilder.buildConstant(Res: GCDTy, Val: 0).getReg(Idx: 0);
274 else if (PadStrategy == TargetOpcode::G_ANYEXT)
275 PadReg = MIRBuilder.buildUndef(Res: GCDTy).getReg(Idx: 0);
276 else {
277 assert(PadStrategy == TargetOpcode::G_SEXT);
278
279 // Shift the sign bit of the low register through the high register.
280 auto ShiftAmt =
281 MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: GCDTy.getSizeInBits() - 1);
282 PadReg = MIRBuilder.buildAShr(Dst: GCDTy, Src0: VRegs.back(), Src1: ShiftAmt).getReg(Idx: 0);
283 }
284 }
285
286 // Registers for the final merge to be produced.
287 SmallVector<Register, 4> Remerge(NumParts);
288
289 // Registers needed for intermediate merges, which will be merged into a
290 // source for Remerge.
291 SmallVector<Register, 4> SubMerge(NumSubParts);
292
293 // Once we've fully read off the end of the original source bits, we can reuse
294 // the same high bits for remaining padding elements.
295 Register AllPadReg;
296
297 // Build merges to the LCM type to cover the original result type.
298 for (int I = 0; I != NumParts; ++I) {
299 bool AllMergePartsArePadding = true;
300
301 // Build the requested merges to the requested type.
302 for (int J = 0; J != NumSubParts; ++J) {
303 int Idx = I * NumSubParts + J;
304 if (Idx >= NumOrigSrc) {
305 SubMerge[J] = PadReg;
306 continue;
307 }
308
309 SubMerge[J] = VRegs[Idx];
310
311 // There are meaningful bits here we can't reuse later.
312 AllMergePartsArePadding = false;
313 }
314
315 // If we've filled up a complete piece with padding bits, we can directly
316 // emit the natural sized constant if applicable, rather than a merge of
317 // smaller constants.
318 if (AllMergePartsArePadding && !AllPadReg) {
319 if (PadStrategy == TargetOpcode::G_ANYEXT)
320 AllPadReg = MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0);
321 else if (PadStrategy == TargetOpcode::G_ZEXT)
322 AllPadReg = MIRBuilder.buildConstant(Res: NarrowTy, Val: 0).getReg(Idx: 0);
323
324 // If this is a sign extension, we can't materialize a trivial constant
325 // with the right type and have to produce a merge.
326 }
327
328 if (AllPadReg) {
329 // Avoid creating additional instructions if we're just adding additional
330 // copies of padding bits.
331 Remerge[I] = AllPadReg;
332 continue;
333 }
334
335 if (NumSubParts == 1)
336 Remerge[I] = SubMerge[0];
337 else
338 Remerge[I] = MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: SubMerge).getReg(Idx: 0);
339
340 // In the sign extend padding case, re-use the first all-signbit merge.
341 if (AllMergePartsArePadding && !AllPadReg)
342 AllPadReg = Remerge[I];
343 }
344
345 VRegs = std::move(Remerge);
346 return LCMTy;
347}
348
349void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
350 ArrayRef<Register> RemergeRegs) {
351 LLT DstTy = MRI.getType(Reg: DstReg);
352
353 // Create the merge to the widened source, and extract the relevant bits into
354 // the result.
355
356 if (DstTy == LCMTy) {
357 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: RemergeRegs);
358 return;
359 }
360
361 auto Remerge = MIRBuilder.buildMergeLikeInstr(Res: LCMTy, Ops: RemergeRegs);
362 if (DstTy.isScalar() && LCMTy.isScalar()) {
363 MIRBuilder.buildTrunc(Res: DstReg, Op: Remerge);
364 return;
365 }
366
367 if (LCMTy.isVector()) {
368 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
369 SmallVector<Register, 8> UnmergeDefs(NumDefs);
370 UnmergeDefs[0] = DstReg;
371 for (unsigned I = 1; I != NumDefs; ++I)
372 UnmergeDefs[I] = MRI.createGenericVirtualRegister(Ty: DstTy);
373
374 MIRBuilder.buildUnmerge(Res: UnmergeDefs,
375 Op: MIRBuilder.buildMergeLikeInstr(Res: LCMTy, Ops: RemergeRegs));
376 return;
377 }
378
379 llvm_unreachable("unhandled case");
380}
381
382static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
383#define RTLIBCASE_INT(LibcallPrefix) \
384 do { \
385 switch (Size) { \
386 case 32: \
387 return RTLIB::LibcallPrefix##32; \
388 case 64: \
389 return RTLIB::LibcallPrefix##64; \
390 case 128: \
391 return RTLIB::LibcallPrefix##128; \
392 default: \
393 llvm_unreachable("unexpected size"); \
394 } \
395 } while (0)
396
397#define RTLIBCASE(LibcallPrefix) \
398 do { \
399 switch (Size) { \
400 case 32: \
401 return RTLIB::LibcallPrefix##32; \
402 case 64: \
403 return RTLIB::LibcallPrefix##64; \
404 case 80: \
405 return RTLIB::LibcallPrefix##80; \
406 case 128: \
407 return RTLIB::LibcallPrefix##128; \
408 default: \
409 llvm_unreachable("unexpected size"); \
410 } \
411 } while (0)
412
413 switch (Opcode) {
414 case TargetOpcode::G_LROUND:
415 RTLIBCASE(LROUND_F);
416 case TargetOpcode::G_LLROUND:
417 RTLIBCASE(LLROUND_F);
418 case TargetOpcode::G_MUL:
419 RTLIBCASE_INT(MUL_I);
420 case TargetOpcode::G_SDIV:
421 RTLIBCASE_INT(SDIV_I);
422 case TargetOpcode::G_UDIV:
423 RTLIBCASE_INT(UDIV_I);
424 case TargetOpcode::G_SREM:
425 RTLIBCASE_INT(SREM_I);
426 case TargetOpcode::G_UREM:
427 RTLIBCASE_INT(UREM_I);
428 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
429 RTLIBCASE_INT(CTLZ_I);
430 case TargetOpcode::G_FADD:
431 RTLIBCASE(ADD_F);
432 case TargetOpcode::G_FSUB:
433 RTLIBCASE(SUB_F);
434 case TargetOpcode::G_FMUL:
435 RTLIBCASE(MUL_F);
436 case TargetOpcode::G_FDIV:
437 RTLIBCASE(DIV_F);
438 case TargetOpcode::G_FEXP:
439 RTLIBCASE(EXP_F);
440 case TargetOpcode::G_FEXP2:
441 RTLIBCASE(EXP2_F);
442 case TargetOpcode::G_FEXP10:
443 RTLIBCASE(EXP10_F);
444 case TargetOpcode::G_FREM:
445 RTLIBCASE(REM_F);
446 case TargetOpcode::G_FPOW:
447 RTLIBCASE(POW_F);
448 case TargetOpcode::G_FPOWI:
449 RTLIBCASE(POWI_F);
450 case TargetOpcode::G_FMA:
451 RTLIBCASE(FMA_F);
452 case TargetOpcode::G_FSIN:
453 RTLIBCASE(SIN_F);
454 case TargetOpcode::G_FCOS:
455 RTLIBCASE(COS_F);
456 case TargetOpcode::G_FTAN:
457 RTLIBCASE(TAN_F);
458 case TargetOpcode::G_FASIN:
459 RTLIBCASE(ASIN_F);
460 case TargetOpcode::G_FACOS:
461 RTLIBCASE(ACOS_F);
462 case TargetOpcode::G_FATAN:
463 RTLIBCASE(ATAN_F);
464 case TargetOpcode::G_FATAN2:
465 RTLIBCASE(ATAN2_F);
466 case TargetOpcode::G_FSINH:
467 RTLIBCASE(SINH_F);
468 case TargetOpcode::G_FCOSH:
469 RTLIBCASE(COSH_F);
470 case TargetOpcode::G_FTANH:
471 RTLIBCASE(TANH_F);
472 case TargetOpcode::G_FSINCOS:
473 RTLIBCASE(SINCOS_F);
474 case TargetOpcode::G_FLOG10:
475 RTLIBCASE(LOG10_F);
476 case TargetOpcode::G_FLOG:
477 RTLIBCASE(LOG_F);
478 case TargetOpcode::G_FLOG2:
479 RTLIBCASE(LOG2_F);
480 case TargetOpcode::G_FLDEXP:
481 RTLIBCASE(LDEXP_F);
482 case TargetOpcode::G_FCEIL:
483 RTLIBCASE(CEIL_F);
484 case TargetOpcode::G_FFLOOR:
485 RTLIBCASE(FLOOR_F);
486 case TargetOpcode::G_FMINNUM:
487 RTLIBCASE(FMIN_F);
488 case TargetOpcode::G_FMAXNUM:
489 RTLIBCASE(FMAX_F);
490 case TargetOpcode::G_FSQRT:
491 RTLIBCASE(SQRT_F);
492 case TargetOpcode::G_FRINT:
493 RTLIBCASE(RINT_F);
494 case TargetOpcode::G_FNEARBYINT:
495 RTLIBCASE(NEARBYINT_F);
496 case TargetOpcode::G_INTRINSIC_TRUNC:
497 RTLIBCASE(TRUNC_F);
498 case TargetOpcode::G_INTRINSIC_ROUND:
499 RTLIBCASE(ROUND_F);
500 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
501 RTLIBCASE(ROUNDEVEN_F);
502 case TargetOpcode::G_INTRINSIC_LRINT:
503 RTLIBCASE(LRINT_F);
504 case TargetOpcode::G_INTRINSIC_LLRINT:
505 RTLIBCASE(LLRINT_F);
506 }
507 llvm_unreachable("Unknown libcall function");
508#undef RTLIBCASE_INT
509#undef RTLIBCASE
510}
511
512/// True if an instruction is in tail position in its caller. Intended for
513/// legalizing libcalls as tail calls when possible.
514static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
515 MachineInstr &MI,
516 const TargetInstrInfo &TII,
517 MachineRegisterInfo &MRI) {
518 MachineBasicBlock &MBB = *MI.getParent();
519 const Function &F = MBB.getParent()->getFunction();
520
521 // Conservatively require the attributes of the call to match those of
522 // the return. Ignore NoAlias and NonNull because they don't affect the
523 // call sequence.
524 AttributeList CallerAttrs = F.getAttributes();
525 if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
526 .removeAttribute(Val: Attribute::NoAlias)
527 .removeAttribute(Val: Attribute::NonNull)
528 .hasAttributes())
529 return false;
530
531 // It's not safe to eliminate the sign / zero extension of the return value.
532 if (CallerAttrs.hasRetAttr(Kind: Attribute::ZExt) ||
533 CallerAttrs.hasRetAttr(Kind: Attribute::SExt))
534 return false;
535
536 // Only tail call if the following instruction is a standard return or if we
537 // have a `thisreturn` callee, and a sequence like:
538 //
539 // G_MEMCPY %0, %1, %2
540 // $x0 = COPY %0
541 // RET_ReallyLR implicit $x0
542 auto Next = next_nodbg(It: MI.getIterator(), End: MBB.instr_end());
543 if (Next != MBB.instr_end() && Next->isCopy()) {
544 if (MI.getOpcode() == TargetOpcode::G_BZERO)
545 return false;
546
547 // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
548 // mempy/etc routines return the same parameter. For other it will be the
549 // returned value.
550 Register VReg = MI.getOperand(i: 0).getReg();
551 if (!VReg.isVirtual() || VReg != Next->getOperand(i: 1).getReg())
552 return false;
553
554 Register PReg = Next->getOperand(i: 0).getReg();
555 if (!PReg.isPhysical())
556 return false;
557
558 auto Ret = next_nodbg(It: Next, End: MBB.instr_end());
559 if (Ret == MBB.instr_end() || !Ret->isReturn())
560 return false;
561
562 if (Ret->getNumImplicitOperands() != 1)
563 return false;
564
565 if (!Ret->getOperand(i: 0).isReg() || PReg != Ret->getOperand(i: 0).getReg())
566 return false;
567
568 // Skip over the COPY that we just validated.
569 Next = Ret;
570 }
571
572 if (Next == MBB.instr_end() || TII.isTailCall(Inst: *Next) || !Next->isReturn())
573 return false;
574
575 return true;
576}
577
578LegalizerHelper::LegalizeResult
579llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
580 const CallLowering::ArgInfo &Result,
581 ArrayRef<CallLowering::ArgInfo> Args,
582 const CallingConv::ID CC, LostDebugLocObserver &LocObserver,
583 MachineInstr *MI) {
584 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
585
586 CallLowering::CallLoweringInfo Info;
587 Info.CallConv = CC;
588 Info.Callee = MachineOperand::CreateES(SymName: Name);
589 Info.OrigRet = Result;
590 if (MI)
591 Info.IsTailCall =
592 (Result.Ty->isVoidTy() ||
593 Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
594 isLibCallInTailPosition(Result, MI&: *MI, TII: MIRBuilder.getTII(),
595 MRI&: *MIRBuilder.getMRI());
596
597 llvm::append_range(C&: Info.OrigArgs, R&: Args);
598 if (!CLI.lowerCall(MIRBuilder, Info))
599 return LegalizerHelper::UnableToLegalize;
600
601 if (MI && Info.LoweredTailCall) {
602 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
603
604 // Check debug locations before removing the return.
605 LocObserver.checkpoint(CheckDebugLocs: true);
606
607 // We must have a return following the call (or debug insts) to get past
608 // isLibCallInTailPosition.
609 do {
610 MachineInstr *Next = MI->getNextNode();
611 assert(Next &&
612 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
613 "Expected instr following MI to be return or debug inst?");
614 // We lowered a tail call, so the call is now the return from the block.
615 // Delete the old return.
616 Next->eraseFromParent();
617 } while (MI->getNextNode());
618
619 // We expect to lose the debug location from the return.
620 LocObserver.checkpoint(CheckDebugLocs: false);
621 }
622 return LegalizerHelper::Legalized;
623}
624
625LegalizerHelper::LegalizeResult
626llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
627 const CallLowering::ArgInfo &Result,
628 ArrayRef<CallLowering::ArgInfo> Args,
629 LostDebugLocObserver &LocObserver, MachineInstr *MI) {
630 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
631 const char *Name = TLI.getLibcallName(Call: Libcall);
632 if (!Name)
633 return LegalizerHelper::UnableToLegalize;
634 const CallingConv::ID CC = TLI.getLibcallCallingConv(Call: Libcall);
635 return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI);
636}
637
638// Useful for libcalls where all operands have the same type.
639static LegalizerHelper::LegalizeResult
640simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
641 Type *OpType, LostDebugLocObserver &LocObserver) {
642 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
643
644 // FIXME: What does the original arg index mean here?
645 SmallVector<CallLowering::ArgInfo, 3> Args;
646 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands()))
647 Args.push_back(Elt: {MO.getReg(), OpType, 0});
648 return createLibcall(MIRBuilder, Libcall,
649 Result: {MI.getOperand(i: 0).getReg(), OpType, 0}, Args,
650 LocObserver, MI: &MI);
651}
652
653LegalizerHelper::LegalizeResult LegalizerHelper::emitSincosLibcall(
654 MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, Type *OpType,
655 LostDebugLocObserver &LocObserver) {
656 MachineFunction &MF = *MI.getMF();
657 MachineRegisterInfo &MRI = MF.getRegInfo();
658
659 Register DstSin = MI.getOperand(i: 0).getReg();
660 Register DstCos = MI.getOperand(i: 1).getReg();
661 Register Src = MI.getOperand(i: 2).getReg();
662 LLT DstTy = MRI.getType(Reg: DstSin);
663
664 int MemSize = DstTy.getSizeInBytes();
665 Align Alignment = getStackTemporaryAlignment(Type: DstTy);
666 const DataLayout &DL = MIRBuilder.getDataLayout();
667 unsigned AddrSpace = DL.getAllocaAddrSpace();
668 MachinePointerInfo PtrInfo;
669
670 Register StackPtrSin =
671 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
672 .getReg(Idx: 0);
673 Register StackPtrCos =
674 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
675 .getReg(Idx: 0);
676
677 auto &Ctx = MF.getFunction().getContext();
678 auto LibcallResult =
679 createLibcall(MIRBuilder, Libcall: getRTLibDesc(Opcode: MI.getOpcode(), Size),
680 Result: {{0}, Type::getVoidTy(C&: Ctx), 0},
681 Args: {{Src, OpType, 0},
682 {StackPtrSin, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 1},
683 {StackPtrCos, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 2}},
684 LocObserver, MI: &MI);
685
686 if (LibcallResult != LegalizeResult::Legalized)
687 return LegalizerHelper::UnableToLegalize;
688
689 MachineMemOperand *LoadMMOSin = MF.getMachineMemOperand(
690 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
691 MachineMemOperand *LoadMMOCos = MF.getMachineMemOperand(
692 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
693
694 MIRBuilder.buildLoad(Res: DstSin, Addr: StackPtrSin, MMO&: *LoadMMOSin);
695 MIRBuilder.buildLoad(Res: DstCos, Addr: StackPtrCos, MMO&: *LoadMMOCos);
696 MI.eraseFromParent();
697
698 return LegalizerHelper::Legalized;
699}
700
701LegalizerHelper::LegalizeResult
702llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
703 MachineInstr &MI, LostDebugLocObserver &LocObserver) {
704 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
705
706 SmallVector<CallLowering::ArgInfo, 3> Args;
707 // Add all the args, except for the last which is an imm denoting 'tail'.
708 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
709 Register Reg = MI.getOperand(i).getReg();
710
711 // Need derive an IR type for call lowering.
712 LLT OpLLT = MRI.getType(Reg);
713 Type *OpTy = nullptr;
714 if (OpLLT.isPointer())
715 OpTy = PointerType::get(C&: Ctx, AddressSpace: OpLLT.getAddressSpace());
716 else
717 OpTy = IntegerType::get(C&: Ctx, NumBits: OpLLT.getSizeInBits());
718 Args.push_back(Elt: {Reg, OpTy, 0});
719 }
720
721 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
722 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
723 RTLIB::Libcall RTLibcall;
724 unsigned Opc = MI.getOpcode();
725 const char *Name;
726 switch (Opc) {
727 case TargetOpcode::G_BZERO:
728 RTLibcall = RTLIB::BZERO;
729 Name = TLI.getLibcallName(Call: RTLibcall);
730 break;
731 case TargetOpcode::G_MEMCPY:
732 RTLibcall = RTLIB::MEMCPY;
733 Name = TLI.getMemcpyName();
734 Args[0].Flags[0].setReturned();
735 break;
736 case TargetOpcode::G_MEMMOVE:
737 RTLibcall = RTLIB::MEMMOVE;
738 Name = TLI.getLibcallName(Call: RTLibcall);
739 Args[0].Flags[0].setReturned();
740 break;
741 case TargetOpcode::G_MEMSET:
742 RTLibcall = RTLIB::MEMSET;
743 Name = TLI.getLibcallName(Call: RTLibcall);
744 Args[0].Flags[0].setReturned();
745 break;
746 default:
747 llvm_unreachable("unsupported opcode");
748 }
749
750 // Unsupported libcall on the target.
751 if (!Name) {
752 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
753 << MIRBuilder.getTII().getName(Opc) << "\n");
754 return LegalizerHelper::UnableToLegalize;
755 }
756
757 CallLowering::CallLoweringInfo Info;
758 Info.CallConv = TLI.getLibcallCallingConv(Call: RTLibcall);
759 Info.Callee = MachineOperand::CreateES(SymName: Name);
760 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0);
761 Info.IsTailCall =
762 MI.getOperand(i: MI.getNumOperands() - 1).getImm() &&
763 isLibCallInTailPosition(Result: Info.OrigRet, MI, TII: MIRBuilder.getTII(), MRI);
764
765 llvm::append_range(C&: Info.OrigArgs, R&: Args);
766 if (!CLI.lowerCall(MIRBuilder, Info))
767 return LegalizerHelper::UnableToLegalize;
768
769 if (Info.LoweredTailCall) {
770 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
771
772 // Check debug locations before removing the return.
773 LocObserver.checkpoint(CheckDebugLocs: true);
774
775 // We must have a return following the call (or debug insts) to get past
776 // isLibCallInTailPosition.
777 do {
778 MachineInstr *Next = MI.getNextNode();
779 assert(Next &&
780 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
781 "Expected instr following MI to be return or debug inst?");
782 // We lowered a tail call, so the call is now the return from the block.
783 // Delete the old return.
784 Next->eraseFromParent();
785 } while (MI.getNextNode());
786
787 // We expect to lose the debug location from the return.
788 LocObserver.checkpoint(CheckDebugLocs: false);
789 }
790
791 return LegalizerHelper::Legalized;
792}
793
794static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
795 unsigned Opc = MI.getOpcode();
796 auto &AtomicMI = cast<GMemOperation>(Val&: MI);
797 auto &MMO = AtomicMI.getMMO();
798 auto Ordering = MMO.getMergedOrdering();
799 LLT MemType = MMO.getMemoryType();
800 uint64_t MemSize = MemType.getSizeInBytes();
801 if (MemType.isVector())
802 return RTLIB::UNKNOWN_LIBCALL;
803
804#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
805#define LCALL5(A) \
806 LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
807 switch (Opc) {
808 case TargetOpcode::G_ATOMIC_CMPXCHG:
809 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
810 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
811 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
812 }
813 case TargetOpcode::G_ATOMICRMW_XCHG: {
814 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
815 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
816 }
817 case TargetOpcode::G_ATOMICRMW_ADD:
818 case TargetOpcode::G_ATOMICRMW_SUB: {
819 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
820 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
821 }
822 case TargetOpcode::G_ATOMICRMW_AND: {
823 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
824 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
825 }
826 case TargetOpcode::G_ATOMICRMW_OR: {
827 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
828 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
829 }
830 case TargetOpcode::G_ATOMICRMW_XOR: {
831 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
832 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
833 }
834 default:
835 return RTLIB::UNKNOWN_LIBCALL;
836 }
837#undef LCALLS
838#undef LCALL5
839}
840
841static LegalizerHelper::LegalizeResult
842createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
843 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
844
845 Type *RetTy;
846 SmallVector<Register> RetRegs;
847 SmallVector<CallLowering::ArgInfo, 3> Args;
848 unsigned Opc = MI.getOpcode();
849 switch (Opc) {
850 case TargetOpcode::G_ATOMIC_CMPXCHG:
851 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
852 Register Success;
853 LLT SuccessLLT;
854 auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
855 MI.getFirst4RegLLTs();
856 RetRegs.push_back(Elt: Ret);
857 RetTy = IntegerType::get(C&: Ctx, NumBits: RetLLT.getSizeInBits());
858 if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
859 std::tie(args&: Ret, args&: RetLLT, args&: Success, args&: SuccessLLT, args&: Mem, args&: MemLLT, args&: Cmp, args&: CmpLLT, args&: New,
860 args&: NewLLT) = MI.getFirst5RegLLTs();
861 RetRegs.push_back(Elt: Success);
862 RetTy = StructType::get(
863 Context&: Ctx, Elements: {RetTy, IntegerType::get(C&: Ctx, NumBits: SuccessLLT.getSizeInBits())});
864 }
865 Args.push_back(Elt: {Cmp, IntegerType::get(C&: Ctx, NumBits: CmpLLT.getSizeInBits()), 0});
866 Args.push_back(Elt: {New, IntegerType::get(C&: Ctx, NumBits: NewLLT.getSizeInBits()), 0});
867 Args.push_back(Elt: {Mem, PointerType::get(C&: Ctx, AddressSpace: MemLLT.getAddressSpace()), 0});
868 break;
869 }
870 case TargetOpcode::G_ATOMICRMW_XCHG:
871 case TargetOpcode::G_ATOMICRMW_ADD:
872 case TargetOpcode::G_ATOMICRMW_SUB:
873 case TargetOpcode::G_ATOMICRMW_AND:
874 case TargetOpcode::G_ATOMICRMW_OR:
875 case TargetOpcode::G_ATOMICRMW_XOR: {
876 auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
877 RetRegs.push_back(Elt: Ret);
878 RetTy = IntegerType::get(C&: Ctx, NumBits: RetLLT.getSizeInBits());
879 if (Opc == TargetOpcode::G_ATOMICRMW_AND)
880 Val =
881 MIRBuilder.buildXor(Dst: ValLLT, Src0: MIRBuilder.buildConstant(Res: ValLLT, Val: -1), Src1: Val)
882 .getReg(Idx: 0);
883 else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
884 Val =
885 MIRBuilder.buildSub(Dst: ValLLT, Src0: MIRBuilder.buildConstant(Res: ValLLT, Val: 0), Src1: Val)
886 .getReg(Idx: 0);
887 Args.push_back(Elt: {Val, IntegerType::get(C&: Ctx, NumBits: ValLLT.getSizeInBits()), 0});
888 Args.push_back(Elt: {Mem, PointerType::get(C&: Ctx, AddressSpace: MemLLT.getAddressSpace()), 0});
889 break;
890 }
891 default:
892 llvm_unreachable("unsupported opcode");
893 }
894
895 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
896 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
897 RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
898 const char *Name = TLI.getLibcallName(Call: RTLibcall);
899
900 // Unsupported libcall on the target.
901 if (!Name) {
902 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
903 << MIRBuilder.getTII().getName(Opc) << "\n");
904 return LegalizerHelper::UnableToLegalize;
905 }
906
907 CallLowering::CallLoweringInfo Info;
908 Info.CallConv = TLI.getLibcallCallingConv(Call: RTLibcall);
909 Info.Callee = MachineOperand::CreateES(SymName: Name);
910 Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
911
912 llvm::append_range(C&: Info.OrigArgs, R&: Args);
913 if (!CLI.lowerCall(MIRBuilder, Info))
914 return LegalizerHelper::UnableToLegalize;
915
916 return LegalizerHelper::Legalized;
917}
918
919static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
920 Type *FromType) {
921 auto ToMVT = MVT::getVT(Ty: ToType);
922 auto FromMVT = MVT::getVT(Ty: FromType);
923
924 switch (Opcode) {
925 case TargetOpcode::G_FPEXT:
926 return RTLIB::getFPEXT(OpVT: FromMVT, RetVT: ToMVT);
927 case TargetOpcode::G_FPTRUNC:
928 return RTLIB::getFPROUND(OpVT: FromMVT, RetVT: ToMVT);
929 case TargetOpcode::G_FPTOSI:
930 return RTLIB::getFPTOSINT(OpVT: FromMVT, RetVT: ToMVT);
931 case TargetOpcode::G_FPTOUI:
932 return RTLIB::getFPTOUINT(OpVT: FromMVT, RetVT: ToMVT);
933 case TargetOpcode::G_SITOFP:
934 return RTLIB::getSINTTOFP(OpVT: FromMVT, RetVT: ToMVT);
935 case TargetOpcode::G_UITOFP:
936 return RTLIB::getUINTTOFP(OpVT: FromMVT, RetVT: ToMVT);
937 }
938 llvm_unreachable("Unsupported libcall function");
939}
940
941static LegalizerHelper::LegalizeResult
942conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
943 Type *FromType, LostDebugLocObserver &LocObserver,
944 const TargetLowering &TLI, bool IsSigned = false) {
945 CallLowering::ArgInfo Arg = {MI.getOperand(i: 1).getReg(), FromType, 0};
946 if (FromType->isIntegerTy()) {
947 if (TLI.shouldSignExtendTypeInLibCall(Ty: FromType, IsSigned))
948 Arg.Flags[0].setSExt();
949 else
950 Arg.Flags[0].setZExt();
951 }
952
953 RTLIB::Libcall Libcall = getConvRTLibDesc(Opcode: MI.getOpcode(), ToType, FromType);
954 return createLibcall(MIRBuilder, Libcall,
955 Result: {MI.getOperand(i: 0).getReg(), ToType, 0}, Args: Arg, LocObserver,
956 MI: &MI);
957}
958
959static RTLIB::Libcall
960getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
961 RTLIB::Libcall RTLibcall;
962 switch (MI.getOpcode()) {
963 case TargetOpcode::G_GET_FPENV:
964 RTLibcall = RTLIB::FEGETENV;
965 break;
966 case TargetOpcode::G_SET_FPENV:
967 case TargetOpcode::G_RESET_FPENV:
968 RTLibcall = RTLIB::FESETENV;
969 break;
970 case TargetOpcode::G_GET_FPMODE:
971 RTLibcall = RTLIB::FEGETMODE;
972 break;
973 case TargetOpcode::G_SET_FPMODE:
974 case TargetOpcode::G_RESET_FPMODE:
975 RTLibcall = RTLIB::FESETMODE;
976 break;
977 default:
978 llvm_unreachable("Unexpected opcode");
979 }
980 return RTLibcall;
981}
982
983// Some library functions that read FP state (fegetmode, fegetenv) write the
984// state into a region in memory. IR intrinsics that do the same operations
985// (get_fpmode, get_fpenv) return the state as integer value. To implement these
986// intrinsics via the library functions, we need to use temporary variable,
987// for example:
988//
989// %0:_(s32) = G_GET_FPMODE
990//
991// is transformed to:
992//
993// %1:_(p0) = G_FRAME_INDEX %stack.0
994// BL &fegetmode
995// %0:_(s32) = G_LOAD % 1
996//
997LegalizerHelper::LegalizeResult
998LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
999 MachineInstr &MI,
1000 LostDebugLocObserver &LocObserver) {
1001 const DataLayout &DL = MIRBuilder.getDataLayout();
1002 auto &MF = MIRBuilder.getMF();
1003 auto &MRI = *MIRBuilder.getMRI();
1004 auto &Ctx = MF.getFunction().getContext();
1005
1006 // Create temporary, where library function will put the read state.
1007 Register Dst = MI.getOperand(i: 0).getReg();
1008 LLT StateTy = MRI.getType(Reg: Dst);
1009 TypeSize StateSize = StateTy.getSizeInBytes();
1010 Align TempAlign = getStackTemporaryAlignment(Type: StateTy);
1011 MachinePointerInfo TempPtrInfo;
1012 auto Temp = createStackTemporary(Bytes: StateSize, Alignment: TempAlign, PtrInfo&: TempPtrInfo);
1013
1014 // Create a call to library function, with the temporary as an argument.
1015 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1016 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: TempAddrSpace);
1017 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1018 auto Res =
1019 createLibcall(MIRBuilder, Libcall: RTLibcall,
1020 Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1021 Args: CallLowering::ArgInfo({Temp.getReg(Idx: 0), StatePtrTy, 0}),
1022 LocObserver, MI: nullptr);
1023 if (Res != LegalizerHelper::Legalized)
1024 return Res;
1025
1026 // Create a load from the temporary.
1027 MachineMemOperand *MMO = MF.getMachineMemOperand(
1028 PtrInfo: TempPtrInfo, f: MachineMemOperand::MOLoad, MemTy: StateTy, base_alignment: TempAlign);
1029 MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_LOAD, Res: Dst, Addr: Temp, MMO&: *MMO);
1030
1031 return LegalizerHelper::Legalized;
1032}
1033
1034// Similar to `createGetStateLibcall` the function calls a library function
1035// using transient space in stack. In this case the library function reads
1036// content of memory region.
1037LegalizerHelper::LegalizeResult
1038LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
1039 MachineInstr &MI,
1040 LostDebugLocObserver &LocObserver) {
1041 const DataLayout &DL = MIRBuilder.getDataLayout();
1042 auto &MF = MIRBuilder.getMF();
1043 auto &MRI = *MIRBuilder.getMRI();
1044 auto &Ctx = MF.getFunction().getContext();
1045
1046 // Create temporary, where library function will get the new state.
1047 Register Src = MI.getOperand(i: 0).getReg();
1048 LLT StateTy = MRI.getType(Reg: Src);
1049 TypeSize StateSize = StateTy.getSizeInBytes();
1050 Align TempAlign = getStackTemporaryAlignment(Type: StateTy);
1051 MachinePointerInfo TempPtrInfo;
1052 auto Temp = createStackTemporary(Bytes: StateSize, Alignment: TempAlign, PtrInfo&: TempPtrInfo);
1053
1054 // Put the new state into the temporary.
1055 MachineMemOperand *MMO = MF.getMachineMemOperand(
1056 PtrInfo: TempPtrInfo, f: MachineMemOperand::MOStore, MemTy: StateTy, base_alignment: TempAlign);
1057 MIRBuilder.buildStore(Val: Src, Addr: Temp, MMO&: *MMO);
1058
1059 // Create a call to library function, with the temporary as an argument.
1060 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1061 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: TempAddrSpace);
1062 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1063 return createLibcall(MIRBuilder, Libcall: RTLibcall,
1064 Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1065 Args: CallLowering::ArgInfo({Temp.getReg(Idx: 0), StatePtrTy, 0}),
1066 LocObserver, MI: nullptr);
1067}
1068
1069/// Returns the corresponding libcall for the given Pred and
1070/// the ICMP predicate that should be generated to compare with #0
1071/// after the libcall.
1072static std::pair<RTLIB::Libcall, CmpInst::Predicate>
1073getFCMPLibcallDesc(const CmpInst::Predicate Pred, unsigned Size) {
1074#define RTLIBCASE_CMP(LibcallPrefix, ICmpPred) \
1075 do { \
1076 switch (Size) { \
1077 case 32: \
1078 return {RTLIB::LibcallPrefix##32, ICmpPred}; \
1079 case 64: \
1080 return {RTLIB::LibcallPrefix##64, ICmpPred}; \
1081 case 128: \
1082 return {RTLIB::LibcallPrefix##128, ICmpPred}; \
1083 default: \
1084 llvm_unreachable("unexpected size"); \
1085 } \
1086 } while (0)
1087
1088 switch (Pred) {
1089 case CmpInst::FCMP_OEQ:
1090 RTLIBCASE_CMP(OEQ_F, CmpInst::ICMP_EQ);
1091 case CmpInst::FCMP_UNE:
1092 RTLIBCASE_CMP(UNE_F, CmpInst::ICMP_NE);
1093 case CmpInst::FCMP_OGE:
1094 RTLIBCASE_CMP(OGE_F, CmpInst::ICMP_SGE);
1095 case CmpInst::FCMP_OLT:
1096 RTLIBCASE_CMP(OLT_F, CmpInst::ICMP_SLT);
1097 case CmpInst::FCMP_OLE:
1098 RTLIBCASE_CMP(OLE_F, CmpInst::ICMP_SLE);
1099 case CmpInst::FCMP_OGT:
1100 RTLIBCASE_CMP(OGT_F, CmpInst::ICMP_SGT);
1101 case CmpInst::FCMP_UNO:
1102 RTLIBCASE_CMP(UO_F, CmpInst::ICMP_NE);
1103 default:
1104 return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
1105 }
1106}
1107
1108LegalizerHelper::LegalizeResult
1109LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
1110 MachineInstr &MI,
1111 LostDebugLocObserver &LocObserver) {
1112 auto &MF = MIRBuilder.getMF();
1113 auto &Ctx = MF.getFunction().getContext();
1114 const GFCmp *Cmp = cast<GFCmp>(Val: &MI);
1115
1116 LLT OpLLT = MRI.getType(Reg: Cmp->getLHSReg());
1117 unsigned Size = OpLLT.getSizeInBits();
1118 if ((Size != 32 && Size != 64 && Size != 128) ||
1119 OpLLT != MRI.getType(Reg: Cmp->getRHSReg()))
1120 return UnableToLegalize;
1121
1122 Type *OpType = getFloatTypeForLLT(Ctx, Ty: OpLLT);
1123
1124 // DstReg type is s32
1125 const Register DstReg = Cmp->getReg(Idx: 0);
1126 LLT DstTy = MRI.getType(Reg: DstReg);
1127 const auto Cond = Cmp->getCond();
1128
1129 // Reference:
1130 // https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
1131 // Generates a libcall followed by ICMP.
1132 const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
1133 const CmpInst::Predicate ICmpPred,
1134 const DstOp &Res) -> Register {
1135 // FCMP libcall always returns an i32, and needs an ICMP with #0.
1136 constexpr LLT TempLLT = LLT::scalar(SizeInBits: 32);
1137 Register Temp = MRI.createGenericVirtualRegister(Ty: TempLLT);
1138 // Generate libcall, holding result in Temp
1139 const auto Status = createLibcall(
1140 MIRBuilder, Libcall, Result: {Temp, Type::getInt32Ty(C&: Ctx), 0},
1141 Args: {{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
1142 LocObserver, MI: &MI);
1143 if (!Status)
1144 return {};
1145
1146 // Compare temp with #0 to get the final result.
1147 return MIRBuilder
1148 .buildICmp(Pred: ICmpPred, Res, Op0: Temp, Op1: MIRBuilder.buildConstant(Res: TempLLT, Val: 0))
1149 .getReg(Idx: 0);
1150 };
1151
1152 // Simple case if we have a direct mapping from predicate to libcall
1153 if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Pred: Cond, Size);
1154 Libcall != RTLIB::UNKNOWN_LIBCALL &&
1155 ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
1156 if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
1157 return Legalized;
1158 }
1159 return UnableToLegalize;
1160 }
1161
1162 // No direct mapping found, should be generated as combination of libcalls.
1163
1164 switch (Cond) {
1165 case CmpInst::FCMP_UEQ: {
1166 // FCMP_UEQ: unordered or equal
1167 // Convert into (FCMP_OEQ || FCMP_UNO).
1168
1169 const auto [OeqLibcall, OeqPred] =
1170 getFCMPLibcallDesc(Pred: CmpInst::FCMP_OEQ, Size);
1171 const auto Oeq = BuildLibcall(OeqLibcall, OeqPred, DstTy);
1172
1173 const auto [UnoLibcall, UnoPred] =
1174 getFCMPLibcallDesc(Pred: CmpInst::FCMP_UNO, Size);
1175 const auto Uno = BuildLibcall(UnoLibcall, UnoPred, DstTy);
1176 if (Oeq && Uno)
1177 MIRBuilder.buildOr(Dst: DstReg, Src0: Oeq, Src1: Uno);
1178 else
1179 return UnableToLegalize;
1180
1181 break;
1182 }
1183 case CmpInst::FCMP_ONE: {
1184 // FCMP_ONE: ordered and operands are unequal
1185 // Convert into (!FCMP_OEQ && !FCMP_UNO).
1186
1187 // We inverse the predicate instead of generating a NOT
1188 // to save one instruction.
1189 // On AArch64 isel can even select two cmp into a single ccmp.
1190 const auto [OeqLibcall, OeqPred] =
1191 getFCMPLibcallDesc(Pred: CmpInst::FCMP_OEQ, Size);
1192 const auto NotOeq =
1193 BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(pred: OeqPred), DstTy);
1194
1195 const auto [UnoLibcall, UnoPred] =
1196 getFCMPLibcallDesc(Pred: CmpInst::FCMP_UNO, Size);
1197 const auto NotUno =
1198 BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(pred: UnoPred), DstTy);
1199
1200 if (NotOeq && NotUno)
1201 MIRBuilder.buildAnd(Dst: DstReg, Src0: NotOeq, Src1: NotUno);
1202 else
1203 return UnableToLegalize;
1204
1205 break;
1206 }
1207 case CmpInst::FCMP_ULT:
1208 case CmpInst::FCMP_UGE:
1209 case CmpInst::FCMP_UGT:
1210 case CmpInst::FCMP_ULE:
1211 case CmpInst::FCMP_ORD: {
1212 // Convert into: !(inverse(Pred))
1213 // E.g. FCMP_ULT becomes !FCMP_OGE
1214 // This is equivalent to the following, but saves some instructions.
1215 // MIRBuilder.buildNot(
1216 // PredTy,
1217 // MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
1218 // Op1, Op2));
1219 const auto [InversedLibcall, InversedPred] =
1220 getFCMPLibcallDesc(Pred: CmpInst::getInversePredicate(pred: Cond), Size);
1221 if (!BuildLibcall(InversedLibcall,
1222 CmpInst::getInversePredicate(pred: InversedPred), DstReg))
1223 return UnableToLegalize;
1224 break;
1225 }
1226 default:
1227 return UnableToLegalize;
1228 }
1229
1230 return Legalized;
1231}
1232
1233// The function is used to legalize operations that set default environment
1234// state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
1235// On most targets supported in glibc FE_DFL_MODE is defined as
1236// `((const femode_t *) -1)`. Such assumption is used here. If for some target
1237// it is not true, the target must provide custom lowering.
1238LegalizerHelper::LegalizeResult
1239LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
1240 MachineInstr &MI,
1241 LostDebugLocObserver &LocObserver) {
1242 const DataLayout &DL = MIRBuilder.getDataLayout();
1243 auto &MF = MIRBuilder.getMF();
1244 auto &Ctx = MF.getFunction().getContext();
1245
1246 // Create an argument for the library function.
1247 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
1248 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: AddrSpace);
1249 unsigned PtrSize = DL.getPointerSizeInBits(AS: AddrSpace);
1250 LLT MemTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: PtrSize);
1251 auto DefValue = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: PtrSize), Val: -1LL);
1252 DstOp Dest(MRI.createGenericVirtualRegister(Ty: MemTy));
1253 MIRBuilder.buildIntToPtr(Dst: Dest, Src: DefValue);
1254
1255 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1256 return createLibcall(MIRBuilder, Libcall: RTLibcall,
1257 Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1258 Args: CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}),
1259 LocObserver, MI: &MI);
1260}
1261
1262LegalizerHelper::LegalizeResult
1263LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1264 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1265
1266 switch (MI.getOpcode()) {
1267 default:
1268 return UnableToLegalize;
1269 case TargetOpcode::G_MUL:
1270 case TargetOpcode::G_SDIV:
1271 case TargetOpcode::G_UDIV:
1272 case TargetOpcode::G_SREM:
1273 case TargetOpcode::G_UREM:
1274 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1275 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1276 unsigned Size = LLTy.getSizeInBits();
1277 Type *HLTy = IntegerType::get(C&: Ctx, NumBits: Size);
1278 auto Status = simpleLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1279 if (Status != Legalized)
1280 return Status;
1281 break;
1282 }
1283 case TargetOpcode::G_FADD:
1284 case TargetOpcode::G_FSUB:
1285 case TargetOpcode::G_FMUL:
1286 case TargetOpcode::G_FDIV:
1287 case TargetOpcode::G_FMA:
1288 case TargetOpcode::G_FPOW:
1289 case TargetOpcode::G_FREM:
1290 case TargetOpcode::G_FCOS:
1291 case TargetOpcode::G_FSIN:
1292 case TargetOpcode::G_FTAN:
1293 case TargetOpcode::G_FACOS:
1294 case TargetOpcode::G_FASIN:
1295 case TargetOpcode::G_FATAN:
1296 case TargetOpcode::G_FATAN2:
1297 case TargetOpcode::G_FCOSH:
1298 case TargetOpcode::G_FSINH:
1299 case TargetOpcode::G_FTANH:
1300 case TargetOpcode::G_FLOG10:
1301 case TargetOpcode::G_FLOG:
1302 case TargetOpcode::G_FLOG2:
1303 case TargetOpcode::G_FEXP:
1304 case TargetOpcode::G_FEXP2:
1305 case TargetOpcode::G_FEXP10:
1306 case TargetOpcode::G_FCEIL:
1307 case TargetOpcode::G_FFLOOR:
1308 case TargetOpcode::G_FMINNUM:
1309 case TargetOpcode::G_FMAXNUM:
1310 case TargetOpcode::G_FSQRT:
1311 case TargetOpcode::G_FRINT:
1312 case TargetOpcode::G_FNEARBYINT:
1313 case TargetOpcode::G_INTRINSIC_TRUNC:
1314 case TargetOpcode::G_INTRINSIC_ROUND:
1315 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1316 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1317 unsigned Size = LLTy.getSizeInBits();
1318 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1319 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1320 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1321 return UnableToLegalize;
1322 }
1323 auto Status = simpleLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1324 if (Status != Legalized)
1325 return Status;
1326 break;
1327 }
1328 case TargetOpcode::G_FSINCOS: {
1329 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1330 unsigned Size = LLTy.getSizeInBits();
1331 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1332 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1333 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1334 return UnableToLegalize;
1335 }
1336 return emitSincosLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1337 }
1338 case TargetOpcode::G_LROUND:
1339 case TargetOpcode::G_LLROUND:
1340 case TargetOpcode::G_INTRINSIC_LRINT:
1341 case TargetOpcode::G_INTRINSIC_LLRINT: {
1342 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
1343 unsigned Size = LLTy.getSizeInBits();
1344 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1345 Type *ITy = IntegerType::get(
1346 C&: Ctx, NumBits: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits());
1347 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1348 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1349 return UnableToLegalize;
1350 }
1351 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
1352 LegalizeResult Status =
1353 createLibcall(MIRBuilder, Libcall, Result: {MI.getOperand(i: 0).getReg(), ITy, 0},
1354 Args: {{MI.getOperand(i: 1).getReg(), HLTy, 0}}, LocObserver, MI: &MI);
1355 if (Status != Legalized)
1356 return Status;
1357 MI.eraseFromParent();
1358 return Legalized;
1359 }
1360 case TargetOpcode::G_FPOWI:
1361 case TargetOpcode::G_FLDEXP: {
1362 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1363 unsigned Size = LLTy.getSizeInBits();
1364 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1365 Type *ITy = IntegerType::get(
1366 C&: Ctx, NumBits: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits());
1367 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1368 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1369 return UnableToLegalize;
1370 }
1371 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
1372 SmallVector<CallLowering::ArgInfo, 2> Args = {
1373 {MI.getOperand(i: 1).getReg(), HLTy, 0},
1374 {MI.getOperand(i: 2).getReg(), ITy, 1}};
1375 Args[1].Flags[0].setSExt();
1376 LegalizeResult Status =
1377 createLibcall(MIRBuilder, Libcall, Result: {MI.getOperand(i: 0).getReg(), HLTy, 0},
1378 Args, LocObserver, MI: &MI);
1379 if (Status != Legalized)
1380 return Status;
1381 break;
1382 }
1383 case TargetOpcode::G_FPEXT:
1384 case TargetOpcode::G_FPTRUNC: {
1385 Type *FromTy = getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 1).getReg()));
1386 Type *ToTy = getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
1387 if (!FromTy || !ToTy)
1388 return UnableToLegalize;
1389 LegalizeResult Status =
1390 conversionLibcall(MI, MIRBuilder, ToType: ToTy, FromType: FromTy, LocObserver, TLI);
1391 if (Status != Legalized)
1392 return Status;
1393 break;
1394 }
1395 case TargetOpcode::G_FCMP: {
1396 LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
1397 if (Status != Legalized)
1398 return Status;
1399 MI.eraseFromParent();
1400 return Status;
1401 }
1402 case TargetOpcode::G_FPTOSI:
1403 case TargetOpcode::G_FPTOUI: {
1404 // FIXME: Support other types
1405 Type *FromTy =
1406 getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 1).getReg()));
1407 unsigned ToSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1408 if ((ToSize != 32 && ToSize != 64 && ToSize != 128) || !FromTy)
1409 return UnableToLegalize;
1410 LegalizeResult Status = conversionLibcall(
1411 MI, MIRBuilder, ToType: Type::getIntNTy(C&: Ctx, N: ToSize), FromType: FromTy, LocObserver, TLI);
1412 if (Status != Legalized)
1413 return Status;
1414 break;
1415 }
1416 case TargetOpcode::G_SITOFP:
1417 case TargetOpcode::G_UITOFP: {
1418 unsigned FromSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1419 Type *ToTy =
1420 getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
1421 if ((FromSize != 32 && FromSize != 64 && FromSize != 128) || !ToTy)
1422 return UnableToLegalize;
1423 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SITOFP;
1424 LegalizeResult Status =
1425 conversionLibcall(MI, MIRBuilder, ToType: ToTy, FromType: Type::getIntNTy(C&: Ctx, N: FromSize),
1426 LocObserver, TLI, IsSigned);
1427 if (Status != Legalized)
1428 return Status;
1429 break;
1430 }
1431 case TargetOpcode::G_ATOMICRMW_XCHG:
1432 case TargetOpcode::G_ATOMICRMW_ADD:
1433 case TargetOpcode::G_ATOMICRMW_SUB:
1434 case TargetOpcode::G_ATOMICRMW_AND:
1435 case TargetOpcode::G_ATOMICRMW_OR:
1436 case TargetOpcode::G_ATOMICRMW_XOR:
1437 case TargetOpcode::G_ATOMIC_CMPXCHG:
1438 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1439 auto Status = createAtomicLibcall(MIRBuilder, MI);
1440 if (Status != Legalized)
1441 return Status;
1442 break;
1443 }
1444 case TargetOpcode::G_BZERO:
1445 case TargetOpcode::G_MEMCPY:
1446 case TargetOpcode::G_MEMMOVE:
1447 case TargetOpcode::G_MEMSET: {
1448 LegalizeResult Result =
1449 createMemLibcall(MIRBuilder, MRI&: *MIRBuilder.getMRI(), MI, LocObserver);
1450 if (Result != Legalized)
1451 return Result;
1452 MI.eraseFromParent();
1453 return Result;
1454 }
1455 case TargetOpcode::G_GET_FPENV:
1456 case TargetOpcode::G_GET_FPMODE: {
1457 LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI, LocObserver);
1458 if (Result != Legalized)
1459 return Result;
1460 break;
1461 }
1462 case TargetOpcode::G_SET_FPENV:
1463 case TargetOpcode::G_SET_FPMODE: {
1464 LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI, LocObserver);
1465 if (Result != Legalized)
1466 return Result;
1467 break;
1468 }
1469 case TargetOpcode::G_RESET_FPENV:
1470 case TargetOpcode::G_RESET_FPMODE: {
1471 LegalizeResult Result =
1472 createResetStateLibcall(MIRBuilder, MI, LocObserver);
1473 if (Result != Legalized)
1474 return Result;
1475 break;
1476 }
1477 }
1478
1479 MI.eraseFromParent();
1480 return Legalized;
1481}
1482
1483LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1484 unsigned TypeIdx,
1485 LLT NarrowTy) {
1486 uint64_t SizeOp0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1487 uint64_t NarrowSize = NarrowTy.getSizeInBits();
1488
1489 switch (MI.getOpcode()) {
1490 default:
1491 return UnableToLegalize;
1492 case TargetOpcode::G_IMPLICIT_DEF: {
1493 Register DstReg = MI.getOperand(i: 0).getReg();
1494 LLT DstTy = MRI.getType(Reg: DstReg);
1495
1496 // If SizeOp0 is not an exact multiple of NarrowSize, emit
1497 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1498 // FIXME: Although this would also be legal for the general case, it causes
1499 // a lot of regressions in the emitted code (superfluous COPYs, artifact
1500 // combines not being hit). This seems to be a problem related to the
1501 // artifact combiner.
1502 if (SizeOp0 % NarrowSize != 0) {
1503 LLT ImplicitTy = NarrowTy;
1504 if (DstTy.isVector())
1505 ImplicitTy = LLT::vector(EC: DstTy.getElementCount(), ScalarTy: ImplicitTy);
1506
1507 Register ImplicitReg = MIRBuilder.buildUndef(Res: ImplicitTy).getReg(Idx: 0);
1508 MIRBuilder.buildAnyExt(Res: DstReg, Op: ImplicitReg);
1509
1510 MI.eraseFromParent();
1511 return Legalized;
1512 }
1513
1514 int NumParts = SizeOp0 / NarrowSize;
1515
1516 SmallVector<Register, 2> DstRegs;
1517 for (int i = 0; i < NumParts; ++i)
1518 DstRegs.push_back(Elt: MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0));
1519
1520 if (DstTy.isVector())
1521 MIRBuilder.buildBuildVector(Res: DstReg, Ops: DstRegs);
1522 else
1523 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
1524 MI.eraseFromParent();
1525 return Legalized;
1526 }
1527 case TargetOpcode::G_CONSTANT: {
1528 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1529 const APInt &Val = MI.getOperand(i: 1).getCImm()->getValue();
1530 unsigned TotalSize = Ty.getSizeInBits();
1531 unsigned NarrowSize = NarrowTy.getSizeInBits();
1532 int NumParts = TotalSize / NarrowSize;
1533
1534 SmallVector<Register, 4> PartRegs;
1535 for (int I = 0; I != NumParts; ++I) {
1536 unsigned Offset = I * NarrowSize;
1537 auto K = MIRBuilder.buildConstant(Res: NarrowTy,
1538 Val: Val.lshr(shiftAmt: Offset).trunc(width: NarrowSize));
1539 PartRegs.push_back(Elt: K.getReg(Idx: 0));
1540 }
1541
1542 LLT LeftoverTy;
1543 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1544 SmallVector<Register, 1> LeftoverRegs;
1545 if (LeftoverBits != 0) {
1546 LeftoverTy = LLT::scalar(SizeInBits: LeftoverBits);
1547 auto K = MIRBuilder.buildConstant(
1548 Res: LeftoverTy,
1549 Val: Val.lshr(shiftAmt: NumParts * NarrowSize).trunc(width: LeftoverBits));
1550 LeftoverRegs.push_back(Elt: K.getReg(Idx: 0));
1551 }
1552
1553 insertParts(DstReg: MI.getOperand(i: 0).getReg(),
1554 ResultTy: Ty, PartTy: NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1555
1556 MI.eraseFromParent();
1557 return Legalized;
1558 }
1559 case TargetOpcode::G_SEXT:
1560 case TargetOpcode::G_ZEXT:
1561 case TargetOpcode::G_ANYEXT:
1562 return narrowScalarExt(MI, TypeIdx, Ty: NarrowTy);
1563 case TargetOpcode::G_TRUNC: {
1564 if (TypeIdx != 1)
1565 return UnableToLegalize;
1566
1567 uint64_t SizeOp1 = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1568 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1569 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1570 return UnableToLegalize;
1571 }
1572
1573 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1));
1574 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0), Op: Unmerge.getReg(Idx: 0));
1575 MI.eraseFromParent();
1576 return Legalized;
1577 }
1578 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
1579 case TargetOpcode::G_FREEZE: {
1580 if (TypeIdx != 0)
1581 return UnableToLegalize;
1582
1583 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1584 // Should widen scalar first
1585 if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1586 return UnableToLegalize;
1587
1588 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1).getReg());
1589 SmallVector<Register, 8> Parts;
1590 for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1591 Parts.push_back(
1592 Elt: MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy}, SrcOps: {Unmerge.getReg(Idx: i)})
1593 .getReg(Idx: 0));
1594 }
1595
1596 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: Parts);
1597 MI.eraseFromParent();
1598 return Legalized;
1599 }
1600 case TargetOpcode::G_ADD:
1601 case TargetOpcode::G_SUB:
1602 case TargetOpcode::G_SADDO:
1603 case TargetOpcode::G_SSUBO:
1604 case TargetOpcode::G_SADDE:
1605 case TargetOpcode::G_SSUBE:
1606 case TargetOpcode::G_UADDO:
1607 case TargetOpcode::G_USUBO:
1608 case TargetOpcode::G_UADDE:
1609 case TargetOpcode::G_USUBE:
1610 return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1611 case TargetOpcode::G_MUL:
1612 case TargetOpcode::G_UMULH:
1613 return narrowScalarMul(MI, Ty: NarrowTy);
1614 case TargetOpcode::G_EXTRACT:
1615 return narrowScalarExtract(MI, TypeIdx, Ty: NarrowTy);
1616 case TargetOpcode::G_INSERT:
1617 return narrowScalarInsert(MI, TypeIdx, Ty: NarrowTy);
1618 case TargetOpcode::G_LOAD: {
1619 auto &LoadMI = cast<GLoad>(Val&: MI);
1620 Register DstReg = LoadMI.getDstReg();
1621 LLT DstTy = MRI.getType(Reg: DstReg);
1622 if (DstTy.isVector())
1623 return UnableToLegalize;
1624
1625 if (8 * LoadMI.getMemSize().getValue() != DstTy.getSizeInBits()) {
1626 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1627 MIRBuilder.buildLoad(Res: TmpReg, Addr: LoadMI.getPointerReg(), MMO&: LoadMI.getMMO());
1628 MIRBuilder.buildAnyExt(Res: DstReg, Op: TmpReg);
1629 LoadMI.eraseFromParent();
1630 return Legalized;
1631 }
1632
1633 return reduceLoadStoreWidth(MI&: LoadMI, TypeIdx, NarrowTy);
1634 }
1635 case TargetOpcode::G_ZEXTLOAD:
1636 case TargetOpcode::G_SEXTLOAD: {
1637 auto &LoadMI = cast<GExtLoad>(Val&: MI);
1638 Register DstReg = LoadMI.getDstReg();
1639 Register PtrReg = LoadMI.getPointerReg();
1640
1641 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1642 auto &MMO = LoadMI.getMMO();
1643 unsigned MemSize = MMO.getSizeInBits().getValue();
1644
1645 if (MemSize == NarrowSize) {
1646 MIRBuilder.buildLoad(Res: TmpReg, Addr: PtrReg, MMO);
1647 } else if (MemSize < NarrowSize) {
1648 MIRBuilder.buildLoadInstr(Opcode: LoadMI.getOpcode(), Res: TmpReg, Addr: PtrReg, MMO);
1649 } else if (MemSize > NarrowSize) {
1650 // FIXME: Need to split the load.
1651 return UnableToLegalize;
1652 }
1653
1654 if (isa<GZExtLoad>(Val: LoadMI))
1655 MIRBuilder.buildZExt(Res: DstReg, Op: TmpReg);
1656 else
1657 MIRBuilder.buildSExt(Res: DstReg, Op: TmpReg);
1658
1659 LoadMI.eraseFromParent();
1660 return Legalized;
1661 }
1662 case TargetOpcode::G_STORE: {
1663 auto &StoreMI = cast<GStore>(Val&: MI);
1664
1665 Register SrcReg = StoreMI.getValueReg();
1666 LLT SrcTy = MRI.getType(Reg: SrcReg);
1667 if (SrcTy.isVector())
1668 return UnableToLegalize;
1669
1670 int NumParts = SizeOp0 / NarrowSize;
1671 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1672 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1673 if (SrcTy.isVector() && LeftoverBits != 0)
1674 return UnableToLegalize;
1675
1676 if (8 * StoreMI.getMemSize().getValue() != SrcTy.getSizeInBits()) {
1677 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1678 MIRBuilder.buildTrunc(Res: TmpReg, Op: SrcReg);
1679 MIRBuilder.buildStore(Val: TmpReg, Addr: StoreMI.getPointerReg(), MMO&: StoreMI.getMMO());
1680 StoreMI.eraseFromParent();
1681 return Legalized;
1682 }
1683
1684 return reduceLoadStoreWidth(MI&: StoreMI, TypeIdx: 0, NarrowTy);
1685 }
1686 case TargetOpcode::G_SELECT:
1687 return narrowScalarSelect(MI, TypeIdx, Ty: NarrowTy);
1688 case TargetOpcode::G_AND:
1689 case TargetOpcode::G_OR:
1690 case TargetOpcode::G_XOR: {
1691 // Legalize bitwise operation:
1692 // A = BinOp<Ty> B, C
1693 // into:
1694 // B1, ..., BN = G_UNMERGE_VALUES B
1695 // C1, ..., CN = G_UNMERGE_VALUES C
1696 // A1 = BinOp<Ty/N> B1, C2
1697 // ...
1698 // AN = BinOp<Ty/N> BN, CN
1699 // A = G_MERGE_VALUES A1, ..., AN
1700 return narrowScalarBasic(MI, TypeIdx, Ty: NarrowTy);
1701 }
1702 case TargetOpcode::G_SHL:
1703 case TargetOpcode::G_LSHR:
1704 case TargetOpcode::G_ASHR:
1705 return narrowScalarShift(MI, TypeIdx, Ty: NarrowTy);
1706 case TargetOpcode::G_CTLZ:
1707 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1708 case TargetOpcode::G_CTTZ:
1709 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1710 case TargetOpcode::G_CTPOP:
1711 if (TypeIdx == 1)
1712 switch (MI.getOpcode()) {
1713 case TargetOpcode::G_CTLZ:
1714 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1715 return narrowScalarCTLZ(MI, TypeIdx, Ty: NarrowTy);
1716 case TargetOpcode::G_CTTZ:
1717 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1718 return narrowScalarCTTZ(MI, TypeIdx, Ty: NarrowTy);
1719 case TargetOpcode::G_CTPOP:
1720 return narrowScalarCTPOP(MI, TypeIdx, Ty: NarrowTy);
1721 default:
1722 return UnableToLegalize;
1723 }
1724
1725 Observer.changingInstr(MI);
1726 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1727 Observer.changedInstr(MI);
1728 return Legalized;
1729 case TargetOpcode::G_INTTOPTR:
1730 if (TypeIdx != 1)
1731 return UnableToLegalize;
1732
1733 Observer.changingInstr(MI);
1734 narrowScalarSrc(MI, NarrowTy, OpIdx: 1);
1735 Observer.changedInstr(MI);
1736 return Legalized;
1737 case TargetOpcode::G_PTRTOINT:
1738 if (TypeIdx != 0)
1739 return UnableToLegalize;
1740
1741 Observer.changingInstr(MI);
1742 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1743 Observer.changedInstr(MI);
1744 return Legalized;
1745 case TargetOpcode::G_PHI: {
1746 // FIXME: add support for when SizeOp0 isn't an exact multiple of
1747 // NarrowSize.
1748 if (SizeOp0 % NarrowSize != 0)
1749 return UnableToLegalize;
1750
1751 unsigned NumParts = SizeOp0 / NarrowSize;
1752 SmallVector<Register, 2> DstRegs(NumParts);
1753 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1754 Observer.changingInstr(MI);
1755 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1756 MachineBasicBlock &OpMBB = *MI.getOperand(i: i + 1).getMBB();
1757 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
1758 extractParts(Reg: MI.getOperand(i).getReg(), Ty: NarrowTy, NumParts,
1759 VRegs&: SrcRegs[i / 2], MIRBuilder, MRI);
1760 }
1761 MachineBasicBlock &MBB = *MI.getParent();
1762 MIRBuilder.setInsertPt(MBB, II: MI);
1763 for (unsigned i = 0; i < NumParts; ++i) {
1764 DstRegs[i] = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1765 MachineInstrBuilder MIB =
1766 MIRBuilder.buildInstr(Opcode: TargetOpcode::G_PHI).addDef(RegNo: DstRegs[i]);
1767 for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1768 MIB.addUse(RegNo: SrcRegs[j / 2][i]).add(MO: MI.getOperand(i: j + 1));
1769 }
1770 MIRBuilder.setInsertPt(MBB, II: MBB.getFirstNonPHI());
1771 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: DstRegs);
1772 Observer.changedInstr(MI);
1773 MI.eraseFromParent();
1774 return Legalized;
1775 }
1776 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1777 case TargetOpcode::G_INSERT_VECTOR_ELT: {
1778 if (TypeIdx != 2)
1779 return UnableToLegalize;
1780
1781 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1782 Observer.changingInstr(MI);
1783 narrowScalarSrc(MI, NarrowTy, OpIdx);
1784 Observer.changedInstr(MI);
1785 return Legalized;
1786 }
1787 case TargetOpcode::G_ICMP: {
1788 Register LHS = MI.getOperand(i: 2).getReg();
1789 LLT SrcTy = MRI.getType(Reg: LHS);
1790 CmpInst::Predicate Pred =
1791 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
1792
1793 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1794 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1795 if (!extractParts(Reg: LHS, RegTy: SrcTy, MainTy: NarrowTy, LeftoverTy, VRegs&: LHSPartRegs,
1796 LeftoverVRegs&: LHSLeftoverRegs, MIRBuilder, MRI))
1797 return UnableToLegalize;
1798
1799 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1800 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1801 if (!extractParts(Reg: MI.getOperand(i: 3).getReg(), RegTy: SrcTy, MainTy: NarrowTy, LeftoverTy&: Unused,
1802 VRegs&: RHSPartRegs, LeftoverVRegs&: RHSLeftoverRegs, MIRBuilder, MRI))
1803 return UnableToLegalize;
1804
1805 // We now have the LHS and RHS of the compare split into narrow-type
1806 // registers, plus potentially some leftover type.
1807 Register Dst = MI.getOperand(i: 0).getReg();
1808 LLT ResTy = MRI.getType(Reg: Dst);
1809 if (ICmpInst::isEquality(P: Pred)) {
1810 // For each part on the LHS and RHS, keep track of the result of XOR-ing
1811 // them together. For each equal part, the result should be all 0s. For
1812 // each non-equal part, we'll get at least one 1.
1813 auto Zero = MIRBuilder.buildConstant(Res: NarrowTy, Val: 0);
1814 SmallVector<Register, 4> Xors;
1815 for (auto LHSAndRHS : zip(t&: LHSPartRegs, u&: RHSPartRegs)) {
1816 auto LHS = std::get<0>(t&: LHSAndRHS);
1817 auto RHS = std::get<1>(t&: LHSAndRHS);
1818 auto Xor = MIRBuilder.buildXor(Dst: NarrowTy, Src0: LHS, Src1: RHS).getReg(Idx: 0);
1819 Xors.push_back(Elt: Xor);
1820 }
1821
1822 // Build a G_XOR for each leftover register. Each G_XOR must be widened
1823 // to the desired narrow type so that we can OR them together later.
1824 SmallVector<Register, 4> WidenedXors;
1825 for (auto LHSAndRHS : zip(t&: LHSLeftoverRegs, u&: RHSLeftoverRegs)) {
1826 auto LHS = std::get<0>(t&: LHSAndRHS);
1827 auto RHS = std::get<1>(t&: LHSAndRHS);
1828 auto Xor = MIRBuilder.buildXor(Dst: LeftoverTy, Src0: LHS, Src1: RHS).getReg(Idx: 0);
1829 LLT GCDTy = extractGCDType(Parts&: WidenedXors, DstTy: NarrowTy, NarrowTy: LeftoverTy, SrcReg: Xor);
1830 buildLCMMergePieces(DstTy: LeftoverTy, NarrowTy, GCDTy, VRegs&: WidenedXors,
1831 /* PadStrategy = */ TargetOpcode::G_ZEXT);
1832 llvm::append_range(C&: Xors, R&: WidenedXors);
1833 }
1834
1835 // Now, for each part we broke up, we know if they are equal/not equal
1836 // based off the G_XOR. We can OR these all together and compare against
1837 // 0 to get the result.
1838 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1839 auto Or = MIRBuilder.buildOr(Dst: NarrowTy, Src0: Xors[0], Src1: Xors[1]);
1840 for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1841 Or = MIRBuilder.buildOr(Dst: NarrowTy, Src0: Or, Src1: Xors[I]);
1842 MIRBuilder.buildICmp(Pred, Res: Dst, Op0: Or, Op1: Zero);
1843 } else {
1844 Register CmpIn;
1845 for (unsigned I = 0, E = LHSPartRegs.size(); I != E; ++I) {
1846 Register CmpOut;
1847 CmpInst::Predicate PartPred;
1848
1849 if (I == E - 1 && LHSLeftoverRegs.empty()) {
1850 PartPred = Pred;
1851 CmpOut = Dst;
1852 } else {
1853 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1854 CmpOut = MRI.createGenericVirtualRegister(Ty: ResTy);
1855 }
1856
1857 if (!CmpIn) {
1858 MIRBuilder.buildICmp(Pred: PartPred, Res: CmpOut, Op0: LHSPartRegs[I],
1859 Op1: RHSPartRegs[I]);
1860 } else {
1861 auto Cmp = MIRBuilder.buildICmp(Pred: PartPred, Res: ResTy, Op0: LHSPartRegs[I],
1862 Op1: RHSPartRegs[I]);
1863 auto CmpEq = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: ResTy,
1864 Op0: LHSPartRegs[I], Op1: RHSPartRegs[I]);
1865 MIRBuilder.buildSelect(Res: CmpOut, Tst: CmpEq, Op0: CmpIn, Op1: Cmp);
1866 }
1867
1868 CmpIn = CmpOut;
1869 }
1870
1871 for (unsigned I = 0, E = LHSLeftoverRegs.size(); I != E; ++I) {
1872 Register CmpOut;
1873 CmpInst::Predicate PartPred;
1874
1875 if (I == E - 1 && LHSLeftoverRegs.empty()) {
1876 PartPred = Pred;
1877 CmpOut = Dst;
1878 } else {
1879 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1880 CmpOut = MRI.createGenericVirtualRegister(Ty: ResTy);
1881 }
1882
1883 if (!CmpIn) {
1884 MIRBuilder.buildICmp(Pred: PartPred, Res: CmpOut, Op0: LHSLeftoverRegs[I],
1885 Op1: RHSLeftoverRegs[I]);
1886 } else {
1887 auto Cmp = MIRBuilder.buildICmp(Pred: PartPred, Res: ResTy, Op0: LHSLeftoverRegs[I],
1888 Op1: RHSLeftoverRegs[I]);
1889 auto CmpEq =
1890 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: ResTy,
1891 Op0: LHSLeftoverRegs[I], Op1: RHSLeftoverRegs[I]);
1892 MIRBuilder.buildSelect(Res: CmpOut, Tst: CmpEq, Op0: CmpIn, Op1: Cmp);
1893 }
1894
1895 CmpIn = CmpOut;
1896 }
1897 }
1898 MI.eraseFromParent();
1899 return Legalized;
1900 }
1901 case TargetOpcode::G_FCMP:
1902 if (TypeIdx != 0)
1903 return UnableToLegalize;
1904
1905 Observer.changingInstr(MI);
1906 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1907 Observer.changedInstr(MI);
1908 return Legalized;
1909
1910 case TargetOpcode::G_SEXT_INREG: {
1911 if (TypeIdx != 0)
1912 return UnableToLegalize;
1913
1914 int64_t SizeInBits = MI.getOperand(i: 2).getImm();
1915
1916 // So long as the new type has more bits than the bits we're extending we
1917 // don't need to break it apart.
1918 if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1919 Observer.changingInstr(MI);
1920 // We don't lose any non-extension bits by truncating the src and
1921 // sign-extending the dst.
1922 MachineOperand &MO1 = MI.getOperand(i: 1);
1923 auto TruncMIB = MIRBuilder.buildTrunc(Res: NarrowTy, Op: MO1);
1924 MO1.setReg(TruncMIB.getReg(Idx: 0));
1925
1926 MachineOperand &MO2 = MI.getOperand(i: 0);
1927 Register DstExt = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1928 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
1929 MIRBuilder.buildSExt(Res: MO2, Op: DstExt);
1930 MO2.setReg(DstExt);
1931 Observer.changedInstr(MI);
1932 return Legalized;
1933 }
1934
1935 // Break it apart. Components below the extension point are unmodified. The
1936 // component containing the extension point becomes a narrower SEXT_INREG.
1937 // Components above it are ashr'd from the component containing the
1938 // extension point.
1939 if (SizeOp0 % NarrowSize != 0)
1940 return UnableToLegalize;
1941 int NumParts = SizeOp0 / NarrowSize;
1942
1943 // List the registers where the destination will be scattered.
1944 SmallVector<Register, 2> DstRegs;
1945 // List the registers where the source will be split.
1946 SmallVector<Register, 2> SrcRegs;
1947
1948 // Create all the temporary registers.
1949 for (int i = 0; i < NumParts; ++i) {
1950 Register SrcReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1951
1952 SrcRegs.push_back(Elt: SrcReg);
1953 }
1954
1955 // Explode the big arguments into smaller chunks.
1956 MIRBuilder.buildUnmerge(Res: SrcRegs, Op: MI.getOperand(i: 1));
1957
1958 Register AshrCstReg =
1959 MIRBuilder.buildConstant(Res: NarrowTy, Val: NarrowTy.getScalarSizeInBits() - 1)
1960 .getReg(Idx: 0);
1961 Register FullExtensionReg;
1962 Register PartialExtensionReg;
1963
1964 // Do the operation on each small part.
1965 for (int i = 0; i < NumParts; ++i) {
1966 if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
1967 DstRegs.push_back(Elt: SrcRegs[i]);
1968 PartialExtensionReg = DstRegs.back();
1969 } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1970 assert(PartialExtensionReg &&
1971 "Expected to visit partial extension before full");
1972 if (FullExtensionReg) {
1973 DstRegs.push_back(Elt: FullExtensionReg);
1974 continue;
1975 }
1976 DstRegs.push_back(
1977 Elt: MIRBuilder.buildAShr(Dst: NarrowTy, Src0: PartialExtensionReg, Src1: AshrCstReg)
1978 .getReg(Idx: 0));
1979 FullExtensionReg = DstRegs.back();
1980 } else {
1981 DstRegs.push_back(
1982 Elt: MIRBuilder
1983 .buildInstr(
1984 Opc: TargetOpcode::G_SEXT_INREG, DstOps: {NarrowTy},
1985 SrcOps: {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1986 .getReg(Idx: 0));
1987 PartialExtensionReg = DstRegs.back();
1988 }
1989 }
1990
1991 // Gather the destination registers into the final destination.
1992 Register DstReg = MI.getOperand(i: 0).getReg();
1993 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
1994 MI.eraseFromParent();
1995 return Legalized;
1996 }
1997 case TargetOpcode::G_BSWAP:
1998 case TargetOpcode::G_BITREVERSE: {
1999 if (SizeOp0 % NarrowSize != 0)
2000 return UnableToLegalize;
2001
2002 Observer.changingInstr(MI);
2003 SmallVector<Register, 2> SrcRegs, DstRegs;
2004 unsigned NumParts = SizeOp0 / NarrowSize;
2005 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowTy, NumParts, VRegs&: SrcRegs,
2006 MIRBuilder, MRI);
2007
2008 for (unsigned i = 0; i < NumParts; ++i) {
2009 auto DstPart = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy},
2010 SrcOps: {SrcRegs[NumParts - 1 - i]});
2011 DstRegs.push_back(Elt: DstPart.getReg(Idx: 0));
2012 }
2013
2014 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: DstRegs);
2015
2016 Observer.changedInstr(MI);
2017 MI.eraseFromParent();
2018 return Legalized;
2019 }
2020 case TargetOpcode::G_PTR_ADD:
2021 case TargetOpcode::G_PTRMASK: {
2022 if (TypeIdx != 1)
2023 return UnableToLegalize;
2024 Observer.changingInstr(MI);
2025 narrowScalarSrc(MI, NarrowTy, OpIdx: 2);
2026 Observer.changedInstr(MI);
2027 return Legalized;
2028 }
2029 case TargetOpcode::G_FPTOUI:
2030 case TargetOpcode::G_FPTOSI:
2031 case TargetOpcode::G_FPTOUI_SAT:
2032 case TargetOpcode::G_FPTOSI_SAT:
2033 return narrowScalarFPTOI(MI, TypeIdx, Ty: NarrowTy);
2034 case TargetOpcode::G_FPEXT:
2035 if (TypeIdx != 0)
2036 return UnableToLegalize;
2037 Observer.changingInstr(MI);
2038 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_FPEXT);
2039 Observer.changedInstr(MI);
2040 return Legalized;
2041 case TargetOpcode::G_FLDEXP:
2042 case TargetOpcode::G_STRICT_FLDEXP:
2043 return narrowScalarFLDEXP(MI, TypeIdx, Ty: NarrowTy);
2044 case TargetOpcode::G_VSCALE: {
2045 Register Dst = MI.getOperand(i: 0).getReg();
2046 LLT Ty = MRI.getType(Reg: Dst);
2047
2048 // Assume VSCALE(1) fits into a legal integer
2049 const APInt One(NarrowTy.getSizeInBits(), 1);
2050 auto VScaleBase = MIRBuilder.buildVScale(Res: NarrowTy, MinElts: One);
2051 auto ZExt = MIRBuilder.buildZExt(Res: Ty, Op: VScaleBase);
2052 auto C = MIRBuilder.buildConstant(Res: Ty, Val: *MI.getOperand(i: 1).getCImm());
2053 MIRBuilder.buildMul(Dst, Src0: ZExt, Src1: C);
2054
2055 MI.eraseFromParent();
2056 return Legalized;
2057 }
2058 }
2059}
2060
2061Register LegalizerHelper::coerceToScalar(Register Val) {
2062 LLT Ty = MRI.getType(Reg: Val);
2063 if (Ty.isScalar())
2064 return Val;
2065
2066 const DataLayout &DL = MIRBuilder.getDataLayout();
2067 LLT NewTy = LLT::scalar(SizeInBits: Ty.getSizeInBits());
2068 if (Ty.isPointer()) {
2069 if (DL.isNonIntegralAddressSpace(AddrSpace: Ty.getAddressSpace()))
2070 return Register();
2071 return MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Val).getReg(Idx: 0);
2072 }
2073
2074 Register NewVal = Val;
2075
2076 assert(Ty.isVector());
2077 if (Ty.isPointerVector())
2078 NewVal = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: NewVal).getReg(Idx: 0);
2079 return MIRBuilder.buildBitcast(Dst: NewTy, Src: NewVal).getReg(Idx: 0);
2080}
2081
2082void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
2083 unsigned OpIdx, unsigned ExtOpcode) {
2084 MachineOperand &MO = MI.getOperand(i: OpIdx);
2085 auto ExtB = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MO});
2086 MO.setReg(ExtB.getReg(Idx: 0));
2087}
2088
2089void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
2090 unsigned OpIdx) {
2091 MachineOperand &MO = MI.getOperand(i: OpIdx);
2092 auto ExtB = MIRBuilder.buildTrunc(Res: NarrowTy, Op: MO);
2093 MO.setReg(ExtB.getReg(Idx: 0));
2094}
2095
2096void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
2097 unsigned OpIdx, unsigned TruncOpcode) {
2098 MachineOperand &MO = MI.getOperand(i: OpIdx);
2099 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2100 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2101 MIRBuilder.buildInstr(Opc: TruncOpcode, DstOps: {MO}, SrcOps: {DstExt});
2102 MO.setReg(DstExt);
2103}
2104
2105void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
2106 unsigned OpIdx, unsigned ExtOpcode) {
2107 MachineOperand &MO = MI.getOperand(i: OpIdx);
2108 Register DstTrunc = MRI.createGenericVirtualRegister(Ty: NarrowTy);
2109 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2110 MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {MO}, SrcOps: {DstTrunc});
2111 MO.setReg(DstTrunc);
2112}
2113
2114void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
2115 unsigned OpIdx) {
2116 MachineOperand &MO = MI.getOperand(i: OpIdx);
2117 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2118 Register Dst = MO.getReg();
2119 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2120 MO.setReg(DstExt);
2121 MIRBuilder.buildDeleteTrailingVectorElements(Res: Dst, Op0: DstExt);
2122}
2123
2124void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
2125 unsigned OpIdx) {
2126 MachineOperand &MO = MI.getOperand(i: OpIdx);
2127 MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(Res: MoreTy, Op0: MO).getReg(Idx: 0));
2128}
2129
2130void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2131 MachineOperand &Op = MI.getOperand(i: OpIdx);
2132 Op.setReg(MIRBuilder.buildBitcast(Dst: CastTy, Src: Op).getReg(Idx: 0));
2133}
2134
2135void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2136 MachineOperand &MO = MI.getOperand(i: OpIdx);
2137 Register CastDst = MRI.createGenericVirtualRegister(Ty: CastTy);
2138 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2139 MIRBuilder.buildBitcast(Dst: MO, Src: CastDst);
2140 MO.setReg(CastDst);
2141}
2142
2143LegalizerHelper::LegalizeResult
2144LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
2145 LLT WideTy) {
2146 if (TypeIdx != 1)
2147 return UnableToLegalize;
2148
2149 auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
2150 if (DstTy.isVector())
2151 return UnableToLegalize;
2152
2153 LLT SrcTy = MRI.getType(Reg: Src1Reg);
2154 const int DstSize = DstTy.getSizeInBits();
2155 const int SrcSize = SrcTy.getSizeInBits();
2156 const int WideSize = WideTy.getSizeInBits();
2157 const int NumMerge = (DstSize + WideSize - 1) / WideSize;
2158
2159 unsigned NumOps = MI.getNumOperands();
2160 unsigned NumSrc = MI.getNumOperands() - 1;
2161 unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
2162
2163 if (WideSize >= DstSize) {
2164 // Directly pack the bits in the target type.
2165 Register ResultReg = MIRBuilder.buildZExt(Res: WideTy, Op: Src1Reg).getReg(Idx: 0);
2166
2167 for (unsigned I = 2; I != NumOps; ++I) {
2168 const unsigned Offset = (I - 1) * PartSize;
2169
2170 Register SrcReg = MI.getOperand(i: I).getReg();
2171 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
2172
2173 auto ZextInput = MIRBuilder.buildZExt(Res: WideTy, Op: SrcReg);
2174
2175 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
2176 MRI.createGenericVirtualRegister(Ty: WideTy);
2177
2178 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: Offset);
2179 auto Shl = MIRBuilder.buildShl(Dst: WideTy, Src0: ZextInput, Src1: ShiftAmt);
2180 MIRBuilder.buildOr(Dst: NextResult, Src0: ResultReg, Src1: Shl);
2181 ResultReg = NextResult;
2182 }
2183
2184 if (WideSize > DstSize)
2185 MIRBuilder.buildTrunc(Res: DstReg, Op: ResultReg);
2186 else if (DstTy.isPointer())
2187 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
2188
2189 MI.eraseFromParent();
2190 return Legalized;
2191 }
2192
2193 // Unmerge the original values to the GCD type, and recombine to the next
2194 // multiple greater than the original type.
2195 //
2196 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
2197 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
2198 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
2199 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
2200 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
2201 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
2202 // %12:_(s12) = G_MERGE_VALUES %10, %11
2203 //
2204 // Padding with undef if necessary:
2205 //
2206 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
2207 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
2208 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
2209 // %7:_(s2) = G_IMPLICIT_DEF
2210 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
2211 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
2212 // %10:_(s12) = G_MERGE_VALUES %8, %9
2213
2214 const int GCD = std::gcd(m: SrcSize, n: WideSize);
2215 LLT GCDTy = LLT::scalar(SizeInBits: GCD);
2216
2217 SmallVector<Register, 8> NewMergeRegs;
2218 SmallVector<Register, 8> Unmerges;
2219 LLT WideDstTy = LLT::scalar(SizeInBits: NumMerge * WideSize);
2220
2221 // Decompose the original operands if they don't evenly divide.
2222 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands())) {
2223 Register SrcReg = MO.getReg();
2224 if (GCD == SrcSize) {
2225 Unmerges.push_back(Elt: SrcReg);
2226 } else {
2227 auto Unmerge = MIRBuilder.buildUnmerge(Res: GCDTy, Op: SrcReg);
2228 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
2229 Unmerges.push_back(Elt: Unmerge.getReg(Idx: J));
2230 }
2231 }
2232
2233 // Pad with undef to the next size that is a multiple of the requested size.
2234 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
2235 Register UndefReg = MIRBuilder.buildUndef(Res: GCDTy).getReg(Idx: 0);
2236 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
2237 Unmerges.push_back(Elt: UndefReg);
2238 }
2239
2240 const int PartsPerGCD = WideSize / GCD;
2241
2242 // Build merges of each piece.
2243 ArrayRef<Register> Slicer(Unmerges);
2244 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(N: PartsPerGCD)) {
2245 auto Merge =
2246 MIRBuilder.buildMergeLikeInstr(Res: WideTy, Ops: Slicer.take_front(N: PartsPerGCD));
2247 NewMergeRegs.push_back(Elt: Merge.getReg(Idx: 0));
2248 }
2249
2250 // A truncate may be necessary if the requested type doesn't evenly divide the
2251 // original result type.
2252 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
2253 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NewMergeRegs);
2254 } else {
2255 auto FinalMerge = MIRBuilder.buildMergeLikeInstr(Res: WideDstTy, Ops: NewMergeRegs);
2256 MIRBuilder.buildTrunc(Res: DstReg, Op: FinalMerge.getReg(Idx: 0));
2257 }
2258
2259 MI.eraseFromParent();
2260 return Legalized;
2261}
2262
2263LegalizerHelper::LegalizeResult
2264LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
2265 LLT WideTy) {
2266 if (TypeIdx != 0)
2267 return UnableToLegalize;
2268
2269 int NumDst = MI.getNumOperands() - 1;
2270 Register SrcReg = MI.getOperand(i: NumDst).getReg();
2271 LLT SrcTy = MRI.getType(Reg: SrcReg);
2272 if (SrcTy.isVector())
2273 return UnableToLegalize;
2274
2275 Register Dst0Reg = MI.getOperand(i: 0).getReg();
2276 LLT DstTy = MRI.getType(Reg: Dst0Reg);
2277 if (!DstTy.isScalar())
2278 return UnableToLegalize;
2279
2280 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
2281 if (SrcTy.isPointer()) {
2282 const DataLayout &DL = MIRBuilder.getDataLayout();
2283 if (DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace())) {
2284 LLVM_DEBUG(
2285 dbgs() << "Not casting non-integral address space integer\n");
2286 return UnableToLegalize;
2287 }
2288
2289 SrcTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
2290 SrcReg = MIRBuilder.buildPtrToInt(Dst: SrcTy, Src: SrcReg).getReg(Idx: 0);
2291 }
2292
2293 // Widen SrcTy to WideTy. This does not affect the result, but since the
2294 // user requested this size, it is probably better handled than SrcTy and
2295 // should reduce the total number of legalization artifacts.
2296 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2297 SrcTy = WideTy;
2298 SrcReg = MIRBuilder.buildAnyExt(Res: WideTy, Op: SrcReg).getReg(Idx: 0);
2299 }
2300
2301 // Theres no unmerge type to target. Directly extract the bits from the
2302 // source type
2303 unsigned DstSize = DstTy.getSizeInBits();
2304
2305 MIRBuilder.buildTrunc(Res: Dst0Reg, Op: SrcReg);
2306 for (int I = 1; I != NumDst; ++I) {
2307 auto ShiftAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: DstSize * I);
2308 auto Shr = MIRBuilder.buildLShr(Dst: SrcTy, Src0: SrcReg, Src1: ShiftAmt);
2309 MIRBuilder.buildTrunc(Res: MI.getOperand(i: I), Op: Shr);
2310 }
2311
2312 MI.eraseFromParent();
2313 return Legalized;
2314 }
2315
2316 // Extend the source to a wider type.
2317 LLT LCMTy = getLCMType(OrigTy: SrcTy, TargetTy: WideTy);
2318
2319 Register WideSrc = SrcReg;
2320 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
2321 // TODO: If this is an integral address space, cast to integer and anyext.
2322 if (SrcTy.isPointer()) {
2323 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
2324 return UnableToLegalize;
2325 }
2326
2327 WideSrc = MIRBuilder.buildAnyExt(Res: LCMTy, Op: WideSrc).getReg(Idx: 0);
2328 }
2329
2330 auto Unmerge = MIRBuilder.buildUnmerge(Res: WideTy, Op: WideSrc);
2331
2332 // Create a sequence of unmerges and merges to the original results. Since we
2333 // may have widened the source, we will need to pad the results with dead defs
2334 // to cover the source register.
2335 // e.g. widen s48 to s64:
2336 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
2337 //
2338 // =>
2339 // %4:_(s192) = G_ANYEXT %0:_(s96)
2340 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
2341 // ; unpack to GCD type, with extra dead defs
2342 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
2343 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
2344 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
2345 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination
2346 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
2347 const LLT GCDTy = getGCDType(OrigTy: WideTy, TargetTy: DstTy);
2348 const int NumUnmerge = Unmerge->getNumOperands() - 1;
2349 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
2350
2351 // Directly unmerge to the destination without going through a GCD type
2352 // if possible
2353 if (PartsPerRemerge == 1) {
2354 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
2355
2356 for (int I = 0; I != NumUnmerge; ++I) {
2357 auto MIB = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_UNMERGE_VALUES);
2358
2359 for (int J = 0; J != PartsPerUnmerge; ++J) {
2360 int Idx = I * PartsPerUnmerge + J;
2361 if (Idx < NumDst)
2362 MIB.addDef(RegNo: MI.getOperand(i: Idx).getReg());
2363 else {
2364 // Create dead def for excess components.
2365 MIB.addDef(RegNo: MRI.createGenericVirtualRegister(Ty: DstTy));
2366 }
2367 }
2368
2369 MIB.addUse(RegNo: Unmerge.getReg(Idx: I));
2370 }
2371 } else {
2372 SmallVector<Register, 16> Parts;
2373 for (int J = 0; J != NumUnmerge; ++J)
2374 extractGCDType(Parts, GCDTy, SrcReg: Unmerge.getReg(Idx: J));
2375
2376 SmallVector<Register, 8> RemergeParts;
2377 for (int I = 0; I != NumDst; ++I) {
2378 for (int J = 0; J < PartsPerRemerge; ++J) {
2379 const int Idx = I * PartsPerRemerge + J;
2380 RemergeParts.emplace_back(Args&: Parts[Idx]);
2381 }
2382
2383 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: I).getReg(), Ops: RemergeParts);
2384 RemergeParts.clear();
2385 }
2386 }
2387
2388 MI.eraseFromParent();
2389 return Legalized;
2390}
2391
2392LegalizerHelper::LegalizeResult
2393LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2394 LLT WideTy) {
2395 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2396 unsigned Offset = MI.getOperand(i: 2).getImm();
2397
2398 if (TypeIdx == 0) {
2399 if (SrcTy.isVector() || DstTy.isVector())
2400 return UnableToLegalize;
2401
2402 SrcOp Src(SrcReg);
2403 if (SrcTy.isPointer()) {
2404 // Extracts from pointers can be handled only if they are really just
2405 // simple integers.
2406 const DataLayout &DL = MIRBuilder.getDataLayout();
2407 if (DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace()))
2408 return UnableToLegalize;
2409
2410 LLT SrcAsIntTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
2411 Src = MIRBuilder.buildPtrToInt(Dst: SrcAsIntTy, Src);
2412 SrcTy = SrcAsIntTy;
2413 }
2414
2415 if (DstTy.isPointer())
2416 return UnableToLegalize;
2417
2418 if (Offset == 0) {
2419 // Avoid a shift in the degenerate case.
2420 MIRBuilder.buildTrunc(Res: DstReg,
2421 Op: MIRBuilder.buildAnyExtOrTrunc(Res: WideTy, Op: Src));
2422 MI.eraseFromParent();
2423 return Legalized;
2424 }
2425
2426 // Do a shift in the source type.
2427 LLT ShiftTy = SrcTy;
2428 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2429 Src = MIRBuilder.buildAnyExt(Res: WideTy, Op: Src);
2430 ShiftTy = WideTy;
2431 }
2432
2433 auto LShr = MIRBuilder.buildLShr(
2434 Dst: ShiftTy, Src0: Src, Src1: MIRBuilder.buildConstant(Res: ShiftTy, Val: Offset));
2435 MIRBuilder.buildTrunc(Res: DstReg, Op: LShr);
2436 MI.eraseFromParent();
2437 return Legalized;
2438 }
2439
2440 if (SrcTy.isScalar()) {
2441 Observer.changingInstr(MI);
2442 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2443 Observer.changedInstr(MI);
2444 return Legalized;
2445 }
2446
2447 if (!SrcTy.isVector())
2448 return UnableToLegalize;
2449
2450 if (DstTy != SrcTy.getElementType())
2451 return UnableToLegalize;
2452
2453 if (Offset % SrcTy.getScalarSizeInBits() != 0)
2454 return UnableToLegalize;
2455
2456 Observer.changingInstr(MI);
2457 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2458
2459 MI.getOperand(i: 2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2460 Offset);
2461 widenScalarDst(MI, WideTy: WideTy.getScalarType(), OpIdx: 0);
2462 Observer.changedInstr(MI);
2463 return Legalized;
2464}
2465
2466LegalizerHelper::LegalizeResult
2467LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2468 LLT WideTy) {
2469 if (TypeIdx != 0 || WideTy.isVector())
2470 return UnableToLegalize;
2471 Observer.changingInstr(MI);
2472 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2473 widenScalarDst(MI, WideTy);
2474 Observer.changedInstr(MI);
2475 return Legalized;
2476}
2477
2478LegalizerHelper::LegalizeResult
2479LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2480 LLT WideTy) {
2481 unsigned Opcode;
2482 unsigned ExtOpcode;
2483 std::optional<Register> CarryIn;
2484 switch (MI.getOpcode()) {
2485 default:
2486 llvm_unreachable("Unexpected opcode!");
2487 case TargetOpcode::G_SADDO:
2488 Opcode = TargetOpcode::G_ADD;
2489 ExtOpcode = TargetOpcode::G_SEXT;
2490 break;
2491 case TargetOpcode::G_SSUBO:
2492 Opcode = TargetOpcode::G_SUB;
2493 ExtOpcode = TargetOpcode::G_SEXT;
2494 break;
2495 case TargetOpcode::G_UADDO:
2496 Opcode = TargetOpcode::G_ADD;
2497 ExtOpcode = TargetOpcode::G_ZEXT;
2498 break;
2499 case TargetOpcode::G_USUBO:
2500 Opcode = TargetOpcode::G_SUB;
2501 ExtOpcode = TargetOpcode::G_ZEXT;
2502 break;
2503 case TargetOpcode::G_SADDE:
2504 Opcode = TargetOpcode::G_UADDE;
2505 ExtOpcode = TargetOpcode::G_SEXT;
2506 CarryIn = MI.getOperand(i: 4).getReg();
2507 break;
2508 case TargetOpcode::G_SSUBE:
2509 Opcode = TargetOpcode::G_USUBE;
2510 ExtOpcode = TargetOpcode::G_SEXT;
2511 CarryIn = MI.getOperand(i: 4).getReg();
2512 break;
2513 case TargetOpcode::G_UADDE:
2514 Opcode = TargetOpcode::G_UADDE;
2515 ExtOpcode = TargetOpcode::G_ZEXT;
2516 CarryIn = MI.getOperand(i: 4).getReg();
2517 break;
2518 case TargetOpcode::G_USUBE:
2519 Opcode = TargetOpcode::G_USUBE;
2520 ExtOpcode = TargetOpcode::G_ZEXT;
2521 CarryIn = MI.getOperand(i: 4).getReg();
2522 break;
2523 }
2524
2525 if (TypeIdx == 1) {
2526 unsigned BoolExtOp = MIRBuilder.getBoolExtOp(IsVec: WideTy.isVector(), IsFP: false);
2527
2528 Observer.changingInstr(MI);
2529 if (CarryIn)
2530 widenScalarSrc(MI, WideTy, OpIdx: 4, ExtOpcode: BoolExtOp);
2531 widenScalarDst(MI, WideTy, OpIdx: 1);
2532
2533 Observer.changedInstr(MI);
2534 return Legalized;
2535 }
2536
2537 auto LHSExt = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 2)});
2538 auto RHSExt = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 3)});
2539 // Do the arithmetic in the larger type.
2540 Register NewOp;
2541 if (CarryIn) {
2542 LLT CarryOutTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
2543 NewOp = MIRBuilder
2544 .buildInstr(Opc: Opcode, DstOps: {WideTy, CarryOutTy},
2545 SrcOps: {LHSExt, RHSExt, *CarryIn})
2546 .getReg(Idx: 0);
2547 } else {
2548 NewOp = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {WideTy}, SrcOps: {LHSExt, RHSExt}).getReg(Idx: 0);
2549 }
2550 LLT OrigTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
2551 auto TruncOp = MIRBuilder.buildTrunc(Res: OrigTy, Op: NewOp);
2552 auto ExtOp = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {TruncOp});
2553 // There is no overflow if the ExtOp is the same as NewOp.
2554 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: MI.getOperand(i: 1), Op0: NewOp, Op1: ExtOp);
2555 // Now trunc the NewOp to the original result.
2556 MIRBuilder.buildTrunc(Res: MI.getOperand(i: 0), Op: NewOp);
2557 MI.eraseFromParent();
2558 return Legalized;
2559}
2560
2561LegalizerHelper::LegalizeResult
2562LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2563 LLT WideTy) {
2564 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2565 MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2566 MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2567 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2568 MI.getOpcode() == TargetOpcode::G_USHLSAT;
2569 // We can convert this to:
2570 // 1. Any extend iN to iM
2571 // 2. SHL by M-N
2572 // 3. [US][ADD|SUB|SHL]SAT
2573 // 4. L/ASHR by M-N
2574 //
2575 // It may be more efficient to lower this to a min and a max operation in
2576 // the higher precision arithmetic if the promoted operation isn't legal,
2577 // but this decision is up to the target's lowering request.
2578 Register DstReg = MI.getOperand(i: 0).getReg();
2579
2580 unsigned NewBits = WideTy.getScalarSizeInBits();
2581 unsigned SHLAmount = NewBits - MRI.getType(Reg: DstReg).getScalarSizeInBits();
2582
2583 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2584 // must not left shift the RHS to preserve the shift amount.
2585 auto LHS = MIRBuilder.buildAnyExt(Res: WideTy, Op: MI.getOperand(i: 1));
2586 auto RHS = IsShift ? MIRBuilder.buildZExt(Res: WideTy, Op: MI.getOperand(i: 2))
2587 : MIRBuilder.buildAnyExt(Res: WideTy, Op: MI.getOperand(i: 2));
2588 auto ShiftK = MIRBuilder.buildConstant(Res: WideTy, Val: SHLAmount);
2589 auto ShiftL = MIRBuilder.buildShl(Dst: WideTy, Src0: LHS, Src1: ShiftK);
2590 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(Dst: WideTy, Src0: RHS, Src1: ShiftK);
2591
2592 auto WideInst = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {WideTy},
2593 SrcOps: {ShiftL, ShiftR}, Flags: MI.getFlags());
2594
2595 // Use a shift that will preserve the number of sign bits when the trunc is
2596 // folded away.
2597 auto Result = IsSigned ? MIRBuilder.buildAShr(Dst: WideTy, Src0: WideInst, Src1: ShiftK)
2598 : MIRBuilder.buildLShr(Dst: WideTy, Src0: WideInst, Src1: ShiftK);
2599
2600 MIRBuilder.buildTrunc(Res: DstReg, Op: Result);
2601 MI.eraseFromParent();
2602 return Legalized;
2603}
2604
2605LegalizerHelper::LegalizeResult
2606LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2607 LLT WideTy) {
2608 if (TypeIdx == 1) {
2609 Observer.changingInstr(MI);
2610 widenScalarDst(MI, WideTy, OpIdx: 1);
2611 Observer.changedInstr(MI);
2612 return Legalized;
2613 }
2614
2615 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2616 auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2617 LLT SrcTy = MRI.getType(Reg: LHS);
2618 LLT OverflowTy = MRI.getType(Reg: OriginalOverflow);
2619 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2620
2621 // To determine if the result overflowed in the larger type, we extend the
2622 // input to the larger type, do the multiply (checking if it overflows),
2623 // then also check the high bits of the result to see if overflow happened
2624 // there.
2625 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2626 auto LeftOperand = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {LHS});
2627 auto RightOperand = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {RHS});
2628
2629 // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2630 // so we don't need to check the overflow result of larger type Mulo.
2631 bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2632
2633 unsigned MulOpc =
2634 WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2635
2636 MachineInstrBuilder Mulo;
2637 if (WideMulCanOverflow)
2638 Mulo = MIRBuilder.buildInstr(Opc: MulOpc, DstOps: {WideTy, OverflowTy},
2639 SrcOps: {LeftOperand, RightOperand});
2640 else
2641 Mulo = MIRBuilder.buildInstr(Opc: MulOpc, DstOps: {WideTy}, SrcOps: {LeftOperand, RightOperand});
2642
2643 auto Mul = Mulo->getOperand(i: 0);
2644 MIRBuilder.buildTrunc(Res: Result, Op: Mul);
2645
2646 MachineInstrBuilder ExtResult;
2647 // Overflow occurred if it occurred in the larger type, or if the high part
2648 // of the result does not zero/sign-extend the low part. Check this second
2649 // possibility first.
2650 if (IsSigned) {
2651 // For signed, overflow occurred when the high part does not sign-extend
2652 // the low part.
2653 ExtResult = MIRBuilder.buildSExtInReg(Res: WideTy, Op: Mul, ImmOp: SrcBitWidth);
2654 } else {
2655 // Unsigned overflow occurred when the high part does not zero-extend the
2656 // low part.
2657 ExtResult = MIRBuilder.buildZExtInReg(Res: WideTy, Op: Mul, ImmOp: SrcBitWidth);
2658 }
2659
2660 if (WideMulCanOverflow) {
2661 auto Overflow =
2662 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: OverflowTy, Op0: Mul, Op1: ExtResult);
2663 // Finally check if the multiplication in the larger type itself overflowed.
2664 MIRBuilder.buildOr(Dst: OriginalOverflow, Src0: Mulo->getOperand(i: 1), Src1: Overflow);
2665 } else {
2666 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: OriginalOverflow, Op0: Mul, Op1: ExtResult);
2667 }
2668 MI.eraseFromParent();
2669 return Legalized;
2670}
2671
2672LegalizerHelper::LegalizeResult
2673LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2674 unsigned Opcode = MI.getOpcode();
2675 switch (Opcode) {
2676 default:
2677 return UnableToLegalize;
2678 case TargetOpcode::G_ATOMICRMW_XCHG:
2679 case TargetOpcode::G_ATOMICRMW_ADD:
2680 case TargetOpcode::G_ATOMICRMW_SUB:
2681 case TargetOpcode::G_ATOMICRMW_AND:
2682 case TargetOpcode::G_ATOMICRMW_OR:
2683 case TargetOpcode::G_ATOMICRMW_XOR:
2684 case TargetOpcode::G_ATOMICRMW_MIN:
2685 case TargetOpcode::G_ATOMICRMW_MAX:
2686 case TargetOpcode::G_ATOMICRMW_UMIN:
2687 case TargetOpcode::G_ATOMICRMW_UMAX:
2688 assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2689 Observer.changingInstr(MI);
2690 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2691 widenScalarDst(MI, WideTy, OpIdx: 0);
2692 Observer.changedInstr(MI);
2693 return Legalized;
2694 case TargetOpcode::G_ATOMIC_CMPXCHG:
2695 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2696 Observer.changingInstr(MI);
2697 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2698 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2699 widenScalarDst(MI, WideTy, OpIdx: 0);
2700 Observer.changedInstr(MI);
2701 return Legalized;
2702 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2703 if (TypeIdx == 0) {
2704 Observer.changingInstr(MI);
2705 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2706 widenScalarSrc(MI, WideTy, OpIdx: 4, ExtOpcode: TargetOpcode::G_ANYEXT);
2707 widenScalarDst(MI, WideTy, OpIdx: 0);
2708 Observer.changedInstr(MI);
2709 return Legalized;
2710 }
2711 assert(TypeIdx == 1 &&
2712 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2713 Observer.changingInstr(MI);
2714 widenScalarDst(MI, WideTy, OpIdx: 1);
2715 Observer.changedInstr(MI);
2716 return Legalized;
2717 case TargetOpcode::G_EXTRACT:
2718 return widenScalarExtract(MI, TypeIdx, WideTy);
2719 case TargetOpcode::G_INSERT:
2720 return widenScalarInsert(MI, TypeIdx, WideTy);
2721 case TargetOpcode::G_MERGE_VALUES:
2722 return widenScalarMergeValues(MI, TypeIdx, WideTy);
2723 case TargetOpcode::G_UNMERGE_VALUES:
2724 return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2725 case TargetOpcode::G_SADDO:
2726 case TargetOpcode::G_SSUBO:
2727 case TargetOpcode::G_UADDO:
2728 case TargetOpcode::G_USUBO:
2729 case TargetOpcode::G_SADDE:
2730 case TargetOpcode::G_SSUBE:
2731 case TargetOpcode::G_UADDE:
2732 case TargetOpcode::G_USUBE:
2733 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2734 case TargetOpcode::G_UMULO:
2735 case TargetOpcode::G_SMULO:
2736 return widenScalarMulo(MI, TypeIdx, WideTy);
2737 case TargetOpcode::G_SADDSAT:
2738 case TargetOpcode::G_SSUBSAT:
2739 case TargetOpcode::G_SSHLSAT:
2740 case TargetOpcode::G_UADDSAT:
2741 case TargetOpcode::G_USUBSAT:
2742 case TargetOpcode::G_USHLSAT:
2743 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2744 case TargetOpcode::G_CTTZ:
2745 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2746 case TargetOpcode::G_CTLZ:
2747 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2748 case TargetOpcode::G_CTPOP: {
2749 if (TypeIdx == 0) {
2750 Observer.changingInstr(MI);
2751 widenScalarDst(MI, WideTy, OpIdx: 0);
2752 Observer.changedInstr(MI);
2753 return Legalized;
2754 }
2755
2756 Register SrcReg = MI.getOperand(i: 1).getReg();
2757
2758 // First extend the input.
2759 unsigned ExtOpc = Opcode == TargetOpcode::G_CTTZ ||
2760 Opcode == TargetOpcode::G_CTTZ_ZERO_UNDEF
2761 ? TargetOpcode::G_ANYEXT
2762 : TargetOpcode::G_ZEXT;
2763 auto MIBSrc = MIRBuilder.buildInstr(Opc: ExtOpc, DstOps: {WideTy}, SrcOps: {SrcReg});
2764 LLT CurTy = MRI.getType(Reg: SrcReg);
2765 unsigned NewOpc = Opcode;
2766 if (NewOpc == TargetOpcode::G_CTTZ) {
2767 // The count is the same in the larger type except if the original
2768 // value was zero. This can be handled by setting the bit just off
2769 // the top of the original type.
2770 auto TopBit =
2771 APInt::getOneBitSet(numBits: WideTy.getSizeInBits(), BitNo: CurTy.getSizeInBits());
2772 MIBSrc = MIRBuilder.buildOr(
2773 Dst: WideTy, Src0: MIBSrc, Src1: MIRBuilder.buildConstant(Res: WideTy, Val: TopBit));
2774 // Now we know the operand is non-zero, use the more relaxed opcode.
2775 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2776 }
2777
2778 unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2779
2780 if (Opcode == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2781 // An optimization where the result is the CTLZ after the left shift by
2782 // (Difference in widety and current ty), that is,
2783 // MIBSrc = MIBSrc << (sizeinbits(WideTy) - sizeinbits(CurTy))
2784 // Result = ctlz MIBSrc
2785 MIBSrc = MIRBuilder.buildShl(Dst: WideTy, Src0: MIBSrc,
2786 Src1: MIRBuilder.buildConstant(Res: WideTy, Val: SizeDiff));
2787 }
2788
2789 // Perform the operation at the larger size.
2790 auto MIBNewOp = MIRBuilder.buildInstr(Opc: NewOpc, DstOps: {WideTy}, SrcOps: {MIBSrc});
2791 // This is already the correct result for CTPOP and CTTZs
2792 if (Opcode == TargetOpcode::G_CTLZ) {
2793 // The correct result is NewOp - (Difference in widety and current ty).
2794 MIBNewOp = MIRBuilder.buildSub(
2795 Dst: WideTy, Src0: MIBNewOp, Src1: MIRBuilder.buildConstant(Res: WideTy, Val: SizeDiff));
2796 }
2797
2798 MIRBuilder.buildZExtOrTrunc(Res: MI.getOperand(i: 0), Op: MIBNewOp);
2799 MI.eraseFromParent();
2800 return Legalized;
2801 }
2802 case TargetOpcode::G_BSWAP: {
2803 Observer.changingInstr(MI);
2804 Register DstReg = MI.getOperand(i: 0).getReg();
2805
2806 Register ShrReg = MRI.createGenericVirtualRegister(Ty: WideTy);
2807 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2808 Register ShiftAmtReg = MRI.createGenericVirtualRegister(Ty: WideTy);
2809 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2810
2811 MI.getOperand(i: 0).setReg(DstExt);
2812
2813 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2814
2815 LLT Ty = MRI.getType(Reg: DstReg);
2816 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2817 MIRBuilder.buildConstant(Res: ShiftAmtReg, Val: DiffBits);
2818 MIRBuilder.buildLShr(Dst: ShrReg, Src0: DstExt, Src1: ShiftAmtReg);
2819
2820 MIRBuilder.buildTrunc(Res: DstReg, Op: ShrReg);
2821 Observer.changedInstr(MI);
2822 return Legalized;
2823 }
2824 case TargetOpcode::G_BITREVERSE: {
2825 Observer.changingInstr(MI);
2826
2827 Register DstReg = MI.getOperand(i: 0).getReg();
2828 LLT Ty = MRI.getType(Reg: DstReg);
2829 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2830
2831 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2832 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2833 MI.getOperand(i: 0).setReg(DstExt);
2834 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2835
2836 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: DiffBits);
2837 auto Shift = MIRBuilder.buildLShr(Dst: WideTy, Src0: DstExt, Src1: ShiftAmt);
2838 MIRBuilder.buildTrunc(Res: DstReg, Op: Shift);
2839 Observer.changedInstr(MI);
2840 return Legalized;
2841 }
2842 case TargetOpcode::G_FREEZE:
2843 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
2844 Observer.changingInstr(MI);
2845 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2846 widenScalarDst(MI, WideTy);
2847 Observer.changedInstr(MI);
2848 return Legalized;
2849
2850 case TargetOpcode::G_ABS:
2851 Observer.changingInstr(MI);
2852 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2853 widenScalarDst(MI, WideTy);
2854 Observer.changedInstr(MI);
2855 return Legalized;
2856
2857 case TargetOpcode::G_ADD:
2858 case TargetOpcode::G_AND:
2859 case TargetOpcode::G_MUL:
2860 case TargetOpcode::G_OR:
2861 case TargetOpcode::G_XOR:
2862 case TargetOpcode::G_SUB:
2863 case TargetOpcode::G_SHUFFLE_VECTOR:
2864 // Perform operation at larger width (any extension is fines here, high bits
2865 // don't affect the result) and then truncate the result back to the
2866 // original type.
2867 Observer.changingInstr(MI);
2868 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2869 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2870 widenScalarDst(MI, WideTy);
2871 Observer.changedInstr(MI);
2872 return Legalized;
2873
2874 case TargetOpcode::G_SBFX:
2875 case TargetOpcode::G_UBFX:
2876 Observer.changingInstr(MI);
2877
2878 if (TypeIdx == 0) {
2879 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2880 widenScalarDst(MI, WideTy);
2881 } else {
2882 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2883 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
2884 }
2885
2886 Observer.changedInstr(MI);
2887 return Legalized;
2888
2889 case TargetOpcode::G_SHL:
2890 Observer.changingInstr(MI);
2891
2892 if (TypeIdx == 0) {
2893 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2894 widenScalarDst(MI, WideTy);
2895 } else {
2896 assert(TypeIdx == 1);
2897 // The "number of bits to shift" operand must preserve its value as an
2898 // unsigned integer:
2899 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2900 }
2901
2902 Observer.changedInstr(MI);
2903 return Legalized;
2904
2905 case TargetOpcode::G_ROTR:
2906 case TargetOpcode::G_ROTL:
2907 if (TypeIdx != 1)
2908 return UnableToLegalize;
2909
2910 Observer.changingInstr(MI);
2911 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2912 Observer.changedInstr(MI);
2913 return Legalized;
2914
2915 case TargetOpcode::G_SDIV:
2916 case TargetOpcode::G_SREM:
2917 case TargetOpcode::G_SMIN:
2918 case TargetOpcode::G_SMAX:
2919 Observer.changingInstr(MI);
2920 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2921 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2922 widenScalarDst(MI, WideTy);
2923 Observer.changedInstr(MI);
2924 return Legalized;
2925
2926 case TargetOpcode::G_SDIVREM:
2927 Observer.changingInstr(MI);
2928 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2929 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_SEXT);
2930 widenScalarDst(MI, WideTy);
2931 widenScalarDst(MI, WideTy, OpIdx: 1);
2932 Observer.changedInstr(MI);
2933 return Legalized;
2934
2935 case TargetOpcode::G_ASHR:
2936 case TargetOpcode::G_LSHR:
2937 Observer.changingInstr(MI);
2938
2939 if (TypeIdx == 0) {
2940 unsigned CvtOp = Opcode == TargetOpcode::G_ASHR ? TargetOpcode::G_SEXT
2941 : TargetOpcode::G_ZEXT;
2942
2943 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: CvtOp);
2944 widenScalarDst(MI, WideTy);
2945 } else {
2946 assert(TypeIdx == 1);
2947 // The "number of bits to shift" operand must preserve its value as an
2948 // unsigned integer:
2949 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2950 }
2951
2952 Observer.changedInstr(MI);
2953 return Legalized;
2954 case TargetOpcode::G_UDIV:
2955 case TargetOpcode::G_UREM:
2956 Observer.changingInstr(MI);
2957 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
2958 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2959 widenScalarDst(MI, WideTy);
2960 Observer.changedInstr(MI);
2961 return Legalized;
2962 case TargetOpcode::G_UDIVREM:
2963 Observer.changingInstr(MI);
2964 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2965 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
2966 widenScalarDst(MI, WideTy);
2967 widenScalarDst(MI, WideTy, OpIdx: 1);
2968 Observer.changedInstr(MI);
2969 return Legalized;
2970 case TargetOpcode::G_UMIN:
2971 case TargetOpcode::G_UMAX: {
2972 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
2973
2974 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
2975 unsigned ExtOpc =
2976 TLI.isSExtCheaperThanZExt(FromTy: getApproximateEVTForLLT(Ty, Ctx),
2977 ToTy: getApproximateEVTForLLT(Ty: WideTy, Ctx))
2978 ? TargetOpcode::G_SEXT
2979 : TargetOpcode::G_ZEXT;
2980
2981 Observer.changingInstr(MI);
2982 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: ExtOpc);
2983 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: ExtOpc);
2984 widenScalarDst(MI, WideTy);
2985 Observer.changedInstr(MI);
2986 return Legalized;
2987 }
2988
2989 case TargetOpcode::G_SELECT:
2990 Observer.changingInstr(MI);
2991 if (TypeIdx == 0) {
2992 // Perform operation at larger width (any extension is fine here, high
2993 // bits don't affect the result) and then truncate the result back to the
2994 // original type.
2995 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2996 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2997 widenScalarDst(MI, WideTy);
2998 } else {
2999 bool IsVec = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector();
3000 // Explicit extension is required here since high bits affect the result.
3001 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: MIRBuilder.getBoolExtOp(IsVec, IsFP: false));
3002 }
3003 Observer.changedInstr(MI);
3004 return Legalized;
3005
3006 case TargetOpcode::G_FPTOSI:
3007 case TargetOpcode::G_FPTOUI:
3008 case TargetOpcode::G_INTRINSIC_LRINT:
3009 case TargetOpcode::G_INTRINSIC_LLRINT:
3010 case TargetOpcode::G_IS_FPCLASS:
3011 Observer.changingInstr(MI);
3012
3013 if (TypeIdx == 0)
3014 widenScalarDst(MI, WideTy);
3015 else
3016 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3017
3018 Observer.changedInstr(MI);
3019 return Legalized;
3020 case TargetOpcode::G_SITOFP:
3021 Observer.changingInstr(MI);
3022
3023 if (TypeIdx == 0)
3024 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3025 else
3026 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
3027
3028 Observer.changedInstr(MI);
3029 return Legalized;
3030 case TargetOpcode::G_UITOFP:
3031 Observer.changingInstr(MI);
3032
3033 if (TypeIdx == 0)
3034 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3035 else
3036 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3037
3038 Observer.changedInstr(MI);
3039 return Legalized;
3040 case TargetOpcode::G_FPTOSI_SAT:
3041 case TargetOpcode::G_FPTOUI_SAT:
3042 Observer.changingInstr(MI);
3043
3044 if (TypeIdx == 0) {
3045 Register OldDst = MI.getOperand(i: 0).getReg();
3046 LLT Ty = MRI.getType(Reg: OldDst);
3047 Register ExtReg = MRI.createGenericVirtualRegister(Ty: WideTy);
3048 Register NewDst;
3049 MI.getOperand(i: 0).setReg(ExtReg);
3050 uint64_t ShortBits = Ty.getScalarSizeInBits();
3051 uint64_t WideBits = WideTy.getScalarSizeInBits();
3052 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
3053 if (Opcode == TargetOpcode::G_FPTOSI_SAT) {
3054 // z = i16 fptosi_sat(a)
3055 // ->
3056 // x = i32 fptosi_sat(a)
3057 // y = smin(x, 32767)
3058 // z = smax(y, -32768)
3059 auto MaxVal = MIRBuilder.buildConstant(
3060 Res: WideTy, Val: APInt::getSignedMaxValue(numBits: ShortBits).sext(width: WideBits));
3061 auto MinVal = MIRBuilder.buildConstant(
3062 Res: WideTy, Val: APInt::getSignedMinValue(numBits: ShortBits).sext(width: WideBits));
3063 Register MidReg =
3064 MIRBuilder.buildSMin(Dst: WideTy, Src0: ExtReg, Src1: MaxVal).getReg(Idx: 0);
3065 NewDst = MIRBuilder.buildSMax(Dst: WideTy, Src0: MidReg, Src1: MinVal).getReg(Idx: 0);
3066 } else {
3067 // z = i16 fptoui_sat(a)
3068 // ->
3069 // x = i32 fptoui_sat(a)
3070 // y = smin(x, 65535)
3071 auto MaxVal = MIRBuilder.buildConstant(
3072 Res: WideTy, Val: APInt::getAllOnes(numBits: ShortBits).zext(width: WideBits));
3073 NewDst = MIRBuilder.buildUMin(Dst: WideTy, Src0: ExtReg, Src1: MaxVal).getReg(Idx: 0);
3074 }
3075 MIRBuilder.buildTrunc(Res: OldDst, Op: NewDst);
3076 } else
3077 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3078
3079 Observer.changedInstr(MI);
3080 return Legalized;
3081 case TargetOpcode::G_LOAD:
3082 case TargetOpcode::G_SEXTLOAD:
3083 case TargetOpcode::G_ZEXTLOAD:
3084 Observer.changingInstr(MI);
3085 widenScalarDst(MI, WideTy);
3086 Observer.changedInstr(MI);
3087 return Legalized;
3088
3089 case TargetOpcode::G_STORE: {
3090 if (TypeIdx != 0)
3091 return UnableToLegalize;
3092
3093 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3094 assert(!Ty.isPointerOrPointerVector() && "Can't widen type");
3095 if (!Ty.isScalar()) {
3096 // We need to widen the vector element type.
3097 Observer.changingInstr(MI);
3098 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ANYEXT);
3099 // We also need to adjust the MMO to turn this into a truncating store.
3100 MachineMemOperand &MMO = **MI.memoperands_begin();
3101 MachineFunction &MF = MIRBuilder.getMF();
3102 auto *NewMMO = MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty);
3103 MI.setMemRefs(MF, MemRefs: {NewMMO});
3104 Observer.changedInstr(MI);
3105 return Legalized;
3106 }
3107
3108 Observer.changingInstr(MI);
3109
3110 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
3111 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
3112 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: ExtType);
3113
3114 Observer.changedInstr(MI);
3115 return Legalized;
3116 }
3117 case TargetOpcode::G_CONSTANT: {
3118 MachineOperand &SrcMO = MI.getOperand(i: 1);
3119 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3120 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
3121 SmallTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
3122 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
3123 ExtOpc == TargetOpcode::G_ANYEXT) &&
3124 "Illegal Extend");
3125 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3126 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
3127 ? SrcVal.sext(width: WideTy.getSizeInBits())
3128 : SrcVal.zext(width: WideTy.getSizeInBits());
3129 Observer.changingInstr(MI);
3130 SrcMO.setCImm(ConstantInt::get(Context&: Ctx, V: Val));
3131
3132 widenScalarDst(MI, WideTy);
3133 Observer.changedInstr(MI);
3134 return Legalized;
3135 }
3136 case TargetOpcode::G_FCONSTANT: {
3137 // To avoid changing the bits of the constant due to extension to a larger
3138 // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
3139 MachineOperand &SrcMO = MI.getOperand(i: 1);
3140 APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
3141 MIRBuilder.setInstrAndDebugLoc(MI);
3142 auto IntCst = MIRBuilder.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val);
3143 widenScalarDst(MI&: *IntCst, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3144 MI.eraseFromParent();
3145 return Legalized;
3146 }
3147 case TargetOpcode::G_IMPLICIT_DEF: {
3148 Observer.changingInstr(MI);
3149 widenScalarDst(MI, WideTy);
3150 Observer.changedInstr(MI);
3151 return Legalized;
3152 }
3153 case TargetOpcode::G_BRCOND:
3154 Observer.changingInstr(MI);
3155 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: MIRBuilder.getBoolExtOp(IsVec: false, IsFP: false));
3156 Observer.changedInstr(MI);
3157 return Legalized;
3158
3159 case TargetOpcode::G_FCMP:
3160 Observer.changingInstr(MI);
3161 if (TypeIdx == 0)
3162 widenScalarDst(MI, WideTy);
3163 else {
3164 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3165 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_FPEXT);
3166 }
3167 Observer.changedInstr(MI);
3168 return Legalized;
3169
3170 case TargetOpcode::G_ICMP:
3171 Observer.changingInstr(MI);
3172 if (TypeIdx == 0)
3173 widenScalarDst(MI, WideTy);
3174 else {
3175 LLT SrcTy = MRI.getType(Reg: MI.getOperand(i: 2).getReg());
3176 CmpInst::Predicate Pred =
3177 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
3178
3179 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
3180 unsigned ExtOpcode =
3181 (CmpInst::isSigned(predicate: Pred) ||
3182 TLI.isSExtCheaperThanZExt(FromTy: getApproximateEVTForLLT(Ty: SrcTy, Ctx),
3183 ToTy: getApproximateEVTForLLT(Ty: WideTy, Ctx)))
3184 ? TargetOpcode::G_SEXT
3185 : TargetOpcode::G_ZEXT;
3186 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode);
3187 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode);
3188 }
3189 Observer.changedInstr(MI);
3190 return Legalized;
3191
3192 case TargetOpcode::G_PTR_ADD:
3193 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
3194 Observer.changingInstr(MI);
3195 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
3196 Observer.changedInstr(MI);
3197 return Legalized;
3198
3199 case TargetOpcode::G_PHI: {
3200 assert(TypeIdx == 0 && "Expecting only Idx 0");
3201
3202 Observer.changingInstr(MI);
3203 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
3204 MachineBasicBlock &OpMBB = *MI.getOperand(i: I + 1).getMBB();
3205 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
3206 widenScalarSrc(MI, WideTy, OpIdx: I, ExtOpcode: TargetOpcode::G_ANYEXT);
3207 }
3208
3209 MachineBasicBlock &MBB = *MI.getParent();
3210 MIRBuilder.setInsertPt(MBB, II: --MBB.getFirstNonPHI());
3211 widenScalarDst(MI, WideTy);
3212 Observer.changedInstr(MI);
3213 return Legalized;
3214 }
3215 case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
3216 if (TypeIdx == 0) {
3217 Register VecReg = MI.getOperand(i: 1).getReg();
3218 LLT VecTy = MRI.getType(Reg: VecReg);
3219 Observer.changingInstr(MI);
3220
3221 widenScalarSrc(
3222 MI, WideTy: LLT::vector(EC: VecTy.getElementCount(), ScalarSizeInBits: WideTy.getSizeInBits()), OpIdx: 1,
3223 ExtOpcode: TargetOpcode::G_ANYEXT);
3224
3225 widenScalarDst(MI, WideTy, OpIdx: 0);
3226 Observer.changedInstr(MI);
3227 return Legalized;
3228 }
3229
3230 if (TypeIdx != 2)
3231 return UnableToLegalize;
3232 Observer.changingInstr(MI);
3233 // TODO: Probably should be zext
3234 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
3235 Observer.changedInstr(MI);
3236 return Legalized;
3237 }
3238 case TargetOpcode::G_INSERT_VECTOR_ELT: {
3239 if (TypeIdx == 0) {
3240 Observer.changingInstr(MI);
3241 const LLT WideEltTy = WideTy.getElementType();
3242
3243 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3244 widenScalarSrc(MI, WideTy: WideEltTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3245 widenScalarDst(MI, WideTy, OpIdx: 0);
3246 Observer.changedInstr(MI);
3247 return Legalized;
3248 }
3249
3250 if (TypeIdx == 1) {
3251 Observer.changingInstr(MI);
3252
3253 Register VecReg = MI.getOperand(i: 1).getReg();
3254 LLT VecTy = MRI.getType(Reg: VecReg);
3255 LLT WideVecTy = LLT::vector(EC: VecTy.getElementCount(), ScalarTy: WideTy);
3256
3257 widenScalarSrc(MI, WideTy: WideVecTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3258 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3259 widenScalarDst(MI, WideTy: WideVecTy, OpIdx: 0);
3260 Observer.changedInstr(MI);
3261 return Legalized;
3262 }
3263
3264 if (TypeIdx == 2) {
3265 Observer.changingInstr(MI);
3266 // TODO: Probably should be zext
3267 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_SEXT);
3268 Observer.changedInstr(MI);
3269 return Legalized;
3270 }
3271
3272 return UnableToLegalize;
3273 }
3274 case TargetOpcode::G_FADD:
3275 case TargetOpcode::G_FMUL:
3276 case TargetOpcode::G_FSUB:
3277 case TargetOpcode::G_FMA:
3278 case TargetOpcode::G_FMAD:
3279 case TargetOpcode::G_FNEG:
3280 case TargetOpcode::G_FABS:
3281 case TargetOpcode::G_FCANONICALIZE:
3282 case TargetOpcode::G_FMINNUM:
3283 case TargetOpcode::G_FMAXNUM:
3284 case TargetOpcode::G_FMINNUM_IEEE:
3285 case TargetOpcode::G_FMAXNUM_IEEE:
3286 case TargetOpcode::G_FMINIMUM:
3287 case TargetOpcode::G_FMAXIMUM:
3288 case TargetOpcode::G_FMINIMUMNUM:
3289 case TargetOpcode::G_FMAXIMUMNUM:
3290 case TargetOpcode::G_FDIV:
3291 case TargetOpcode::G_FREM:
3292 case TargetOpcode::G_FCEIL:
3293 case TargetOpcode::G_FFLOOR:
3294 case TargetOpcode::G_FCOS:
3295 case TargetOpcode::G_FSIN:
3296 case TargetOpcode::G_FTAN:
3297 case TargetOpcode::G_FACOS:
3298 case TargetOpcode::G_FASIN:
3299 case TargetOpcode::G_FATAN:
3300 case TargetOpcode::G_FATAN2:
3301 case TargetOpcode::G_FCOSH:
3302 case TargetOpcode::G_FSINH:
3303 case TargetOpcode::G_FTANH:
3304 case TargetOpcode::G_FLOG10:
3305 case TargetOpcode::G_FLOG:
3306 case TargetOpcode::G_FLOG2:
3307 case TargetOpcode::G_FRINT:
3308 case TargetOpcode::G_FNEARBYINT:
3309 case TargetOpcode::G_FSQRT:
3310 case TargetOpcode::G_FEXP:
3311 case TargetOpcode::G_FEXP2:
3312 case TargetOpcode::G_FEXP10:
3313 case TargetOpcode::G_FPOW:
3314 case TargetOpcode::G_INTRINSIC_TRUNC:
3315 case TargetOpcode::G_INTRINSIC_ROUND:
3316 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
3317 assert(TypeIdx == 0);
3318 Observer.changingInstr(MI);
3319
3320 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3321 widenScalarSrc(MI, WideTy, OpIdx: I, ExtOpcode: TargetOpcode::G_FPEXT);
3322
3323 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3324 Observer.changedInstr(MI);
3325 return Legalized;
3326 case TargetOpcode::G_FPOWI:
3327 case TargetOpcode::G_FLDEXP:
3328 case TargetOpcode::G_STRICT_FLDEXP: {
3329 if (TypeIdx == 0) {
3330 if (Opcode == TargetOpcode::G_STRICT_FLDEXP)
3331 return UnableToLegalize;
3332
3333 Observer.changingInstr(MI);
3334 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3335 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3336 Observer.changedInstr(MI);
3337 return Legalized;
3338 }
3339
3340 if (TypeIdx == 1) {
3341 // For some reason SelectionDAG tries to promote to a libcall without
3342 // actually changing the integer type for promotion.
3343 Observer.changingInstr(MI);
3344 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
3345 Observer.changedInstr(MI);
3346 return Legalized;
3347 }
3348
3349 return UnableToLegalize;
3350 }
3351 case TargetOpcode::G_FFREXP: {
3352 Observer.changingInstr(MI);
3353
3354 if (TypeIdx == 0) {
3355 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3356 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3357 } else {
3358 widenScalarDst(MI, WideTy, OpIdx: 1);
3359 }
3360
3361 Observer.changedInstr(MI);
3362 return Legalized;
3363 }
3364 case TargetOpcode::G_INTTOPTR:
3365 if (TypeIdx != 1)
3366 return UnableToLegalize;
3367
3368 Observer.changingInstr(MI);
3369 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3370 Observer.changedInstr(MI);
3371 return Legalized;
3372 case TargetOpcode::G_PTRTOINT:
3373 if (TypeIdx != 0)
3374 return UnableToLegalize;
3375
3376 Observer.changingInstr(MI);
3377 widenScalarDst(MI, WideTy, OpIdx: 0);
3378 Observer.changedInstr(MI);
3379 return Legalized;
3380 case TargetOpcode::G_BUILD_VECTOR: {
3381 Observer.changingInstr(MI);
3382
3383 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
3384 for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
3385 widenScalarSrc(MI, WideTy: WideEltTy, OpIdx: I, ExtOpcode: TargetOpcode::G_ANYEXT);
3386
3387 // Avoid changing the result vector type if the source element type was
3388 // requested.
3389 if (TypeIdx == 1) {
3390 MI.setDesc(MIRBuilder.getTII().get(Opcode: TargetOpcode::G_BUILD_VECTOR_TRUNC));
3391 } else {
3392 widenScalarDst(MI, WideTy, OpIdx: 0);
3393 }
3394
3395 Observer.changedInstr(MI);
3396 return Legalized;
3397 }
3398 case TargetOpcode::G_SEXT_INREG:
3399 if (TypeIdx != 0)
3400 return UnableToLegalize;
3401
3402 Observer.changingInstr(MI);
3403 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3404 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3405 Observer.changedInstr(MI);
3406 return Legalized;
3407 case TargetOpcode::G_PTRMASK: {
3408 if (TypeIdx != 1)
3409 return UnableToLegalize;
3410 Observer.changingInstr(MI);
3411 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3412 Observer.changedInstr(MI);
3413 return Legalized;
3414 }
3415 case TargetOpcode::G_VECREDUCE_ADD: {
3416 if (TypeIdx != 1)
3417 return UnableToLegalize;
3418 Observer.changingInstr(MI);
3419 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3420 widenScalarDst(MI, WideTy: WideTy.getScalarType(), OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3421 Observer.changedInstr(MI);
3422 return Legalized;
3423 }
3424 case TargetOpcode::G_VECREDUCE_FADD:
3425 case TargetOpcode::G_VECREDUCE_FMUL:
3426 case TargetOpcode::G_VECREDUCE_FMIN:
3427 case TargetOpcode::G_VECREDUCE_FMAX:
3428 case TargetOpcode::G_VECREDUCE_FMINIMUM:
3429 case TargetOpcode::G_VECREDUCE_FMAXIMUM: {
3430 if (TypeIdx != 0)
3431 return UnableToLegalize;
3432 Observer.changingInstr(MI);
3433 Register VecReg = MI.getOperand(i: 1).getReg();
3434 LLT VecTy = MRI.getType(Reg: VecReg);
3435 LLT WideVecTy = VecTy.isVector()
3436 ? LLT::vector(EC: VecTy.getElementCount(), ScalarTy: WideTy)
3437 : WideTy;
3438 widenScalarSrc(MI, WideTy: WideVecTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3439 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3440 Observer.changedInstr(MI);
3441 return Legalized;
3442 }
3443 case TargetOpcode::G_VSCALE: {
3444 MachineOperand &SrcMO = MI.getOperand(i: 1);
3445 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3446 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3447 // The CImm is always a signed value
3448 const APInt Val = SrcVal.sext(width: WideTy.getSizeInBits());
3449 Observer.changingInstr(MI);
3450 SrcMO.setCImm(ConstantInt::get(Context&: Ctx, V: Val));
3451 widenScalarDst(MI, WideTy);
3452 Observer.changedInstr(MI);
3453 return Legalized;
3454 }
3455 case TargetOpcode::G_SPLAT_VECTOR: {
3456 if (TypeIdx != 1)
3457 return UnableToLegalize;
3458
3459 Observer.changingInstr(MI);
3460 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3461 Observer.changedInstr(MI);
3462 return Legalized;
3463 }
3464 case TargetOpcode::G_INSERT_SUBVECTOR: {
3465 if (TypeIdx != 0)
3466 return UnableToLegalize;
3467
3468 GInsertSubvector &IS = cast<GInsertSubvector>(Val&: MI);
3469 Register BigVec = IS.getBigVec();
3470 Register SubVec = IS.getSubVec();
3471
3472 LLT SubVecTy = MRI.getType(Reg: SubVec);
3473 LLT SubVecWideTy = SubVecTy.changeElementType(NewEltTy: WideTy.getElementType());
3474
3475 // Widen the G_INSERT_SUBVECTOR
3476 auto BigZExt = MIRBuilder.buildZExt(Res: WideTy, Op: BigVec);
3477 auto SubZExt = MIRBuilder.buildZExt(Res: SubVecWideTy, Op: SubVec);
3478 auto WideInsert = MIRBuilder.buildInsertSubvector(Res: WideTy, Src0: BigZExt, Src1: SubZExt,
3479 Index: IS.getIndexImm());
3480
3481 // Truncate back down
3482 auto SplatZero = MIRBuilder.buildSplatVector(
3483 Res: WideTy, Val: MIRBuilder.buildConstant(Res: WideTy.getElementType(), Val: 0));
3484 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_NE, Res: IS.getReg(Idx: 0), Op0: WideInsert,
3485 Op1: SplatZero);
3486
3487 MI.eraseFromParent();
3488
3489 return Legalized;
3490 }
3491 }
3492}
3493
3494static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
3495 MachineIRBuilder &B, Register Src, LLT Ty) {
3496 auto Unmerge = B.buildUnmerge(Res: Ty, Op: Src);
3497 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3498 Pieces.push_back(Elt: Unmerge.getReg(Idx: I));
3499}
3500
3501static void emitLoadFromConstantPool(Register DstReg, const Constant *ConstVal,
3502 MachineIRBuilder &MIRBuilder) {
3503 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3504 MachineFunction &MF = MIRBuilder.getMF();
3505 const DataLayout &DL = MIRBuilder.getDataLayout();
3506 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
3507 LLT AddrPtrTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: DL.getPointerSizeInBits(AS: AddrSpace));
3508 LLT DstLLT = MRI.getType(Reg: DstReg);
3509
3510 Align Alignment(DL.getABITypeAlign(Ty: ConstVal->getType()));
3511
3512 auto Addr = MIRBuilder.buildConstantPool(
3513 Res: AddrPtrTy,
3514 Idx: MF.getConstantPool()->getConstantPoolIndex(C: ConstVal, Alignment));
3515
3516 MachineMemOperand *MMO =
3517 MF.getMachineMemOperand(PtrInfo: MachinePointerInfo::getConstantPool(MF),
3518 f: MachineMemOperand::MOLoad, MemTy: DstLLT, base_alignment: Alignment);
3519
3520 MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_LOAD, Res: DstReg, Addr, MMO&: *MMO);
3521}
3522
3523LegalizerHelper::LegalizeResult
3524LegalizerHelper::lowerConstant(MachineInstr &MI) {
3525 const MachineOperand &ConstOperand = MI.getOperand(i: 1);
3526 const Constant *ConstantVal = ConstOperand.getCImm();
3527
3528 emitLoadFromConstantPool(DstReg: MI.getOperand(i: 0).getReg(), ConstVal: ConstantVal, MIRBuilder);
3529 MI.eraseFromParent();
3530
3531 return Legalized;
3532}
3533
3534LegalizerHelper::LegalizeResult
3535LegalizerHelper::lowerFConstant(MachineInstr &MI) {
3536 const MachineOperand &ConstOperand = MI.getOperand(i: 1);
3537 const Constant *ConstantVal = ConstOperand.getFPImm();
3538
3539 emitLoadFromConstantPool(DstReg: MI.getOperand(i: 0).getReg(), ConstVal: ConstantVal, MIRBuilder);
3540 MI.eraseFromParent();
3541
3542 return Legalized;
3543}
3544
3545LegalizerHelper::LegalizeResult
3546LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3547 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3548 if (SrcTy.isVector()) {
3549 LLT SrcEltTy = SrcTy.getElementType();
3550 SmallVector<Register, 8> SrcRegs;
3551
3552 if (DstTy.isVector()) {
3553 int NumDstElt = DstTy.getNumElements();
3554 int NumSrcElt = SrcTy.getNumElements();
3555
3556 LLT DstEltTy = DstTy.getElementType();
3557 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3558 LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3559
3560 // If there's an element size mismatch, insert intermediate casts to match
3561 // the result element type.
3562 if (NumSrcElt < NumDstElt) { // Source element type is larger.
3563 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3564 //
3565 // =>
3566 //
3567 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3568 // %3:_(<2 x s8>) = G_BITCAST %2
3569 // %4:_(<2 x s8>) = G_BITCAST %3
3570 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3571 DstCastTy = LLT::fixed_vector(NumElements: NumDstElt / NumSrcElt, ScalarTy: DstEltTy);
3572 SrcPartTy = SrcEltTy;
3573 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3574 //
3575 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3576 //
3577 // =>
3578 //
3579 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3580 // %3:_(s16) = G_BITCAST %2
3581 // %4:_(s16) = G_BITCAST %3
3582 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3583 SrcPartTy = LLT::fixed_vector(NumElements: NumSrcElt / NumDstElt, ScalarTy: SrcEltTy);
3584 DstCastTy = DstEltTy;
3585 }
3586
3587 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: SrcPartTy);
3588 for (Register &SrcReg : SrcRegs)
3589 SrcReg = MIRBuilder.buildBitcast(Dst: DstCastTy, Src: SrcReg).getReg(Idx: 0);
3590 } else
3591 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: SrcEltTy);
3592
3593 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3594 MI.eraseFromParent();
3595 return Legalized;
3596 }
3597
3598 if (DstTy.isVector()) {
3599 SmallVector<Register, 8> SrcRegs;
3600 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: DstTy.getElementType());
3601 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3602 MI.eraseFromParent();
3603 return Legalized;
3604 }
3605
3606 return UnableToLegalize;
3607}
3608
3609/// Figure out the bit offset into a register when coercing a vector index for
3610/// the wide element type. This is only for the case when promoting vector to
3611/// one with larger elements.
3612//
3613///
3614/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3615/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3616static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3617 Register Idx,
3618 unsigned NewEltSize,
3619 unsigned OldEltSize) {
3620 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3621 LLT IdxTy = B.getMRI()->getType(Reg: Idx);
3622
3623 // Now figure out the amount we need to shift to get the target bits.
3624 auto OffsetMask = B.buildConstant(
3625 Res: IdxTy, Val: ~(APInt::getAllOnes(numBits: IdxTy.getSizeInBits()) << Log2EltRatio));
3626 auto OffsetIdx = B.buildAnd(Dst: IdxTy, Src0: Idx, Src1: OffsetMask);
3627 return B.buildShl(Dst: IdxTy, Src0: OffsetIdx,
3628 Src1: B.buildConstant(Res: IdxTy, Val: Log2_32(Value: OldEltSize))).getReg(Idx: 0);
3629}
3630
3631/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3632/// is casting to a vector with a smaller element size, perform multiple element
3633/// extracts and merge the results. If this is coercing to a vector with larger
3634/// elements, index the bitcasted vector and extract the target element with bit
3635/// operations. This is intended to force the indexing in the native register
3636/// size for architectures that can dynamically index the register file.
3637LegalizerHelper::LegalizeResult
3638LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3639 LLT CastTy) {
3640 if (TypeIdx != 1)
3641 return UnableToLegalize;
3642
3643 auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3644
3645 LLT SrcEltTy = SrcVecTy.getElementType();
3646 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3647 unsigned OldNumElts = SrcVecTy.getNumElements();
3648
3649 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3650 Register CastVec = MIRBuilder.buildBitcast(Dst: CastTy, Src: SrcVec).getReg(Idx: 0);
3651
3652 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3653 const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3654 if (NewNumElts > OldNumElts) {
3655 // Decreasing the vector element size
3656 //
3657 // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3658 // =>
3659 // v4i32:castx = bitcast x:v2i64
3660 //
3661 // i64 = bitcast
3662 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3663 // (i32 (extract_vector_elt castx, (2 * y + 1)))
3664 //
3665 if (NewNumElts % OldNumElts != 0)
3666 return UnableToLegalize;
3667
3668 // Type of the intermediate result vector.
3669 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3670 LLT MidTy =
3671 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: NewEltsPerOldElt), ScalarTy: NewEltTy);
3672
3673 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(Res: IdxTy, Val: NewEltsPerOldElt);
3674
3675 SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3676 auto NewBaseIdx = MIRBuilder.buildMul(Dst: IdxTy, Src0: Idx, Src1: NewEltsPerOldEltK);
3677
3678 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3679 auto IdxOffset = MIRBuilder.buildConstant(Res: IdxTy, Val: I);
3680 auto TmpIdx = MIRBuilder.buildAdd(Dst: IdxTy, Src0: NewBaseIdx, Src1: IdxOffset);
3681 auto Elt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec, Idx: TmpIdx);
3682 NewOps[I] = Elt.getReg(Idx: 0);
3683 }
3684
3685 auto NewVec = MIRBuilder.buildBuildVector(Res: MidTy, Ops: NewOps);
3686 MIRBuilder.buildBitcast(Dst, Src: NewVec);
3687 MI.eraseFromParent();
3688 return Legalized;
3689 }
3690
3691 if (NewNumElts < OldNumElts) {
3692 if (NewEltSize % OldEltSize != 0)
3693 return UnableToLegalize;
3694
3695 // This only depends on powers of 2 because we use bit tricks to figure out
3696 // the bit offset we need to shift to get the target element. A general
3697 // expansion could emit division/multiply.
3698 if (!isPowerOf2_32(Value: NewEltSize / OldEltSize))
3699 return UnableToLegalize;
3700
3701 // Increasing the vector element size.
3702 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3703 //
3704 // =>
3705 //
3706 // %cast = G_BITCAST %vec
3707 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3708 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3709 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3710 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3711 // %elt_bits = G_LSHR %wide_elt, %offset_bits
3712 // %elt = G_TRUNC %elt_bits
3713
3714 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3715 auto Log2Ratio = MIRBuilder.buildConstant(Res: IdxTy, Val: Log2EltRatio);
3716
3717 // Divide to get the index in the wider element type.
3718 auto ScaledIdx = MIRBuilder.buildLShr(Dst: IdxTy, Src0: Idx, Src1: Log2Ratio);
3719
3720 Register WideElt = CastVec;
3721 if (CastTy.isVector()) {
3722 WideElt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec,
3723 Idx: ScaledIdx).getReg(Idx: 0);
3724 }
3725
3726 // Compute the bit offset into the register of the target element.
3727 Register OffsetBits = getBitcastWiderVectorElementOffset(
3728 B&: MIRBuilder, Idx, NewEltSize, OldEltSize);
3729
3730 // Shift the wide element to get the target element.
3731 auto ExtractedBits = MIRBuilder.buildLShr(Dst: NewEltTy, Src0: WideElt, Src1: OffsetBits);
3732 MIRBuilder.buildTrunc(Res: Dst, Op: ExtractedBits);
3733 MI.eraseFromParent();
3734 return Legalized;
3735 }
3736
3737 return UnableToLegalize;
3738}
3739
3740/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3741/// TargetReg, while preserving other bits in \p TargetReg.
3742///
3743/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3744static Register buildBitFieldInsert(MachineIRBuilder &B,
3745 Register TargetReg, Register InsertReg,
3746 Register OffsetBits) {
3747 LLT TargetTy = B.getMRI()->getType(Reg: TargetReg);
3748 LLT InsertTy = B.getMRI()->getType(Reg: InsertReg);
3749 auto ZextVal = B.buildZExt(Res: TargetTy, Op: InsertReg);
3750 auto ShiftedInsertVal = B.buildShl(Dst: TargetTy, Src0: ZextVal, Src1: OffsetBits);
3751
3752 // Produce a bitmask of the value to insert
3753 auto EltMask = B.buildConstant(
3754 Res: TargetTy, Val: APInt::getLowBitsSet(numBits: TargetTy.getSizeInBits(),
3755 loBitsSet: InsertTy.getSizeInBits()));
3756 // Shift it into position
3757 auto ShiftedMask = B.buildShl(Dst: TargetTy, Src0: EltMask, Src1: OffsetBits);
3758 auto InvShiftedMask = B.buildNot(Dst: TargetTy, Src0: ShiftedMask);
3759
3760 // Clear out the bits in the wide element
3761 auto MaskedOldElt = B.buildAnd(Dst: TargetTy, Src0: TargetReg, Src1: InvShiftedMask);
3762
3763 // The value to insert has all zeros already, so stick it into the masked
3764 // wide element.
3765 return B.buildOr(Dst: TargetTy, Src0: MaskedOldElt, Src1: ShiftedInsertVal).getReg(Idx: 0);
3766}
3767
3768/// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3769/// is increasing the element size, perform the indexing in the target element
3770/// type, and use bit operations to insert at the element position. This is
3771/// intended for architectures that can dynamically index the register file and
3772/// want to force indexing in the native register size.
3773LegalizerHelper::LegalizeResult
3774LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3775 LLT CastTy) {
3776 if (TypeIdx != 0)
3777 return UnableToLegalize;
3778
3779 auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3780 MI.getFirst4RegLLTs();
3781 LLT VecTy = DstTy;
3782
3783 LLT VecEltTy = VecTy.getElementType();
3784 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3785 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3786 const unsigned OldEltSize = VecEltTy.getSizeInBits();
3787
3788 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3789 unsigned OldNumElts = VecTy.getNumElements();
3790
3791 Register CastVec = MIRBuilder.buildBitcast(Dst: CastTy, Src: SrcVec).getReg(Idx: 0);
3792 if (NewNumElts < OldNumElts) {
3793 if (NewEltSize % OldEltSize != 0)
3794 return UnableToLegalize;
3795
3796 // This only depends on powers of 2 because we use bit tricks to figure out
3797 // the bit offset we need to shift to get the target element. A general
3798 // expansion could emit division/multiply.
3799 if (!isPowerOf2_32(Value: NewEltSize / OldEltSize))
3800 return UnableToLegalize;
3801
3802 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3803 auto Log2Ratio = MIRBuilder.buildConstant(Res: IdxTy, Val: Log2EltRatio);
3804
3805 // Divide to get the index in the wider element type.
3806 auto ScaledIdx = MIRBuilder.buildLShr(Dst: IdxTy, Src0: Idx, Src1: Log2Ratio);
3807
3808 Register ExtractedElt = CastVec;
3809 if (CastTy.isVector()) {
3810 ExtractedElt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec,
3811 Idx: ScaledIdx).getReg(Idx: 0);
3812 }
3813
3814 // Compute the bit offset into the register of the target element.
3815 Register OffsetBits = getBitcastWiderVectorElementOffset(
3816 B&: MIRBuilder, Idx, NewEltSize, OldEltSize);
3817
3818 Register InsertedElt = buildBitFieldInsert(B&: MIRBuilder, TargetReg: ExtractedElt,
3819 InsertReg: Val, OffsetBits);
3820 if (CastTy.isVector()) {
3821 InsertedElt = MIRBuilder.buildInsertVectorElement(
3822 Res: CastTy, Val: CastVec, Elt: InsertedElt, Idx: ScaledIdx).getReg(Idx: 0);
3823 }
3824
3825 MIRBuilder.buildBitcast(Dst, Src: InsertedElt);
3826 MI.eraseFromParent();
3827 return Legalized;
3828 }
3829
3830 return UnableToLegalize;
3831}
3832
3833// This attempts to handle G_CONCAT_VECTORS with illegal operands, particularly
3834// those that have smaller than legal operands.
3835//
3836// <16 x s8> = G_CONCAT_VECTORS <4 x s8>, <4 x s8>, <4 x s8>, <4 x s8>
3837//
3838// ===>
3839//
3840// s32 = G_BITCAST <4 x s8>
3841// s32 = G_BITCAST <4 x s8>
3842// s32 = G_BITCAST <4 x s8>
3843// s32 = G_BITCAST <4 x s8>
3844// <4 x s32> = G_BUILD_VECTOR s32, s32, s32, s32
3845// <16 x s8> = G_BITCAST <4 x s32>
3846LegalizerHelper::LegalizeResult
3847LegalizerHelper::bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx,
3848 LLT CastTy) {
3849 // Convert it to CONCAT instruction
3850 auto ConcatMI = dyn_cast<GConcatVectors>(Val: &MI);
3851 if (!ConcatMI) {
3852 return UnableToLegalize;
3853 }
3854
3855 // Check if bitcast is Legal
3856 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
3857 LLT SrcScalTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
3858
3859 // Check if the build vector is Legal
3860 if (!LI.isLegal(Query: {TargetOpcode::G_BUILD_VECTOR, {CastTy, SrcScalTy}})) {
3861 return UnableToLegalize;
3862 }
3863
3864 // Bitcast the sources
3865 SmallVector<Register> BitcastRegs;
3866 for (unsigned i = 0; i < ConcatMI->getNumSources(); i++) {
3867 BitcastRegs.push_back(
3868 Elt: MIRBuilder.buildBitcast(Dst: SrcScalTy, Src: ConcatMI->getSourceReg(I: i))
3869 .getReg(Idx: 0));
3870 }
3871
3872 // Build the scalar values into a vector
3873 Register BuildReg =
3874 MIRBuilder.buildBuildVector(Res: CastTy, Ops: BitcastRegs).getReg(Idx: 0);
3875 MIRBuilder.buildBitcast(Dst: DstReg, Src: BuildReg);
3876
3877 MI.eraseFromParent();
3878 return Legalized;
3879}
3880
3881// This bitcasts a shuffle vector to a different type currently of the same
3882// element size. Mostly used to legalize ptr vectors, where ptrtoint/inttoptr
3883// will be used instead.
3884//
3885// <16 x p0> = G_CONCAT_VECTORS <4 x p0>, <4 x p0>, mask
3886// ===>
3887// <4 x s64> = G_PTRTOINT <4 x p0>
3888// <4 x s64> = G_PTRTOINT <4 x p0>
3889// <16 x s64> = G_CONCAT_VECTORS <4 x s64>, <4 x s64>, mask
3890// <16 x p0> = G_INTTOPTR <16 x s64>
3891LegalizerHelper::LegalizeResult
3892LegalizerHelper::bitcastShuffleVector(MachineInstr &MI, unsigned TypeIdx,
3893 LLT CastTy) {
3894 auto ShuffleMI = cast<GShuffleVector>(Val: &MI);
3895 LLT DstTy = MRI.getType(Reg: ShuffleMI->getReg(Idx: 0));
3896 LLT SrcTy = MRI.getType(Reg: ShuffleMI->getReg(Idx: 1));
3897
3898 // We currently only handle vectors of the same size.
3899 if (TypeIdx != 0 ||
3900 CastTy.getScalarSizeInBits() != DstTy.getScalarSizeInBits() ||
3901 CastTy.getElementCount() != DstTy.getElementCount())
3902 return UnableToLegalize;
3903
3904 LLT NewSrcTy = SrcTy.changeElementType(NewEltTy: CastTy.getScalarType());
3905
3906 auto Inp1 = MIRBuilder.buildCast(Dst: NewSrcTy, Src: ShuffleMI->getReg(Idx: 1));
3907 auto Inp2 = MIRBuilder.buildCast(Dst: NewSrcTy, Src: ShuffleMI->getReg(Idx: 2));
3908 auto Shuf =
3909 MIRBuilder.buildShuffleVector(Res: CastTy, Src1: Inp1, Src2: Inp2, Mask: ShuffleMI->getMask());
3910 MIRBuilder.buildCast(Dst: ShuffleMI->getReg(Idx: 0), Src: Shuf);
3911
3912 MI.eraseFromParent();
3913 return Legalized;
3914}
3915
3916/// This attempts to bitcast G_EXTRACT_SUBVECTOR to CastTy.
3917///
3918/// <vscale x 8 x i1> = G_EXTRACT_SUBVECTOR <vscale x 16 x i1>, N
3919///
3920/// ===>
3921///
3922/// <vscale x 2 x i1> = G_BITCAST <vscale x 16 x i1>
3923/// <vscale x 1 x i8> = G_EXTRACT_SUBVECTOR <vscale x 2 x i1>, N / 8
3924/// <vscale x 8 x i1> = G_BITCAST <vscale x 1 x i8>
3925LegalizerHelper::LegalizeResult
3926LegalizerHelper::bitcastExtractSubvector(MachineInstr &MI, unsigned TypeIdx,
3927 LLT CastTy) {
3928 auto ES = cast<GExtractSubvector>(Val: &MI);
3929
3930 if (!CastTy.isVector())
3931 return UnableToLegalize;
3932
3933 if (TypeIdx != 0)
3934 return UnableToLegalize;
3935
3936 Register Dst = ES->getReg(Idx: 0);
3937 Register Src = ES->getSrcVec();
3938 uint64_t Idx = ES->getIndexImm();
3939
3940 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3941
3942 LLT DstTy = MRI.getType(Reg: Dst);
3943 LLT SrcTy = MRI.getType(Reg: Src);
3944 ElementCount DstTyEC = DstTy.getElementCount();
3945 ElementCount SrcTyEC = SrcTy.getElementCount();
3946 auto DstTyMinElts = DstTyEC.getKnownMinValue();
3947 auto SrcTyMinElts = SrcTyEC.getKnownMinValue();
3948
3949 if (DstTy == CastTy)
3950 return Legalized;
3951
3952 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
3953 return UnableToLegalize;
3954
3955 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
3956 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
3957 if (CastEltSize < DstEltSize)
3958 return UnableToLegalize;
3959
3960 auto AdjustAmt = CastEltSize / DstEltSize;
3961 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
3962 SrcTyMinElts % AdjustAmt != 0)
3963 return UnableToLegalize;
3964
3965 Idx /= AdjustAmt;
3966 SrcTy = LLT::vector(EC: SrcTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
3967 auto CastVec = MIRBuilder.buildBitcast(Dst: SrcTy, Src);
3968 auto PromotedES = MIRBuilder.buildExtractSubvector(Res: CastTy, Src: CastVec, Index: Idx);
3969 MIRBuilder.buildBitcast(Dst, Src: PromotedES);
3970
3971 ES->eraseFromParent();
3972 return Legalized;
3973}
3974
3975/// This attempts to bitcast G_INSERT_SUBVECTOR to CastTy.
3976///
3977/// <vscale x 16 x i1> = G_INSERT_SUBVECTOR <vscale x 16 x i1>,
3978/// <vscale x 8 x i1>,
3979/// N
3980///
3981/// ===>
3982///
3983/// <vscale x 2 x i8> = G_BITCAST <vscale x 16 x i1>
3984/// <vscale x 1 x i8> = G_BITCAST <vscale x 8 x i1>
3985/// <vscale x 2 x i8> = G_INSERT_SUBVECTOR <vscale x 2 x i8>,
3986/// <vscale x 1 x i8>, N / 8
3987/// <vscale x 16 x i1> = G_BITCAST <vscale x 2 x i8>
3988LegalizerHelper::LegalizeResult
3989LegalizerHelper::bitcastInsertSubvector(MachineInstr &MI, unsigned TypeIdx,
3990 LLT CastTy) {
3991 auto ES = cast<GInsertSubvector>(Val: &MI);
3992
3993 if (!CastTy.isVector())
3994 return UnableToLegalize;
3995
3996 if (TypeIdx != 0)
3997 return UnableToLegalize;
3998
3999 Register Dst = ES->getReg(Idx: 0);
4000 Register BigVec = ES->getBigVec();
4001 Register SubVec = ES->getSubVec();
4002 uint64_t Idx = ES->getIndexImm();
4003
4004 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4005
4006 LLT DstTy = MRI.getType(Reg: Dst);
4007 LLT BigVecTy = MRI.getType(Reg: BigVec);
4008 LLT SubVecTy = MRI.getType(Reg: SubVec);
4009
4010 if (DstTy == CastTy)
4011 return Legalized;
4012
4013 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
4014 return UnableToLegalize;
4015
4016 ElementCount DstTyEC = DstTy.getElementCount();
4017 ElementCount BigVecTyEC = BigVecTy.getElementCount();
4018 ElementCount SubVecTyEC = SubVecTy.getElementCount();
4019 auto DstTyMinElts = DstTyEC.getKnownMinValue();
4020 auto BigVecTyMinElts = BigVecTyEC.getKnownMinValue();
4021 auto SubVecTyMinElts = SubVecTyEC.getKnownMinValue();
4022
4023 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
4024 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
4025 if (CastEltSize < DstEltSize)
4026 return UnableToLegalize;
4027
4028 auto AdjustAmt = CastEltSize / DstEltSize;
4029 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
4030 BigVecTyMinElts % AdjustAmt != 0 || SubVecTyMinElts % AdjustAmt != 0)
4031 return UnableToLegalize;
4032
4033 Idx /= AdjustAmt;
4034 BigVecTy = LLT::vector(EC: BigVecTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4035 SubVecTy = LLT::vector(EC: SubVecTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4036 auto CastBigVec = MIRBuilder.buildBitcast(Dst: BigVecTy, Src: BigVec);
4037 auto CastSubVec = MIRBuilder.buildBitcast(Dst: SubVecTy, Src: SubVec);
4038 auto PromotedIS =
4039 MIRBuilder.buildInsertSubvector(Res: CastTy, Src0: CastBigVec, Src1: CastSubVec, Index: Idx);
4040 MIRBuilder.buildBitcast(Dst, Src: PromotedIS);
4041
4042 ES->eraseFromParent();
4043 return Legalized;
4044}
4045
4046LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
4047 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
4048 Register DstReg = LoadMI.getDstReg();
4049 Register PtrReg = LoadMI.getPointerReg();
4050 LLT DstTy = MRI.getType(Reg: DstReg);
4051 MachineMemOperand &MMO = LoadMI.getMMO();
4052 LLT MemTy = MMO.getMemoryType();
4053 MachineFunction &MF = MIRBuilder.getMF();
4054
4055 unsigned MemSizeInBits = MemTy.getSizeInBits();
4056 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
4057
4058 if (MemSizeInBits != MemStoreSizeInBits) {
4059 if (MemTy.isVector())
4060 return UnableToLegalize;
4061
4062 // Promote to a byte-sized load if not loading an integral number of
4063 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
4064 LLT WideMemTy = LLT::scalar(SizeInBits: MemStoreSizeInBits);
4065 MachineMemOperand *NewMMO =
4066 MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty: WideMemTy);
4067
4068 Register LoadReg = DstReg;
4069 LLT LoadTy = DstTy;
4070
4071 // If this wasn't already an extending load, we need to widen the result
4072 // register to avoid creating a load with a narrower result than the source.
4073 if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
4074 LoadTy = WideMemTy;
4075 LoadReg = MRI.createGenericVirtualRegister(Ty: WideMemTy);
4076 }
4077
4078 if (isa<GSExtLoad>(Val: LoadMI)) {
4079 auto NewLoad = MIRBuilder.buildLoad(Res: LoadTy, Addr: PtrReg, MMO&: *NewMMO);
4080 MIRBuilder.buildSExtInReg(Res: LoadReg, Op: NewLoad, ImmOp: MemSizeInBits);
4081 } else if (isa<GZExtLoad>(Val: LoadMI) || WideMemTy == LoadTy) {
4082 auto NewLoad = MIRBuilder.buildLoad(Res: LoadTy, Addr: PtrReg, MMO&: *NewMMO);
4083 // The extra bits are guaranteed to be zero, since we stored them that
4084 // way. A zext load from Wide thus automatically gives zext from MemVT.
4085 MIRBuilder.buildAssertZExt(Res: LoadReg, Op: NewLoad, Size: MemSizeInBits);
4086 } else {
4087 MIRBuilder.buildLoad(Res: LoadReg, Addr: PtrReg, MMO&: *NewMMO);
4088 }
4089
4090 if (DstTy != LoadTy)
4091 MIRBuilder.buildTrunc(Res: DstReg, Op: LoadReg);
4092
4093 LoadMI.eraseFromParent();
4094 return Legalized;
4095 }
4096
4097 // Big endian lowering not implemented.
4098 if (MIRBuilder.getDataLayout().isBigEndian())
4099 return UnableToLegalize;
4100
4101 // This load needs splitting into power of 2 sized loads.
4102 //
4103 // Our strategy here is to generate anyextending loads for the smaller
4104 // types up to next power-2 result type, and then combine the two larger
4105 // result values together, before truncating back down to the non-pow-2
4106 // type.
4107 // E.g. v1 = i24 load =>
4108 // v2 = i32 zextload (2 byte)
4109 // v3 = i32 load (1 byte)
4110 // v4 = i32 shl v3, 16
4111 // v5 = i32 or v4, v2
4112 // v1 = i24 trunc v5
4113 // By doing this we generate the correct truncate which should get
4114 // combined away as an artifact with a matching extend.
4115
4116 uint64_t LargeSplitSize, SmallSplitSize;
4117
4118 if (!isPowerOf2_32(Value: MemSizeInBits)) {
4119 // This load needs splitting into power of 2 sized loads.
4120 LargeSplitSize = llvm::bit_floor(Value: MemSizeInBits);
4121 SmallSplitSize = MemSizeInBits - LargeSplitSize;
4122 } else {
4123 // This is already a power of 2, but we still need to split this in half.
4124 //
4125 // Assume we're being asked to decompose an unaligned load.
4126 // TODO: If this requires multiple splits, handle them all at once.
4127 auto &Ctx = MF.getFunction().getContext();
4128 if (TLI.allowsMemoryAccess(Context&: Ctx, DL: MIRBuilder.getDataLayout(), Ty: MemTy, MMO))
4129 return UnableToLegalize;
4130
4131 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4132 }
4133
4134 if (MemTy.isVector()) {
4135 // TODO: Handle vector extloads
4136 if (MemTy != DstTy)
4137 return UnableToLegalize;
4138
4139 Align Alignment = LoadMI.getAlign();
4140 // Given an alignment larger than the size of the memory, we can increase
4141 // the size of the load without needing to scalarize it.
4142 if (Alignment.value() * 8 > MemSizeInBits &&
4143 isPowerOf2_64(Value: DstTy.getScalarSizeInBits())) {
4144 LLT MoreTy = LLT::fixed_vector(NumElements: NextPowerOf2(A: DstTy.getNumElements()),
4145 ScalarTy: DstTy.getElementType());
4146 MachineMemOperand *NewMMO = MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Ty: MoreTy);
4147 auto NewLoad = MIRBuilder.buildLoad(Res: MoreTy, Addr: PtrReg, MMO&: *NewMMO);
4148 MIRBuilder.buildDeleteTrailingVectorElements(Res: LoadMI.getReg(Idx: 0),
4149 Op0: NewLoad.getReg(Idx: 0));
4150 LoadMI.eraseFromParent();
4151 return Legalized;
4152 }
4153
4154 // TODO: We can do better than scalarizing the vector and at least split it
4155 // in half.
4156 return reduceLoadStoreWidth(MI&: LoadMI, TypeIdx: 0, NarrowTy: DstTy.getElementType());
4157 }
4158
4159 MachineMemOperand *LargeMMO =
4160 MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Size: LargeSplitSize / 8);
4161 MachineMemOperand *SmallMMO =
4162 MF.getMachineMemOperand(MMO: &MMO, Offset: LargeSplitSize / 8, Size: SmallSplitSize / 8);
4163
4164 LLT PtrTy = MRI.getType(Reg: PtrReg);
4165 unsigned AnyExtSize = PowerOf2Ceil(A: DstTy.getSizeInBits());
4166 LLT AnyExtTy = LLT::scalar(SizeInBits: AnyExtSize);
4167 auto LargeLoad = MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_ZEXTLOAD, Res: AnyExtTy,
4168 Addr: PtrReg, MMO&: *LargeMMO);
4169
4170 auto OffsetCst = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()),
4171 Val: LargeSplitSize / 8);
4172 Register PtrAddReg = MRI.createGenericVirtualRegister(Ty: PtrTy);
4173 auto SmallPtr = MIRBuilder.buildPtrAdd(Res: PtrAddReg, Op0: PtrReg, Op1: OffsetCst);
4174 auto SmallLoad = MIRBuilder.buildLoadInstr(Opcode: LoadMI.getOpcode(), Res: AnyExtTy,
4175 Addr: SmallPtr, MMO&: *SmallMMO);
4176
4177 auto ShiftAmt = MIRBuilder.buildConstant(Res: AnyExtTy, Val: LargeSplitSize);
4178 auto Shift = MIRBuilder.buildShl(Dst: AnyExtTy, Src0: SmallLoad, Src1: ShiftAmt);
4179
4180 if (AnyExtTy == DstTy)
4181 MIRBuilder.buildOr(Dst: DstReg, Src0: Shift, Src1: LargeLoad);
4182 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
4183 auto Or = MIRBuilder.buildOr(Dst: AnyExtTy, Src0: Shift, Src1: LargeLoad);
4184 MIRBuilder.buildTrunc(Res: DstReg, Op: {Or});
4185 } else {
4186 assert(DstTy.isPointer() && "expected pointer");
4187 auto Or = MIRBuilder.buildOr(Dst: AnyExtTy, Src0: Shift, Src1: LargeLoad);
4188
4189 // FIXME: We currently consider this to be illegal for non-integral address
4190 // spaces, but we need still need a way to reinterpret the bits.
4191 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: Or);
4192 }
4193
4194 LoadMI.eraseFromParent();
4195 return Legalized;
4196}
4197
4198LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
4199 // Lower a non-power of 2 store into multiple pow-2 stores.
4200 // E.g. split an i24 store into an i16 store + i8 store.
4201 // We do this by first extending the stored value to the next largest power
4202 // of 2 type, and then using truncating stores to store the components.
4203 // By doing this, likewise with G_LOAD, generate an extend that can be
4204 // artifact-combined away instead of leaving behind extracts.
4205 Register SrcReg = StoreMI.getValueReg();
4206 Register PtrReg = StoreMI.getPointerReg();
4207 LLT SrcTy = MRI.getType(Reg: SrcReg);
4208 MachineFunction &MF = MIRBuilder.getMF();
4209 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4210 LLT MemTy = MMO.getMemoryType();
4211
4212 unsigned StoreWidth = MemTy.getSizeInBits();
4213 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
4214
4215 if (StoreWidth != StoreSizeInBits && !SrcTy.isVector()) {
4216 // Promote to a byte-sized store with upper bits zero if not
4217 // storing an integral number of bytes. For example, promote
4218 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
4219 LLT WideTy = LLT::scalar(SizeInBits: StoreSizeInBits);
4220
4221 if (StoreSizeInBits > SrcTy.getSizeInBits()) {
4222 // Avoid creating a store with a narrower source than result.
4223 SrcReg = MIRBuilder.buildAnyExt(Res: WideTy, Op: SrcReg).getReg(Idx: 0);
4224 SrcTy = WideTy;
4225 }
4226
4227 auto ZextInReg = MIRBuilder.buildZExtInReg(Res: SrcTy, Op: SrcReg, ImmOp: StoreWidth);
4228
4229 MachineMemOperand *NewMMO =
4230 MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty: WideTy);
4231 MIRBuilder.buildStore(Val: ZextInReg, Addr: PtrReg, MMO&: *NewMMO);
4232 StoreMI.eraseFromParent();
4233 return Legalized;
4234 }
4235
4236 if (MemTy.isVector()) {
4237 if (MemTy != SrcTy)
4238 return scalarizeVectorBooleanStore(MI&: StoreMI);
4239
4240 // TODO: We can do better than scalarizing the vector and at least split it
4241 // in half.
4242 return reduceLoadStoreWidth(MI&: StoreMI, TypeIdx: 0, NarrowTy: SrcTy.getElementType());
4243 }
4244
4245 unsigned MemSizeInBits = MemTy.getSizeInBits();
4246 uint64_t LargeSplitSize, SmallSplitSize;
4247
4248 if (!isPowerOf2_32(Value: MemSizeInBits)) {
4249 LargeSplitSize = llvm::bit_floor<uint64_t>(Value: MemTy.getSizeInBits());
4250 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
4251 } else {
4252 auto &Ctx = MF.getFunction().getContext();
4253 if (TLI.allowsMemoryAccess(Context&: Ctx, DL: MIRBuilder.getDataLayout(), Ty: MemTy, MMO))
4254 return UnableToLegalize; // Don't know what we're being asked to do.
4255
4256 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4257 }
4258
4259 // Extend to the next pow-2. If this store was itself the result of lowering,
4260 // e.g. an s56 store being broken into s32 + s24, we might have a stored type
4261 // that's wider than the stored size.
4262 unsigned AnyExtSize = PowerOf2Ceil(A: MemTy.getSizeInBits());
4263 const LLT NewSrcTy = LLT::scalar(SizeInBits: AnyExtSize);
4264
4265 if (SrcTy.isPointer()) {
4266 const LLT IntPtrTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
4267 SrcReg = MIRBuilder.buildPtrToInt(Dst: IntPtrTy, Src: SrcReg).getReg(Idx: 0);
4268 }
4269
4270 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(Res: NewSrcTy, Op: SrcReg);
4271
4272 // Obtain the smaller value by shifting away the larger value.
4273 auto ShiftAmt = MIRBuilder.buildConstant(Res: NewSrcTy, Val: LargeSplitSize);
4274 auto SmallVal = MIRBuilder.buildLShr(Dst: NewSrcTy, Src0: ExtVal, Src1: ShiftAmt);
4275
4276 // Generate the PtrAdd and truncating stores.
4277 LLT PtrTy = MRI.getType(Reg: PtrReg);
4278 auto OffsetCst = MIRBuilder.buildConstant(
4279 Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()), Val: LargeSplitSize / 8);
4280 auto SmallPtr =
4281 MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: PtrReg, Op1: OffsetCst);
4282
4283 MachineMemOperand *LargeMMO =
4284 MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Size: LargeSplitSize / 8);
4285 MachineMemOperand *SmallMMO =
4286 MF.getMachineMemOperand(MMO: &MMO, Offset: LargeSplitSize / 8, Size: SmallSplitSize / 8);
4287 MIRBuilder.buildStore(Val: ExtVal, Addr: PtrReg, MMO&: *LargeMMO);
4288 MIRBuilder.buildStore(Val: SmallVal, Addr: SmallPtr, MMO&: *SmallMMO);
4289 StoreMI.eraseFromParent();
4290 return Legalized;
4291}
4292
4293LegalizerHelper::LegalizeResult
4294LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
4295 Register SrcReg = StoreMI.getValueReg();
4296 Register PtrReg = StoreMI.getPointerReg();
4297 LLT SrcTy = MRI.getType(Reg: SrcReg);
4298 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4299 LLT MemTy = MMO.getMemoryType();
4300 LLT MemScalarTy = MemTy.getElementType();
4301 MachineFunction &MF = MIRBuilder.getMF();
4302
4303 assert(SrcTy.isVector() && "Expect a vector store type");
4304
4305 if (!MemScalarTy.isByteSized()) {
4306 // We need to build an integer scalar of the vector bit pattern.
4307 // It's not legal for us to add padding when storing a vector.
4308 unsigned NumBits = MemTy.getSizeInBits();
4309 LLT IntTy = LLT::scalar(SizeInBits: NumBits);
4310 auto CurrVal = MIRBuilder.buildConstant(Res: IntTy, Val: 0);
4311 LLT IdxTy = TLI.getVectorIdxLLT(DL: MF.getDataLayout());
4312
4313 for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
4314 auto Elt = MIRBuilder.buildExtractVectorElement(
4315 Res: SrcTy.getElementType(), Val: SrcReg, Idx: MIRBuilder.buildConstant(Res: IdxTy, Val: I));
4316 auto Trunc = MIRBuilder.buildTrunc(Res: MemScalarTy, Op: Elt);
4317 auto ZExt = MIRBuilder.buildZExt(Res: IntTy, Op: Trunc);
4318 unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
4319 ? (MemTy.getNumElements() - 1) - I
4320 : I;
4321 auto ShiftAmt = MIRBuilder.buildConstant(
4322 Res: IntTy, Val: ShiftIntoIdx * MemScalarTy.getSizeInBits());
4323 auto Shifted = MIRBuilder.buildShl(Dst: IntTy, Src0: ZExt, Src1: ShiftAmt);
4324 CurrVal = MIRBuilder.buildOr(Dst: IntTy, Src0: CurrVal, Src1: Shifted);
4325 }
4326 auto PtrInfo = MMO.getPointerInfo();
4327 auto *NewMMO = MF.getMachineMemOperand(MMO: &MMO, PtrInfo, Ty: IntTy);
4328 MIRBuilder.buildStore(Val: CurrVal, Addr: PtrReg, MMO&: *NewMMO);
4329 StoreMI.eraseFromParent();
4330 return Legalized;
4331 }
4332
4333 // TODO: implement simple scalarization.
4334 return UnableToLegalize;
4335}
4336
4337LegalizerHelper::LegalizeResult
4338LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
4339 switch (MI.getOpcode()) {
4340 case TargetOpcode::G_LOAD: {
4341 if (TypeIdx != 0)
4342 return UnableToLegalize;
4343 MachineMemOperand &MMO = **MI.memoperands_begin();
4344
4345 // Not sure how to interpret a bitcast of an extending load.
4346 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4347 return UnableToLegalize;
4348
4349 Observer.changingInstr(MI);
4350 bitcastDst(MI, CastTy, OpIdx: 0);
4351 MMO.setType(CastTy);
4352 // The range metadata is no longer valid when reinterpreted as a different
4353 // type.
4354 MMO.clearRanges();
4355 Observer.changedInstr(MI);
4356 return Legalized;
4357 }
4358 case TargetOpcode::G_STORE: {
4359 if (TypeIdx != 0)
4360 return UnableToLegalize;
4361
4362 MachineMemOperand &MMO = **MI.memoperands_begin();
4363
4364 // Not sure how to interpret a bitcast of a truncating store.
4365 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4366 return UnableToLegalize;
4367
4368 Observer.changingInstr(MI);
4369 bitcastSrc(MI, CastTy, OpIdx: 0);
4370 MMO.setType(CastTy);
4371 Observer.changedInstr(MI);
4372 return Legalized;
4373 }
4374 case TargetOpcode::G_SELECT: {
4375 if (TypeIdx != 0)
4376 return UnableToLegalize;
4377
4378 if (MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector()) {
4379 LLVM_DEBUG(
4380 dbgs() << "bitcast action not implemented for vector select\n");
4381 return UnableToLegalize;
4382 }
4383
4384 Observer.changingInstr(MI);
4385 bitcastSrc(MI, CastTy, OpIdx: 2);
4386 bitcastSrc(MI, CastTy, OpIdx: 3);
4387 bitcastDst(MI, CastTy, OpIdx: 0);
4388 Observer.changedInstr(MI);
4389 return Legalized;
4390 }
4391 case TargetOpcode::G_AND:
4392 case TargetOpcode::G_OR:
4393 case TargetOpcode::G_XOR: {
4394 Observer.changingInstr(MI);
4395 bitcastSrc(MI, CastTy, OpIdx: 1);
4396 bitcastSrc(MI, CastTy, OpIdx: 2);
4397 bitcastDst(MI, CastTy, OpIdx: 0);
4398 Observer.changedInstr(MI);
4399 return Legalized;
4400 }
4401 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4402 return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
4403 case TargetOpcode::G_INSERT_VECTOR_ELT:
4404 return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
4405 case TargetOpcode::G_CONCAT_VECTORS:
4406 return bitcastConcatVector(MI, TypeIdx, CastTy);
4407 case TargetOpcode::G_SHUFFLE_VECTOR:
4408 return bitcastShuffleVector(MI, TypeIdx, CastTy);
4409 case TargetOpcode::G_EXTRACT_SUBVECTOR:
4410 return bitcastExtractSubvector(MI, TypeIdx, CastTy);
4411 case TargetOpcode::G_INSERT_SUBVECTOR:
4412 return bitcastInsertSubvector(MI, TypeIdx, CastTy);
4413 default:
4414 return UnableToLegalize;
4415 }
4416}
4417
4418// Legalize an instruction by changing the opcode in place.
4419void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
4420 Observer.changingInstr(MI);
4421 MI.setDesc(MIRBuilder.getTII().get(Opcode: NewOpcode));
4422 Observer.changedInstr(MI);
4423}
4424
4425LegalizerHelper::LegalizeResult
4426LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
4427 using namespace TargetOpcode;
4428
4429 switch(MI.getOpcode()) {
4430 default:
4431 return UnableToLegalize;
4432 case TargetOpcode::G_FCONSTANT:
4433 return lowerFConstant(MI);
4434 case TargetOpcode::G_BITCAST:
4435 return lowerBitcast(MI);
4436 case TargetOpcode::G_SREM:
4437 case TargetOpcode::G_UREM: {
4438 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4439 auto Quot =
4440 MIRBuilder.buildInstr(Opc: MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, DstOps: {Ty},
4441 SrcOps: {MI.getOperand(i: 1), MI.getOperand(i: 2)});
4442
4443 auto Prod = MIRBuilder.buildMul(Dst: Ty, Src0: Quot, Src1: MI.getOperand(i: 2));
4444 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MI.getOperand(i: 1), Src1: Prod);
4445 MI.eraseFromParent();
4446 return Legalized;
4447 }
4448 case TargetOpcode::G_SADDO:
4449 case TargetOpcode::G_SSUBO:
4450 return lowerSADDO_SSUBO(MI);
4451 case TargetOpcode::G_UMULH:
4452 case TargetOpcode::G_SMULH:
4453 return lowerSMULH_UMULH(MI);
4454 case TargetOpcode::G_SMULO:
4455 case TargetOpcode::G_UMULO: {
4456 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
4457 // result.
4458 auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
4459 LLT Ty = MRI.getType(Reg: Res);
4460
4461 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
4462 ? TargetOpcode::G_SMULH
4463 : TargetOpcode::G_UMULH;
4464
4465 Observer.changingInstr(MI);
4466 const auto &TII = MIRBuilder.getTII();
4467 MI.setDesc(TII.get(Opcode: TargetOpcode::G_MUL));
4468 MI.removeOperand(OpNo: 1);
4469 Observer.changedInstr(MI);
4470
4471 auto HiPart = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {Ty}, SrcOps: {LHS, RHS});
4472 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4473
4474 // Move insert point forward so we can use the Res register if needed.
4475 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
4476
4477 // For *signed* multiply, overflow is detected by checking:
4478 // (hi != (lo >> bitwidth-1))
4479 if (Opcode == TargetOpcode::G_SMULH) {
4480 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: Ty.getSizeInBits() - 1);
4481 auto Shifted = MIRBuilder.buildAShr(Dst: Ty, Src0: Res, Src1: ShiftAmt);
4482 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: Overflow, Op0: HiPart, Op1: Shifted);
4483 } else {
4484 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: Overflow, Op0: HiPart, Op1: Zero);
4485 }
4486 return Legalized;
4487 }
4488 case TargetOpcode::G_FNEG: {
4489 auto [Res, SubByReg] = MI.getFirst2Regs();
4490 LLT Ty = MRI.getType(Reg: Res);
4491
4492 auto SignMask = MIRBuilder.buildConstant(
4493 Res: Ty, Val: APInt::getSignMask(BitWidth: Ty.getScalarSizeInBits()));
4494 MIRBuilder.buildXor(Dst: Res, Src0: SubByReg, Src1: SignMask);
4495 MI.eraseFromParent();
4496 return Legalized;
4497 }
4498 case TargetOpcode::G_FSUB:
4499 case TargetOpcode::G_STRICT_FSUB: {
4500 auto [Res, LHS, RHS] = MI.getFirst3Regs();
4501 LLT Ty = MRI.getType(Reg: Res);
4502
4503 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
4504 auto Neg = MIRBuilder.buildFNeg(Dst: Ty, Src0: RHS);
4505
4506 if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
4507 MIRBuilder.buildStrictFAdd(Dst: Res, Src0: LHS, Src1: Neg, Flags: MI.getFlags());
4508 else
4509 MIRBuilder.buildFAdd(Dst: Res, Src0: LHS, Src1: Neg, Flags: MI.getFlags());
4510
4511 MI.eraseFromParent();
4512 return Legalized;
4513 }
4514 case TargetOpcode::G_FMAD:
4515 return lowerFMad(MI);
4516 case TargetOpcode::G_FFLOOR:
4517 return lowerFFloor(MI);
4518 case TargetOpcode::G_LROUND:
4519 case TargetOpcode::G_LLROUND: {
4520 Register DstReg = MI.getOperand(i: 0).getReg();
4521 Register SrcReg = MI.getOperand(i: 1).getReg();
4522 LLT SrcTy = MRI.getType(Reg: SrcReg);
4523 auto Round = MIRBuilder.buildInstr(Opc: TargetOpcode::G_INTRINSIC_ROUND, DstOps: {SrcTy},
4524 SrcOps: {SrcReg});
4525 MIRBuilder.buildFPTOSI(Dst: DstReg, Src0: Round);
4526 MI.eraseFromParent();
4527 return Legalized;
4528 }
4529 case TargetOpcode::G_INTRINSIC_ROUND:
4530 return lowerIntrinsicRound(MI);
4531 case TargetOpcode::G_FRINT: {
4532 // Since round even is the assumed rounding mode for unconstrained FP
4533 // operations, rint and roundeven are the same operation.
4534 changeOpcode(MI, NewOpcode: TargetOpcode::G_INTRINSIC_ROUNDEVEN);
4535 return Legalized;
4536 }
4537 case TargetOpcode::G_INTRINSIC_LRINT:
4538 case TargetOpcode::G_INTRINSIC_LLRINT: {
4539 Register DstReg = MI.getOperand(i: 0).getReg();
4540 Register SrcReg = MI.getOperand(i: 1).getReg();
4541 LLT SrcTy = MRI.getType(Reg: SrcReg);
4542 auto Round =
4543 MIRBuilder.buildInstr(Opc: TargetOpcode::G_FRINT, DstOps: {SrcTy}, SrcOps: {SrcReg});
4544 MIRBuilder.buildFPTOSI(Dst: DstReg, Src0: Round);
4545 MI.eraseFromParent();
4546 return Legalized;
4547 }
4548 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
4549 auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
4550 Register NewOldValRes = MRI.cloneVirtualRegister(VReg: OldValRes);
4551 MIRBuilder.buildAtomicCmpXchg(OldValRes: NewOldValRes, Addr, CmpVal, NewVal,
4552 MMO&: **MI.memoperands_begin());
4553 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: SuccessRes, Op0: NewOldValRes, Op1: CmpVal);
4554 MIRBuilder.buildCopy(Res: OldValRes, Op: NewOldValRes);
4555 MI.eraseFromParent();
4556 return Legalized;
4557 }
4558 case TargetOpcode::G_LOAD:
4559 case TargetOpcode::G_SEXTLOAD:
4560 case TargetOpcode::G_ZEXTLOAD:
4561 return lowerLoad(LoadMI&: cast<GAnyLoad>(Val&: MI));
4562 case TargetOpcode::G_STORE:
4563 return lowerStore(StoreMI&: cast<GStore>(Val&: MI));
4564 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
4565 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
4566 case TargetOpcode::G_CTLZ:
4567 case TargetOpcode::G_CTTZ:
4568 case TargetOpcode::G_CTPOP:
4569 return lowerBitCount(MI);
4570 case G_UADDO: {
4571 auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
4572
4573 Register NewRes = MRI.cloneVirtualRegister(VReg: Res);
4574
4575 MIRBuilder.buildAdd(Dst: NewRes, Src0: LHS, Src1: RHS);
4576 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: CarryOut, Op0: NewRes, Op1: RHS);
4577
4578 MIRBuilder.buildCopy(Res, Op: NewRes);
4579
4580 MI.eraseFromParent();
4581 return Legalized;
4582 }
4583 case G_UADDE: {
4584 auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
4585 const LLT CondTy = MRI.getType(Reg: CarryOut);
4586 const LLT Ty = MRI.getType(Reg: Res);
4587
4588 Register NewRes = MRI.cloneVirtualRegister(VReg: Res);
4589
4590 // Initial add of the two operands.
4591 auto TmpRes = MIRBuilder.buildAdd(Dst: Ty, Src0: LHS, Src1: RHS);
4592
4593 // Initial check for carry.
4594 auto Carry = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: CondTy, Op0: TmpRes, Op1: LHS);
4595
4596 // Add the sum and the carry.
4597 auto ZExtCarryIn = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
4598 MIRBuilder.buildAdd(Dst: NewRes, Src0: TmpRes, Src1: ZExtCarryIn);
4599
4600 // Second check for carry. We can only carry if the initial sum is all 1s
4601 // and the carry is set, resulting in a new sum of 0.
4602 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4603 auto ResEqZero =
4604 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CondTy, Op0: NewRes, Op1: Zero);
4605 auto Carry2 = MIRBuilder.buildAnd(Dst: CondTy, Src0: ResEqZero, Src1: CarryIn);
4606 MIRBuilder.buildOr(Dst: CarryOut, Src0: Carry, Src1: Carry2);
4607
4608 MIRBuilder.buildCopy(Res, Op: NewRes);
4609
4610 MI.eraseFromParent();
4611 return Legalized;
4612 }
4613 case G_USUBO: {
4614 auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
4615
4616 MIRBuilder.buildSub(Dst: Res, Src0: LHS, Src1: RHS);
4617 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: BorrowOut, Op0: LHS, Op1: RHS);
4618
4619 MI.eraseFromParent();
4620 return Legalized;
4621 }
4622 case G_USUBE: {
4623 auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
4624 const LLT CondTy = MRI.getType(Reg: BorrowOut);
4625 const LLT Ty = MRI.getType(Reg: Res);
4626
4627 // Initial subtract of the two operands.
4628 auto TmpRes = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHS);
4629
4630 // Initial check for borrow.
4631 auto Borrow = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_UGT, Res: CondTy, Op0: TmpRes, Op1: LHS);
4632
4633 // Subtract the borrow from the first subtract.
4634 auto ZExtBorrowIn = MIRBuilder.buildZExt(Res: Ty, Op: BorrowIn);
4635 MIRBuilder.buildSub(Dst: Res, Src0: TmpRes, Src1: ZExtBorrowIn);
4636
4637 // Second check for borrow. We can only borrow if the initial difference is
4638 // 0 and the borrow is set, resulting in a new difference of all 1s.
4639 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4640 auto TmpResEqZero =
4641 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CondTy, Op0: TmpRes, Op1: Zero);
4642 auto Borrow2 = MIRBuilder.buildAnd(Dst: CondTy, Src0: TmpResEqZero, Src1: BorrowIn);
4643 MIRBuilder.buildOr(Dst: BorrowOut, Src0: Borrow, Src1: Borrow2);
4644
4645 MI.eraseFromParent();
4646 return Legalized;
4647 }
4648 case G_UITOFP:
4649 return lowerUITOFP(MI);
4650 case G_SITOFP:
4651 return lowerSITOFP(MI);
4652 case G_FPTOUI:
4653 return lowerFPTOUI(MI);
4654 case G_FPTOSI:
4655 return lowerFPTOSI(MI);
4656 case G_FPTOUI_SAT:
4657 case G_FPTOSI_SAT:
4658 return lowerFPTOINT_SAT(MI);
4659 case G_FPTRUNC:
4660 return lowerFPTRUNC(MI);
4661 case G_FPOWI:
4662 return lowerFPOWI(MI);
4663 case G_SMIN:
4664 case G_SMAX:
4665 case G_UMIN:
4666 case G_UMAX:
4667 return lowerMinMax(MI);
4668 case G_SCMP:
4669 case G_UCMP:
4670 return lowerThreewayCompare(MI);
4671 case G_FCOPYSIGN:
4672 return lowerFCopySign(MI);
4673 case G_FMINNUM:
4674 case G_FMAXNUM:
4675 case G_FMINIMUMNUM:
4676 case G_FMAXIMUMNUM:
4677 return lowerFMinNumMaxNum(MI);
4678 case G_MERGE_VALUES:
4679 return lowerMergeValues(MI);
4680 case G_UNMERGE_VALUES:
4681 return lowerUnmergeValues(MI);
4682 case TargetOpcode::G_SEXT_INREG: {
4683 assert(MI.getOperand(2).isImm() && "Expected immediate");
4684 int64_t SizeInBits = MI.getOperand(i: 2).getImm();
4685
4686 auto [DstReg, SrcReg] = MI.getFirst2Regs();
4687 LLT DstTy = MRI.getType(Reg: DstReg);
4688 Register TmpRes = MRI.createGenericVirtualRegister(Ty: DstTy);
4689
4690 auto MIBSz = MIRBuilder.buildConstant(Res: DstTy, Val: DstTy.getScalarSizeInBits() - SizeInBits);
4691 MIRBuilder.buildShl(Dst: TmpRes, Src0: SrcReg, Src1: MIBSz->getOperand(i: 0));
4692 MIRBuilder.buildAShr(Dst: DstReg, Src0: TmpRes, Src1: MIBSz->getOperand(i: 0));
4693 MI.eraseFromParent();
4694 return Legalized;
4695 }
4696 case G_EXTRACT_VECTOR_ELT:
4697 case G_INSERT_VECTOR_ELT:
4698 return lowerExtractInsertVectorElt(MI);
4699 case G_SHUFFLE_VECTOR:
4700 return lowerShuffleVector(MI);
4701 case G_VECTOR_COMPRESS:
4702 return lowerVECTOR_COMPRESS(MI);
4703 case G_DYN_STACKALLOC:
4704 return lowerDynStackAlloc(MI);
4705 case G_STACKSAVE:
4706 return lowerStackSave(MI);
4707 case G_STACKRESTORE:
4708 return lowerStackRestore(MI);
4709 case G_EXTRACT:
4710 return lowerExtract(MI);
4711 case G_INSERT:
4712 return lowerInsert(MI);
4713 case G_BSWAP:
4714 return lowerBswap(MI);
4715 case G_BITREVERSE:
4716 return lowerBitreverse(MI);
4717 case G_READ_REGISTER:
4718 case G_WRITE_REGISTER:
4719 return lowerReadWriteRegister(MI);
4720 case G_UADDSAT:
4721 case G_USUBSAT: {
4722 // Try to make a reasonable guess about which lowering strategy to use. The
4723 // target can override this with custom lowering and calling the
4724 // implementation functions.
4725 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4726 if (LI.isLegalOrCustom(Query: {G_UMIN, Ty}))
4727 return lowerAddSubSatToMinMax(MI);
4728 return lowerAddSubSatToAddoSubo(MI);
4729 }
4730 case G_SADDSAT:
4731 case G_SSUBSAT: {
4732 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4733
4734 // FIXME: It would probably make more sense to see if G_SADDO is preferred,
4735 // since it's a shorter expansion. However, we would need to figure out the
4736 // preferred boolean type for the carry out for the query.
4737 if (LI.isLegalOrCustom(Query: {G_SMIN, Ty}) && LI.isLegalOrCustom(Query: {G_SMAX, Ty}))
4738 return lowerAddSubSatToMinMax(MI);
4739 return lowerAddSubSatToAddoSubo(MI);
4740 }
4741 case G_SSHLSAT:
4742 case G_USHLSAT:
4743 return lowerShlSat(MI);
4744 case G_ABS:
4745 return lowerAbsToAddXor(MI);
4746 case G_FABS:
4747 return lowerFAbs(MI);
4748 case G_SELECT:
4749 return lowerSelect(MI);
4750 case G_IS_FPCLASS:
4751 return lowerISFPCLASS(MI);
4752 case G_SDIVREM:
4753 case G_UDIVREM:
4754 return lowerDIVREM(MI);
4755 case G_FSHL:
4756 case G_FSHR:
4757 return lowerFunnelShift(MI);
4758 case G_ROTL:
4759 case G_ROTR:
4760 return lowerRotate(MI);
4761 case G_MEMSET:
4762 case G_MEMCPY:
4763 case G_MEMMOVE:
4764 return lowerMemCpyFamily(MI);
4765 case G_MEMCPY_INLINE:
4766 return lowerMemcpyInline(MI);
4767 case G_ZEXT:
4768 case G_SEXT:
4769 case G_ANYEXT:
4770 return lowerEXT(MI);
4771 case G_TRUNC:
4772 return lowerTRUNC(MI);
4773 GISEL_VECREDUCE_CASES_NONSEQ
4774 return lowerVectorReduction(MI);
4775 case G_VAARG:
4776 return lowerVAArg(MI);
4777 }
4778}
4779
4780Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
4781 Align MinAlign) const {
4782 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
4783 // datalayout for the preferred alignment. Also there should be a target hook
4784 // for this to allow targets to reduce the alignment and ignore the
4785 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
4786 // the type.
4787 return std::max(a: Align(PowerOf2Ceil(A: Ty.getSizeInBytes())), b: MinAlign);
4788}
4789
4790MachineInstrBuilder
4791LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
4792 MachinePointerInfo &PtrInfo) {
4793 MachineFunction &MF = MIRBuilder.getMF();
4794 const DataLayout &DL = MIRBuilder.getDataLayout();
4795 int FrameIdx = MF.getFrameInfo().CreateStackObject(Size: Bytes, Alignment, isSpillSlot: false);
4796
4797 unsigned AddrSpace = DL.getAllocaAddrSpace();
4798 LLT FramePtrTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: DL.getPointerSizeInBits(AS: AddrSpace));
4799
4800 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI: FrameIdx);
4801 return MIRBuilder.buildFrameIndex(Res: FramePtrTy, Idx: FrameIdx);
4802}
4803
4804MachineInstrBuilder LegalizerHelper::createStackStoreLoad(const DstOp &Res,
4805 const SrcOp &Val) {
4806 LLT SrcTy = Val.getLLTTy(MRI);
4807 Align StackTypeAlign =
4808 std::max(a: getStackTemporaryAlignment(Ty: SrcTy),
4809 b: getStackTemporaryAlignment(Ty: Res.getLLTTy(MRI)));
4810 MachinePointerInfo PtrInfo;
4811 auto StackTemp =
4812 createStackTemporary(Bytes: SrcTy.getSizeInBytes(), Alignment: StackTypeAlign, PtrInfo);
4813
4814 MIRBuilder.buildStore(Val, Addr: StackTemp, PtrInfo, Alignment: StackTypeAlign);
4815 return MIRBuilder.buildLoad(Res, Addr: StackTemp, PtrInfo, Alignment: StackTypeAlign);
4816}
4817
4818static Register clampVectorIndex(MachineIRBuilder &B, Register IdxReg,
4819 LLT VecTy) {
4820 LLT IdxTy = B.getMRI()->getType(Reg: IdxReg);
4821 unsigned NElts = VecTy.getNumElements();
4822
4823 int64_t IdxVal;
4824 if (mi_match(R: IdxReg, MRI: *B.getMRI(), P: m_ICst(Cst&: IdxVal))) {
4825 if (IdxVal < VecTy.getNumElements())
4826 return IdxReg;
4827 // If a constant index would be out of bounds, clamp it as well.
4828 }
4829
4830 if (isPowerOf2_32(Value: NElts)) {
4831 APInt Imm = APInt::getLowBitsSet(numBits: IdxTy.getSizeInBits(), loBitsSet: Log2_32(Value: NElts));
4832 return B.buildAnd(Dst: IdxTy, Src0: IdxReg, Src1: B.buildConstant(Res: IdxTy, Val: Imm)).getReg(Idx: 0);
4833 }
4834
4835 return B.buildUMin(Dst: IdxTy, Src0: IdxReg, Src1: B.buildConstant(Res: IdxTy, Val: NElts - 1))
4836 .getReg(Idx: 0);
4837}
4838
4839Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
4840 Register Index) {
4841 LLT EltTy = VecTy.getElementType();
4842
4843 // Calculate the element offset and add it to the pointer.
4844 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
4845 assert(EltSize * 8 == EltTy.getSizeInBits() &&
4846 "Converting bits to bytes lost precision");
4847
4848 Index = clampVectorIndex(B&: MIRBuilder, IdxReg: Index, VecTy);
4849
4850 // Convert index to the correct size for the address space.
4851 const DataLayout &DL = MIRBuilder.getDataLayout();
4852 unsigned AS = MRI.getType(Reg: VecPtr).getAddressSpace();
4853 unsigned IndexSizeInBits = DL.getIndexSize(AS) * 8;
4854 LLT IdxTy = MRI.getType(Reg: Index).changeElementSize(NewEltSize: IndexSizeInBits);
4855 if (IdxTy != MRI.getType(Reg: Index))
4856 Index = MIRBuilder.buildSExtOrTrunc(Res: IdxTy, Op: Index).getReg(Idx: 0);
4857
4858 auto Mul = MIRBuilder.buildMul(Dst: IdxTy, Src0: Index,
4859 Src1: MIRBuilder.buildConstant(Res: IdxTy, Val: EltSize));
4860
4861 LLT PtrTy = MRI.getType(Reg: VecPtr);
4862 return MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VecPtr, Op1: Mul).getReg(Idx: 0);
4863}
4864
4865#ifndef NDEBUG
4866/// Check that all vector operands have same number of elements. Other operands
4867/// should be listed in NonVecOp.
4868static bool hasSameNumEltsOnAllVectorOperands(
4869 GenericMachineInstr &MI, MachineRegisterInfo &MRI,
4870 std::initializer_list<unsigned> NonVecOpIndices) {
4871 if (MI.getNumMemOperands() != 0)
4872 return false;
4873
4874 LLT VecTy = MRI.getType(MI.getReg(0));
4875 if (!VecTy.isVector())
4876 return false;
4877 unsigned NumElts = VecTy.getNumElements();
4878
4879 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
4880 MachineOperand &Op = MI.getOperand(OpIdx);
4881 if (!Op.isReg()) {
4882 if (!is_contained(NonVecOpIndices, OpIdx))
4883 return false;
4884 continue;
4885 }
4886
4887 LLT Ty = MRI.getType(Op.getReg());
4888 if (!Ty.isVector()) {
4889 if (!is_contained(NonVecOpIndices, OpIdx))
4890 return false;
4891 continue;
4892 }
4893
4894 if (Ty.getNumElements() != NumElts)
4895 return false;
4896 }
4897
4898 return true;
4899}
4900#endif
4901
4902/// Fill \p DstOps with DstOps that have same number of elements combined as
4903/// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
4904/// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
4905/// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
4906static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
4907 unsigned NumElts) {
4908 LLT LeftoverTy;
4909 assert(Ty.isVector() && "Expected vector type");
4910 LLT EltTy = Ty.getElementType();
4911 LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElements: NumElts, ScalarTy: EltTy);
4912 int NumParts, NumLeftover;
4913 std::tie(args&: NumParts, args&: NumLeftover) =
4914 getNarrowTypeBreakDown(OrigTy: Ty, NarrowTy, LeftoverTy);
4915
4916 assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
4917 for (int i = 0; i < NumParts; ++i) {
4918 DstOps.push_back(Elt: NarrowTy);
4919 }
4920
4921 if (LeftoverTy.isValid()) {
4922 assert(NumLeftover == 1 && "expected exactly one leftover");
4923 DstOps.push_back(Elt: LeftoverTy);
4924 }
4925}
4926
4927/// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
4928/// made from \p Op depending on operand type.
4929static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
4930 MachineOperand &Op) {
4931 for (unsigned i = 0; i < N; ++i) {
4932 if (Op.isReg())
4933 Ops.push_back(Elt: Op.getReg());
4934 else if (Op.isImm())
4935 Ops.push_back(Elt: Op.getImm());
4936 else if (Op.isPredicate())
4937 Ops.push_back(Elt: static_cast<CmpInst::Predicate>(Op.getPredicate()));
4938 else
4939 llvm_unreachable("Unsupported type");
4940 }
4941}
4942
4943// Handle splitting vector operations which need to have the same number of
4944// elements in each type index, but each type index may have a different element
4945// type.
4946//
4947// e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
4948// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4949// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4950//
4951// Also handles some irregular breakdown cases, e.g.
4952// e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
4953// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4954// s64 = G_SHL s64, s32
4955LegalizerHelper::LegalizeResult
4956LegalizerHelper::fewerElementsVectorMultiEltType(
4957 GenericMachineInstr &MI, unsigned NumElts,
4958 std::initializer_list<unsigned> NonVecOpIndices) {
4959 assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
4960 "Non-compatible opcode or not specified non-vector operands");
4961 unsigned OrigNumElts = MRI.getType(Reg: MI.getReg(Idx: 0)).getNumElements();
4962
4963 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4964 unsigned NumDefs = MI.getNumDefs();
4965
4966 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
4967 // Build instructions with DstOps to use instruction found by CSE directly.
4968 // CSE copies found instruction into given vreg when building with vreg dest.
4969 SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
4970 // Output registers will be taken from created instructions.
4971 SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
4972 for (unsigned i = 0; i < NumDefs; ++i) {
4973 makeDstOps(DstOps&: OutputOpsPieces[i], Ty: MRI.getType(Reg: MI.getReg(Idx: i)), NumElts);
4974 }
4975
4976 // Split vector input operands into sub-vectors with NumElts elts + Leftover.
4977 // Operands listed in NonVecOpIndices will be used as is without splitting;
4978 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
4979 // scalar condition (op 1), immediate in sext_inreg (op 2).
4980 SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
4981 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4982 ++UseIdx, ++UseNo) {
4983 if (is_contained(Set: NonVecOpIndices, Element: UseIdx)) {
4984 broadcastSrcOp(Ops&: InputOpsPieces[UseNo], N: OutputOpsPieces[0].size(),
4985 Op&: MI.getOperand(i: UseIdx));
4986 } else {
4987 SmallVector<Register, 8> SplitPieces;
4988 extractVectorParts(Reg: MI.getReg(Idx: UseIdx), NumElts, VRegs&: SplitPieces, MIRBuilder,
4989 MRI);
4990 llvm::append_range(C&: InputOpsPieces[UseNo], R&: SplitPieces);
4991 }
4992 }
4993
4994 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4995
4996 // Take i-th piece of each input operand split and build sub-vector/scalar
4997 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
4998 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4999 SmallVector<DstOp, 2> Defs;
5000 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5001 Defs.push_back(Elt: OutputOpsPieces[DstNo][i]);
5002
5003 SmallVector<SrcOp, 3> Uses;
5004 for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
5005 Uses.push_back(Elt: InputOpsPieces[InputNo][i]);
5006
5007 auto I = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: Defs, SrcOps: Uses, Flags: MI.getFlags());
5008 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5009 OutputRegs[DstNo].push_back(Elt: I.getReg(Idx: DstNo));
5010 }
5011
5012 // Merge small outputs into MI's output for each def operand.
5013 if (NumLeftovers) {
5014 for (unsigned i = 0; i < NumDefs; ++i)
5015 mergeMixedSubvectors(DstReg: MI.getReg(Idx: i), PartRegs: OutputRegs[i]);
5016 } else {
5017 for (unsigned i = 0; i < NumDefs; ++i)
5018 MIRBuilder.buildMergeLikeInstr(Res: MI.getReg(Idx: i), Ops: OutputRegs[i]);
5019 }
5020
5021 MI.eraseFromParent();
5022 return Legalized;
5023}
5024
5025LegalizerHelper::LegalizeResult
5026LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
5027 unsigned NumElts) {
5028 unsigned OrigNumElts = MRI.getType(Reg: MI.getReg(Idx: 0)).getNumElements();
5029
5030 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
5031 unsigned NumDefs = MI.getNumDefs();
5032
5033 SmallVector<DstOp, 8> OutputOpsPieces;
5034 SmallVector<Register, 8> OutputRegs;
5035 makeDstOps(DstOps&: OutputOpsPieces, Ty: MRI.getType(Reg: MI.getReg(Idx: 0)), NumElts);
5036
5037 // Instructions that perform register split will be inserted in basic block
5038 // where register is defined (basic block is in the next operand).
5039 SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
5040 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
5041 UseIdx += 2, ++UseNo) {
5042 MachineBasicBlock &OpMBB = *MI.getOperand(i: UseIdx + 1).getMBB();
5043 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
5044 extractVectorParts(Reg: MI.getReg(Idx: UseIdx), NumElts, VRegs&: InputOpsPieces[UseNo],
5045 MIRBuilder, MRI);
5046 }
5047
5048 // Build PHIs with fewer elements.
5049 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
5050 MIRBuilder.setInsertPt(MBB&: *MI.getParent(), II: MI);
5051 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
5052 auto Phi = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_PHI);
5053 Phi.addDef(
5054 RegNo: MRI.createGenericVirtualRegister(Ty: OutputOpsPieces[i].getLLTTy(MRI)));
5055 OutputRegs.push_back(Elt: Phi.getReg(Idx: 0));
5056
5057 for (unsigned j = 0; j < NumInputs / 2; ++j) {
5058 Phi.addUse(RegNo: InputOpsPieces[j][i]);
5059 Phi.add(MO: MI.getOperand(i: 1 + j * 2 + 1));
5060 }
5061 }
5062
5063 // Set the insert point after the existing PHIs
5064 MachineBasicBlock &MBB = *MI.getParent();
5065 MIRBuilder.setInsertPt(MBB, II: MBB.getFirstNonPHI());
5066
5067 // Merge small outputs into MI's def.
5068 if (NumLeftovers) {
5069 mergeMixedSubvectors(DstReg: MI.getReg(Idx: 0), PartRegs: OutputRegs);
5070 } else {
5071 MIRBuilder.buildMergeLikeInstr(Res: MI.getReg(Idx: 0), Ops: OutputRegs);
5072 }
5073
5074 MI.eraseFromParent();
5075 return Legalized;
5076}
5077
5078LegalizerHelper::LegalizeResult
5079LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
5080 unsigned TypeIdx,
5081 LLT NarrowTy) {
5082 const int NumDst = MI.getNumOperands() - 1;
5083 const Register SrcReg = MI.getOperand(i: NumDst).getReg();
5084 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5085 LLT SrcTy = MRI.getType(Reg: SrcReg);
5086
5087 if (TypeIdx != 1 || NarrowTy == DstTy)
5088 return UnableToLegalize;
5089
5090 // Requires compatible types. Otherwise SrcReg should have been defined by
5091 // merge-like instruction that would get artifact combined. Most likely
5092 // instruction that defines SrcReg has to perform more/fewer elements
5093 // legalization compatible with NarrowTy.
5094 assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5095 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5096
5097 if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5098 (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
5099 return UnableToLegalize;
5100
5101 // This is most likely DstTy (smaller then register size) packed in SrcTy
5102 // (larger then register size) and since unmerge was not combined it will be
5103 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
5104 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
5105
5106 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
5107 //
5108 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
5109 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
5110 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
5111 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: SrcReg);
5112 const int NumUnmerge = Unmerge->getNumOperands() - 1;
5113 const int PartsPerUnmerge = NumDst / NumUnmerge;
5114
5115 for (int I = 0; I != NumUnmerge; ++I) {
5116 auto MIB = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_UNMERGE_VALUES);
5117
5118 for (int J = 0; J != PartsPerUnmerge; ++J)
5119 MIB.addDef(RegNo: MI.getOperand(i: I * PartsPerUnmerge + J).getReg());
5120 MIB.addUse(RegNo: Unmerge.getReg(Idx: I));
5121 }
5122
5123 MI.eraseFromParent();
5124 return Legalized;
5125}
5126
5127LegalizerHelper::LegalizeResult
5128LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
5129 LLT NarrowTy) {
5130 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5131 // Requires compatible types. Otherwise user of DstReg did not perform unmerge
5132 // that should have been artifact combined. Most likely instruction that uses
5133 // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
5134 assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5135 assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5136 if (NarrowTy == SrcTy)
5137 return UnableToLegalize;
5138
5139 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
5140 // is for old mir tests. Since the changes to more/fewer elements it should no
5141 // longer be possible to generate MIR like this when starting from llvm-ir
5142 // because LCMTy approach was replaced with merge/unmerge to vector elements.
5143 if (TypeIdx == 1) {
5144 assert(SrcTy.isVector() && "Expected vector types");
5145 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5146 if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5147 (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
5148 return UnableToLegalize;
5149 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
5150 //
5151 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
5152 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
5153 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
5154 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
5155 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
5156 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
5157
5158 SmallVector<Register, 8> Elts;
5159 LLT EltTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getScalarType();
5160 for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
5161 auto Unmerge = MIRBuilder.buildUnmerge(Res: EltTy, Op: MI.getOperand(i).getReg());
5162 for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
5163 Elts.push_back(Elt: Unmerge.getReg(Idx: j));
5164 }
5165
5166 SmallVector<Register, 8> NarrowTyElts;
5167 unsigned NumNarrowTyElts = NarrowTy.getNumElements();
5168 unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
5169 for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
5170 ++i, Offset += NumNarrowTyElts) {
5171 ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
5172 NarrowTyElts.push_back(
5173 Elt: MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: Pieces).getReg(Idx: 0));
5174 }
5175
5176 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NarrowTyElts);
5177 MI.eraseFromParent();
5178 return Legalized;
5179 }
5180
5181 assert(TypeIdx == 0 && "Bad type index");
5182 if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
5183 (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
5184 return UnableToLegalize;
5185
5186 // This is most likely SrcTy (smaller then register size) packed in DstTy
5187 // (larger then register size) and since merge was not combined it will be
5188 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
5189 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
5190
5191 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
5192 //
5193 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
5194 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
5195 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
5196 SmallVector<Register, 8> NarrowTyElts;
5197 unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
5198 unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
5199 unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
5200 for (unsigned i = 0; i < NumParts; ++i) {
5201 SmallVector<Register, 8> Sources;
5202 for (unsigned j = 0; j < NumElts; ++j)
5203 Sources.push_back(Elt: MI.getOperand(i: 1 + i * NumElts + j).getReg());
5204 NarrowTyElts.push_back(
5205 Elt: MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: Sources).getReg(Idx: 0));
5206 }
5207
5208 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NarrowTyElts);
5209 MI.eraseFromParent();
5210 return Legalized;
5211}
5212
5213LegalizerHelper::LegalizeResult
5214LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
5215 unsigned TypeIdx,
5216 LLT NarrowVecTy) {
5217 auto [DstReg, SrcVec] = MI.getFirst2Regs();
5218 Register InsertVal;
5219 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
5220
5221 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
5222 if (IsInsert)
5223 InsertVal = MI.getOperand(i: 2).getReg();
5224
5225 Register Idx = MI.getOperand(i: MI.getNumOperands() - 1).getReg();
5226
5227 // TODO: Handle total scalarization case.
5228 if (!NarrowVecTy.isVector())
5229 return UnableToLegalize;
5230
5231 LLT VecTy = MRI.getType(Reg: SrcVec);
5232
5233 // If the index is a constant, we can really break this down as you would
5234 // expect, and index into the target size pieces.
5235 int64_t IdxVal;
5236 auto MaybeCst = getIConstantVRegValWithLookThrough(VReg: Idx, MRI);
5237 if (MaybeCst) {
5238 IdxVal = MaybeCst->Value.getSExtValue();
5239 // Avoid out of bounds indexing the pieces.
5240 if (IdxVal >= VecTy.getNumElements()) {
5241 MIRBuilder.buildUndef(Res: DstReg);
5242 MI.eraseFromParent();
5243 return Legalized;
5244 }
5245
5246 SmallVector<Register, 8> VecParts;
5247 LLT GCDTy = extractGCDType(Parts&: VecParts, DstTy: VecTy, NarrowTy: NarrowVecTy, SrcReg: SrcVec);
5248
5249 // Build a sequence of NarrowTy pieces in VecParts for this operand.
5250 LLT LCMTy = buildLCMMergePieces(DstTy: VecTy, NarrowTy: NarrowVecTy, GCDTy, VRegs&: VecParts,
5251 PadStrategy: TargetOpcode::G_ANYEXT);
5252
5253 unsigned NewNumElts = NarrowVecTy.getNumElements();
5254
5255 LLT IdxTy = MRI.getType(Reg: Idx);
5256 int64_t PartIdx = IdxVal / NewNumElts;
5257 auto NewIdx =
5258 MIRBuilder.buildConstant(Res: IdxTy, Val: IdxVal - NewNumElts * PartIdx);
5259
5260 if (IsInsert) {
5261 LLT PartTy = MRI.getType(Reg: VecParts[PartIdx]);
5262
5263 // Use the adjusted index to insert into one of the subvectors.
5264 auto InsertPart = MIRBuilder.buildInsertVectorElement(
5265 Res: PartTy, Val: VecParts[PartIdx], Elt: InsertVal, Idx: NewIdx);
5266 VecParts[PartIdx] = InsertPart.getReg(Idx: 0);
5267
5268 // Recombine the inserted subvector with the others to reform the result
5269 // vector.
5270 buildWidenedRemergeToDst(DstReg, LCMTy, RemergeRegs: VecParts);
5271 } else {
5272 MIRBuilder.buildExtractVectorElement(Res: DstReg, Val: VecParts[PartIdx], Idx: NewIdx);
5273 }
5274
5275 MI.eraseFromParent();
5276 return Legalized;
5277 }
5278
5279 // With a variable index, we can't perform the operation in a smaller type, so
5280 // we're forced to expand this.
5281 //
5282 // TODO: We could emit a chain of compare/select to figure out which piece to
5283 // index.
5284 return lowerExtractInsertVectorElt(MI);
5285}
5286
5287LegalizerHelper::LegalizeResult
5288LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
5289 LLT NarrowTy) {
5290 // FIXME: Don't know how to handle secondary types yet.
5291 if (TypeIdx != 0)
5292 return UnableToLegalize;
5293
5294 if (!NarrowTy.isByteSized()) {
5295 LLVM_DEBUG(dbgs() << "Can't narrow load/store to non-byte-sized type\n");
5296 return UnableToLegalize;
5297 }
5298
5299 // This implementation doesn't work for atomics. Give up instead of doing
5300 // something invalid.
5301 if (LdStMI.isAtomic())
5302 return UnableToLegalize;
5303
5304 bool IsLoad = isa<GLoad>(Val: LdStMI);
5305 Register ValReg = LdStMI.getReg(Idx: 0);
5306 Register AddrReg = LdStMI.getPointerReg();
5307 LLT ValTy = MRI.getType(Reg: ValReg);
5308
5309 // FIXME: Do we need a distinct NarrowMemory legalize action?
5310 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize().getValue()) {
5311 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
5312 return UnableToLegalize;
5313 }
5314
5315 int NumParts = -1;
5316 int NumLeftover = -1;
5317 LLT LeftoverTy;
5318 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
5319 if (IsLoad) {
5320 std::tie(args&: NumParts, args&: NumLeftover) = getNarrowTypeBreakDown(OrigTy: ValTy, NarrowTy, LeftoverTy);
5321 } else {
5322 if (extractParts(Reg: ValReg, RegTy: ValTy, MainTy: NarrowTy, LeftoverTy, VRegs&: NarrowRegs,
5323 LeftoverVRegs&: NarrowLeftoverRegs, MIRBuilder, MRI)) {
5324 NumParts = NarrowRegs.size();
5325 NumLeftover = NarrowLeftoverRegs.size();
5326 }
5327 }
5328
5329 if (NumParts == -1)
5330 return UnableToLegalize;
5331
5332 LLT PtrTy = MRI.getType(Reg: AddrReg);
5333 const LLT OffsetTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
5334
5335 unsigned TotalSize = ValTy.getSizeInBits();
5336
5337 // Split the load/store into PartTy sized pieces starting at Offset. If this
5338 // is a load, return the new registers in ValRegs. For a store, each elements
5339 // of ValRegs should be PartTy. Returns the next offset that needs to be
5340 // handled.
5341 bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
5342 auto MMO = LdStMI.getMMO();
5343 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
5344 unsigned NumParts, unsigned Offset) -> unsigned {
5345 MachineFunction &MF = MIRBuilder.getMF();
5346 unsigned PartSize = PartTy.getSizeInBits();
5347 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
5348 ++Idx) {
5349 unsigned ByteOffset = Offset / 8;
5350 Register NewAddrReg;
5351
5352 MIRBuilder.materializePtrAdd(Res&: NewAddrReg, Op0: AddrReg, ValueTy: OffsetTy, Value: ByteOffset);
5353
5354 MachineMemOperand *NewMMO =
5355 MF.getMachineMemOperand(MMO: &MMO, Offset: ByteOffset, Ty: PartTy);
5356
5357 if (IsLoad) {
5358 Register Dst = MRI.createGenericVirtualRegister(Ty: PartTy);
5359 ValRegs.push_back(Elt: Dst);
5360 MIRBuilder.buildLoad(Res: Dst, Addr: NewAddrReg, MMO&: *NewMMO);
5361 } else {
5362 MIRBuilder.buildStore(Val: ValRegs[Idx], Addr: NewAddrReg, MMO&: *NewMMO);
5363 }
5364 Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
5365 }
5366
5367 return Offset;
5368 };
5369
5370 unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
5371 unsigned HandledOffset =
5372 splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
5373
5374 // Handle the rest of the register if this isn't an even type breakdown.
5375 if (LeftoverTy.isValid())
5376 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
5377
5378 if (IsLoad) {
5379 insertParts(DstReg: ValReg, ResultTy: ValTy, PartTy: NarrowTy, PartRegs: NarrowRegs,
5380 LeftoverTy, LeftoverRegs: NarrowLeftoverRegs);
5381 }
5382
5383 LdStMI.eraseFromParent();
5384 return Legalized;
5385}
5386
5387LegalizerHelper::LegalizeResult
5388LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
5389 LLT NarrowTy) {
5390 using namespace TargetOpcode;
5391 GenericMachineInstr &GMI = cast<GenericMachineInstr>(Val&: MI);
5392 unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5393
5394 switch (MI.getOpcode()) {
5395 case G_IMPLICIT_DEF:
5396 case G_TRUNC:
5397 case G_AND:
5398 case G_OR:
5399 case G_XOR:
5400 case G_ADD:
5401 case G_SUB:
5402 case G_MUL:
5403 case G_PTR_ADD:
5404 case G_SMULH:
5405 case G_UMULH:
5406 case G_FADD:
5407 case G_FMUL:
5408 case G_FSUB:
5409 case G_FNEG:
5410 case G_FABS:
5411 case G_FCANONICALIZE:
5412 case G_FDIV:
5413 case G_FREM:
5414 case G_FMA:
5415 case G_FMAD:
5416 case G_FPOW:
5417 case G_FEXP:
5418 case G_FEXP2:
5419 case G_FEXP10:
5420 case G_FLOG:
5421 case G_FLOG2:
5422 case G_FLOG10:
5423 case G_FLDEXP:
5424 case G_FNEARBYINT:
5425 case G_FCEIL:
5426 case G_FFLOOR:
5427 case G_FRINT:
5428 case G_INTRINSIC_LRINT:
5429 case G_INTRINSIC_LLRINT:
5430 case G_INTRINSIC_ROUND:
5431 case G_INTRINSIC_ROUNDEVEN:
5432 case G_LROUND:
5433 case G_LLROUND:
5434 case G_INTRINSIC_TRUNC:
5435 case G_FCOS:
5436 case G_FSIN:
5437 case G_FTAN:
5438 case G_FACOS:
5439 case G_FASIN:
5440 case G_FATAN:
5441 case G_FATAN2:
5442 case G_FCOSH:
5443 case G_FSINH:
5444 case G_FTANH:
5445 case G_FSQRT:
5446 case G_BSWAP:
5447 case G_BITREVERSE:
5448 case G_SDIV:
5449 case G_UDIV:
5450 case G_SREM:
5451 case G_UREM:
5452 case G_SDIVREM:
5453 case G_UDIVREM:
5454 case G_SMIN:
5455 case G_SMAX:
5456 case G_UMIN:
5457 case G_UMAX:
5458 case G_ABS:
5459 case G_FMINNUM:
5460 case G_FMAXNUM:
5461 case G_FMINNUM_IEEE:
5462 case G_FMAXNUM_IEEE:
5463 case G_FMINIMUM:
5464 case G_FMAXIMUM:
5465 case G_FMINIMUMNUM:
5466 case G_FMAXIMUMNUM:
5467 case G_FSHL:
5468 case G_FSHR:
5469 case G_ROTL:
5470 case G_ROTR:
5471 case G_FREEZE:
5472 case G_SADDSAT:
5473 case G_SSUBSAT:
5474 case G_UADDSAT:
5475 case G_USUBSAT:
5476 case G_UMULO:
5477 case G_SMULO:
5478 case G_SHL:
5479 case G_LSHR:
5480 case G_ASHR:
5481 case G_SSHLSAT:
5482 case G_USHLSAT:
5483 case G_CTLZ:
5484 case G_CTLZ_ZERO_UNDEF:
5485 case G_CTTZ:
5486 case G_CTTZ_ZERO_UNDEF:
5487 case G_CTPOP:
5488 case G_FCOPYSIGN:
5489 case G_ZEXT:
5490 case G_SEXT:
5491 case G_ANYEXT:
5492 case G_FPEXT:
5493 case G_FPTRUNC:
5494 case G_SITOFP:
5495 case G_UITOFP:
5496 case G_FPTOSI:
5497 case G_FPTOUI:
5498 case G_FPTOSI_SAT:
5499 case G_FPTOUI_SAT:
5500 case G_INTTOPTR:
5501 case G_PTRTOINT:
5502 case G_ADDRSPACE_CAST:
5503 case G_UADDO:
5504 case G_USUBO:
5505 case G_UADDE:
5506 case G_USUBE:
5507 case G_SADDO:
5508 case G_SSUBO:
5509 case G_SADDE:
5510 case G_SSUBE:
5511 case G_STRICT_FADD:
5512 case G_STRICT_FSUB:
5513 case G_STRICT_FMUL:
5514 case G_STRICT_FMA:
5515 case G_STRICT_FLDEXP:
5516 case G_FFREXP:
5517 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts);
5518 case G_ICMP:
5519 case G_FCMP:
5520 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {1 /*cpm predicate*/});
5521 case G_IS_FPCLASS:
5522 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2, 3 /*mask,fpsem*/});
5523 case G_SELECT:
5524 if (MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector())
5525 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts);
5526 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {1 /*scalar cond*/});
5527 case G_PHI:
5528 return fewerElementsVectorPhi(MI&: GMI, NumElts);
5529 case G_UNMERGE_VALUES:
5530 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
5531 case G_BUILD_VECTOR:
5532 assert(TypeIdx == 0 && "not a vector type index");
5533 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5534 case G_CONCAT_VECTORS:
5535 if (TypeIdx != 1) // TODO: This probably does work as expected already.
5536 return UnableToLegalize;
5537 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5538 case G_EXTRACT_VECTOR_ELT:
5539 case G_INSERT_VECTOR_ELT:
5540 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowVecTy: NarrowTy);
5541 case G_LOAD:
5542 case G_STORE:
5543 return reduceLoadStoreWidth(LdStMI&: cast<GLoadStore>(Val&: MI), TypeIdx, NarrowTy);
5544 case G_SEXT_INREG:
5545 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2 /*imm*/});
5546 GISEL_VECREDUCE_CASES_NONSEQ
5547 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
5548 case TargetOpcode::G_VECREDUCE_SEQ_FADD:
5549 case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
5550 return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
5551 case G_SHUFFLE_VECTOR:
5552 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
5553 case G_FPOWI:
5554 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2 /*pow*/});
5555 case G_BITCAST:
5556 return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
5557 case G_INTRINSIC_FPTRUNC_ROUND:
5558 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2});
5559 default:
5560 return UnableToLegalize;
5561 }
5562}
5563
5564LegalizerHelper::LegalizeResult
5565LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
5566 LLT NarrowTy) {
5567 assert(MI.getOpcode() == TargetOpcode::G_BITCAST &&
5568 "Not a bitcast operation");
5569
5570 if (TypeIdx != 0)
5571 return UnableToLegalize;
5572
5573 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5574
5575 unsigned NewElemCount =
5576 NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
5577 LLT SrcNarrowTy = LLT::fixed_vector(NumElements: NewElemCount, ScalarTy: SrcTy.getElementType());
5578
5579 // Split the Src and Dst Reg into smaller registers
5580 SmallVector<Register> SrcVRegs, BitcastVRegs;
5581 if (extractGCDType(Parts&: SrcVRegs, DstTy, NarrowTy: SrcNarrowTy, SrcReg) != SrcNarrowTy)
5582 return UnableToLegalize;
5583
5584 // Build new smaller bitcast instructions
5585 // Not supporting Leftover types for now but will have to
5586 for (Register Reg : SrcVRegs)
5587 BitcastVRegs.push_back(Elt: MIRBuilder.buildBitcast(Dst: NarrowTy, Src: Reg).getReg(Idx: 0));
5588
5589 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: BitcastVRegs);
5590 MI.eraseFromParent();
5591 return Legalized;
5592}
5593
5594LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
5595 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5596 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
5597 if (TypeIdx != 0)
5598 return UnableToLegalize;
5599
5600 auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
5601 MI.getFirst3RegLLTs();
5602 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
5603 // The shuffle should be canonicalized by now.
5604 if (DstTy != Src1Ty)
5605 return UnableToLegalize;
5606 if (DstTy != Src2Ty)
5607 return UnableToLegalize;
5608
5609 if (!isPowerOf2_32(Value: DstTy.getNumElements()))
5610 return UnableToLegalize;
5611
5612 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
5613 // Further legalization attempts will be needed to do split further.
5614 NarrowTy =
5615 DstTy.changeElementCount(EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2));
5616 unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5617
5618 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
5619 extractParts(Reg: Src1Reg, Ty: NarrowTy, NumParts: 2, VRegs&: SplitSrc1Regs, MIRBuilder, MRI);
5620 extractParts(Reg: Src2Reg, Ty: NarrowTy, NumParts: 2, VRegs&: SplitSrc2Regs, MIRBuilder, MRI);
5621 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
5622 SplitSrc2Regs[1]};
5623
5624 Register Hi, Lo;
5625
5626 // If Lo or Hi uses elements from at most two of the four input vectors, then
5627 // express it as a vector shuffle of those two inputs. Otherwise extract the
5628 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
5629 SmallVector<int, 16> Ops;
5630 for (unsigned High = 0; High < 2; ++High) {
5631 Register &Output = High ? Hi : Lo;
5632
5633 // Build a shuffle mask for the output, discovering on the fly which
5634 // input vectors to use as shuffle operands (recorded in InputUsed).
5635 // If building a suitable shuffle vector proves too hard, then bail
5636 // out with useBuildVector set.
5637 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
5638 unsigned FirstMaskIdx = High * NewElts;
5639 bool UseBuildVector = false;
5640 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5641 // The mask element. This indexes into the input.
5642 int Idx = Mask[FirstMaskIdx + MaskOffset];
5643
5644 // The input vector this mask element indexes into.
5645 unsigned Input = (unsigned)Idx / NewElts;
5646
5647 if (Input >= std::size(Inputs)) {
5648 // The mask element does not index into any input vector.
5649 Ops.push_back(Elt: -1);
5650 continue;
5651 }
5652
5653 // Turn the index into an offset from the start of the input vector.
5654 Idx -= Input * NewElts;
5655
5656 // Find or create a shuffle vector operand to hold this input.
5657 unsigned OpNo;
5658 for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
5659 if (InputUsed[OpNo] == Input) {
5660 // This input vector is already an operand.
5661 break;
5662 } else if (InputUsed[OpNo] == -1U) {
5663 // Create a new operand for this input vector.
5664 InputUsed[OpNo] = Input;
5665 break;
5666 }
5667 }
5668
5669 if (OpNo >= std::size(InputUsed)) {
5670 // More than two input vectors used! Give up on trying to create a
5671 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
5672 UseBuildVector = true;
5673 break;
5674 }
5675
5676 // Add the mask index for the new shuffle vector.
5677 Ops.push_back(Elt: Idx + OpNo * NewElts);
5678 }
5679
5680 if (UseBuildVector) {
5681 LLT EltTy = NarrowTy.getElementType();
5682 SmallVector<Register, 16> SVOps;
5683
5684 // Extract the input elements by hand.
5685 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5686 // The mask element. This indexes into the input.
5687 int Idx = Mask[FirstMaskIdx + MaskOffset];
5688
5689 // The input vector this mask element indexes into.
5690 unsigned Input = (unsigned)Idx / NewElts;
5691
5692 if (Input >= std::size(Inputs)) {
5693 // The mask element is "undef" or indexes off the end of the input.
5694 SVOps.push_back(Elt: MIRBuilder.buildUndef(Res: EltTy).getReg(Idx: 0));
5695 continue;
5696 }
5697
5698 // Turn the index into an offset from the start of the input vector.
5699 Idx -= Input * NewElts;
5700
5701 // Extract the vector element by hand.
5702 SVOps.push_back(Elt: MIRBuilder
5703 .buildExtractVectorElement(
5704 Res: EltTy, Val: Inputs[Input],
5705 Idx: MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: Idx))
5706 .getReg(Idx: 0));
5707 }
5708
5709 // Construct the Lo/Hi output using a G_BUILD_VECTOR.
5710 Output = MIRBuilder.buildBuildVector(Res: NarrowTy, Ops: SVOps).getReg(Idx: 0);
5711 } else if (InputUsed[0] == -1U) {
5712 // No input vectors were used! The result is undefined.
5713 Output = MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0);
5714 } else {
5715 Register Op0 = Inputs[InputUsed[0]];
5716 // If only one input was used, use an undefined vector for the other.
5717 Register Op1 = InputUsed[1] == -1U
5718 ? MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0)
5719 : Inputs[InputUsed[1]];
5720 // At least one input vector was used. Create a new shuffle vector.
5721 Output = MIRBuilder.buildShuffleVector(Res: NarrowTy, Src1: Op0, Src2: Op1, Mask: Ops).getReg(Idx: 0);
5722 }
5723
5724 Ops.clear();
5725 }
5726
5727 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: {Lo, Hi});
5728 MI.eraseFromParent();
5729 return Legalized;
5730}
5731
5732LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
5733 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5734 auto &RdxMI = cast<GVecReduce>(Val&: MI);
5735
5736 if (TypeIdx != 1)
5737 return UnableToLegalize;
5738
5739 // The semantics of the normal non-sequential reductions allow us to freely
5740 // re-associate the operation.
5741 auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
5742
5743 if (NarrowTy.isVector() &&
5744 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
5745 return UnableToLegalize;
5746
5747 unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
5748 SmallVector<Register> SplitSrcs;
5749 // If NarrowTy is a scalar then we're being asked to scalarize.
5750 const unsigned NumParts =
5751 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
5752 : SrcTy.getNumElements();
5753
5754 extractParts(Reg: SrcReg, Ty: NarrowTy, NumParts, VRegs&: SplitSrcs, MIRBuilder, MRI);
5755 if (NarrowTy.isScalar()) {
5756 if (DstTy != NarrowTy)
5757 return UnableToLegalize; // FIXME: handle implicit extensions.
5758
5759 if (isPowerOf2_32(Value: NumParts)) {
5760 // Generate a tree of scalar operations to reduce the critical path.
5761 SmallVector<Register> PartialResults;
5762 unsigned NumPartsLeft = NumParts;
5763 while (NumPartsLeft > 1) {
5764 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
5765 PartialResults.emplace_back(
5766 Args: MIRBuilder
5767 .buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy},
5768 SrcOps: {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
5769 .getReg(Idx: 0));
5770 }
5771 SplitSrcs = PartialResults;
5772 PartialResults.clear();
5773 NumPartsLeft = SplitSrcs.size();
5774 }
5775 assert(SplitSrcs.size() == 1);
5776 MIRBuilder.buildCopy(Res: DstReg, Op: SplitSrcs[0]);
5777 MI.eraseFromParent();
5778 return Legalized;
5779 }
5780 // If we can't generate a tree, then just do sequential operations.
5781 Register Acc = SplitSrcs[0];
5782 for (unsigned Idx = 1; Idx < NumParts; ++Idx)
5783 Acc = MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {Acc, SplitSrcs[Idx]})
5784 .getReg(Idx: 0);
5785 MIRBuilder.buildCopy(Res: DstReg, Op: Acc);
5786 MI.eraseFromParent();
5787 return Legalized;
5788 }
5789 SmallVector<Register> PartialReductions;
5790 for (unsigned Part = 0; Part < NumParts; ++Part) {
5791 PartialReductions.push_back(
5792 Elt: MIRBuilder.buildInstr(Opc: RdxMI.getOpcode(), DstOps: {DstTy}, SrcOps: {SplitSrcs[Part]})
5793 .getReg(Idx: 0));
5794 }
5795
5796 // If the types involved are powers of 2, we can generate intermediate vector
5797 // ops, before generating a final reduction operation.
5798 if (isPowerOf2_32(Value: SrcTy.getNumElements()) &&
5799 isPowerOf2_32(Value: NarrowTy.getNumElements())) {
5800 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
5801 }
5802
5803 Register Acc = PartialReductions[0];
5804 for (unsigned Part = 1; Part < NumParts; ++Part) {
5805 if (Part == NumParts - 1) {
5806 MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {DstReg},
5807 SrcOps: {Acc, PartialReductions[Part]});
5808 } else {
5809 Acc = MIRBuilder
5810 .buildInstr(Opc: ScalarOpc, DstOps: {DstTy}, SrcOps: {Acc, PartialReductions[Part]})
5811 .getReg(Idx: 0);
5812 }
5813 }
5814 MI.eraseFromParent();
5815 return Legalized;
5816}
5817
5818LegalizerHelper::LegalizeResult
5819LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
5820 unsigned int TypeIdx,
5821 LLT NarrowTy) {
5822 auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
5823 MI.getFirst3RegLLTs();
5824 if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
5825 DstTy != NarrowTy)
5826 return UnableToLegalize;
5827
5828 assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
5829 MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
5830 "Unexpected vecreduce opcode");
5831 unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
5832 ? TargetOpcode::G_FADD
5833 : TargetOpcode::G_FMUL;
5834
5835 SmallVector<Register> SplitSrcs;
5836 unsigned NumParts = SrcTy.getNumElements();
5837 extractParts(Reg: SrcReg, Ty: NarrowTy, NumParts, VRegs&: SplitSrcs, MIRBuilder, MRI);
5838 Register Acc = ScalarReg;
5839 for (unsigned i = 0; i < NumParts; i++)
5840 Acc = MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {Acc, SplitSrcs[i]})
5841 .getReg(Idx: 0);
5842
5843 MIRBuilder.buildCopy(Res: DstReg, Op: Acc);
5844 MI.eraseFromParent();
5845 return Legalized;
5846}
5847
5848LegalizerHelper::LegalizeResult
5849LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
5850 LLT SrcTy, LLT NarrowTy,
5851 unsigned ScalarOpc) {
5852 SmallVector<Register> SplitSrcs;
5853 // Split the sources into NarrowTy size pieces.
5854 extractParts(Reg: SrcReg, Ty: NarrowTy,
5855 NumParts: SrcTy.getNumElements() / NarrowTy.getNumElements(), VRegs&: SplitSrcs,
5856 MIRBuilder, MRI);
5857 // We're going to do a tree reduction using vector operations until we have
5858 // one NarrowTy size value left.
5859 while (SplitSrcs.size() > 1) {
5860 SmallVector<Register> PartialRdxs;
5861 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
5862 Register LHS = SplitSrcs[Idx];
5863 Register RHS = SplitSrcs[Idx + 1];
5864 // Create the intermediate vector op.
5865 Register Res =
5866 MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {LHS, RHS}).getReg(Idx: 0);
5867 PartialRdxs.push_back(Elt: Res);
5868 }
5869 SplitSrcs = std::move(PartialRdxs);
5870 }
5871 // Finally generate the requested NarrowTy based reduction.
5872 Observer.changingInstr(MI);
5873 MI.getOperand(i: 1).setReg(SplitSrcs[0]);
5874 Observer.changedInstr(MI);
5875 return Legalized;
5876}
5877
5878LegalizerHelper::LegalizeResult
5879LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
5880 const LLT HalfTy, const LLT AmtTy) {
5881
5882 Register InL = MRI.createGenericVirtualRegister(Ty: HalfTy);
5883 Register InH = MRI.createGenericVirtualRegister(Ty: HalfTy);
5884 MIRBuilder.buildUnmerge(Res: {InL, InH}, Op: MI.getOperand(i: 1));
5885
5886 if (Amt.isZero()) {
5887 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {InL, InH});
5888 MI.eraseFromParent();
5889 return Legalized;
5890 }
5891
5892 LLT NVT = HalfTy;
5893 unsigned NVTBits = HalfTy.getSizeInBits();
5894 unsigned VTBits = 2 * NVTBits;
5895
5896 SrcOp Lo(Register(0)), Hi(Register(0));
5897 if (MI.getOpcode() == TargetOpcode::G_SHL) {
5898 if (Amt.ugt(RHS: VTBits)) {
5899 Lo = Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
5900 } else if (Amt.ugt(RHS: NVTBits)) {
5901 Lo = MIRBuilder.buildConstant(Res: NVT, Val: 0);
5902 Hi = MIRBuilder.buildShl(Dst: NVT, Src0: InL,
5903 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
5904 } else if (Amt == NVTBits) {
5905 Lo = MIRBuilder.buildConstant(Res: NVT, Val: 0);
5906 Hi = InL;
5907 } else {
5908 Lo = MIRBuilder.buildShl(Dst: NVT, Src0: InL, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt));
5909 auto OrLHS =
5910 MIRBuilder.buildShl(Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt));
5911 auto OrRHS = MIRBuilder.buildLShr(
5912 Dst: NVT, Src0: InL, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
5913 Hi = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
5914 }
5915 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
5916 if (Amt.ugt(RHS: VTBits)) {
5917 Lo = Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
5918 } else if (Amt.ugt(RHS: NVTBits)) {
5919 Lo = MIRBuilder.buildLShr(Dst: NVT, Src0: InH,
5920 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
5921 Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
5922 } else if (Amt == NVTBits) {
5923 Lo = InH;
5924 Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
5925 } else {
5926 auto ShiftAmtConst = MIRBuilder.buildConstant(Res: AmtTy, Val: Amt);
5927
5928 auto OrLHS = MIRBuilder.buildLShr(Dst: NVT, Src0: InL, Src1: ShiftAmtConst);
5929 auto OrRHS = MIRBuilder.buildShl(
5930 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
5931
5932 Lo = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
5933 Hi = MIRBuilder.buildLShr(Dst: NVT, Src0: InH, Src1: ShiftAmtConst);
5934 }
5935 } else {
5936 if (Amt.ugt(RHS: VTBits)) {
5937 Hi = Lo = MIRBuilder.buildAShr(
5938 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
5939 } else if (Amt.ugt(RHS: NVTBits)) {
5940 Lo = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
5941 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
5942 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
5943 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
5944 } else if (Amt == NVTBits) {
5945 Lo = InH;
5946 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
5947 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
5948 } else {
5949 auto ShiftAmtConst = MIRBuilder.buildConstant(Res: AmtTy, Val: Amt);
5950
5951 auto OrLHS = MIRBuilder.buildLShr(Dst: NVT, Src0: InL, Src1: ShiftAmtConst);
5952 auto OrRHS = MIRBuilder.buildShl(
5953 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
5954
5955 Lo = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
5956 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH, Src1: ShiftAmtConst);
5957 }
5958 }
5959
5960 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {Lo, Hi});
5961 MI.eraseFromParent();
5962
5963 return Legalized;
5964}
5965
5966// TODO: Optimize if constant shift amount.
5967LegalizerHelper::LegalizeResult
5968LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
5969 LLT RequestedTy) {
5970 if (TypeIdx == 1) {
5971 Observer.changingInstr(MI);
5972 narrowScalarSrc(MI, NarrowTy: RequestedTy, OpIdx: 2);
5973 Observer.changedInstr(MI);
5974 return Legalized;
5975 }
5976
5977 Register DstReg = MI.getOperand(i: 0).getReg();
5978 LLT DstTy = MRI.getType(Reg: DstReg);
5979 if (DstTy.isVector())
5980 return UnableToLegalize;
5981
5982 Register Amt = MI.getOperand(i: 2).getReg();
5983 LLT ShiftAmtTy = MRI.getType(Reg: Amt);
5984 const unsigned DstEltSize = DstTy.getScalarSizeInBits();
5985 if (DstEltSize % 2 != 0)
5986 return UnableToLegalize;
5987
5988 // Ignore the input type. We can only go to exactly half the size of the
5989 // input. If that isn't small enough, the resulting pieces will be further
5990 // legalized.
5991 const unsigned NewBitSize = DstEltSize / 2;
5992 const LLT HalfTy = LLT::scalar(SizeInBits: NewBitSize);
5993 const LLT CondTy = LLT::scalar(SizeInBits: 1);
5994
5995 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: Amt, MRI)) {
5996 return narrowScalarShiftByConstant(MI, Amt: VRegAndVal->Value, HalfTy,
5997 AmtTy: ShiftAmtTy);
5998 }
5999
6000 // TODO: Expand with known bits.
6001
6002 // Handle the fully general expansion by an unknown amount.
6003 auto NewBits = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: NewBitSize);
6004
6005 Register InL = MRI.createGenericVirtualRegister(Ty: HalfTy);
6006 Register InH = MRI.createGenericVirtualRegister(Ty: HalfTy);
6007 MIRBuilder.buildUnmerge(Res: {InL, InH}, Op: MI.getOperand(i: 1));
6008
6009 auto AmtExcess = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: Amt, Src1: NewBits);
6010 auto AmtLack = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: NewBits, Src1: Amt);
6011
6012 auto Zero = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
6013 auto IsShort = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_ULT, Res: CondTy, Op0: Amt, Op1: NewBits);
6014 auto IsZero = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: CondTy, Op0: Amt, Op1: Zero);
6015
6016 Register ResultRegs[2];
6017 switch (MI.getOpcode()) {
6018 case TargetOpcode::G_SHL: {
6019 // Short: ShAmt < NewBitSize
6020 auto LoS = MIRBuilder.buildShl(Dst: HalfTy, Src0: InL, Src1: Amt);
6021
6022 auto LoOr = MIRBuilder.buildLShr(Dst: HalfTy, Src0: InL, Src1: AmtLack);
6023 auto HiOr = MIRBuilder.buildShl(Dst: HalfTy, Src0: InH, Src1: Amt);
6024 auto HiS = MIRBuilder.buildOr(Dst: HalfTy, Src0: LoOr, Src1: HiOr);
6025
6026 // Long: ShAmt >= NewBitSize
6027 auto LoL = MIRBuilder.buildConstant(Res: HalfTy, Val: 0); // Lo part is zero.
6028 auto HiL = MIRBuilder.buildShl(Dst: HalfTy, Src0: InL, Src1: AmtExcess); // Hi from Lo part.
6029
6030 auto Lo = MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: LoS, Op1: LoL);
6031 auto Hi = MIRBuilder.buildSelect(
6032 Res: HalfTy, Tst: IsZero, Op0: InH, Op1: MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: HiS, Op1: HiL));
6033
6034 ResultRegs[0] = Lo.getReg(Idx: 0);
6035 ResultRegs[1] = Hi.getReg(Idx: 0);
6036 break;
6037 }
6038 case TargetOpcode::G_LSHR:
6039 case TargetOpcode::G_ASHR: {
6040 // Short: ShAmt < NewBitSize
6041 auto HiS = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {HalfTy}, SrcOps: {InH, Amt});
6042
6043 auto LoOr = MIRBuilder.buildLShr(Dst: HalfTy, Src0: InL, Src1: Amt);
6044 auto HiOr = MIRBuilder.buildShl(Dst: HalfTy, Src0: InH, Src1: AmtLack);
6045 auto LoS = MIRBuilder.buildOr(Dst: HalfTy, Src0: LoOr, Src1: HiOr);
6046
6047 // Long: ShAmt >= NewBitSize
6048 MachineInstrBuilder HiL;
6049 if (MI.getOpcode() == TargetOpcode::G_LSHR) {
6050 HiL = MIRBuilder.buildConstant(Res: HalfTy, Val: 0); // Hi part is zero.
6051 } else {
6052 auto ShiftAmt = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: NewBitSize - 1);
6053 HiL = MIRBuilder.buildAShr(Dst: HalfTy, Src0: InH, Src1: ShiftAmt); // Sign of Hi part.
6054 }
6055 auto LoL = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {HalfTy},
6056 SrcOps: {InH, AmtExcess}); // Lo from Hi part.
6057
6058 auto Lo = MIRBuilder.buildSelect(
6059 Res: HalfTy, Tst: IsZero, Op0: InL, Op1: MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: LoS, Op1: LoL));
6060
6061 auto Hi = MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: HiS, Op1: HiL);
6062
6063 ResultRegs[0] = Lo.getReg(Idx: 0);
6064 ResultRegs[1] = Hi.getReg(Idx: 0);
6065 break;
6066 }
6067 default:
6068 llvm_unreachable("not a shift");
6069 }
6070
6071 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: ResultRegs);
6072 MI.eraseFromParent();
6073 return Legalized;
6074}
6075
6076LegalizerHelper::LegalizeResult
6077LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
6078 LLT MoreTy) {
6079 assert(TypeIdx == 0 && "Expecting only Idx 0");
6080
6081 Observer.changingInstr(MI);
6082 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6083 MachineBasicBlock &OpMBB = *MI.getOperand(i: I + 1).getMBB();
6084 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminator());
6085 moreElementsVectorSrc(MI, MoreTy, OpIdx: I);
6086 }
6087
6088 MachineBasicBlock &MBB = *MI.getParent();
6089 MIRBuilder.setInsertPt(MBB, II: --MBB.getFirstNonPHI());
6090 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6091 Observer.changedInstr(MI);
6092 return Legalized;
6093}
6094
6095MachineInstrBuilder LegalizerHelper::getNeutralElementForVecReduce(
6096 unsigned Opcode, MachineIRBuilder &MIRBuilder, LLT Ty) {
6097 assert(Ty.isScalar() && "Expected scalar type to make neutral element for");
6098
6099 switch (Opcode) {
6100 default:
6101 llvm_unreachable(
6102 "getNeutralElementForVecReduce called with invalid opcode!");
6103 case TargetOpcode::G_VECREDUCE_ADD:
6104 case TargetOpcode::G_VECREDUCE_OR:
6105 case TargetOpcode::G_VECREDUCE_XOR:
6106 case TargetOpcode::G_VECREDUCE_UMAX:
6107 return MIRBuilder.buildConstant(Res: Ty, Val: 0);
6108 case TargetOpcode::G_VECREDUCE_MUL:
6109 return MIRBuilder.buildConstant(Res: Ty, Val: 1);
6110 case TargetOpcode::G_VECREDUCE_AND:
6111 case TargetOpcode::G_VECREDUCE_UMIN:
6112 return MIRBuilder.buildConstant(
6113 Res: Ty, Val: APInt::getAllOnes(numBits: Ty.getScalarSizeInBits()));
6114 case TargetOpcode::G_VECREDUCE_SMAX:
6115 return MIRBuilder.buildConstant(
6116 Res: Ty, Val: APInt::getSignedMinValue(numBits: Ty.getSizeInBits()));
6117 case TargetOpcode::G_VECREDUCE_SMIN:
6118 return MIRBuilder.buildConstant(
6119 Res: Ty, Val: APInt::getSignedMaxValue(numBits: Ty.getSizeInBits()));
6120 case TargetOpcode::G_VECREDUCE_FADD:
6121 return MIRBuilder.buildFConstant(Res: Ty, Val: -0.0);
6122 case TargetOpcode::G_VECREDUCE_FMUL:
6123 return MIRBuilder.buildFConstant(Res: Ty, Val: 1.0);
6124 case TargetOpcode::G_VECREDUCE_FMINIMUM:
6125 case TargetOpcode::G_VECREDUCE_FMAXIMUM:
6126 assert(false && "getNeutralElementForVecReduce unimplemented for "
6127 "G_VECREDUCE_FMINIMUM and G_VECREDUCE_FMAXIMUM!");
6128 }
6129 llvm_unreachable("switch expected to return!");
6130}
6131
6132LegalizerHelper::LegalizeResult
6133LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
6134 LLT MoreTy) {
6135 unsigned Opc = MI.getOpcode();
6136 switch (Opc) {
6137 case TargetOpcode::G_IMPLICIT_DEF:
6138 case TargetOpcode::G_LOAD: {
6139 if (TypeIdx != 0)
6140 return UnableToLegalize;
6141 Observer.changingInstr(MI);
6142 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6143 Observer.changedInstr(MI);
6144 return Legalized;
6145 }
6146 case TargetOpcode::G_STORE:
6147 if (TypeIdx != 0)
6148 return UnableToLegalize;
6149 Observer.changingInstr(MI);
6150 moreElementsVectorSrc(MI, MoreTy, OpIdx: 0);
6151 Observer.changedInstr(MI);
6152 return Legalized;
6153 case TargetOpcode::G_AND:
6154 case TargetOpcode::G_OR:
6155 case TargetOpcode::G_XOR:
6156 case TargetOpcode::G_ADD:
6157 case TargetOpcode::G_SUB:
6158 case TargetOpcode::G_MUL:
6159 case TargetOpcode::G_FADD:
6160 case TargetOpcode::G_FSUB:
6161 case TargetOpcode::G_FMUL:
6162 case TargetOpcode::G_FDIV:
6163 case TargetOpcode::G_FCOPYSIGN:
6164 case TargetOpcode::G_UADDSAT:
6165 case TargetOpcode::G_USUBSAT:
6166 case TargetOpcode::G_SADDSAT:
6167 case TargetOpcode::G_SSUBSAT:
6168 case TargetOpcode::G_SMIN:
6169 case TargetOpcode::G_SMAX:
6170 case TargetOpcode::G_UMIN:
6171 case TargetOpcode::G_UMAX:
6172 case TargetOpcode::G_FMINNUM:
6173 case TargetOpcode::G_FMAXNUM:
6174 case TargetOpcode::G_FMINNUM_IEEE:
6175 case TargetOpcode::G_FMAXNUM_IEEE:
6176 case TargetOpcode::G_FMINIMUM:
6177 case TargetOpcode::G_FMAXIMUM:
6178 case TargetOpcode::G_FMINIMUMNUM:
6179 case TargetOpcode::G_FMAXIMUMNUM:
6180 case TargetOpcode::G_STRICT_FADD:
6181 case TargetOpcode::G_STRICT_FSUB:
6182 case TargetOpcode::G_STRICT_FMUL:
6183 case TargetOpcode::G_SHL:
6184 case TargetOpcode::G_ASHR:
6185 case TargetOpcode::G_LSHR: {
6186 Observer.changingInstr(MI);
6187 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6188 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6189 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6190 Observer.changedInstr(MI);
6191 return Legalized;
6192 }
6193 case TargetOpcode::G_FMA:
6194 case TargetOpcode::G_STRICT_FMA:
6195 case TargetOpcode::G_FSHR:
6196 case TargetOpcode::G_FSHL: {
6197 Observer.changingInstr(MI);
6198 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6199 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6200 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6201 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6202 Observer.changedInstr(MI);
6203 return Legalized;
6204 }
6205 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
6206 case TargetOpcode::G_EXTRACT:
6207 if (TypeIdx != 1)
6208 return UnableToLegalize;
6209 Observer.changingInstr(MI);
6210 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6211 Observer.changedInstr(MI);
6212 return Legalized;
6213 case TargetOpcode::G_INSERT:
6214 case TargetOpcode::G_INSERT_VECTOR_ELT:
6215 case TargetOpcode::G_FREEZE:
6216 case TargetOpcode::G_FNEG:
6217 case TargetOpcode::G_FABS:
6218 case TargetOpcode::G_FSQRT:
6219 case TargetOpcode::G_FCEIL:
6220 case TargetOpcode::G_FFLOOR:
6221 case TargetOpcode::G_FNEARBYINT:
6222 case TargetOpcode::G_FRINT:
6223 case TargetOpcode::G_INTRINSIC_ROUND:
6224 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
6225 case TargetOpcode::G_INTRINSIC_TRUNC:
6226 case TargetOpcode::G_BITREVERSE:
6227 case TargetOpcode::G_BSWAP:
6228 case TargetOpcode::G_FCANONICALIZE:
6229 case TargetOpcode::G_SEXT_INREG:
6230 case TargetOpcode::G_ABS:
6231 case TargetOpcode::G_CTLZ:
6232 case TargetOpcode::G_CTPOP:
6233 if (TypeIdx != 0)
6234 return UnableToLegalize;
6235 Observer.changingInstr(MI);
6236 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6237 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6238 Observer.changedInstr(MI);
6239 return Legalized;
6240 case TargetOpcode::G_SELECT: {
6241 auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
6242 if (TypeIdx == 1) {
6243 if (!CondTy.isScalar() ||
6244 DstTy.getElementCount() != MoreTy.getElementCount())
6245 return UnableToLegalize;
6246
6247 // This is turning a scalar select of vectors into a vector
6248 // select. Broadcast the select condition.
6249 auto ShufSplat = MIRBuilder.buildShuffleSplat(Res: MoreTy, Src: CondReg);
6250 Observer.changingInstr(MI);
6251 MI.getOperand(i: 1).setReg(ShufSplat.getReg(Idx: 0));
6252 Observer.changedInstr(MI);
6253 return Legalized;
6254 }
6255
6256 if (CondTy.isVector())
6257 return UnableToLegalize;
6258
6259 Observer.changingInstr(MI);
6260 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6261 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6262 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6263 Observer.changedInstr(MI);
6264 return Legalized;
6265 }
6266 case TargetOpcode::G_UNMERGE_VALUES:
6267 return UnableToLegalize;
6268 case TargetOpcode::G_PHI:
6269 return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
6270 case TargetOpcode::G_SHUFFLE_VECTOR:
6271 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
6272 case TargetOpcode::G_BUILD_VECTOR: {
6273 SmallVector<SrcOp, 8> Elts;
6274 for (auto Op : MI.uses()) {
6275 Elts.push_back(Elt: Op.getReg());
6276 }
6277
6278 for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
6279 Elts.push_back(Elt: MIRBuilder.buildUndef(Res: MoreTy.getScalarType()));
6280 }
6281
6282 MIRBuilder.buildDeleteTrailingVectorElements(
6283 Res: MI.getOperand(i: 0).getReg(), Op0: MIRBuilder.buildInstr(Opc, DstOps: {MoreTy}, SrcOps: Elts));
6284 MI.eraseFromParent();
6285 return Legalized;
6286 }
6287 case TargetOpcode::G_SEXT:
6288 case TargetOpcode::G_ZEXT:
6289 case TargetOpcode::G_ANYEXT:
6290 case TargetOpcode::G_TRUNC:
6291 case TargetOpcode::G_FPTRUNC:
6292 case TargetOpcode::G_FPEXT:
6293 case TargetOpcode::G_FPTOSI:
6294 case TargetOpcode::G_FPTOUI:
6295 case TargetOpcode::G_FPTOSI_SAT:
6296 case TargetOpcode::G_FPTOUI_SAT:
6297 case TargetOpcode::G_SITOFP:
6298 case TargetOpcode::G_UITOFP: {
6299 Observer.changingInstr(MI);
6300 LLT SrcExtTy;
6301 LLT DstExtTy;
6302 if (TypeIdx == 0) {
6303 DstExtTy = MoreTy;
6304 SrcExtTy = LLT::fixed_vector(
6305 NumElements: MoreTy.getNumElements(),
6306 ScalarTy: MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getElementType());
6307 } else {
6308 DstExtTy = LLT::fixed_vector(
6309 NumElements: MoreTy.getNumElements(),
6310 ScalarTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getElementType());
6311 SrcExtTy = MoreTy;
6312 }
6313 moreElementsVectorSrc(MI, MoreTy: SrcExtTy, OpIdx: 1);
6314 moreElementsVectorDst(MI, WideTy: DstExtTy, OpIdx: 0);
6315 Observer.changedInstr(MI);
6316 return Legalized;
6317 }
6318 case TargetOpcode::G_ICMP:
6319 case TargetOpcode::G_FCMP: {
6320 if (TypeIdx != 1)
6321 return UnableToLegalize;
6322
6323 Observer.changingInstr(MI);
6324 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6325 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6326 LLT CondTy = LLT::fixed_vector(
6327 NumElements: MoreTy.getNumElements(),
6328 ScalarTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getElementType());
6329 moreElementsVectorDst(MI, WideTy: CondTy, OpIdx: 0);
6330 Observer.changedInstr(MI);
6331 return Legalized;
6332 }
6333 case TargetOpcode::G_BITCAST: {
6334 if (TypeIdx != 0)
6335 return UnableToLegalize;
6336
6337 LLT SrcTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
6338 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
6339
6340 unsigned coefficient = SrcTy.getNumElements() * MoreTy.getNumElements();
6341 if (coefficient % DstTy.getNumElements() != 0)
6342 return UnableToLegalize;
6343
6344 coefficient = coefficient / DstTy.getNumElements();
6345
6346 LLT NewTy = SrcTy.changeElementCount(
6347 EC: ElementCount::get(MinVal: coefficient, Scalable: MoreTy.isScalable()));
6348 Observer.changingInstr(MI);
6349 moreElementsVectorSrc(MI, MoreTy: NewTy, OpIdx: 1);
6350 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6351 Observer.changedInstr(MI);
6352 return Legalized;
6353 }
6354 case TargetOpcode::G_VECREDUCE_FADD:
6355 case TargetOpcode::G_VECREDUCE_FMUL:
6356 case TargetOpcode::G_VECREDUCE_ADD:
6357 case TargetOpcode::G_VECREDUCE_MUL:
6358 case TargetOpcode::G_VECREDUCE_AND:
6359 case TargetOpcode::G_VECREDUCE_OR:
6360 case TargetOpcode::G_VECREDUCE_XOR:
6361 case TargetOpcode::G_VECREDUCE_SMAX:
6362 case TargetOpcode::G_VECREDUCE_SMIN:
6363 case TargetOpcode::G_VECREDUCE_UMAX:
6364 case TargetOpcode::G_VECREDUCE_UMIN: {
6365 LLT OrigTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
6366 MachineOperand &MO = MI.getOperand(i: 1);
6367 auto NewVec = MIRBuilder.buildPadVectorWithUndefElements(Res: MoreTy, Op0: MO);
6368 auto NeutralElement = getNeutralElementForVecReduce(
6369 Opcode: MI.getOpcode(), MIRBuilder, Ty: MoreTy.getElementType());
6370
6371 LLT IdxTy(TLI.getVectorIdxLLT(DL: MIRBuilder.getDataLayout()));
6372 for (size_t i = OrigTy.getNumElements(), e = MoreTy.getNumElements();
6373 i != e; i++) {
6374 auto Idx = MIRBuilder.buildConstant(Res: IdxTy, Val: i);
6375 NewVec = MIRBuilder.buildInsertVectorElement(Res: MoreTy, Val: NewVec,
6376 Elt: NeutralElement, Idx);
6377 }
6378
6379 Observer.changingInstr(MI);
6380 MO.setReg(NewVec.getReg(Idx: 0));
6381 Observer.changedInstr(MI);
6382 return Legalized;
6383 }
6384
6385 default:
6386 return UnableToLegalize;
6387 }
6388}
6389
6390LegalizerHelper::LegalizeResult
6391LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
6392 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6393 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
6394 unsigned MaskNumElts = Mask.size();
6395 unsigned SrcNumElts = SrcTy.getNumElements();
6396 LLT DestEltTy = DstTy.getElementType();
6397
6398 if (MaskNumElts == SrcNumElts)
6399 return Legalized;
6400
6401 if (MaskNumElts < SrcNumElts) {
6402 // Extend mask to match new destination vector size with
6403 // undef values.
6404 SmallVector<int, 16> NewMask(SrcNumElts, -1);
6405 llvm::copy(Range&: Mask, Out: NewMask.begin());
6406
6407 moreElementsVectorDst(MI, WideTy: SrcTy, OpIdx: 0);
6408 MIRBuilder.setInstrAndDebugLoc(MI);
6409 MIRBuilder.buildShuffleVector(Res: MI.getOperand(i: 0).getReg(),
6410 Src1: MI.getOperand(i: 1).getReg(),
6411 Src2: MI.getOperand(i: 2).getReg(), Mask: NewMask);
6412 MI.eraseFromParent();
6413
6414 return Legalized;
6415 }
6416
6417 unsigned PaddedMaskNumElts = alignTo(Value: MaskNumElts, Align: SrcNumElts);
6418 unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
6419 LLT PaddedTy = LLT::fixed_vector(NumElements: PaddedMaskNumElts, ScalarTy: DestEltTy);
6420
6421 // Create new source vectors by concatenating the initial
6422 // source vectors with undefined vectors of the same size.
6423 auto Undef = MIRBuilder.buildUndef(Res: SrcTy);
6424 SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(Idx: 0));
6425 SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(Idx: 0));
6426 MOps1[0] = MI.getOperand(i: 1).getReg();
6427 MOps2[0] = MI.getOperand(i: 2).getReg();
6428
6429 auto Src1 = MIRBuilder.buildConcatVectors(Res: PaddedTy, Ops: MOps1);
6430 auto Src2 = MIRBuilder.buildConcatVectors(Res: PaddedTy, Ops: MOps2);
6431
6432 // Readjust mask for new input vector length.
6433 SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
6434 for (unsigned I = 0; I != MaskNumElts; ++I) {
6435 int Idx = Mask[I];
6436 if (Idx >= static_cast<int>(SrcNumElts))
6437 Idx += PaddedMaskNumElts - SrcNumElts;
6438 MappedOps[I] = Idx;
6439 }
6440
6441 // If we got more elements than required, extract subvector.
6442 if (MaskNumElts != PaddedMaskNumElts) {
6443 auto Shuffle =
6444 MIRBuilder.buildShuffleVector(Res: PaddedTy, Src1, Src2, Mask: MappedOps);
6445
6446 SmallVector<Register, 16> Elts(MaskNumElts);
6447 for (unsigned I = 0; I < MaskNumElts; ++I) {
6448 Elts[I] =
6449 MIRBuilder.buildExtractVectorElementConstant(Res: DestEltTy, Val: Shuffle, Idx: I)
6450 .getReg(Idx: 0);
6451 }
6452 MIRBuilder.buildBuildVector(Res: DstReg, Ops: Elts);
6453 } else {
6454 MIRBuilder.buildShuffleVector(Res: DstReg, Src1, Src2, Mask: MappedOps);
6455 }
6456
6457 MI.eraseFromParent();
6458 return LegalizerHelper::LegalizeResult::Legalized;
6459}
6460
6461LegalizerHelper::LegalizeResult
6462LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
6463 unsigned int TypeIdx, LLT MoreTy) {
6464 auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
6465 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
6466 unsigned NumElts = DstTy.getNumElements();
6467 unsigned WidenNumElts = MoreTy.getNumElements();
6468
6469 if (DstTy.isVector() && Src1Ty.isVector() &&
6470 DstTy.getNumElements() != Src1Ty.getNumElements()) {
6471 return equalizeVectorShuffleLengths(MI);
6472 }
6473
6474 if (TypeIdx != 0)
6475 return UnableToLegalize;
6476
6477 // Expect a canonicalized shuffle.
6478 if (DstTy != Src1Ty || DstTy != Src2Ty)
6479 return UnableToLegalize;
6480
6481 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6482 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6483
6484 // Adjust mask based on new input vector length.
6485 SmallVector<int, 16> NewMask(WidenNumElts, -1);
6486 for (unsigned I = 0; I != NumElts; ++I) {
6487 int Idx = Mask[I];
6488 if (Idx < static_cast<int>(NumElts))
6489 NewMask[I] = Idx;
6490 else
6491 NewMask[I] = Idx - NumElts + WidenNumElts;
6492 }
6493 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6494 MIRBuilder.setInstrAndDebugLoc(MI);
6495 MIRBuilder.buildShuffleVector(Res: MI.getOperand(i: 0).getReg(),
6496 Src1: MI.getOperand(i: 1).getReg(),
6497 Src2: MI.getOperand(i: 2).getReg(), Mask: NewMask);
6498 MI.eraseFromParent();
6499 return Legalized;
6500}
6501
6502void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
6503 ArrayRef<Register> Src1Regs,
6504 ArrayRef<Register> Src2Regs,
6505 LLT NarrowTy) {
6506 MachineIRBuilder &B = MIRBuilder;
6507 unsigned SrcParts = Src1Regs.size();
6508 unsigned DstParts = DstRegs.size();
6509
6510 unsigned DstIdx = 0; // Low bits of the result.
6511 Register FactorSum =
6512 B.buildMul(Dst: NarrowTy, Src0: Src1Regs[DstIdx], Src1: Src2Regs[DstIdx]).getReg(Idx: 0);
6513 DstRegs[DstIdx] = FactorSum;
6514
6515 Register CarrySumPrevDstIdx;
6516 SmallVector<Register, 4> Factors;
6517
6518 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
6519 // Collect low parts of muls for DstIdx.
6520 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
6521 i <= std::min(a: DstIdx, b: SrcParts - 1); ++i) {
6522 MachineInstrBuilder Mul =
6523 B.buildMul(Dst: NarrowTy, Src0: Src1Regs[DstIdx - i], Src1: Src2Regs[i]);
6524 Factors.push_back(Elt: Mul.getReg(Idx: 0));
6525 }
6526 // Collect high parts of muls from previous DstIdx.
6527 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
6528 i <= std::min(a: DstIdx - 1, b: SrcParts - 1); ++i) {
6529 MachineInstrBuilder Umulh =
6530 B.buildUMulH(Dst: NarrowTy, Src0: Src1Regs[DstIdx - 1 - i], Src1: Src2Regs[i]);
6531 Factors.push_back(Elt: Umulh.getReg(Idx: 0));
6532 }
6533 // Add CarrySum from additions calculated for previous DstIdx.
6534 if (DstIdx != 1) {
6535 Factors.push_back(Elt: CarrySumPrevDstIdx);
6536 }
6537
6538 Register CarrySum;
6539 // Add all factors and accumulate all carries into CarrySum.
6540 if (DstIdx != DstParts - 1) {
6541 MachineInstrBuilder Uaddo =
6542 B.buildUAddo(Res: NarrowTy, CarryOut: LLT::scalar(SizeInBits: 1), Op0: Factors[0], Op1: Factors[1]);
6543 FactorSum = Uaddo.getReg(Idx: 0);
6544 CarrySum = B.buildZExt(Res: NarrowTy, Op: Uaddo.getReg(Idx: 1)).getReg(Idx: 0);
6545 for (unsigned i = 2; i < Factors.size(); ++i) {
6546 MachineInstrBuilder Uaddo =
6547 B.buildUAddo(Res: NarrowTy, CarryOut: LLT::scalar(SizeInBits: 1), Op0: FactorSum, Op1: Factors[i]);
6548 FactorSum = Uaddo.getReg(Idx: 0);
6549 MachineInstrBuilder Carry = B.buildZExt(Res: NarrowTy, Op: Uaddo.getReg(Idx: 1));
6550 CarrySum = B.buildAdd(Dst: NarrowTy, Src0: CarrySum, Src1: Carry).getReg(Idx: 0);
6551 }
6552 } else {
6553 // Since value for the next index is not calculated, neither is CarrySum.
6554 FactorSum = B.buildAdd(Dst: NarrowTy, Src0: Factors[0], Src1: Factors[1]).getReg(Idx: 0);
6555 for (unsigned i = 2; i < Factors.size(); ++i)
6556 FactorSum = B.buildAdd(Dst: NarrowTy, Src0: FactorSum, Src1: Factors[i]).getReg(Idx: 0);
6557 }
6558
6559 CarrySumPrevDstIdx = CarrySum;
6560 DstRegs[DstIdx] = FactorSum;
6561 Factors.clear();
6562 }
6563}
6564
6565LegalizerHelper::LegalizeResult
6566LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
6567 LLT NarrowTy) {
6568 if (TypeIdx != 0)
6569 return UnableToLegalize;
6570
6571 Register DstReg = MI.getOperand(i: 0).getReg();
6572 LLT DstType = MRI.getType(Reg: DstReg);
6573 // FIXME: add support for vector types
6574 if (DstType.isVector())
6575 return UnableToLegalize;
6576
6577 unsigned Opcode = MI.getOpcode();
6578 unsigned OpO, OpE, OpF;
6579 switch (Opcode) {
6580 case TargetOpcode::G_SADDO:
6581 case TargetOpcode::G_SADDE:
6582 case TargetOpcode::G_UADDO:
6583 case TargetOpcode::G_UADDE:
6584 case TargetOpcode::G_ADD:
6585 OpO = TargetOpcode::G_UADDO;
6586 OpE = TargetOpcode::G_UADDE;
6587 OpF = TargetOpcode::G_UADDE;
6588 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
6589 OpF = TargetOpcode::G_SADDE;
6590 break;
6591 case TargetOpcode::G_SSUBO:
6592 case TargetOpcode::G_SSUBE:
6593 case TargetOpcode::G_USUBO:
6594 case TargetOpcode::G_USUBE:
6595 case TargetOpcode::G_SUB:
6596 OpO = TargetOpcode::G_USUBO;
6597 OpE = TargetOpcode::G_USUBE;
6598 OpF = TargetOpcode::G_USUBE;
6599 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
6600 OpF = TargetOpcode::G_SSUBE;
6601 break;
6602 default:
6603 llvm_unreachable("Unexpected add/sub opcode!");
6604 }
6605
6606 // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
6607 unsigned NumDefs = MI.getNumExplicitDefs();
6608 Register Src1 = MI.getOperand(i: NumDefs).getReg();
6609 Register Src2 = MI.getOperand(i: NumDefs + 1).getReg();
6610 Register CarryDst, CarryIn;
6611 if (NumDefs == 2)
6612 CarryDst = MI.getOperand(i: 1).getReg();
6613 if (MI.getNumOperands() == NumDefs + 3)
6614 CarryIn = MI.getOperand(i: NumDefs + 2).getReg();
6615
6616 LLT RegTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
6617 LLT LeftoverTy, DummyTy;
6618 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
6619 extractParts(Reg: Src1, RegTy, MainTy: NarrowTy, LeftoverTy, VRegs&: Src1Regs, LeftoverVRegs&: Src1Left,
6620 MIRBuilder, MRI);
6621 extractParts(Reg: Src2, RegTy, MainTy: NarrowTy, LeftoverTy&: DummyTy, VRegs&: Src2Regs, LeftoverVRegs&: Src2Left, MIRBuilder,
6622 MRI);
6623
6624 int NarrowParts = Src1Regs.size();
6625 Src1Regs.append(RHS: Src1Left);
6626 Src2Regs.append(RHS: Src2Left);
6627 DstRegs.reserve(N: Src1Regs.size());
6628
6629 for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
6630 Register DstReg =
6631 MRI.createGenericVirtualRegister(Ty: MRI.getType(Reg: Src1Regs[i]));
6632 Register CarryOut;
6633 // Forward the final carry-out to the destination register
6634 if (i == e - 1 && CarryDst)
6635 CarryOut = CarryDst;
6636 else
6637 CarryOut = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 1));
6638
6639 if (!CarryIn) {
6640 MIRBuilder.buildInstr(Opc: OpO, DstOps: {DstReg, CarryOut},
6641 SrcOps: {Src1Regs[i], Src2Regs[i]});
6642 } else if (i == e - 1) {
6643 MIRBuilder.buildInstr(Opc: OpF, DstOps: {DstReg, CarryOut},
6644 SrcOps: {Src1Regs[i], Src2Regs[i], CarryIn});
6645 } else {
6646 MIRBuilder.buildInstr(Opc: OpE, DstOps: {DstReg, CarryOut},
6647 SrcOps: {Src1Regs[i], Src2Regs[i], CarryIn});
6648 }
6649
6650 DstRegs.push_back(Elt: DstReg);
6651 CarryIn = CarryOut;
6652 }
6653 insertParts(DstReg: MI.getOperand(i: 0).getReg(), ResultTy: RegTy, PartTy: NarrowTy,
6654 PartRegs: ArrayRef(DstRegs).take_front(N: NarrowParts), LeftoverTy,
6655 LeftoverRegs: ArrayRef(DstRegs).drop_front(N: NarrowParts));
6656
6657 MI.eraseFromParent();
6658 return Legalized;
6659}
6660
6661LegalizerHelper::LegalizeResult
6662LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
6663 auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
6664
6665 LLT Ty = MRI.getType(Reg: DstReg);
6666 if (Ty.isVector())
6667 return UnableToLegalize;
6668
6669 unsigned Size = Ty.getSizeInBits();
6670 unsigned NarrowSize = NarrowTy.getSizeInBits();
6671 if (Size % NarrowSize != 0)
6672 return UnableToLegalize;
6673
6674 unsigned NumParts = Size / NarrowSize;
6675 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
6676 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
6677
6678 SmallVector<Register, 2> Src1Parts, Src2Parts;
6679 SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
6680 extractParts(Reg: Src1, Ty: NarrowTy, NumParts, VRegs&: Src1Parts, MIRBuilder, MRI);
6681 extractParts(Reg: Src2, Ty: NarrowTy, NumParts, VRegs&: Src2Parts, MIRBuilder, MRI);
6682 multiplyRegisters(DstRegs&: DstTmpRegs, Src1Regs: Src1Parts, Src2Regs: Src2Parts, NarrowTy);
6683
6684 // Take only high half of registers if this is high mul.
6685 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
6686 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
6687 MI.eraseFromParent();
6688 return Legalized;
6689}
6690
6691LegalizerHelper::LegalizeResult
6692LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
6693 LLT NarrowTy) {
6694 if (TypeIdx != 0)
6695 return UnableToLegalize;
6696
6697 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
6698
6699 Register Src = MI.getOperand(i: 1).getReg();
6700 LLT SrcTy = MRI.getType(Reg: Src);
6701
6702 // If all finite floats fit into the narrowed integer type, we can just swap
6703 // out the result type. This is practically only useful for conversions from
6704 // half to at least 16-bits, so just handle the one case.
6705 if (SrcTy.getScalarType() != LLT::scalar(SizeInBits: 16) ||
6706 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
6707 return UnableToLegalize;
6708
6709 Observer.changingInstr(MI);
6710 narrowScalarDst(MI, NarrowTy, OpIdx: 0,
6711 ExtOpcode: IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
6712 Observer.changedInstr(MI);
6713 return Legalized;
6714}
6715
6716LegalizerHelper::LegalizeResult
6717LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
6718 LLT NarrowTy) {
6719 if (TypeIdx != 1)
6720 return UnableToLegalize;
6721
6722 uint64_t NarrowSize = NarrowTy.getSizeInBits();
6723
6724 int64_t SizeOp1 = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
6725 // FIXME: add support for when SizeOp1 isn't an exact multiple of
6726 // NarrowSize.
6727 if (SizeOp1 % NarrowSize != 0)
6728 return UnableToLegalize;
6729 int NumParts = SizeOp1 / NarrowSize;
6730
6731 SmallVector<Register, 2> SrcRegs, DstRegs;
6732 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowTy, NumParts, VRegs&: SrcRegs,
6733 MIRBuilder, MRI);
6734
6735 Register OpReg = MI.getOperand(i: 0).getReg();
6736 uint64_t OpStart = MI.getOperand(i: 2).getImm();
6737 uint64_t OpSize = MRI.getType(Reg: OpReg).getSizeInBits();
6738 for (int i = 0; i < NumParts; ++i) {
6739 unsigned SrcStart = i * NarrowSize;
6740
6741 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
6742 // No part of the extract uses this subregister, ignore it.
6743 continue;
6744 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(Reg: OpReg)) {
6745 // The entire subregister is extracted, forward the value.
6746 DstRegs.push_back(Elt: SrcRegs[i]);
6747 continue;
6748 }
6749
6750 // OpSegStart is where this destination segment would start in OpReg if it
6751 // extended infinitely in both directions.
6752 int64_t ExtractOffset;
6753 uint64_t SegSize;
6754 if (OpStart < SrcStart) {
6755 ExtractOffset = 0;
6756 SegSize = std::min(a: NarrowSize, b: OpStart + OpSize - SrcStart);
6757 } else {
6758 ExtractOffset = OpStart - SrcStart;
6759 SegSize = std::min(a: SrcStart + NarrowSize - OpStart, b: OpSize);
6760 }
6761
6762 Register SegReg = SrcRegs[i];
6763 if (ExtractOffset != 0 || SegSize != NarrowSize) {
6764 // A genuine extract is needed.
6765 SegReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: SegSize));
6766 MIRBuilder.buildExtract(Res: SegReg, Src: SrcRegs[i], Index: ExtractOffset);
6767 }
6768
6769 DstRegs.push_back(Elt: SegReg);
6770 }
6771
6772 Register DstReg = MI.getOperand(i: 0).getReg();
6773 if (MRI.getType(Reg: DstReg).isVector())
6774 MIRBuilder.buildBuildVector(Res: DstReg, Ops: DstRegs);
6775 else if (DstRegs.size() > 1)
6776 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
6777 else
6778 MIRBuilder.buildCopy(Res: DstReg, Op: DstRegs[0]);
6779 MI.eraseFromParent();
6780 return Legalized;
6781}
6782
6783LegalizerHelper::LegalizeResult
6784LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
6785 LLT NarrowTy) {
6786 // FIXME: Don't know how to handle secondary types yet.
6787 if (TypeIdx != 0)
6788 return UnableToLegalize;
6789
6790 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
6791 LLT RegTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
6792 LLT LeftoverTy;
6793 extractParts(Reg: MI.getOperand(i: 1).getReg(), RegTy, MainTy: NarrowTy, LeftoverTy, VRegs&: SrcRegs,
6794 LeftoverVRegs&: LeftoverRegs, MIRBuilder, MRI);
6795
6796 SrcRegs.append(RHS: LeftoverRegs);
6797
6798 uint64_t NarrowSize = NarrowTy.getSizeInBits();
6799 Register OpReg = MI.getOperand(i: 2).getReg();
6800 uint64_t OpStart = MI.getOperand(i: 3).getImm();
6801 uint64_t OpSize = MRI.getType(Reg: OpReg).getSizeInBits();
6802 for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
6803 unsigned DstStart = I * NarrowSize;
6804
6805 if (DstStart == OpStart && NarrowTy == MRI.getType(Reg: OpReg)) {
6806 // The entire subregister is defined by this insert, forward the new
6807 // value.
6808 DstRegs.push_back(Elt: OpReg);
6809 continue;
6810 }
6811
6812 Register SrcReg = SrcRegs[I];
6813 if (MRI.getType(Reg: SrcRegs[I]) == LeftoverTy) {
6814 // The leftover reg is smaller than NarrowTy, so we need to extend it.
6815 SrcReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
6816 MIRBuilder.buildAnyExt(Res: SrcReg, Op: SrcRegs[I]);
6817 }
6818
6819 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
6820 // No part of the insert affects this subregister, forward the original.
6821 DstRegs.push_back(Elt: SrcReg);
6822 continue;
6823 }
6824
6825 // OpSegStart is where this destination segment would start in OpReg if it
6826 // extended infinitely in both directions.
6827 int64_t ExtractOffset, InsertOffset;
6828 uint64_t SegSize;
6829 if (OpStart < DstStart) {
6830 InsertOffset = 0;
6831 ExtractOffset = DstStart - OpStart;
6832 SegSize = std::min(a: NarrowSize, b: OpStart + OpSize - DstStart);
6833 } else {
6834 InsertOffset = OpStart - DstStart;
6835 ExtractOffset = 0;
6836 SegSize =
6837 std::min(a: NarrowSize - InsertOffset, b: OpStart + OpSize - DstStart);
6838 }
6839
6840 Register SegReg = OpReg;
6841 if (ExtractOffset != 0 || SegSize != OpSize) {
6842 // A genuine extract is needed.
6843 SegReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: SegSize));
6844 MIRBuilder.buildExtract(Res: SegReg, Src: OpReg, Index: ExtractOffset);
6845 }
6846
6847 Register DstReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
6848 MIRBuilder.buildInsert(Res: DstReg, Src: SrcReg, Op: SegReg, Index: InsertOffset);
6849 DstRegs.push_back(Elt: DstReg);
6850 }
6851
6852 uint64_t WideSize = DstRegs.size() * NarrowSize;
6853 Register DstReg = MI.getOperand(i: 0).getReg();
6854 if (WideSize > RegTy.getSizeInBits()) {
6855 Register MergeReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: WideSize));
6856 MIRBuilder.buildMergeLikeInstr(Res: MergeReg, Ops: DstRegs);
6857 MIRBuilder.buildTrunc(Res: DstReg, Op: MergeReg);
6858 } else
6859 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
6860
6861 MI.eraseFromParent();
6862 return Legalized;
6863}
6864
6865LegalizerHelper::LegalizeResult
6866LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
6867 LLT NarrowTy) {
6868 Register DstReg = MI.getOperand(i: 0).getReg();
6869 LLT DstTy = MRI.getType(Reg: DstReg);
6870
6871 assert(MI.getNumOperands() == 3 && TypeIdx == 0);
6872
6873 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
6874 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
6875 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
6876 LLT LeftoverTy;
6877 if (!extractParts(Reg: MI.getOperand(i: 1).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy,
6878 VRegs&: Src0Regs, LeftoverVRegs&: Src0LeftoverRegs, MIRBuilder, MRI))
6879 return UnableToLegalize;
6880
6881 LLT Unused;
6882 if (!extractParts(Reg: MI.getOperand(i: 2).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy&: Unused,
6883 VRegs&: Src1Regs, LeftoverVRegs&: Src1LeftoverRegs, MIRBuilder, MRI))
6884 llvm_unreachable("inconsistent extractParts result");
6885
6886 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
6887 auto Inst = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy},
6888 SrcOps: {Src0Regs[I], Src1Regs[I]});
6889 DstRegs.push_back(Elt: Inst.getReg(Idx: 0));
6890 }
6891
6892 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
6893 auto Inst = MIRBuilder.buildInstr(
6894 Opc: MI.getOpcode(),
6895 DstOps: {LeftoverTy}, SrcOps: {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
6896 DstLeftoverRegs.push_back(Elt: Inst.getReg(Idx: 0));
6897 }
6898
6899 insertParts(DstReg, ResultTy: DstTy, PartTy: NarrowTy, PartRegs: DstRegs,
6900 LeftoverTy, LeftoverRegs: DstLeftoverRegs);
6901
6902 MI.eraseFromParent();
6903 return Legalized;
6904}
6905
6906LegalizerHelper::LegalizeResult
6907LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
6908 LLT NarrowTy) {
6909 if (TypeIdx != 0)
6910 return UnableToLegalize;
6911
6912 auto [DstReg, SrcReg] = MI.getFirst2Regs();
6913
6914 LLT DstTy = MRI.getType(Reg: DstReg);
6915 if (DstTy.isVector())
6916 return UnableToLegalize;
6917
6918 SmallVector<Register, 8> Parts;
6919 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
6920 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, VRegs&: Parts, PadStrategy: MI.getOpcode());
6921 buildWidenedRemergeToDst(DstReg, LCMTy, RemergeRegs: Parts);
6922
6923 MI.eraseFromParent();
6924 return Legalized;
6925}
6926
6927LegalizerHelper::LegalizeResult
6928LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
6929 LLT NarrowTy) {
6930 if (TypeIdx != 0)
6931 return UnableToLegalize;
6932
6933 Register CondReg = MI.getOperand(i: 1).getReg();
6934 LLT CondTy = MRI.getType(Reg: CondReg);
6935 if (CondTy.isVector()) // TODO: Handle vselect
6936 return UnableToLegalize;
6937
6938 Register DstReg = MI.getOperand(i: 0).getReg();
6939 LLT DstTy = MRI.getType(Reg: DstReg);
6940
6941 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
6942 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
6943 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
6944 LLT LeftoverTy;
6945 if (!extractParts(Reg: MI.getOperand(i: 2).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy,
6946 VRegs&: Src1Regs, LeftoverVRegs&: Src1LeftoverRegs, MIRBuilder, MRI))
6947 return UnableToLegalize;
6948
6949 LLT Unused;
6950 if (!extractParts(Reg: MI.getOperand(i: 3).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy&: Unused,
6951 VRegs&: Src2Regs, LeftoverVRegs&: Src2LeftoverRegs, MIRBuilder, MRI))
6952 llvm_unreachable("inconsistent extractParts result");
6953
6954 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
6955 auto Select = MIRBuilder.buildSelect(Res: NarrowTy,
6956 Tst: CondReg, Op0: Src1Regs[I], Op1: Src2Regs[I]);
6957 DstRegs.push_back(Elt: Select.getReg(Idx: 0));
6958 }
6959
6960 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
6961 auto Select = MIRBuilder.buildSelect(
6962 Res: LeftoverTy, Tst: CondReg, Op0: Src1LeftoverRegs[I], Op1: Src2LeftoverRegs[I]);
6963 DstLeftoverRegs.push_back(Elt: Select.getReg(Idx: 0));
6964 }
6965
6966 insertParts(DstReg, ResultTy: DstTy, PartTy: NarrowTy, PartRegs: DstRegs,
6967 LeftoverTy, LeftoverRegs: DstLeftoverRegs);
6968
6969 MI.eraseFromParent();
6970 return Legalized;
6971}
6972
6973LegalizerHelper::LegalizeResult
6974LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
6975 LLT NarrowTy) {
6976 if (TypeIdx != 1)
6977 return UnableToLegalize;
6978
6979 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6980 unsigned NarrowSize = NarrowTy.getSizeInBits();
6981
6982 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6983 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
6984
6985 MachineIRBuilder &B = MIRBuilder;
6986 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
6987 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
6988 auto C_0 = B.buildConstant(Res: NarrowTy, Val: 0);
6989 auto HiIsZero = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
6990 Op0: UnmergeSrc.getReg(Idx: 1), Op1: C_0);
6991 auto LoCTLZ = IsUndef ?
6992 B.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0)) :
6993 B.buildCTLZ(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
6994 auto C_NarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
6995 auto HiIsZeroCTLZ = B.buildAdd(Dst: DstTy, Src0: LoCTLZ, Src1: C_NarrowSize);
6996 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
6997 B.buildSelect(Res: DstReg, Tst: HiIsZero, Op0: HiIsZeroCTLZ, Op1: HiCTLZ);
6998
6999 MI.eraseFromParent();
7000 return Legalized;
7001 }
7002
7003 return UnableToLegalize;
7004}
7005
7006LegalizerHelper::LegalizeResult
7007LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
7008 LLT NarrowTy) {
7009 if (TypeIdx != 1)
7010 return UnableToLegalize;
7011
7012 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7013 unsigned NarrowSize = NarrowTy.getSizeInBits();
7014
7015 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7016 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
7017
7018 MachineIRBuilder &B = MIRBuilder;
7019 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
7020 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
7021 auto C_0 = B.buildConstant(Res: NarrowTy, Val: 0);
7022 auto LoIsZero = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
7023 Op0: UnmergeSrc.getReg(Idx: 0), Op1: C_0);
7024 auto HiCTTZ = IsUndef ?
7025 B.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1)) :
7026 B.buildCTTZ(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7027 auto C_NarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
7028 auto LoIsZeroCTTZ = B.buildAdd(Dst: DstTy, Src0: HiCTTZ, Src1: C_NarrowSize);
7029 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7030 B.buildSelect(Res: DstReg, Tst: LoIsZero, Op0: LoIsZeroCTTZ, Op1: LoCTTZ);
7031
7032 MI.eraseFromParent();
7033 return Legalized;
7034 }
7035
7036 return UnableToLegalize;
7037}
7038
7039LegalizerHelper::LegalizeResult
7040LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
7041 LLT NarrowTy) {
7042 if (TypeIdx != 1)
7043 return UnableToLegalize;
7044
7045 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7046 unsigned NarrowSize = NarrowTy.getSizeInBits();
7047
7048 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7049 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1));
7050
7051 auto LoCTPOP = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7052 auto HiCTPOP = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7053 MIRBuilder.buildAdd(Dst: DstReg, Src0: HiCTPOP, Src1: LoCTPOP);
7054
7055 MI.eraseFromParent();
7056 return Legalized;
7057 }
7058
7059 return UnableToLegalize;
7060}
7061
7062LegalizerHelper::LegalizeResult
7063LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
7064 LLT NarrowTy) {
7065 if (TypeIdx != 1)
7066 return UnableToLegalize;
7067
7068 MachineIRBuilder &B = MIRBuilder;
7069 Register ExpReg = MI.getOperand(i: 2).getReg();
7070 LLT ExpTy = MRI.getType(Reg: ExpReg);
7071
7072 unsigned ClampSize = NarrowTy.getScalarSizeInBits();
7073
7074 // Clamp the exponent to the range of the target type.
7075 auto MinExp = B.buildConstant(Res: ExpTy, Val: minIntN(N: ClampSize));
7076 auto ClampMin = B.buildSMax(Dst: ExpTy, Src0: ExpReg, Src1: MinExp);
7077 auto MaxExp = B.buildConstant(Res: ExpTy, Val: maxIntN(N: ClampSize));
7078 auto Clamp = B.buildSMin(Dst: ExpTy, Src0: ClampMin, Src1: MaxExp);
7079
7080 auto Trunc = B.buildTrunc(Res: NarrowTy, Op: Clamp);
7081 Observer.changingInstr(MI);
7082 MI.getOperand(i: 2).setReg(Trunc.getReg(Idx: 0));
7083 Observer.changedInstr(MI);
7084 return Legalized;
7085}
7086
7087LegalizerHelper::LegalizeResult
7088LegalizerHelper::lowerBitCount(MachineInstr &MI) {
7089 unsigned Opc = MI.getOpcode();
7090 const auto &TII = MIRBuilder.getTII();
7091 auto isSupported = [this](const LegalityQuery &Q) {
7092 auto QAction = LI.getAction(Query: Q).Action;
7093 return QAction == Legal || QAction == Libcall || QAction == Custom;
7094 };
7095 switch (Opc) {
7096 default:
7097 return UnableToLegalize;
7098 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
7099 // This trivially expands to CTLZ.
7100 Observer.changingInstr(MI);
7101 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTLZ));
7102 Observer.changedInstr(MI);
7103 return Legalized;
7104 }
7105 case TargetOpcode::G_CTLZ: {
7106 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7107 unsigned Len = SrcTy.getSizeInBits();
7108
7109 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7110 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
7111 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: SrcReg);
7112 auto ZeroSrc = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
7113 auto ICmp = MIRBuilder.buildICmp(
7114 Pred: CmpInst::ICMP_EQ, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: SrcReg, Op1: ZeroSrc);
7115 auto LenConst = MIRBuilder.buildConstant(Res: DstTy, Val: Len);
7116 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LenConst, Op1: CtlzZU);
7117 MI.eraseFromParent();
7118 return Legalized;
7119 }
7120 // for now, we do this:
7121 // NewLen = NextPowerOf2(Len);
7122 // x = x | (x >> 1);
7123 // x = x | (x >> 2);
7124 // ...
7125 // x = x | (x >>16);
7126 // x = x | (x >>32); // for 64-bit input
7127 // Upto NewLen/2
7128 // return Len - popcount(x);
7129 //
7130 // Ref: "Hacker's Delight" by Henry Warren
7131 Register Op = SrcReg;
7132 unsigned NewLen = PowerOf2Ceil(A: Len);
7133 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
7134 auto MIBShiftAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: 1ULL << i);
7135 auto MIBOp = MIRBuilder.buildOr(
7136 Dst: SrcTy, Src0: Op, Src1: MIRBuilder.buildLShr(Dst: SrcTy, Src0: Op, Src1: MIBShiftAmt));
7137 Op = MIBOp.getReg(Idx: 0);
7138 }
7139 auto MIBPop = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: Op);
7140 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MIRBuilder.buildConstant(Res: DstTy, Val: Len),
7141 Src1: MIBPop);
7142 MI.eraseFromParent();
7143 return Legalized;
7144 }
7145 case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
7146 // This trivially expands to CTTZ.
7147 Observer.changingInstr(MI);
7148 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTTZ));
7149 Observer.changedInstr(MI);
7150 return Legalized;
7151 }
7152 case TargetOpcode::G_CTTZ: {
7153 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7154
7155 unsigned Len = SrcTy.getSizeInBits();
7156 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7157 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
7158 // zero.
7159 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: SrcReg);
7160 auto Zero = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
7161 auto ICmp = MIRBuilder.buildICmp(
7162 Pred: CmpInst::ICMP_EQ, Res: DstTy.changeElementSize(NewEltSize: 1), Op0: SrcReg, Op1: Zero);
7163 auto LenConst = MIRBuilder.buildConstant(Res: DstTy, Val: Len);
7164 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LenConst, Op1: CttzZU);
7165 MI.eraseFromParent();
7166 return Legalized;
7167 }
7168 // for now, we use: { return popcount(~x & (x - 1)); }
7169 // unless the target has ctlz but not ctpop, in which case we use:
7170 // { return 32 - nlz(~x & (x-1)); }
7171 // Ref: "Hacker's Delight" by Henry Warren
7172 auto MIBCstNeg1 = MIRBuilder.buildConstant(Res: SrcTy, Val: -1);
7173 auto MIBNot = MIRBuilder.buildXor(Dst: SrcTy, Src0: SrcReg, Src1: MIBCstNeg1);
7174 auto MIBTmp = MIRBuilder.buildAnd(
7175 Dst: SrcTy, Src0: MIBNot, Src1: MIRBuilder.buildAdd(Dst: SrcTy, Src0: SrcReg, Src1: MIBCstNeg1));
7176 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
7177 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
7178 auto MIBCstLen = MIRBuilder.buildConstant(Res: SrcTy, Val: Len);
7179 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MIBCstLen,
7180 Src1: MIRBuilder.buildCTLZ(Dst: SrcTy, Src0: MIBTmp));
7181 MI.eraseFromParent();
7182 return Legalized;
7183 }
7184 Observer.changingInstr(MI);
7185 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTPOP));
7186 MI.getOperand(i: 1).setReg(MIBTmp.getReg(Idx: 0));
7187 Observer.changedInstr(MI);
7188 return Legalized;
7189 }
7190 case TargetOpcode::G_CTPOP: {
7191 Register SrcReg = MI.getOperand(i: 1).getReg();
7192 LLT Ty = MRI.getType(Reg: SrcReg);
7193 unsigned Size = Ty.getSizeInBits();
7194 MachineIRBuilder &B = MIRBuilder;
7195
7196 // Count set bits in blocks of 2 bits. Default approach would be
7197 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
7198 // We use following formula instead:
7199 // B2Count = val - { (val >> 1) & 0x55555555 }
7200 // since it gives same result in blocks of 2 with one instruction less.
7201 auto C_1 = B.buildConstant(Res: Ty, Val: 1);
7202 auto B2Set1LoTo1Hi = B.buildLShr(Dst: Ty, Src0: SrcReg, Src1: C_1);
7203 APInt B2Mask1HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x55));
7204 auto C_B2Mask1HiTo0 = B.buildConstant(Res: Ty, Val: B2Mask1HiTo0);
7205 auto B2Count1Hi = B.buildAnd(Dst: Ty, Src0: B2Set1LoTo1Hi, Src1: C_B2Mask1HiTo0);
7206 auto B2Count = B.buildSub(Dst: Ty, Src0: SrcReg, Src1: B2Count1Hi);
7207
7208 // In order to get count in blocks of 4 add values from adjacent block of 2.
7209 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
7210 auto C_2 = B.buildConstant(Res: Ty, Val: 2);
7211 auto B4Set2LoTo2Hi = B.buildLShr(Dst: Ty, Src0: B2Count, Src1: C_2);
7212 APInt B4Mask2HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x33));
7213 auto C_B4Mask2HiTo0 = B.buildConstant(Res: Ty, Val: B4Mask2HiTo0);
7214 auto B4HiB2Count = B.buildAnd(Dst: Ty, Src0: B4Set2LoTo2Hi, Src1: C_B4Mask2HiTo0);
7215 auto B4LoB2Count = B.buildAnd(Dst: Ty, Src0: B2Count, Src1: C_B4Mask2HiTo0);
7216 auto B4Count = B.buildAdd(Dst: Ty, Src0: B4HiB2Count, Src1: B4LoB2Count);
7217
7218 // For count in blocks of 8 bits we don't have to mask high 4 bits before
7219 // addition since count value sits in range {0,...,8} and 4 bits are enough
7220 // to hold such binary values. After addition high 4 bits still hold count
7221 // of set bits in high 4 bit block, set them to zero and get 8 bit result.
7222 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
7223 auto C_4 = B.buildConstant(Res: Ty, Val: 4);
7224 auto B8HiB4Count = B.buildLShr(Dst: Ty, Src0: B4Count, Src1: C_4);
7225 auto B8CountDirty4Hi = B.buildAdd(Dst: Ty, Src0: B8HiB4Count, Src1: B4Count);
7226 APInt B8Mask4HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x0F));
7227 auto C_B8Mask4HiTo0 = B.buildConstant(Res: Ty, Val: B8Mask4HiTo0);
7228 auto B8Count = B.buildAnd(Dst: Ty, Src0: B8CountDirty4Hi, Src1: C_B8Mask4HiTo0);
7229
7230 assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
7231 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
7232 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
7233 auto MulMask = B.buildConstant(Res: Ty, Val: APInt::getSplat(NewLen: Size, V: APInt(8, 0x01)));
7234
7235 // Shift count result from 8 high bits to low bits.
7236 auto C_SizeM8 = B.buildConstant(Res: Ty, Val: Size - 8);
7237
7238 auto IsMulSupported = [this](const LLT Ty) {
7239 auto Action = LI.getAction(Query: {TargetOpcode::G_MUL, {Ty}}).Action;
7240 return Action == Legal || Action == WidenScalar || Action == Custom;
7241 };
7242 if (IsMulSupported(Ty)) {
7243 auto ResTmp = B.buildMul(Dst: Ty, Src0: B8Count, Src1: MulMask);
7244 B.buildLShr(Dst: MI.getOperand(i: 0).getReg(), Src0: ResTmp, Src1: C_SizeM8);
7245 } else {
7246 auto ResTmp = B8Count;
7247 for (unsigned Shift = 8; Shift < Size; Shift *= 2) {
7248 auto ShiftC = B.buildConstant(Res: Ty, Val: Shift);
7249 auto Shl = B.buildShl(Dst: Ty, Src0: ResTmp, Src1: ShiftC);
7250 ResTmp = B.buildAdd(Dst: Ty, Src0: ResTmp, Src1: Shl);
7251 }
7252 B.buildLShr(Dst: MI.getOperand(i: 0).getReg(), Src0: ResTmp, Src1: C_SizeM8);
7253 }
7254 MI.eraseFromParent();
7255 return Legalized;
7256 }
7257 }
7258}
7259
7260// Check that (every element of) Reg is undef or not an exact multiple of BW.
7261static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
7262 Register Reg, unsigned BW) {
7263 return matchUnaryPredicate(
7264 MRI, Reg,
7265 Match: [=](const Constant *C) {
7266 // Null constant here means an undef.
7267 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(Val: C);
7268 return !CI || CI->getValue().urem(RHS: BW) != 0;
7269 },
7270 /*AllowUndefs*/ true);
7271}
7272
7273LegalizerHelper::LegalizeResult
7274LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
7275 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7276 LLT Ty = MRI.getType(Reg: Dst);
7277 LLT ShTy = MRI.getType(Reg: Z);
7278
7279 unsigned BW = Ty.getScalarSizeInBits();
7280
7281 if (!isPowerOf2_32(Value: BW))
7282 return UnableToLegalize;
7283
7284 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7285 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7286
7287 if (isNonZeroModBitWidthOrUndef(MRI, Reg: Z, BW)) {
7288 // fshl X, Y, Z -> fshr X, Y, -Z
7289 // fshr X, Y, Z -> fshl X, Y, -Z
7290 auto Zero = MIRBuilder.buildConstant(Res: ShTy, Val: 0);
7291 Z = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: Z).getReg(Idx: 0);
7292 } else {
7293 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
7294 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
7295 auto One = MIRBuilder.buildConstant(Res: ShTy, Val: 1);
7296 if (IsFSHL) {
7297 Y = MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Ty}, SrcOps: {X, Y, One}).getReg(Idx: 0);
7298 X = MIRBuilder.buildLShr(Dst: Ty, Src0: X, Src1: One).getReg(Idx: 0);
7299 } else {
7300 X = MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Ty}, SrcOps: {X, Y, One}).getReg(Idx: 0);
7301 Y = MIRBuilder.buildShl(Dst: Ty, Src0: Y, Src1: One).getReg(Idx: 0);
7302 }
7303
7304 Z = MIRBuilder.buildNot(Dst: ShTy, Src0: Z).getReg(Idx: 0);
7305 }
7306
7307 MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Dst}, SrcOps: {X, Y, Z});
7308 MI.eraseFromParent();
7309 return Legalized;
7310}
7311
7312LegalizerHelper::LegalizeResult
7313LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
7314 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7315 LLT Ty = MRI.getType(Reg: Dst);
7316 LLT ShTy = MRI.getType(Reg: Z);
7317
7318 const unsigned BW = Ty.getScalarSizeInBits();
7319 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7320
7321 Register ShX, ShY;
7322 Register ShAmt, InvShAmt;
7323
7324 // FIXME: Emit optimized urem by constant instead of letting it expand later.
7325 if (isNonZeroModBitWidthOrUndef(MRI, Reg: Z, BW)) {
7326 // fshl: X << C | Y >> (BW - C)
7327 // fshr: X << (BW - C) | Y >> C
7328 // where C = Z % BW is not zero
7329 auto BitWidthC = MIRBuilder.buildConstant(Res: ShTy, Val: BW);
7330 ShAmt = MIRBuilder.buildURem(Dst: ShTy, Src0: Z, Src1: BitWidthC).getReg(Idx: 0);
7331 InvShAmt = MIRBuilder.buildSub(Dst: ShTy, Src0: BitWidthC, Src1: ShAmt).getReg(Idx: 0);
7332 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: IsFSHL ? ShAmt : InvShAmt).getReg(Idx: 0);
7333 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: IsFSHL ? InvShAmt : ShAmt).getReg(Idx: 0);
7334 } else {
7335 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
7336 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
7337 auto Mask = MIRBuilder.buildConstant(Res: ShTy, Val: BW - 1);
7338 if (isPowerOf2_32(Value: BW)) {
7339 // Z % BW -> Z & (BW - 1)
7340 ShAmt = MIRBuilder.buildAnd(Dst: ShTy, Src0: Z, Src1: Mask).getReg(Idx: 0);
7341 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
7342 auto NotZ = MIRBuilder.buildNot(Dst: ShTy, Src0: Z);
7343 InvShAmt = MIRBuilder.buildAnd(Dst: ShTy, Src0: NotZ, Src1: Mask).getReg(Idx: 0);
7344 } else {
7345 auto BitWidthC = MIRBuilder.buildConstant(Res: ShTy, Val: BW);
7346 ShAmt = MIRBuilder.buildURem(Dst: ShTy, Src0: Z, Src1: BitWidthC).getReg(Idx: 0);
7347 InvShAmt = MIRBuilder.buildSub(Dst: ShTy, Src0: Mask, Src1: ShAmt).getReg(Idx: 0);
7348 }
7349
7350 auto One = MIRBuilder.buildConstant(Res: ShTy, Val: 1);
7351 if (IsFSHL) {
7352 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: ShAmt).getReg(Idx: 0);
7353 auto ShY1 = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: One);
7354 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: ShY1, Src1: InvShAmt).getReg(Idx: 0);
7355 } else {
7356 auto ShX1 = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: One);
7357 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: ShX1, Src1: InvShAmt).getReg(Idx: 0);
7358 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: ShAmt).getReg(Idx: 0);
7359 }
7360 }
7361
7362 MIRBuilder.buildOr(Dst, Src0: ShX, Src1: ShY, Flags: MachineInstr::Disjoint);
7363 MI.eraseFromParent();
7364 return Legalized;
7365}
7366
7367LegalizerHelper::LegalizeResult
7368LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
7369 // These operations approximately do the following (while avoiding undefined
7370 // shifts by BW):
7371 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
7372 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
7373 Register Dst = MI.getOperand(i: 0).getReg();
7374 LLT Ty = MRI.getType(Reg: Dst);
7375 LLT ShTy = MRI.getType(Reg: MI.getOperand(i: 3).getReg());
7376
7377 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7378 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7379
7380 // TODO: Use smarter heuristic that accounts for vector legalization.
7381 if (LI.getAction(Query: {RevOpcode, {Ty, ShTy}}).Action == Lower)
7382 return lowerFunnelShiftAsShifts(MI);
7383
7384 // This only works for powers of 2, fallback to shifts if it fails.
7385 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
7386 if (Result == UnableToLegalize)
7387 return lowerFunnelShiftAsShifts(MI);
7388 return Result;
7389}
7390
7391LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
7392 auto [Dst, Src] = MI.getFirst2Regs();
7393 LLT DstTy = MRI.getType(Reg: Dst);
7394 LLT SrcTy = MRI.getType(Reg: Src);
7395
7396 uint32_t DstTySize = DstTy.getSizeInBits();
7397 uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
7398 uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
7399
7400 if (!isPowerOf2_32(Value: DstTySize) || !isPowerOf2_32(Value: DstTyScalarSize) ||
7401 !isPowerOf2_32(Value: SrcTyScalarSize))
7402 return UnableToLegalize;
7403
7404 // The step between extend is too large, split it by creating an intermediate
7405 // extend instruction
7406 if (SrcTyScalarSize * 2 < DstTyScalarSize) {
7407 LLT MidTy = SrcTy.changeElementSize(NewEltSize: SrcTyScalarSize * 2);
7408 // If the destination type is illegal, split it into multiple statements
7409 // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
7410 auto NewExt = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {MidTy}, SrcOps: {Src});
7411 // Unmerge the vector
7412 LLT EltTy = MidTy.changeElementCount(
7413 EC: MidTy.getElementCount().divideCoefficientBy(RHS: 2));
7414 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: NewExt);
7415
7416 // ZExt the vectors
7417 LLT ZExtResTy = DstTy.changeElementCount(
7418 EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2));
7419 auto ZExtRes1 = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {ZExtResTy},
7420 SrcOps: {UnmergeSrc.getReg(Idx: 0)});
7421 auto ZExtRes2 = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {ZExtResTy},
7422 SrcOps: {UnmergeSrc.getReg(Idx: 1)});
7423
7424 // Merge the ending vectors
7425 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: {ZExtRes1, ZExtRes2});
7426
7427 MI.eraseFromParent();
7428 return Legalized;
7429 }
7430 return UnableToLegalize;
7431}
7432
7433LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
7434 // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
7435 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
7436 // Similar to how operand splitting is done in SelectiondDAG, we can handle
7437 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
7438 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
7439 // %lo16(<4 x s16>) = G_TRUNC %inlo
7440 // %hi16(<4 x s16>) = G_TRUNC %inhi
7441 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
7442 // %res(<8 x s8>) = G_TRUNC %in16
7443
7444 assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
7445
7446 Register DstReg = MI.getOperand(i: 0).getReg();
7447 Register SrcReg = MI.getOperand(i: 1).getReg();
7448 LLT DstTy = MRI.getType(Reg: DstReg);
7449 LLT SrcTy = MRI.getType(Reg: SrcReg);
7450
7451 if (DstTy.isVector() && isPowerOf2_32(Value: DstTy.getNumElements()) &&
7452 isPowerOf2_32(Value: DstTy.getScalarSizeInBits()) &&
7453 isPowerOf2_32(Value: SrcTy.getNumElements()) &&
7454 isPowerOf2_32(Value: SrcTy.getScalarSizeInBits())) {
7455 // Split input type.
7456 LLT SplitSrcTy = SrcTy.changeElementCount(
7457 EC: SrcTy.getElementCount().divideCoefficientBy(RHS: 2));
7458
7459 // First, split the source into two smaller vectors.
7460 SmallVector<Register, 2> SplitSrcs;
7461 extractParts(Reg: SrcReg, Ty: SplitSrcTy, NumParts: 2, VRegs&: SplitSrcs, MIRBuilder, MRI);
7462
7463 // Truncate the splits into intermediate narrower elements.
7464 LLT InterTy;
7465 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
7466 InterTy = SplitSrcTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits() * 2);
7467 else
7468 InterTy = SplitSrcTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits());
7469 for (Register &Src : SplitSrcs)
7470 Src = MIRBuilder.buildTrunc(Res: InterTy, Op: Src).getReg(Idx: 0);
7471
7472 // Combine the new truncates into one vector
7473 auto Merge = MIRBuilder.buildMergeLikeInstr(
7474 Res: DstTy.changeElementSize(NewEltSize: InterTy.getScalarSizeInBits()), Ops: SplitSrcs);
7475
7476 // Truncate the new vector to the final result type
7477 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
7478 MIRBuilder.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Merge.getReg(Idx: 0));
7479 else
7480 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0).getReg(), Op: Merge.getReg(Idx: 0));
7481
7482 MI.eraseFromParent();
7483
7484 return Legalized;
7485 }
7486 return UnableToLegalize;
7487}
7488
7489LegalizerHelper::LegalizeResult
7490LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
7491 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
7492 auto Zero = MIRBuilder.buildConstant(Res: AmtTy, Val: 0);
7493 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
7494 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
7495 auto Neg = MIRBuilder.buildSub(Dst: AmtTy, Src0: Zero, Src1: Amt);
7496 MIRBuilder.buildInstr(Opc: RevRot, DstOps: {Dst}, SrcOps: {Src, Neg});
7497 MI.eraseFromParent();
7498 return Legalized;
7499}
7500
7501LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
7502 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
7503
7504 unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
7505 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
7506
7507 MIRBuilder.setInstrAndDebugLoc(MI);
7508
7509 // If a rotate in the other direction is supported, use it.
7510 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
7511 if (LI.isLegalOrCustom(Query: {RevRot, {DstTy, SrcTy}}) &&
7512 isPowerOf2_32(Value: EltSizeInBits))
7513 return lowerRotateWithReverseRotate(MI);
7514
7515 // If a funnel shift is supported, use it.
7516 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
7517 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
7518 bool IsFShLegal = false;
7519 if ((IsFShLegal = LI.isLegalOrCustom(Query: {FShOpc, {DstTy, AmtTy}})) ||
7520 LI.isLegalOrCustom(Query: {RevFsh, {DstTy, AmtTy}})) {
7521 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
7522 Register R3) {
7523 MIRBuilder.buildInstr(Opc, DstOps: {R1}, SrcOps: {R2, R2, R3});
7524 MI.eraseFromParent();
7525 return Legalized;
7526 };
7527 // If a funnel shift in the other direction is supported, use it.
7528 if (IsFShLegal) {
7529 return buildFunnelShift(FShOpc, Dst, Src, Amt);
7530 } else if (isPowerOf2_32(Value: EltSizeInBits)) {
7531 Amt = MIRBuilder.buildNeg(Dst: DstTy, Src0: Amt).getReg(Idx: 0);
7532 return buildFunnelShift(RevFsh, Dst, Src, Amt);
7533 }
7534 }
7535
7536 auto Zero = MIRBuilder.buildConstant(Res: AmtTy, Val: 0);
7537 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
7538 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
7539 auto BitWidthMinusOneC = MIRBuilder.buildConstant(Res: AmtTy, Val: EltSizeInBits - 1);
7540 Register ShVal;
7541 Register RevShiftVal;
7542 if (isPowerOf2_32(Value: EltSizeInBits)) {
7543 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
7544 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
7545 auto NegAmt = MIRBuilder.buildSub(Dst: AmtTy, Src0: Zero, Src1: Amt);
7546 auto ShAmt = MIRBuilder.buildAnd(Dst: AmtTy, Src0: Amt, Src1: BitWidthMinusOneC);
7547 ShVal = MIRBuilder.buildInstr(Opc: ShOpc, DstOps: {DstTy}, SrcOps: {Src, ShAmt}).getReg(Idx: 0);
7548 auto RevAmt = MIRBuilder.buildAnd(Dst: AmtTy, Src0: NegAmt, Src1: BitWidthMinusOneC);
7549 RevShiftVal =
7550 MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Src, RevAmt}).getReg(Idx: 0);
7551 } else {
7552 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
7553 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
7554 auto BitWidthC = MIRBuilder.buildConstant(Res: AmtTy, Val: EltSizeInBits);
7555 auto ShAmt = MIRBuilder.buildURem(Dst: AmtTy, Src0: Amt, Src1: BitWidthC);
7556 ShVal = MIRBuilder.buildInstr(Opc: ShOpc, DstOps: {DstTy}, SrcOps: {Src, ShAmt}).getReg(Idx: 0);
7557 auto RevAmt = MIRBuilder.buildSub(Dst: AmtTy, Src0: BitWidthMinusOneC, Src1: ShAmt);
7558 auto One = MIRBuilder.buildConstant(Res: AmtTy, Val: 1);
7559 auto Inner = MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Src, One});
7560 RevShiftVal =
7561 MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Inner, RevAmt}).getReg(Idx: 0);
7562 }
7563 MIRBuilder.buildOr(Dst, Src0: ShVal, Src1: RevShiftVal);
7564 MI.eraseFromParent();
7565 return Legalized;
7566}
7567
7568// Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
7569// representation.
7570LegalizerHelper::LegalizeResult
7571LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
7572 auto [Dst, Src] = MI.getFirst2Regs();
7573 const LLT S64 = LLT::scalar(SizeInBits: 64);
7574 const LLT S32 = LLT::scalar(SizeInBits: 32);
7575 const LLT S1 = LLT::scalar(SizeInBits: 1);
7576
7577 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
7578
7579 // unsigned cul2f(ulong u) {
7580 // uint lz = clz(u);
7581 // uint e = (u != 0) ? 127U + 63U - lz : 0;
7582 // u = (u << lz) & 0x7fffffffffffffffUL;
7583 // ulong t = u & 0xffffffffffUL;
7584 // uint v = (e << 23) | (uint)(u >> 40);
7585 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
7586 // return as_float(v + r);
7587 // }
7588
7589 auto Zero32 = MIRBuilder.buildConstant(Res: S32, Val: 0);
7590 auto Zero64 = MIRBuilder.buildConstant(Res: S64, Val: 0);
7591
7592 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(Dst: S32, Src0: Src);
7593
7594 auto K = MIRBuilder.buildConstant(Res: S32, Val: 127U + 63U);
7595 auto Sub = MIRBuilder.buildSub(Dst: S32, Src0: K, Src1: LZ);
7596
7597 auto NotZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: Src, Op1: Zero64);
7598 auto E = MIRBuilder.buildSelect(Res: S32, Tst: NotZero, Op0: Sub, Op1: Zero32);
7599
7600 auto Mask0 = MIRBuilder.buildConstant(Res: S64, Val: (-1ULL) >> 1);
7601 auto ShlLZ = MIRBuilder.buildShl(Dst: S64, Src0: Src, Src1: LZ);
7602
7603 auto U = MIRBuilder.buildAnd(Dst: S64, Src0: ShlLZ, Src1: Mask0);
7604
7605 auto Mask1 = MIRBuilder.buildConstant(Res: S64, Val: 0xffffffffffULL);
7606 auto T = MIRBuilder.buildAnd(Dst: S64, Src0: U, Src1: Mask1);
7607
7608 auto UShl = MIRBuilder.buildLShr(Dst: S64, Src0: U, Src1: MIRBuilder.buildConstant(Res: S64, Val: 40));
7609 auto ShlE = MIRBuilder.buildShl(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: 23));
7610 auto V = MIRBuilder.buildOr(Dst: S32, Src0: ShlE, Src1: MIRBuilder.buildTrunc(Res: S32, Op: UShl));
7611
7612 auto C = MIRBuilder.buildConstant(Res: S64, Val: 0x8000000000ULL);
7613 auto RCmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_UGT, Res: S1, Op0: T, Op1: C);
7614 auto TCmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: T, Op1: C);
7615 auto One = MIRBuilder.buildConstant(Res: S32, Val: 1);
7616
7617 auto VTrunc1 = MIRBuilder.buildAnd(Dst: S32, Src0: V, Src1: One);
7618 auto Select0 = MIRBuilder.buildSelect(Res: S32, Tst: TCmp, Op0: VTrunc1, Op1: Zero32);
7619 auto R = MIRBuilder.buildSelect(Res: S32, Tst: RCmp, Op0: One, Op1: Select0);
7620 MIRBuilder.buildAdd(Dst, Src0: V, Src1: R);
7621
7622 MI.eraseFromParent();
7623 return Legalized;
7624}
7625
7626// Expand s32 = G_UITOFP s64 to an IEEE float representation using bit
7627// operations and G_SITOFP
7628LegalizerHelper::LegalizeResult
7629LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
7630 auto [Dst, Src] = MI.getFirst2Regs();
7631 const LLT S64 = LLT::scalar(SizeInBits: 64);
7632 const LLT S32 = LLT::scalar(SizeInBits: 32);
7633 const LLT S1 = LLT::scalar(SizeInBits: 1);
7634
7635 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
7636
7637 // For i64 < INT_MAX we simply reuse SITOFP.
7638 // Otherwise, divide i64 by 2, round result by ORing with the lowest bit
7639 // saved before division, convert to float by SITOFP, multiply the result
7640 // by 2.
7641 auto One = MIRBuilder.buildConstant(Res: S64, Val: 1);
7642 auto Zero = MIRBuilder.buildConstant(Res: S64, Val: 0);
7643 // Result if Src < INT_MAX
7644 auto SmallResult = MIRBuilder.buildSITOFP(Dst: S32, Src0: Src);
7645 // Result if Src >= INT_MAX
7646 auto Halved = MIRBuilder.buildLShr(Dst: S64, Src0: Src, Src1: One);
7647 auto LowerBit = MIRBuilder.buildAnd(Dst: S64, Src0: Src, Src1: One);
7648 auto RoundedHalved = MIRBuilder.buildOr(Dst: S64, Src0: Halved, Src1: LowerBit);
7649 auto HalvedFP = MIRBuilder.buildSITOFP(Dst: S32, Src0: RoundedHalved);
7650 auto LargeResult = MIRBuilder.buildFAdd(Dst: S32, Src0: HalvedFP, Src1: HalvedFP);
7651 // Check if the original value is larger than INT_MAX by comparing with
7652 // zero to pick one of the two conversions.
7653 auto IsLarge =
7654 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_SLT, Res: S1, Op0: Src, Op1: Zero);
7655 MIRBuilder.buildSelect(Res: Dst, Tst: IsLarge, Op0: LargeResult, Op1: SmallResult);
7656
7657 MI.eraseFromParent();
7658 return Legalized;
7659}
7660
7661// Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an
7662// IEEE double representation.
7663LegalizerHelper::LegalizeResult
7664LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
7665 auto [Dst, Src] = MI.getFirst2Regs();
7666 const LLT S64 = LLT::scalar(SizeInBits: 64);
7667 const LLT S32 = LLT::scalar(SizeInBits: 32);
7668
7669 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
7670
7671 // We create double value from 32 bit parts with 32 exponent difference.
7672 // Note that + and - are float operations that adjust the implicit leading
7673 // one, the bases 2^52 and 2^84 are for illustrative purposes.
7674 //
7675 // X = 2^52 * 1.0...LowBits
7676 // Y = 2^84 * 1.0...HighBits
7677 // Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
7678 // = - 2^52 * 1.0...HighBits
7679 // Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
7680 auto TwoP52 = MIRBuilder.buildConstant(Res: S64, UINT64_C(0x4330000000000000));
7681 auto TwoP84 = MIRBuilder.buildConstant(Res: S64, UINT64_C(0x4530000000000000));
7682 auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
7683 auto TwoP52P84FP = MIRBuilder.buildFConstant(Res: S64, Val: TwoP52P84);
7684 auto HalfWidth = MIRBuilder.buildConstant(Res: S64, Val: 32);
7685
7686 auto LowBits = MIRBuilder.buildTrunc(Res: S32, Op: Src);
7687 LowBits = MIRBuilder.buildZExt(Res: S64, Op: LowBits);
7688 auto LowBitsFP = MIRBuilder.buildOr(Dst: S64, Src0: TwoP52, Src1: LowBits);
7689 auto HighBits = MIRBuilder.buildLShr(Dst: S64, Src0: Src, Src1: HalfWidth);
7690 auto HighBitsFP = MIRBuilder.buildOr(Dst: S64, Src0: TwoP84, Src1: HighBits);
7691 auto Scratch = MIRBuilder.buildFSub(Dst: S64, Src0: HighBitsFP, Src1: TwoP52P84FP);
7692 MIRBuilder.buildFAdd(Dst, Src0: Scratch, Src1: LowBitsFP);
7693
7694 MI.eraseFromParent();
7695 return Legalized;
7696}
7697
7698LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
7699 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7700
7701 if (SrcTy == LLT::scalar(SizeInBits: 1)) {
7702 auto True = MIRBuilder.buildFConstant(Res: DstTy, Val: 1.0);
7703 auto False = MIRBuilder.buildFConstant(Res: DstTy, Val: 0.0);
7704 MIRBuilder.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
7705 MI.eraseFromParent();
7706 return Legalized;
7707 }
7708
7709 if (SrcTy != LLT::scalar(SizeInBits: 64))
7710 return UnableToLegalize;
7711
7712 if (DstTy == LLT::scalar(SizeInBits: 32))
7713 // TODO: SelectionDAG has several alternative expansions to port which may
7714 // be more reasonable depending on the available instructions. We also need
7715 // a more advanced mechanism to choose an optimal version depending on
7716 // target features such as sitofp or CTLZ availability.
7717 return lowerU64ToF32WithSITOFP(MI);
7718
7719 if (DstTy == LLT::scalar(SizeInBits: 64))
7720 return lowerU64ToF64BitFloatOps(MI);
7721
7722 return UnableToLegalize;
7723}
7724
7725LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
7726 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7727
7728 const LLT S64 = LLT::scalar(SizeInBits: 64);
7729 const LLT S32 = LLT::scalar(SizeInBits: 32);
7730 const LLT S1 = LLT::scalar(SizeInBits: 1);
7731
7732 if (SrcTy == S1) {
7733 auto True = MIRBuilder.buildFConstant(Res: DstTy, Val: -1.0);
7734 auto False = MIRBuilder.buildFConstant(Res: DstTy, Val: 0.0);
7735 MIRBuilder.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
7736 MI.eraseFromParent();
7737 return Legalized;
7738 }
7739
7740 if (SrcTy != S64)
7741 return UnableToLegalize;
7742
7743 if (DstTy == S32) {
7744 // signed cl2f(long l) {
7745 // long s = l >> 63;
7746 // float r = cul2f((l + s) ^ s);
7747 // return s ? -r : r;
7748 // }
7749 Register L = Src;
7750 auto SignBit = MIRBuilder.buildConstant(Res: S64, Val: 63);
7751 auto S = MIRBuilder.buildAShr(Dst: S64, Src0: L, Src1: SignBit);
7752
7753 auto LPlusS = MIRBuilder.buildAdd(Dst: S64, Src0: L, Src1: S);
7754 auto Xor = MIRBuilder.buildXor(Dst: S64, Src0: LPlusS, Src1: S);
7755 auto R = MIRBuilder.buildUITOFP(Dst: S32, Src0: Xor);
7756
7757 auto RNeg = MIRBuilder.buildFNeg(Dst: S32, Src0: R);
7758 auto SignNotZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: S,
7759 Op1: MIRBuilder.buildConstant(Res: S64, Val: 0));
7760 MIRBuilder.buildSelect(Res: Dst, Tst: SignNotZero, Op0: RNeg, Op1: R);
7761 MI.eraseFromParent();
7762 return Legalized;
7763 }
7764
7765 return UnableToLegalize;
7766}
7767
7768LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
7769 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7770 const LLT S64 = LLT::scalar(SizeInBits: 64);
7771 const LLT S32 = LLT::scalar(SizeInBits: 32);
7772
7773 if (SrcTy != S64 && SrcTy != S32)
7774 return UnableToLegalize;
7775 if (DstTy != S32 && DstTy != S64)
7776 return UnableToLegalize;
7777
7778 // FPTOSI gives same result as FPTOUI for positive signed integers.
7779 // FPTOUI needs to deal with fp values that convert to unsigned integers
7780 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
7781
7782 APInt TwoPExpInt = APInt::getSignMask(BitWidth: DstTy.getSizeInBits());
7783 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
7784 : APFloat::IEEEdouble(),
7785 APInt::getZero(numBits: SrcTy.getSizeInBits()));
7786 TwoPExpFP.convertFromAPInt(Input: TwoPExpInt, IsSigned: false, RM: APFloat::rmNearestTiesToEven);
7787
7788 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Src);
7789
7790 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(Res: SrcTy, Val: TwoPExpFP);
7791 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
7792 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
7793 MachineInstrBuilder FSub = MIRBuilder.buildFSub(Dst: SrcTy, Src0: Src, Src1: Threshold);
7794 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: FSub);
7795 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(Res: DstTy, Val: TwoPExpInt);
7796 MachineInstrBuilder Res = MIRBuilder.buildXor(Dst: DstTy, Src0: ResLowBits, Src1: ResHighBit);
7797
7798 const LLT S1 = LLT::scalar(SizeInBits: 1);
7799
7800 MachineInstrBuilder FCMP =
7801 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ULT, Res: S1, Op0: Src, Op1: Threshold);
7802 MIRBuilder.buildSelect(Res: Dst, Tst: FCMP, Op0: FPTOSI, Op1: Res);
7803
7804 MI.eraseFromParent();
7805 return Legalized;
7806}
7807
7808LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
7809 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7810 const LLT S64 = LLT::scalar(SizeInBits: 64);
7811 const LLT S32 = LLT::scalar(SizeInBits: 32);
7812
7813 // FIXME: Only f32 to i64 conversions are supported.
7814 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
7815 return UnableToLegalize;
7816
7817 // Expand f32 -> i64 conversion
7818 // This algorithm comes from compiler-rt's implementation of fixsfdi:
7819 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
7820
7821 unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
7822
7823 auto ExponentMask = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x7F800000);
7824 auto ExponentLoBit = MIRBuilder.buildConstant(Res: SrcTy, Val: 23);
7825
7826 auto AndExpMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: ExponentMask);
7827 auto ExponentBits = MIRBuilder.buildLShr(Dst: SrcTy, Src0: AndExpMask, Src1: ExponentLoBit);
7828
7829 auto SignMask = MIRBuilder.buildConstant(Res: SrcTy,
7830 Val: APInt::getSignMask(BitWidth: SrcEltBits));
7831 auto AndSignMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: SignMask);
7832 auto SignLowBit = MIRBuilder.buildConstant(Res: SrcTy, Val: SrcEltBits - 1);
7833 auto Sign = MIRBuilder.buildAShr(Dst: SrcTy, Src0: AndSignMask, Src1: SignLowBit);
7834 Sign = MIRBuilder.buildSExt(Res: DstTy, Op: Sign);
7835
7836 auto MantissaMask = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x007FFFFF);
7837 auto AndMantissaMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: MantissaMask);
7838 auto K = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x00800000);
7839
7840 auto R = MIRBuilder.buildOr(Dst: SrcTy, Src0: AndMantissaMask, Src1: K);
7841 R = MIRBuilder.buildZExt(Res: DstTy, Op: R);
7842
7843 auto Bias = MIRBuilder.buildConstant(Res: SrcTy, Val: 127);
7844 auto Exponent = MIRBuilder.buildSub(Dst: SrcTy, Src0: ExponentBits, Src1: Bias);
7845 auto SubExponent = MIRBuilder.buildSub(Dst: SrcTy, Src0: Exponent, Src1: ExponentLoBit);
7846 auto ExponentSub = MIRBuilder.buildSub(Dst: SrcTy, Src0: ExponentLoBit, Src1: Exponent);
7847
7848 auto Shl = MIRBuilder.buildShl(Dst: DstTy, Src0: R, Src1: SubExponent);
7849 auto Srl = MIRBuilder.buildLShr(Dst: DstTy, Src0: R, Src1: ExponentSub);
7850
7851 const LLT S1 = LLT::scalar(SizeInBits: 1);
7852 auto CmpGt = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT,
7853 Res: S1, Op0: Exponent, Op1: ExponentLoBit);
7854
7855 R = MIRBuilder.buildSelect(Res: DstTy, Tst: CmpGt, Op0: Shl, Op1: Srl);
7856
7857 auto XorSign = MIRBuilder.buildXor(Dst: DstTy, Src0: R, Src1: Sign);
7858 auto Ret = MIRBuilder.buildSub(Dst: DstTy, Src0: XorSign, Src1: Sign);
7859
7860 auto ZeroSrcTy = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
7861
7862 auto ExponentLt0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT,
7863 Res: S1, Op0: Exponent, Op1: ZeroSrcTy);
7864
7865 auto ZeroDstTy = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
7866 MIRBuilder.buildSelect(Res: Dst, Tst: ExponentLt0, Op0: ZeroDstTy, Op1: Ret);
7867
7868 MI.eraseFromParent();
7869 return Legalized;
7870}
7871
7872LegalizerHelper::LegalizeResult
7873LegalizerHelper::lowerFPTOINT_SAT(MachineInstr &MI) {
7874 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7875
7876 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI_SAT;
7877 unsigned SatWidth = DstTy.getScalarSizeInBits();
7878
7879 // Determine minimum and maximum integer values and their corresponding
7880 // floating-point values.
7881 APInt MinInt, MaxInt;
7882 if (IsSigned) {
7883 MinInt = APInt::getSignedMinValue(numBits: SatWidth);
7884 MaxInt = APInt::getSignedMaxValue(numBits: SatWidth);
7885 } else {
7886 MinInt = APInt::getMinValue(numBits: SatWidth);
7887 MaxInt = APInt::getMaxValue(numBits: SatWidth);
7888 }
7889
7890 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: SrcTy.getScalarType());
7891 APFloat MinFloat(Semantics);
7892 APFloat MaxFloat(Semantics);
7893
7894 APFloat::opStatus MinStatus =
7895 MinFloat.convertFromAPInt(Input: MinInt, IsSigned, RM: APFloat::rmTowardZero);
7896 APFloat::opStatus MaxStatus =
7897 MaxFloat.convertFromAPInt(Input: MaxInt, IsSigned, RM: APFloat::rmTowardZero);
7898 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
7899 !(MaxStatus & APFloat::opStatus::opInexact);
7900
7901 // If the integer bounds are exactly representable as floats, emit a
7902 // min+max+fptoi sequence. Otherwise we have to use a sequence of comparisons
7903 // and selects.
7904 if (AreExactFloatBounds) {
7905 // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
7906 auto MaxC = MIRBuilder.buildFConstant(Res: SrcTy, Val: MinFloat);
7907 auto MaxP = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGT,
7908 Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Src, Op1: MaxC);
7909 auto Max = MIRBuilder.buildSelect(Res: SrcTy, Tst: MaxP, Op0: Src, Op1: MaxC);
7910 // Clamp by MaxFloat from above. NaN cannot occur.
7911 auto MinC = MIRBuilder.buildFConstant(Res: SrcTy, Val: MaxFloat);
7912 auto MinP =
7913 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Max,
7914 Op1: MinC, Flags: MachineInstr::FmNoNans);
7915 auto Min =
7916 MIRBuilder.buildSelect(Res: SrcTy, Tst: MinP, Op0: Max, Op1: MinC, Flags: MachineInstr::FmNoNans);
7917 // Convert clamped value to integer. In the unsigned case we're done,
7918 // because we mapped NaN to MinFloat, which will cast to zero.
7919 if (!IsSigned) {
7920 MIRBuilder.buildFPTOUI(Dst, Src0: Min);
7921 MI.eraseFromParent();
7922 return Legalized;
7923 }
7924
7925 // Otherwise, select 0 if Src is NaN.
7926 auto FpToInt = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Min);
7927 auto IsZero = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_UNO,
7928 Res: DstTy.changeElementSize(NewEltSize: 1), Op0: Src, Op1: Src);
7929 MIRBuilder.buildSelect(Res: Dst, Tst: IsZero, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: 0),
7930 Op1: FpToInt);
7931 MI.eraseFromParent();
7932 return Legalized;
7933 }
7934
7935 // Result of direct conversion. The assumption here is that the operation is
7936 // non-trapping and it's fine to apply it to an out-of-range value if we
7937 // select it away later.
7938 auto FpToInt = IsSigned ? MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Src)
7939 : MIRBuilder.buildFPTOUI(Dst: DstTy, Src0: Src);
7940
7941 // If Src ULT MinFloat, select MinInt. In particular, this also selects
7942 // MinInt if Src is NaN.
7943 auto ULT =
7944 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ULT, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Src,
7945 Op1: MIRBuilder.buildFConstant(Res: SrcTy, Val: MinFloat));
7946 auto Max = MIRBuilder.buildSelect(
7947 Res: DstTy, Tst: ULT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MinInt), Op1: FpToInt);
7948 // If Src OGT MaxFloat, select MaxInt.
7949 auto OGT =
7950 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Src,
7951 Op1: MIRBuilder.buildFConstant(Res: SrcTy, Val: MaxFloat));
7952
7953 // In the unsigned case we are done, because we mapped NaN to MinInt, which
7954 // is already zero.
7955 if (!IsSigned) {
7956 MIRBuilder.buildSelect(Res: Dst, Tst: OGT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MaxInt),
7957 Op1: Max);
7958 MI.eraseFromParent();
7959 return Legalized;
7960 }
7961
7962 // Otherwise, select 0 if Src is NaN.
7963 auto Min = MIRBuilder.buildSelect(
7964 Res: DstTy, Tst: OGT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MaxInt), Op1: Max);
7965 auto IsZero = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_UNO,
7966 Res: DstTy.changeElementSize(NewEltSize: 1), Op0: Src, Op1: Src);
7967 MIRBuilder.buildSelect(Res: Dst, Tst: IsZero, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: 0), Op1: Min);
7968 MI.eraseFromParent();
7969 return Legalized;
7970}
7971
7972// f64 -> f16 conversion using round-to-nearest-even rounding mode.
7973LegalizerHelper::LegalizeResult
7974LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
7975 const LLT S1 = LLT::scalar(SizeInBits: 1);
7976 const LLT S32 = LLT::scalar(SizeInBits: 32);
7977
7978 auto [Dst, Src] = MI.getFirst2Regs();
7979 assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
7980 MRI.getType(Src).getScalarType() == LLT::scalar(64));
7981
7982 if (MRI.getType(Reg: Src).isVector()) // TODO: Handle vectors directly.
7983 return UnableToLegalize;
7984
7985 if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
7986 unsigned Flags = MI.getFlags();
7987 auto Src32 = MIRBuilder.buildFPTrunc(Res: S32, Op: Src, Flags);
7988 MIRBuilder.buildFPTrunc(Res: Dst, Op: Src32, Flags);
7989 MI.eraseFromParent();
7990 return Legalized;
7991 }
7992
7993 const unsigned ExpMask = 0x7ff;
7994 const unsigned ExpBiasf64 = 1023;
7995 const unsigned ExpBiasf16 = 15;
7996
7997 auto Unmerge = MIRBuilder.buildUnmerge(Res: S32, Op: Src);
7998 Register U = Unmerge.getReg(Idx: 0);
7999 Register UH = Unmerge.getReg(Idx: 1);
8000
8001 auto E = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 20));
8002 E = MIRBuilder.buildAnd(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: ExpMask));
8003
8004 // Subtract the fp64 exponent bias (1023) to get the real exponent and
8005 // add the f16 bias (15) to get the biased exponent for the f16 format.
8006 E = MIRBuilder.buildAdd(
8007 Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: -ExpBiasf64 + ExpBiasf16));
8008
8009 auto M = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 8));
8010 M = MIRBuilder.buildAnd(Dst: S32, Src0: M, Src1: MIRBuilder.buildConstant(Res: S32, Val: 0xffe));
8011
8012 auto MaskedSig = MIRBuilder.buildAnd(Dst: S32, Src0: UH,
8013 Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x1ff));
8014 MaskedSig = MIRBuilder.buildOr(Dst: S32, Src0: MaskedSig, Src1: U);
8015
8016 auto Zero = MIRBuilder.buildConstant(Res: S32, Val: 0);
8017 auto SigCmpNE0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: MaskedSig, Op1: Zero);
8018 auto Lo40Set = MIRBuilder.buildZExt(Res: S32, Op: SigCmpNE0);
8019 M = MIRBuilder.buildOr(Dst: S32, Src0: M, Src1: Lo40Set);
8020
8021 // (M != 0 ? 0x0200 : 0) | 0x7c00;
8022 auto Bits0x200 = MIRBuilder.buildConstant(Res: S32, Val: 0x0200);
8023 auto CmpM_NE0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: M, Op1: Zero);
8024 auto SelectCC = MIRBuilder.buildSelect(Res: S32, Tst: CmpM_NE0, Op0: Bits0x200, Op1: Zero);
8025
8026 auto Bits0x7c00 = MIRBuilder.buildConstant(Res: S32, Val: 0x7c00);
8027 auto I = MIRBuilder.buildOr(Dst: S32, Src0: SelectCC, Src1: Bits0x7c00);
8028
8029 // N = M | (E << 12);
8030 auto EShl12 = MIRBuilder.buildShl(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: 12));
8031 auto N = MIRBuilder.buildOr(Dst: S32, Src0: M, Src1: EShl12);
8032
8033 // B = clamp(1-E, 0, 13);
8034 auto One = MIRBuilder.buildConstant(Res: S32, Val: 1);
8035 auto OneSubExp = MIRBuilder.buildSub(Dst: S32, Src0: One, Src1: E);
8036 auto B = MIRBuilder.buildSMax(Dst: S32, Src0: OneSubExp, Src1: Zero);
8037 B = MIRBuilder.buildSMin(Dst: S32, Src0: B, Src1: MIRBuilder.buildConstant(Res: S32, Val: 13));
8038
8039 auto SigSetHigh = MIRBuilder.buildOr(Dst: S32, Src0: M,
8040 Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x1000));
8041
8042 auto D = MIRBuilder.buildLShr(Dst: S32, Src0: SigSetHigh, Src1: B);
8043 auto D0 = MIRBuilder.buildShl(Dst: S32, Src0: D, Src1: B);
8044
8045 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1,
8046 Op0: D0, Op1: SigSetHigh);
8047 auto D1 = MIRBuilder.buildZExt(Res: S32, Op: D0_NE_SigSetHigh);
8048 D = MIRBuilder.buildOr(Dst: S32, Src0: D, Src1: D1);
8049
8050 auto CmpELtOne = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: E, Op1: One);
8051 auto V = MIRBuilder.buildSelect(Res: S32, Tst: CmpELtOne, Op0: D, Op1: N);
8052
8053 auto VLow3 = MIRBuilder.buildAnd(Dst: S32, Src0: V, Src1: MIRBuilder.buildConstant(Res: S32, Val: 7));
8054 V = MIRBuilder.buildLShr(Dst: S32, Src0: V, Src1: MIRBuilder.buildConstant(Res: S32, Val: 2));
8055
8056 auto VLow3Eq3 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: VLow3,
8057 Op1: MIRBuilder.buildConstant(Res: S32, Val: 3));
8058 auto V0 = MIRBuilder.buildZExt(Res: S32, Op: VLow3Eq3);
8059
8060 auto VLow3Gt5 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: VLow3,
8061 Op1: MIRBuilder.buildConstant(Res: S32, Val: 5));
8062 auto V1 = MIRBuilder.buildZExt(Res: S32, Op: VLow3Gt5);
8063
8064 V1 = MIRBuilder.buildOr(Dst: S32, Src0: V0, Src1: V1);
8065 V = MIRBuilder.buildAdd(Dst: S32, Src0: V, Src1: V1);
8066
8067 auto CmpEGt30 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1,
8068 Op0: E, Op1: MIRBuilder.buildConstant(Res: S32, Val: 30));
8069 V = MIRBuilder.buildSelect(Res: S32, Tst: CmpEGt30,
8070 Op0: MIRBuilder.buildConstant(Res: S32, Val: 0x7c00), Op1: V);
8071
8072 auto CmpEGt1039 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1,
8073 Op0: E, Op1: MIRBuilder.buildConstant(Res: S32, Val: 1039));
8074 V = MIRBuilder.buildSelect(Res: S32, Tst: CmpEGt1039, Op0: I, Op1: V);
8075
8076 // Extract the sign bit.
8077 auto Sign = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 16));
8078 Sign = MIRBuilder.buildAnd(Dst: S32, Src0: Sign, Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x8000));
8079
8080 // Insert the sign bit
8081 V = MIRBuilder.buildOr(Dst: S32, Src0: Sign, Src1: V);
8082
8083 MIRBuilder.buildTrunc(Res: Dst, Op: V);
8084 MI.eraseFromParent();
8085 return Legalized;
8086}
8087
8088LegalizerHelper::LegalizeResult
8089LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
8090 auto [DstTy, SrcTy] = MI.getFirst2LLTs();
8091 const LLT S64 = LLT::scalar(SizeInBits: 64);
8092 const LLT S16 = LLT::scalar(SizeInBits: 16);
8093
8094 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
8095 return lowerFPTRUNC_F64_TO_F16(MI);
8096
8097 return UnableToLegalize;
8098}
8099
8100LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
8101 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8102 LLT Ty = MRI.getType(Reg: Dst);
8103
8104 auto CvtSrc1 = MIRBuilder.buildSITOFP(Dst: Ty, Src0: Src1);
8105 MIRBuilder.buildFPow(Dst, Src0, Src1: CvtSrc1, Flags: MI.getFlags());
8106 MI.eraseFromParent();
8107 return Legalized;
8108}
8109
8110static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
8111 switch (Opc) {
8112 case TargetOpcode::G_SMIN:
8113 return CmpInst::ICMP_SLT;
8114 case TargetOpcode::G_SMAX:
8115 return CmpInst::ICMP_SGT;
8116 case TargetOpcode::G_UMIN:
8117 return CmpInst::ICMP_ULT;
8118 case TargetOpcode::G_UMAX:
8119 return CmpInst::ICMP_UGT;
8120 default:
8121 llvm_unreachable("not in integer min/max");
8122 }
8123}
8124
8125LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
8126 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8127
8128 const CmpInst::Predicate Pred = minMaxToCompare(Opc: MI.getOpcode());
8129 LLT CmpType = MRI.getType(Reg: Dst).changeElementSize(NewEltSize: 1);
8130
8131 auto Cmp = MIRBuilder.buildICmp(Pred, Res: CmpType, Op0: Src0, Op1: Src1);
8132 MIRBuilder.buildSelect(Res: Dst, Tst: Cmp, Op0: Src0, Op1: Src1);
8133
8134 MI.eraseFromParent();
8135 return Legalized;
8136}
8137
8138LegalizerHelper::LegalizeResult
8139LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
8140 GSUCmp *Cmp = cast<GSUCmp>(Val: &MI);
8141
8142 Register Dst = Cmp->getReg(Idx: 0);
8143 LLT DstTy = MRI.getType(Reg: Dst);
8144 LLT SrcTy = MRI.getType(Reg: Cmp->getReg(Idx: 1));
8145 LLT CmpTy = DstTy.changeElementSize(NewEltSize: 1);
8146
8147 CmpInst::Predicate LTPredicate = Cmp->isSigned()
8148 ? CmpInst::Predicate::ICMP_SLT
8149 : CmpInst::Predicate::ICMP_ULT;
8150 CmpInst::Predicate GTPredicate = Cmp->isSigned()
8151 ? CmpInst::Predicate::ICMP_SGT
8152 : CmpInst::Predicate::ICMP_UGT;
8153
8154 auto Zero = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
8155 auto IsGT = MIRBuilder.buildICmp(Pred: GTPredicate, Res: CmpTy, Op0: Cmp->getLHSReg(),
8156 Op1: Cmp->getRHSReg());
8157 auto IsLT = MIRBuilder.buildICmp(Pred: LTPredicate, Res: CmpTy, Op0: Cmp->getLHSReg(),
8158 Op1: Cmp->getRHSReg());
8159
8160 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
8161 auto BC = TLI.getBooleanContents(isVec: DstTy.isVector(), /*isFP=*/isFloat: false);
8162 if (TLI.shouldExpandCmpUsingSelects(VT: getApproximateEVTForLLT(Ty: SrcTy, Ctx)) ||
8163 BC == TargetLowering::UndefinedBooleanContent) {
8164 auto One = MIRBuilder.buildConstant(Res: DstTy, Val: 1);
8165 auto SelectZeroOrOne = MIRBuilder.buildSelect(Res: DstTy, Tst: IsGT, Op0: One, Op1: Zero);
8166
8167 auto MinusOne = MIRBuilder.buildConstant(Res: DstTy, Val: -1);
8168 MIRBuilder.buildSelect(Res: Dst, Tst: IsLT, Op0: MinusOne, Op1: SelectZeroOrOne);
8169 } else {
8170 if (BC == TargetLowering::ZeroOrNegativeOneBooleanContent)
8171 std::swap(a&: IsGT, b&: IsLT);
8172 // Extend boolean results to DstTy, which is at least i2, before subtracting
8173 // them.
8174 unsigned BoolExtOp =
8175 MIRBuilder.getBoolExtOp(IsVec: DstTy.isVector(), /*isFP=*/IsFP: false);
8176 IsGT = MIRBuilder.buildInstr(Opc: BoolExtOp, DstOps: {DstTy}, SrcOps: {IsGT});
8177 IsLT = MIRBuilder.buildInstr(Opc: BoolExtOp, DstOps: {DstTy}, SrcOps: {IsLT});
8178 MIRBuilder.buildSub(Dst, Src0: IsGT, Src1: IsLT);
8179 }
8180
8181 MI.eraseFromParent();
8182 return Legalized;
8183}
8184
8185LegalizerHelper::LegalizeResult
8186LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
8187 auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
8188 const int Src0Size = Src0Ty.getScalarSizeInBits();
8189 const int Src1Size = Src1Ty.getScalarSizeInBits();
8190
8191 auto SignBitMask = MIRBuilder.buildConstant(
8192 Res: Src0Ty, Val: APInt::getSignMask(BitWidth: Src0Size));
8193
8194 auto NotSignBitMask = MIRBuilder.buildConstant(
8195 Res: Src0Ty, Val: APInt::getLowBitsSet(numBits: Src0Size, loBitsSet: Src0Size - 1));
8196
8197 Register And0 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0, Src1: NotSignBitMask).getReg(Idx: 0);
8198 Register And1;
8199 if (Src0Ty == Src1Ty) {
8200 And1 = MIRBuilder.buildAnd(Dst: Src1Ty, Src0: Src1, Src1: SignBitMask).getReg(Idx: 0);
8201 } else if (Src0Size > Src1Size) {
8202 auto ShiftAmt = MIRBuilder.buildConstant(Res: Src0Ty, Val: Src0Size - Src1Size);
8203 auto Zext = MIRBuilder.buildZExt(Res: Src0Ty, Op: Src1);
8204 auto Shift = MIRBuilder.buildShl(Dst: Src0Ty, Src0: Zext, Src1: ShiftAmt);
8205 And1 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0: Shift, Src1: SignBitMask).getReg(Idx: 0);
8206 } else {
8207 auto ShiftAmt = MIRBuilder.buildConstant(Res: Src1Ty, Val: Src1Size - Src0Size);
8208 auto Shift = MIRBuilder.buildLShr(Dst: Src1Ty, Src0: Src1, Src1: ShiftAmt);
8209 auto Trunc = MIRBuilder.buildTrunc(Res: Src0Ty, Op: Shift);
8210 And1 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0: Trunc, Src1: SignBitMask).getReg(Idx: 0);
8211 }
8212
8213 // Be careful about setting nsz/nnan/ninf on every instruction, since the
8214 // constants are a nan and -0.0, but the final result should preserve
8215 // everything.
8216 unsigned Flags = MI.getFlags();
8217
8218 // We masked the sign bit and the not-sign bit, so these are disjoint.
8219 Flags |= MachineInstr::Disjoint;
8220
8221 MIRBuilder.buildOr(Dst, Src0: And0, Src1: And1, Flags);
8222
8223 MI.eraseFromParent();
8224 return Legalized;
8225}
8226
8227LegalizerHelper::LegalizeResult
8228LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
8229 // FIXME: fminnum/fmaxnum and fminimumnum/fmaximumnum should not have
8230 // identical handling. fminimumnum/fmaximumnum also need a path that do not
8231 // depend on fminnum/fmaxnum.
8232
8233 unsigned NewOp;
8234 switch (MI.getOpcode()) {
8235 case TargetOpcode::G_FMINNUM:
8236 NewOp = TargetOpcode::G_FMINNUM_IEEE;
8237 break;
8238 case TargetOpcode::G_FMINIMUMNUM:
8239 NewOp = TargetOpcode::G_FMINNUM;
8240 break;
8241 case TargetOpcode::G_FMAXNUM:
8242 NewOp = TargetOpcode::G_FMAXNUM_IEEE;
8243 break;
8244 case TargetOpcode::G_FMAXIMUMNUM:
8245 NewOp = TargetOpcode::G_FMAXNUM;
8246 break;
8247 default:
8248 llvm_unreachable("unexpected min/max opcode");
8249 }
8250
8251 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8252 LLT Ty = MRI.getType(Reg: Dst);
8253
8254 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
8255 // Insert canonicalizes if it's possible we need to quiet to get correct
8256 // sNaN behavior.
8257
8258 // Note this must be done here, and not as an optimization combine in the
8259 // absence of a dedicate quiet-snan instruction as we're using an
8260 // omni-purpose G_FCANONICALIZE.
8261 if (!isKnownNeverSNaN(Val: Src0, MRI))
8262 Src0 = MIRBuilder.buildFCanonicalize(Dst: Ty, Src0, Flags: MI.getFlags()).getReg(Idx: 0);
8263
8264 if (!isKnownNeverSNaN(Val: Src1, MRI))
8265 Src1 = MIRBuilder.buildFCanonicalize(Dst: Ty, Src0: Src1, Flags: MI.getFlags()).getReg(Idx: 0);
8266 }
8267
8268 // If there are no nans, it's safe to simply replace this with the non-IEEE
8269 // version.
8270 MIRBuilder.buildInstr(Opc: NewOp, DstOps: {Dst}, SrcOps: {Src0, Src1}, Flags: MI.getFlags());
8271 MI.eraseFromParent();
8272 return Legalized;
8273}
8274
8275LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
8276 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
8277 Register DstReg = MI.getOperand(i: 0).getReg();
8278 LLT Ty = MRI.getType(Reg: DstReg);
8279 unsigned Flags = MI.getFlags();
8280
8281 auto Mul = MIRBuilder.buildFMul(Dst: Ty, Src0: MI.getOperand(i: 1), Src1: MI.getOperand(i: 2),
8282 Flags);
8283 MIRBuilder.buildFAdd(Dst: DstReg, Src0: Mul, Src1: MI.getOperand(i: 3), Flags);
8284 MI.eraseFromParent();
8285 return Legalized;
8286}
8287
8288LegalizerHelper::LegalizeResult
8289LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
8290 auto [DstReg, X] = MI.getFirst2Regs();
8291 const unsigned Flags = MI.getFlags();
8292 const LLT Ty = MRI.getType(Reg: DstReg);
8293 const LLT CondTy = Ty.changeElementSize(NewEltSize: 1);
8294
8295 // round(x) =>
8296 // t = trunc(x);
8297 // d = fabs(x - t);
8298 // o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
8299 // return t + o;
8300
8301 auto T = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: X, Flags);
8302
8303 auto Diff = MIRBuilder.buildFSub(Dst: Ty, Src0: X, Src1: T, Flags);
8304 auto AbsDiff = MIRBuilder.buildFAbs(Dst: Ty, Src0: Diff, Flags);
8305
8306 auto Half = MIRBuilder.buildFConstant(Res: Ty, Val: 0.5);
8307 auto Cmp =
8308 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGE, Res: CondTy, Op0: AbsDiff, Op1: Half, Flags);
8309
8310 // Could emit G_UITOFP instead
8311 auto One = MIRBuilder.buildFConstant(Res: Ty, Val: 1.0);
8312 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
8313 auto BoolFP = MIRBuilder.buildSelect(Res: Ty, Tst: Cmp, Op0: One, Op1: Zero);
8314 auto SignedOffset = MIRBuilder.buildFCopysign(Dst: Ty, Src0: BoolFP, Src1: X);
8315
8316 MIRBuilder.buildFAdd(Dst: DstReg, Src0: T, Src1: SignedOffset, Flags);
8317
8318 MI.eraseFromParent();
8319 return Legalized;
8320}
8321
8322LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
8323 auto [DstReg, SrcReg] = MI.getFirst2Regs();
8324 unsigned Flags = MI.getFlags();
8325 LLT Ty = MRI.getType(Reg: DstReg);
8326 const LLT CondTy = Ty.changeElementSize(NewEltSize: 1);
8327
8328 // result = trunc(src);
8329 // if (src < 0.0 && src != result)
8330 // result += -1.0.
8331
8332 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: SrcReg, Flags);
8333 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
8334
8335 auto Lt0 = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: CondTy,
8336 Op0: SrcReg, Op1: Zero, Flags);
8337 auto NeTrunc = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: CondTy,
8338 Op0: SrcReg, Op1: Trunc, Flags);
8339 auto And = MIRBuilder.buildAnd(Dst: CondTy, Src0: Lt0, Src1: NeTrunc);
8340 auto AddVal = MIRBuilder.buildSITOFP(Dst: Ty, Src0: And);
8341
8342 MIRBuilder.buildFAdd(Dst: DstReg, Src0: Trunc, Src1: AddVal, Flags);
8343 MI.eraseFromParent();
8344 return Legalized;
8345}
8346
8347LegalizerHelper::LegalizeResult
8348LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
8349 const unsigned NumOps = MI.getNumOperands();
8350 auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
8351 unsigned PartSize = Src0Ty.getSizeInBits();
8352
8353 LLT WideTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
8354 Register ResultReg = MIRBuilder.buildZExt(Res: WideTy, Op: Src0Reg).getReg(Idx: 0);
8355
8356 for (unsigned I = 2; I != NumOps; ++I) {
8357 const unsigned Offset = (I - 1) * PartSize;
8358
8359 Register SrcReg = MI.getOperand(i: I).getReg();
8360 auto ZextInput = MIRBuilder.buildZExt(Res: WideTy, Op: SrcReg);
8361
8362 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
8363 MRI.createGenericVirtualRegister(Ty: WideTy);
8364
8365 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: Offset);
8366 auto Shl = MIRBuilder.buildShl(Dst: WideTy, Src0: ZextInput, Src1: ShiftAmt);
8367 MIRBuilder.buildOr(Dst: NextResult, Src0: ResultReg, Src1: Shl);
8368 ResultReg = NextResult;
8369 }
8370
8371 if (DstTy.isPointer()) {
8372 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
8373 AddrSpace: DstTy.getAddressSpace())) {
8374 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
8375 return UnableToLegalize;
8376 }
8377
8378 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
8379 }
8380
8381 MI.eraseFromParent();
8382 return Legalized;
8383}
8384
8385LegalizerHelper::LegalizeResult
8386LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
8387 const unsigned NumDst = MI.getNumOperands() - 1;
8388 Register SrcReg = MI.getOperand(i: NumDst).getReg();
8389 Register Dst0Reg = MI.getOperand(i: 0).getReg();
8390 LLT DstTy = MRI.getType(Reg: Dst0Reg);
8391 if (DstTy.isPointer())
8392 return UnableToLegalize; // TODO
8393
8394 SrcReg = coerceToScalar(Val: SrcReg);
8395 if (!SrcReg)
8396 return UnableToLegalize;
8397
8398 // Expand scalarizing unmerge as bitcast to integer and shift.
8399 LLT IntTy = MRI.getType(Reg: SrcReg);
8400
8401 MIRBuilder.buildTrunc(Res: Dst0Reg, Op: SrcReg);
8402
8403 const unsigned DstSize = DstTy.getSizeInBits();
8404 unsigned Offset = DstSize;
8405 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
8406 auto ShiftAmt = MIRBuilder.buildConstant(Res: IntTy, Val: Offset);
8407 auto Shift = MIRBuilder.buildLShr(Dst: IntTy, Src0: SrcReg, Src1: ShiftAmt);
8408 MIRBuilder.buildTrunc(Res: MI.getOperand(i: I), Op: Shift);
8409 }
8410
8411 MI.eraseFromParent();
8412 return Legalized;
8413}
8414
8415/// Lower a vector extract or insert by writing the vector to a stack temporary
8416/// and reloading the element or vector.
8417///
8418/// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
8419/// =>
8420/// %stack_temp = G_FRAME_INDEX
8421/// G_STORE %vec, %stack_temp
8422/// %idx = clamp(%idx, %vec.getNumElements())
8423/// %element_ptr = G_PTR_ADD %stack_temp, %idx
8424/// %dst = G_LOAD %element_ptr
8425LegalizerHelper::LegalizeResult
8426LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
8427 Register DstReg = MI.getOperand(i: 0).getReg();
8428 Register SrcVec = MI.getOperand(i: 1).getReg();
8429 Register InsertVal;
8430 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
8431 InsertVal = MI.getOperand(i: 2).getReg();
8432
8433 Register Idx = MI.getOperand(i: MI.getNumOperands() - 1).getReg();
8434
8435 LLT VecTy = MRI.getType(Reg: SrcVec);
8436 LLT EltTy = VecTy.getElementType();
8437 unsigned NumElts = VecTy.getNumElements();
8438
8439 int64_t IdxVal;
8440 if (mi_match(R: Idx, MRI, P: m_ICst(Cst&: IdxVal)) && IdxVal <= NumElts) {
8441 SmallVector<Register, 8> SrcRegs;
8442 extractParts(Reg: SrcVec, Ty: EltTy, NumParts: NumElts, VRegs&: SrcRegs, MIRBuilder, MRI);
8443
8444 if (InsertVal) {
8445 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
8446 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SrcRegs);
8447 } else {
8448 MIRBuilder.buildCopy(Res: DstReg, Op: SrcRegs[IdxVal]);
8449 }
8450
8451 MI.eraseFromParent();
8452 return Legalized;
8453 }
8454
8455 if (!EltTy.isByteSized()) { // Not implemented.
8456 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
8457 return UnableToLegalize;
8458 }
8459
8460 unsigned EltBytes = EltTy.getSizeInBytes();
8461 Align VecAlign = getStackTemporaryAlignment(Ty: VecTy);
8462 Align EltAlign;
8463
8464 MachinePointerInfo PtrInfo;
8465 auto StackTemp = createStackTemporary(
8466 Bytes: TypeSize::getFixed(ExactSize: VecTy.getSizeInBytes()), Alignment: VecAlign, PtrInfo);
8467 MIRBuilder.buildStore(Val: SrcVec, Addr: StackTemp, PtrInfo, Alignment: VecAlign);
8468
8469 // Get the pointer to the element, and be sure not to hit undefined behavior
8470 // if the index is out of bounds.
8471 Register EltPtr = getVectorElementPointer(VecPtr: StackTemp.getReg(Idx: 0), VecTy, Index: Idx);
8472
8473 if (mi_match(R: Idx, MRI, P: m_ICst(Cst&: IdxVal))) {
8474 int64_t Offset = IdxVal * EltBytes;
8475 PtrInfo = PtrInfo.getWithOffset(O: Offset);
8476 EltAlign = commonAlignment(A: VecAlign, Offset);
8477 } else {
8478 // We lose information with a variable offset.
8479 EltAlign = getStackTemporaryAlignment(Ty: EltTy);
8480 PtrInfo = MachinePointerInfo(MRI.getType(Reg: EltPtr).getAddressSpace());
8481 }
8482
8483 if (InsertVal) {
8484 // Write the inserted element
8485 MIRBuilder.buildStore(Val: InsertVal, Addr: EltPtr, PtrInfo, Alignment: EltAlign);
8486
8487 // Reload the whole vector.
8488 MIRBuilder.buildLoad(Res: DstReg, Addr: StackTemp, PtrInfo, Alignment: VecAlign);
8489 } else {
8490 MIRBuilder.buildLoad(Res: DstReg, Addr: EltPtr, PtrInfo, Alignment: EltAlign);
8491 }
8492
8493 MI.eraseFromParent();
8494 return Legalized;
8495}
8496
8497LegalizerHelper::LegalizeResult
8498LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
8499 auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
8500 MI.getFirst3RegLLTs();
8501 LLT IdxTy = LLT::scalar(SizeInBits: 32);
8502
8503 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
8504 Register Undef;
8505 SmallVector<Register, 32> BuildVec;
8506 LLT EltTy = DstTy.getScalarType();
8507
8508 for (int Idx : Mask) {
8509 if (Idx < 0) {
8510 if (!Undef.isValid())
8511 Undef = MIRBuilder.buildUndef(Res: EltTy).getReg(Idx: 0);
8512 BuildVec.push_back(Elt: Undef);
8513 continue;
8514 }
8515
8516 if (Src0Ty.isScalar()) {
8517 BuildVec.push_back(Elt: Idx == 0 ? Src0Reg : Src1Reg);
8518 } else {
8519 int NumElts = Src0Ty.getNumElements();
8520 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
8521 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
8522 auto IdxK = MIRBuilder.buildConstant(Res: IdxTy, Val: ExtractIdx);
8523 auto Extract = MIRBuilder.buildExtractVectorElement(Res: EltTy, Val: SrcVec, Idx: IdxK);
8524 BuildVec.push_back(Elt: Extract.getReg(Idx: 0));
8525 }
8526 }
8527
8528 if (DstTy.isVector())
8529 MIRBuilder.buildBuildVector(Res: DstReg, Ops: BuildVec);
8530 else
8531 MIRBuilder.buildCopy(Res: DstReg, Op: BuildVec[0]);
8532 MI.eraseFromParent();
8533 return Legalized;
8534}
8535
8536LegalizerHelper::LegalizeResult
8537LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
8538 auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] =
8539 MI.getFirst4RegLLTs();
8540
8541 if (VecTy.isScalableVector())
8542 report_fatal_error(reason: "Cannot expand masked_compress for scalable vectors.");
8543
8544 Align VecAlign = getStackTemporaryAlignment(Ty: VecTy);
8545 MachinePointerInfo PtrInfo;
8546 Register StackPtr =
8547 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: VecTy.getSizeInBytes()), Alignment: VecAlign,
8548 PtrInfo)
8549 .getReg(Idx: 0);
8550 MachinePointerInfo ValPtrInfo =
8551 MachinePointerInfo::getUnknownStack(MF&: *MI.getMF());
8552
8553 LLT IdxTy = LLT::scalar(SizeInBits: 32);
8554 LLT ValTy = VecTy.getElementType();
8555 Align ValAlign = getStackTemporaryAlignment(Ty: ValTy);
8556
8557 auto OutPos = MIRBuilder.buildConstant(Res: IdxTy, Val: 0);
8558
8559 bool HasPassthru =
8560 MRI.getVRegDef(Reg: Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF;
8561
8562 if (HasPassthru)
8563 MIRBuilder.buildStore(Val: Passthru, Addr: StackPtr, PtrInfo, Alignment: VecAlign);
8564
8565 Register LastWriteVal;
8566 std::optional<APInt> PassthruSplatVal =
8567 isConstantOrConstantSplatVector(MI&: *MRI.getVRegDef(Reg: Passthru), MRI);
8568
8569 if (PassthruSplatVal.has_value()) {
8570 LastWriteVal =
8571 MIRBuilder.buildConstant(Res: ValTy, Val: PassthruSplatVal.value()).getReg(Idx: 0);
8572 } else if (HasPassthru) {
8573 auto Popcount = MIRBuilder.buildZExt(Res: MaskTy.changeElementSize(NewEltSize: 32), Op: Mask);
8574 Popcount = MIRBuilder.buildInstr(Opc: TargetOpcode::G_VECREDUCE_ADD,
8575 DstOps: {LLT::scalar(SizeInBits: 32)}, SrcOps: {Popcount});
8576
8577 Register LastElmtPtr =
8578 getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: Popcount.getReg(Idx: 0));
8579 LastWriteVal =
8580 MIRBuilder.buildLoad(Res: ValTy, Addr: LastElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign)
8581 .getReg(Idx: 0);
8582 }
8583
8584 unsigned NumElmts = VecTy.getNumElements();
8585 for (unsigned I = 0; I < NumElmts; ++I) {
8586 auto Idx = MIRBuilder.buildConstant(Res: IdxTy, Val: I);
8587 auto Val = MIRBuilder.buildExtractVectorElement(Res: ValTy, Val: Vec, Idx);
8588 Register ElmtPtr =
8589 getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: OutPos.getReg(Idx: 0));
8590 MIRBuilder.buildStore(Val, Addr: ElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign);
8591
8592 LLT MaskITy = MaskTy.getElementType();
8593 auto MaskI = MIRBuilder.buildExtractVectorElement(Res: MaskITy, Val: Mask, Idx);
8594 if (MaskITy.getSizeInBits() > 1)
8595 MaskI = MIRBuilder.buildTrunc(Res: LLT::scalar(SizeInBits: 1), Op: MaskI);
8596
8597 MaskI = MIRBuilder.buildZExt(Res: IdxTy, Op: MaskI);
8598 OutPos = MIRBuilder.buildAdd(Dst: IdxTy, Src0: OutPos, Src1: MaskI);
8599
8600 if (HasPassthru && I == NumElmts - 1) {
8601 auto EndOfVector =
8602 MIRBuilder.buildConstant(Res: IdxTy, Val: VecTy.getNumElements() - 1);
8603 auto AllLanesSelected = MIRBuilder.buildICmp(
8604 Pred: CmpInst::ICMP_UGT, Res: LLT::scalar(SizeInBits: 1), Op0: OutPos, Op1: EndOfVector);
8605 OutPos = MIRBuilder.buildInstr(Opc: TargetOpcode::G_UMIN, DstOps: {IdxTy},
8606 SrcOps: {OutPos, EndOfVector});
8607 ElmtPtr = getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: OutPos.getReg(Idx: 0));
8608
8609 LastWriteVal =
8610 MIRBuilder.buildSelect(Res: ValTy, Tst: AllLanesSelected, Op0: Val, Op1: LastWriteVal)
8611 .getReg(Idx: 0);
8612 MIRBuilder.buildStore(Val: LastWriteVal, Addr: ElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign);
8613 }
8614 }
8615
8616 // TODO: Use StackPtr's FrameIndex alignment.
8617 MIRBuilder.buildLoad(Res: Dst, Addr: StackPtr, PtrInfo, Alignment: VecAlign);
8618
8619 MI.eraseFromParent();
8620 return Legalized;
8621}
8622
8623Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
8624 Register AllocSize,
8625 Align Alignment,
8626 LLT PtrTy) {
8627 LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
8628
8629 auto SPTmp = MIRBuilder.buildCopy(Res: PtrTy, Op: SPReg);
8630 SPTmp = MIRBuilder.buildCast(Dst: IntPtrTy, Src: SPTmp);
8631
8632 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
8633 // have to generate an extra instruction to negate the alloc and then use
8634 // G_PTR_ADD to add the negative offset.
8635 auto Alloc = MIRBuilder.buildSub(Dst: IntPtrTy, Src0: SPTmp, Src1: AllocSize);
8636 if (Alignment > Align(1)) {
8637 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
8638 AlignMask.negate();
8639 auto AlignCst = MIRBuilder.buildConstant(Res: IntPtrTy, Val: AlignMask);
8640 Alloc = MIRBuilder.buildAnd(Dst: IntPtrTy, Src0: Alloc, Src1: AlignCst);
8641 }
8642
8643 return MIRBuilder.buildCast(Dst: PtrTy, Src: Alloc).getReg(Idx: 0);
8644}
8645
8646LegalizerHelper::LegalizeResult
8647LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
8648 const auto &MF = *MI.getMF();
8649 const auto &TFI = *MF.getSubtarget().getFrameLowering();
8650 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
8651 return UnableToLegalize;
8652
8653 Register Dst = MI.getOperand(i: 0).getReg();
8654 Register AllocSize = MI.getOperand(i: 1).getReg();
8655 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
8656
8657 LLT PtrTy = MRI.getType(Reg: Dst);
8658 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
8659 Register SPTmp =
8660 getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
8661
8662 MIRBuilder.buildCopy(Res: SPReg, Op: SPTmp);
8663 MIRBuilder.buildCopy(Res: Dst, Op: SPTmp);
8664
8665 MI.eraseFromParent();
8666 return Legalized;
8667}
8668
8669LegalizerHelper::LegalizeResult
8670LegalizerHelper::lowerStackSave(MachineInstr &MI) {
8671 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
8672 if (!StackPtr)
8673 return UnableToLegalize;
8674
8675 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0), Op: StackPtr);
8676 MI.eraseFromParent();
8677 return Legalized;
8678}
8679
8680LegalizerHelper::LegalizeResult
8681LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
8682 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
8683 if (!StackPtr)
8684 return UnableToLegalize;
8685
8686 MIRBuilder.buildCopy(Res: StackPtr, Op: MI.getOperand(i: 0));
8687 MI.eraseFromParent();
8688 return Legalized;
8689}
8690
8691LegalizerHelper::LegalizeResult
8692LegalizerHelper::lowerExtract(MachineInstr &MI) {
8693 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
8694 unsigned Offset = MI.getOperand(i: 2).getImm();
8695
8696 // Extract sub-vector or one element
8697 if (SrcTy.isVector()) {
8698 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
8699 unsigned DstSize = DstTy.getSizeInBits();
8700
8701 if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
8702 (Offset + DstSize <= SrcTy.getSizeInBits())) {
8703 // Unmerge and allow access to each Src element for the artifact combiner.
8704 auto Unmerge = MIRBuilder.buildUnmerge(Res: SrcTy.getElementType(), Op: SrcReg);
8705
8706 // Take element(s) we need to extract and copy it (merge them).
8707 SmallVector<Register, 8> SubVectorElts;
8708 for (unsigned Idx = Offset / SrcEltSize;
8709 Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
8710 SubVectorElts.push_back(Elt: Unmerge.getReg(Idx));
8711 }
8712 if (SubVectorElts.size() == 1)
8713 MIRBuilder.buildCopy(Res: DstReg, Op: SubVectorElts[0]);
8714 else
8715 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SubVectorElts);
8716
8717 MI.eraseFromParent();
8718 return Legalized;
8719 }
8720 }
8721
8722 if (DstTy.isScalar() &&
8723 (SrcTy.isScalar() ||
8724 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
8725 LLT SrcIntTy = SrcTy;
8726 if (!SrcTy.isScalar()) {
8727 SrcIntTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
8728 SrcReg = MIRBuilder.buildBitcast(Dst: SrcIntTy, Src: SrcReg).getReg(Idx: 0);
8729 }
8730
8731 if (Offset == 0)
8732 MIRBuilder.buildTrunc(Res: DstReg, Op: SrcReg);
8733 else {
8734 auto ShiftAmt = MIRBuilder.buildConstant(Res: SrcIntTy, Val: Offset);
8735 auto Shr = MIRBuilder.buildLShr(Dst: SrcIntTy, Src0: SrcReg, Src1: ShiftAmt);
8736 MIRBuilder.buildTrunc(Res: DstReg, Op: Shr);
8737 }
8738
8739 MI.eraseFromParent();
8740 return Legalized;
8741 }
8742
8743 return UnableToLegalize;
8744}
8745
8746LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
8747 auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
8748 uint64_t Offset = MI.getOperand(i: 3).getImm();
8749
8750 LLT DstTy = MRI.getType(Reg: Src);
8751 LLT InsertTy = MRI.getType(Reg: InsertSrc);
8752
8753 // Insert sub-vector or one element
8754 if (DstTy.isVector() && !InsertTy.isPointer()) {
8755 LLT EltTy = DstTy.getElementType();
8756 unsigned EltSize = EltTy.getSizeInBits();
8757 unsigned InsertSize = InsertTy.getSizeInBits();
8758
8759 if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
8760 (Offset + InsertSize <= DstTy.getSizeInBits())) {
8761 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: Src);
8762 SmallVector<Register, 8> DstElts;
8763 unsigned Idx = 0;
8764 // Elements from Src before insert start Offset
8765 for (; Idx < Offset / EltSize; ++Idx) {
8766 DstElts.push_back(Elt: UnmergeSrc.getReg(Idx));
8767 }
8768
8769 // Replace elements in Src with elements from InsertSrc
8770 if (InsertTy.getSizeInBits() > EltSize) {
8771 auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: InsertSrc);
8772 for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
8773 ++Idx, ++i) {
8774 DstElts.push_back(Elt: UnmergeInsertSrc.getReg(Idx: i));
8775 }
8776 } else {
8777 DstElts.push_back(Elt: InsertSrc);
8778 ++Idx;
8779 }
8780
8781 // Remaining elements from Src after insert
8782 for (; Idx < DstTy.getNumElements(); ++Idx) {
8783 DstElts.push_back(Elt: UnmergeSrc.getReg(Idx));
8784 }
8785
8786 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: DstElts);
8787 MI.eraseFromParent();
8788 return Legalized;
8789 }
8790 }
8791
8792 if (InsertTy.isVector() ||
8793 (DstTy.isVector() && DstTy.getElementType() != InsertTy))
8794 return UnableToLegalize;
8795
8796 const DataLayout &DL = MIRBuilder.getDataLayout();
8797 if ((DstTy.isPointer() &&
8798 DL.isNonIntegralAddressSpace(AddrSpace: DstTy.getAddressSpace())) ||
8799 (InsertTy.isPointer() &&
8800 DL.isNonIntegralAddressSpace(AddrSpace: InsertTy.getAddressSpace()))) {
8801 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
8802 return UnableToLegalize;
8803 }
8804
8805 LLT IntDstTy = DstTy;
8806
8807 if (!DstTy.isScalar()) {
8808 IntDstTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
8809 Src = MIRBuilder.buildCast(Dst: IntDstTy, Src).getReg(Idx: 0);
8810 }
8811
8812 if (!InsertTy.isScalar()) {
8813 const LLT IntInsertTy = LLT::scalar(SizeInBits: InsertTy.getSizeInBits());
8814 InsertSrc = MIRBuilder.buildPtrToInt(Dst: IntInsertTy, Src: InsertSrc).getReg(Idx: 0);
8815 }
8816
8817 Register ExtInsSrc = MIRBuilder.buildZExt(Res: IntDstTy, Op: InsertSrc).getReg(Idx: 0);
8818 if (Offset != 0) {
8819 auto ShiftAmt = MIRBuilder.buildConstant(Res: IntDstTy, Val: Offset);
8820 ExtInsSrc = MIRBuilder.buildShl(Dst: IntDstTy, Src0: ExtInsSrc, Src1: ShiftAmt).getReg(Idx: 0);
8821 }
8822
8823 APInt MaskVal = APInt::getBitsSetWithWrap(
8824 numBits: DstTy.getSizeInBits(), loBit: Offset + InsertTy.getSizeInBits(), hiBit: Offset);
8825
8826 auto Mask = MIRBuilder.buildConstant(Res: IntDstTy, Val: MaskVal);
8827 auto MaskedSrc = MIRBuilder.buildAnd(Dst: IntDstTy, Src0: Src, Src1: Mask);
8828 auto Or = MIRBuilder.buildOr(Dst: IntDstTy, Src0: MaskedSrc, Src1: ExtInsSrc);
8829
8830 MIRBuilder.buildCast(Dst, Src: Or);
8831 MI.eraseFromParent();
8832 return Legalized;
8833}
8834
8835LegalizerHelper::LegalizeResult
8836LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
8837 auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
8838 MI.getFirst4RegLLTs();
8839 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
8840
8841 LLT Ty = Dst0Ty;
8842 LLT BoolTy = Dst1Ty;
8843
8844 Register NewDst0 = MRI.cloneVirtualRegister(VReg: Dst0);
8845
8846 if (IsAdd)
8847 MIRBuilder.buildAdd(Dst: NewDst0, Src0: LHS, Src1: RHS);
8848 else
8849 MIRBuilder.buildSub(Dst: NewDst0, Src0: LHS, Src1: RHS);
8850
8851 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
8852
8853 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
8854
8855 // For an addition, the result should be less than one of the operands (LHS)
8856 // if and only if the other operand (RHS) is negative, otherwise there will
8857 // be overflow.
8858 // For a subtraction, the result should be less than one of the operands
8859 // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
8860 // otherwise there will be overflow.
8861 auto ResultLowerThanLHS =
8862 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: BoolTy, Op0: NewDst0, Op1: LHS);
8863 auto ConditionRHS = MIRBuilder.buildICmp(
8864 Pred: IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, Res: BoolTy, Op0: RHS, Op1: Zero);
8865
8866 MIRBuilder.buildXor(Dst: Dst1, Src0: ConditionRHS, Src1: ResultLowerThanLHS);
8867
8868 MIRBuilder.buildCopy(Res: Dst0, Op: NewDst0);
8869 MI.eraseFromParent();
8870
8871 return Legalized;
8872}
8873
8874LegalizerHelper::LegalizeResult
8875LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
8876 auto [Res, LHS, RHS] = MI.getFirst3Regs();
8877 LLT Ty = MRI.getType(Reg: Res);
8878 bool IsSigned;
8879 bool IsAdd;
8880 unsigned BaseOp;
8881 switch (MI.getOpcode()) {
8882 default:
8883 llvm_unreachable("unexpected addsat/subsat opcode");
8884 case TargetOpcode::G_UADDSAT:
8885 IsSigned = false;
8886 IsAdd = true;
8887 BaseOp = TargetOpcode::G_ADD;
8888 break;
8889 case TargetOpcode::G_SADDSAT:
8890 IsSigned = true;
8891 IsAdd = true;
8892 BaseOp = TargetOpcode::G_ADD;
8893 break;
8894 case TargetOpcode::G_USUBSAT:
8895 IsSigned = false;
8896 IsAdd = false;
8897 BaseOp = TargetOpcode::G_SUB;
8898 break;
8899 case TargetOpcode::G_SSUBSAT:
8900 IsSigned = true;
8901 IsAdd = false;
8902 BaseOp = TargetOpcode::G_SUB;
8903 break;
8904 }
8905
8906 if (IsSigned) {
8907 // sadd.sat(a, b) ->
8908 // hi = 0x7fffffff - smax(a, 0)
8909 // lo = 0x80000000 - smin(a, 0)
8910 // a + smin(smax(lo, b), hi)
8911 // ssub.sat(a, b) ->
8912 // lo = smax(a, -1) - 0x7fffffff
8913 // hi = smin(a, -1) - 0x80000000
8914 // a - smin(smax(lo, b), hi)
8915 // TODO: AMDGPU can use a "median of 3" instruction here:
8916 // a +/- med3(lo, b, hi)
8917 uint64_t NumBits = Ty.getScalarSizeInBits();
8918 auto MaxVal =
8919 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMaxValue(numBits: NumBits));
8920 auto MinVal =
8921 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: NumBits));
8922 MachineInstrBuilder Hi, Lo;
8923 if (IsAdd) {
8924 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
8925 Hi = MIRBuilder.buildSub(Dst: Ty, Src0: MaxVal, Src1: MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: Zero));
8926 Lo = MIRBuilder.buildSub(Dst: Ty, Src0: MinVal, Src1: MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: Zero));
8927 } else {
8928 auto NegOne = MIRBuilder.buildConstant(Res: Ty, Val: -1);
8929 Lo = MIRBuilder.buildSub(Dst: Ty, Src0: MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: NegOne),
8930 Src1: MaxVal);
8931 Hi = MIRBuilder.buildSub(Dst: Ty, Src0: MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: NegOne),
8932 Src1: MinVal);
8933 }
8934 auto RHSClamped =
8935 MIRBuilder.buildSMin(Dst: Ty, Src0: MIRBuilder.buildSMax(Dst: Ty, Src0: Lo, Src1: RHS), Src1: Hi);
8936 MIRBuilder.buildInstr(Opc: BaseOp, DstOps: {Res}, SrcOps: {LHS, RHSClamped});
8937 } else {
8938 // uadd.sat(a, b) -> a + umin(~a, b)
8939 // usub.sat(a, b) -> a - umin(a, b)
8940 Register Not = IsAdd ? MIRBuilder.buildNot(Dst: Ty, Src0: LHS).getReg(Idx: 0) : LHS;
8941 auto Min = MIRBuilder.buildUMin(Dst: Ty, Src0: Not, Src1: RHS);
8942 MIRBuilder.buildInstr(Opc: BaseOp, DstOps: {Res}, SrcOps: {LHS, Min});
8943 }
8944
8945 MI.eraseFromParent();
8946 return Legalized;
8947}
8948
8949LegalizerHelper::LegalizeResult
8950LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
8951 auto [Res, LHS, RHS] = MI.getFirst3Regs();
8952 LLT Ty = MRI.getType(Reg: Res);
8953 LLT BoolTy = Ty.changeElementSize(NewEltSize: 1);
8954 bool IsSigned;
8955 bool IsAdd;
8956 unsigned OverflowOp;
8957 switch (MI.getOpcode()) {
8958 default:
8959 llvm_unreachable("unexpected addsat/subsat opcode");
8960 case TargetOpcode::G_UADDSAT:
8961 IsSigned = false;
8962 IsAdd = true;
8963 OverflowOp = TargetOpcode::G_UADDO;
8964 break;
8965 case TargetOpcode::G_SADDSAT:
8966 IsSigned = true;
8967 IsAdd = true;
8968 OverflowOp = TargetOpcode::G_SADDO;
8969 break;
8970 case TargetOpcode::G_USUBSAT:
8971 IsSigned = false;
8972 IsAdd = false;
8973 OverflowOp = TargetOpcode::G_USUBO;
8974 break;
8975 case TargetOpcode::G_SSUBSAT:
8976 IsSigned = true;
8977 IsAdd = false;
8978 OverflowOp = TargetOpcode::G_SSUBO;
8979 break;
8980 }
8981
8982 auto OverflowRes =
8983 MIRBuilder.buildInstr(Opc: OverflowOp, DstOps: {Ty, BoolTy}, SrcOps: {LHS, RHS});
8984 Register Tmp = OverflowRes.getReg(Idx: 0);
8985 Register Ov = OverflowRes.getReg(Idx: 1);
8986 MachineInstrBuilder Clamp;
8987 if (IsSigned) {
8988 // sadd.sat(a, b) ->
8989 // {tmp, ov} = saddo(a, b)
8990 // ov ? (tmp >>s 31) + 0x80000000 : r
8991 // ssub.sat(a, b) ->
8992 // {tmp, ov} = ssubo(a, b)
8993 // ov ? (tmp >>s 31) + 0x80000000 : r
8994 uint64_t NumBits = Ty.getScalarSizeInBits();
8995 auto ShiftAmount = MIRBuilder.buildConstant(Res: Ty, Val: NumBits - 1);
8996 auto Sign = MIRBuilder.buildAShr(Dst: Ty, Src0: Tmp, Src1: ShiftAmount);
8997 auto MinVal =
8998 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: NumBits));
8999 Clamp = MIRBuilder.buildAdd(Dst: Ty, Src0: Sign, Src1: MinVal);
9000 } else {
9001 // uadd.sat(a, b) ->
9002 // {tmp, ov} = uaddo(a, b)
9003 // ov ? 0xffffffff : tmp
9004 // usub.sat(a, b) ->
9005 // {tmp, ov} = usubo(a, b)
9006 // ov ? 0 : tmp
9007 Clamp = MIRBuilder.buildConstant(Res: Ty, Val: IsAdd ? -1 : 0);
9008 }
9009 MIRBuilder.buildSelect(Res, Tst: Ov, Op0: Clamp, Op1: Tmp);
9010
9011 MI.eraseFromParent();
9012 return Legalized;
9013}
9014
9015LegalizerHelper::LegalizeResult
9016LegalizerHelper::lowerShlSat(MachineInstr &MI) {
9017 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
9018 MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
9019 "Expected shlsat opcode!");
9020 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
9021 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9022 LLT Ty = MRI.getType(Reg: Res);
9023 LLT BoolTy = Ty.changeElementSize(NewEltSize: 1);
9024
9025 unsigned BW = Ty.getScalarSizeInBits();
9026 auto Result = MIRBuilder.buildShl(Dst: Ty, Src0: LHS, Src1: RHS);
9027 auto Orig = IsSigned ? MIRBuilder.buildAShr(Dst: Ty, Src0: Result, Src1: RHS)
9028 : MIRBuilder.buildLShr(Dst: Ty, Src0: Result, Src1: RHS);
9029
9030 MachineInstrBuilder SatVal;
9031 if (IsSigned) {
9032 auto SatMin = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: BW));
9033 auto SatMax = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMaxValue(numBits: BW));
9034 auto Cmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: BoolTy, Op0: LHS,
9035 Op1: MIRBuilder.buildConstant(Res: Ty, Val: 0));
9036 SatVal = MIRBuilder.buildSelect(Res: Ty, Tst: Cmp, Op0: SatMin, Op1: SatMax);
9037 } else {
9038 SatVal = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getMaxValue(numBits: BW));
9039 }
9040 auto Ov = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: BoolTy, Op0: LHS, Op1: Orig);
9041 MIRBuilder.buildSelect(Res, Tst: Ov, Op0: SatVal, Op1: Result);
9042
9043 MI.eraseFromParent();
9044 return Legalized;
9045}
9046
9047LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
9048 auto [Dst, Src] = MI.getFirst2Regs();
9049 const LLT Ty = MRI.getType(Reg: Src);
9050 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
9051 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
9052
9053 // Swap most and least significant byte, set remaining bytes in Res to zero.
9054 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: BaseShiftAmt);
9055 auto LSByteShiftedLeft = MIRBuilder.buildShl(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9056 auto MSByteShiftedRight = MIRBuilder.buildLShr(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9057 auto Res = MIRBuilder.buildOr(Dst: Ty, Src0: MSByteShiftedRight, Src1: LSByteShiftedLeft);
9058
9059 // Set i-th high/low byte in Res to i-th low/high byte from Src.
9060 for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
9061 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
9062 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
9063 auto Mask = MIRBuilder.buildConstant(Res: Ty, Val: APMask);
9064 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: BaseShiftAmt - 16 * i);
9065 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
9066 auto LoByte = MIRBuilder.buildAnd(Dst: Ty, Src0: Src, Src1: Mask);
9067 auto LoShiftedLeft = MIRBuilder.buildShl(Dst: Ty, Src0: LoByte, Src1: ShiftAmt);
9068 Res = MIRBuilder.buildOr(Dst: Ty, Src0: Res, Src1: LoShiftedLeft);
9069 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
9070 auto SrcShiftedRight = MIRBuilder.buildLShr(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9071 auto HiShiftedRight = MIRBuilder.buildAnd(Dst: Ty, Src0: SrcShiftedRight, Src1: Mask);
9072 Res = MIRBuilder.buildOr(Dst: Ty, Src0: Res, Src1: HiShiftedRight);
9073 }
9074 Res.getInstr()->getOperand(i: 0).setReg(Dst);
9075
9076 MI.eraseFromParent();
9077 return Legalized;
9078}
9079
9080//{ (Src & Mask) >> N } | { (Src << N) & Mask }
9081static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
9082 MachineInstrBuilder Src, const APInt &Mask) {
9083 const LLT Ty = Dst.getLLTTy(MRI: *B.getMRI());
9084 MachineInstrBuilder C_N = B.buildConstant(Res: Ty, Val: N);
9085 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Res: Ty, Val: Mask);
9086 auto LHS = B.buildLShr(Dst: Ty, Src0: B.buildAnd(Dst: Ty, Src0: Src, Src1: MaskLoNTo0), Src1: C_N);
9087 auto RHS = B.buildAnd(Dst: Ty, Src0: B.buildShl(Dst: Ty, Src0: Src, Src1: C_N), Src1: MaskLoNTo0);
9088 return B.buildOr(Dst, Src0: LHS, Src1: RHS);
9089}
9090
9091LegalizerHelper::LegalizeResult
9092LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
9093 auto [Dst, Src] = MI.getFirst2Regs();
9094 const LLT SrcTy = MRI.getType(Reg: Src);
9095 unsigned Size = SrcTy.getScalarSizeInBits();
9096 unsigned VSize = SrcTy.getSizeInBits();
9097
9098 if (Size >= 8) {
9099 if (SrcTy.isVector() && (VSize % 8 == 0) &&
9100 (LI.isLegal(Query: {TargetOpcode::G_BITREVERSE,
9101 {LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8),
9102 LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8)}}))) {
9103 // If bitreverse is legal for i8 vector of the same size, then cast
9104 // to i8 vector type.
9105 // e.g. v4s32 -> v16s8
9106 LLT VTy = LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8);
9107 auto BSWAP = MIRBuilder.buildBSwap(Dst: SrcTy, Src0: Src);
9108 auto Cast = MIRBuilder.buildBitcast(Dst: VTy, Src: BSWAP);
9109 auto RBIT = MIRBuilder.buildBitReverse(Dst: VTy, Src: Cast);
9110 MIRBuilder.buildBitcast(Dst, Src: RBIT);
9111 } else {
9112 MachineInstrBuilder BSWAP =
9113 MIRBuilder.buildInstr(Opc: TargetOpcode::G_BSWAP, DstOps: {SrcTy}, SrcOps: {Src});
9114
9115 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
9116 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
9117 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
9118 MachineInstrBuilder Swap4 = SwapN(N: 4, Dst: SrcTy, B&: MIRBuilder, Src: BSWAP,
9119 Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xF0)));
9120
9121 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
9122 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
9123 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
9124 MachineInstrBuilder Swap2 = SwapN(N: 2, Dst: SrcTy, B&: MIRBuilder, Src: Swap4,
9125 Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xCC)));
9126
9127 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
9128 // 6|7
9129 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
9130 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
9131 SwapN(N: 1, Dst, B&: MIRBuilder, Src: Swap2, Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xAA)));
9132 }
9133 } else {
9134 // Expand bitreverse for types smaller than 8 bits.
9135 MachineInstrBuilder Tmp;
9136 for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
9137 MachineInstrBuilder Tmp2;
9138 if (I < J) {
9139 auto ShAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: J - I);
9140 Tmp2 = MIRBuilder.buildShl(Dst: SrcTy, Src0: Src, Src1: ShAmt);
9141 } else {
9142 auto ShAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: I - J);
9143 Tmp2 = MIRBuilder.buildLShr(Dst: SrcTy, Src0: Src, Src1: ShAmt);
9144 }
9145
9146 auto Mask = MIRBuilder.buildConstant(Res: SrcTy, Val: 1ULL << J);
9147 Tmp2 = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Tmp2, Src1: Mask);
9148 if (I == 0)
9149 Tmp = Tmp2;
9150 else
9151 Tmp = MIRBuilder.buildOr(Dst: SrcTy, Src0: Tmp, Src1: Tmp2);
9152 }
9153 MIRBuilder.buildCopy(Res: Dst, Op: Tmp);
9154 }
9155
9156 MI.eraseFromParent();
9157 return Legalized;
9158}
9159
9160LegalizerHelper::LegalizeResult
9161LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
9162 MachineFunction &MF = MIRBuilder.getMF();
9163
9164 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
9165 int NameOpIdx = IsRead ? 1 : 0;
9166 int ValRegIndex = IsRead ? 0 : 1;
9167
9168 Register ValReg = MI.getOperand(i: ValRegIndex).getReg();
9169 const LLT Ty = MRI.getType(Reg: ValReg);
9170 const MDString *RegStr = cast<MDString>(
9171 Val: cast<MDNode>(Val: MI.getOperand(i: NameOpIdx).getMetadata())->getOperand(I: 0));
9172
9173 Register PhysReg = TLI.getRegisterByName(RegName: RegStr->getString().data(), Ty, MF);
9174 if (!PhysReg) {
9175 const Function &Fn = MF.getFunction();
9176 Fn.getContext().diagnose(DI: DiagnosticInfoGenericWithLoc(
9177 "invalid register \"" + Twine(RegStr->getString().data()) + "\" for " +
9178 (IsRead ? "llvm.read_register" : "llvm.write_register"),
9179 Fn, MI.getDebugLoc()));
9180 if (IsRead)
9181 MIRBuilder.buildUndef(Res: ValReg);
9182
9183 MI.eraseFromParent();
9184 return Legalized;
9185 }
9186
9187 if (IsRead)
9188 MIRBuilder.buildCopy(Res: ValReg, Op: PhysReg);
9189 else
9190 MIRBuilder.buildCopy(Res: PhysReg, Op: ValReg);
9191
9192 MI.eraseFromParent();
9193 return Legalized;
9194}
9195
9196LegalizerHelper::LegalizeResult
9197LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
9198 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
9199 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
9200 Register Result = MI.getOperand(i: 0).getReg();
9201 LLT OrigTy = MRI.getType(Reg: Result);
9202 auto SizeInBits = OrigTy.getScalarSizeInBits();
9203 LLT WideTy = OrigTy.changeElementSize(NewEltSize: SizeInBits * 2);
9204
9205 auto LHS = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 1)});
9206 auto RHS = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 2)});
9207 auto Mul = MIRBuilder.buildMul(Dst: WideTy, Src0: LHS, Src1: RHS);
9208 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
9209
9210 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: SizeInBits);
9211 auto Shifted = MIRBuilder.buildInstr(Opc: ShiftOp, DstOps: {WideTy}, SrcOps: {Mul, ShiftAmt});
9212 MIRBuilder.buildTrunc(Res: Result, Op: Shifted);
9213
9214 MI.eraseFromParent();
9215 return Legalized;
9216}
9217
9218LegalizerHelper::LegalizeResult
9219LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
9220 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
9221 FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(i: 2).getImm());
9222
9223 if (Mask == fcNone) {
9224 MIRBuilder.buildConstant(Res: DstReg, Val: 0);
9225 MI.eraseFromParent();
9226 return Legalized;
9227 }
9228 if (Mask == fcAllFlags) {
9229 MIRBuilder.buildConstant(Res: DstReg, Val: 1);
9230 MI.eraseFromParent();
9231 return Legalized;
9232 }
9233
9234 // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
9235 // version
9236
9237 unsigned BitSize = SrcTy.getScalarSizeInBits();
9238 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: SrcTy.getScalarType());
9239
9240 LLT IntTy = LLT::scalar(SizeInBits: BitSize);
9241 if (SrcTy.isVector())
9242 IntTy = LLT::vector(EC: SrcTy.getElementCount(), ScalarTy: IntTy);
9243 auto AsInt = MIRBuilder.buildCopy(Res: IntTy, Op: SrcReg);
9244
9245 // Various masks.
9246 APInt SignBit = APInt::getSignMask(BitWidth: BitSize);
9247 APInt ValueMask = APInt::getSignedMaxValue(numBits: BitSize); // All bits but sign.
9248 APInt Inf = APFloat::getInf(Sem: Semantics).bitcastToAPInt(); // Exp and int bit.
9249 APInt ExpMask = Inf;
9250 APInt AllOneMantissa = APFloat::getLargest(Sem: Semantics).bitcastToAPInt() & ~Inf;
9251 APInt QNaNBitMask =
9252 APInt::getOneBitSet(numBits: BitSize, BitNo: AllOneMantissa.getActiveBits() - 1);
9253 APInt InvertionMask = APInt::getAllOnes(numBits: DstTy.getScalarSizeInBits());
9254
9255 auto SignBitC = MIRBuilder.buildConstant(Res: IntTy, Val: SignBit);
9256 auto ValueMaskC = MIRBuilder.buildConstant(Res: IntTy, Val: ValueMask);
9257 auto InfC = MIRBuilder.buildConstant(Res: IntTy, Val: Inf);
9258 auto ExpMaskC = MIRBuilder.buildConstant(Res: IntTy, Val: ExpMask);
9259 auto ZeroC = MIRBuilder.buildConstant(Res: IntTy, Val: 0);
9260
9261 auto Abs = MIRBuilder.buildAnd(Dst: IntTy, Src0: AsInt, Src1: ValueMaskC);
9262 auto Sign =
9263 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_NE, Res: DstTy, Op0: AsInt, Op1: Abs);
9264
9265 auto Res = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
9266 // Clang doesn't support capture of structured bindings:
9267 LLT DstTyCopy = DstTy;
9268 const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
9269 Res = MIRBuilder.buildOr(Dst: DstTyCopy, Src0: Res, Src1: ToAppend);
9270 };
9271
9272 // Tests that involve more than one class should be processed first.
9273 if ((Mask & fcFinite) == fcFinite) {
9274 // finite(V) ==> abs(V) u< exp_mask
9275 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: Abs,
9276 Op1: ExpMaskC));
9277 Mask &= ~fcFinite;
9278 } else if ((Mask & fcFinite) == fcPosFinite) {
9279 // finite(V) && V > 0 ==> V u< exp_mask
9280 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: AsInt,
9281 Op1: ExpMaskC));
9282 Mask &= ~fcPosFinite;
9283 } else if ((Mask & fcFinite) == fcNegFinite) {
9284 // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
9285 auto Cmp = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: Abs,
9286 Op1: ExpMaskC);
9287 auto And = MIRBuilder.buildAnd(Dst: DstTy, Src0: Cmp, Src1: Sign);
9288 appendToRes(And);
9289 Mask &= ~fcNegFinite;
9290 }
9291
9292 if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
9293 // fcZero | fcSubnormal => test all exponent bits are 0
9294 // TODO: Handle sign bit specific cases
9295 // TODO: Handle inverted case
9296 if (PartialCheck == (fcZero | fcSubnormal)) {
9297 auto ExpBits = MIRBuilder.buildAnd(Dst: IntTy, Src0: AsInt, Src1: ExpMaskC);
9298 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
9299 Op0: ExpBits, Op1: ZeroC));
9300 Mask &= ~PartialCheck;
9301 }
9302 }
9303
9304 // Check for individual classes.
9305 if (FPClassTest PartialCheck = Mask & fcZero) {
9306 if (PartialCheck == fcPosZero)
9307 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
9308 Op0: AsInt, Op1: ZeroC));
9309 else if (PartialCheck == fcZero)
9310 appendToRes(
9311 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy, Op0: Abs, Op1: ZeroC));
9312 else // fcNegZero
9313 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
9314 Op0: AsInt, Op1: SignBitC));
9315 }
9316
9317 if (FPClassTest PartialCheck = Mask & fcSubnormal) {
9318 // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
9319 // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
9320 auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
9321 auto OneC = MIRBuilder.buildConstant(Res: IntTy, Val: 1);
9322 auto VMinusOne = MIRBuilder.buildSub(Dst: IntTy, Src0: V, Src1: OneC);
9323 auto SubnormalRes =
9324 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: VMinusOne,
9325 Op1: MIRBuilder.buildConstant(Res: IntTy, Val: AllOneMantissa));
9326 if (PartialCheck == fcNegSubnormal)
9327 SubnormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: SubnormalRes, Src1: Sign);
9328 appendToRes(SubnormalRes);
9329 }
9330
9331 if (FPClassTest PartialCheck = Mask & fcInf) {
9332 if (PartialCheck == fcPosInf)
9333 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
9334 Op0: AsInt, Op1: InfC));
9335 else if (PartialCheck == fcInf)
9336 appendToRes(
9337 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy, Op0: Abs, Op1: InfC));
9338 else { // fcNegInf
9339 APInt NegInf = APFloat::getInf(Sem: Semantics, Negative: true).bitcastToAPInt();
9340 auto NegInfC = MIRBuilder.buildConstant(Res: IntTy, Val: NegInf);
9341 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
9342 Op0: AsInt, Op1: NegInfC));
9343 }
9344 }
9345
9346 if (FPClassTest PartialCheck = Mask & fcNan) {
9347 auto InfWithQnanBitC = MIRBuilder.buildConstant(Res: IntTy, Val: Inf | QNaNBitMask);
9348 if (PartialCheck == fcNan) {
9349 // isnan(V) ==> abs(V) u> int(inf)
9350 appendToRes(
9351 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGT, Res: DstTy, Op0: Abs, Op1: InfC));
9352 } else if (PartialCheck == fcQNan) {
9353 // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
9354 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGE, Res: DstTy, Op0: Abs,
9355 Op1: InfWithQnanBitC));
9356 } else { // fcSNan
9357 // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
9358 // abs(V) u< (unsigned(Inf) | quiet_bit)
9359 auto IsNan =
9360 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGT, Res: DstTy, Op0: Abs, Op1: InfC);
9361 auto IsNotQnan = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy,
9362 Op0: Abs, Op1: InfWithQnanBitC);
9363 appendToRes(MIRBuilder.buildAnd(Dst: DstTy, Src0: IsNan, Src1: IsNotQnan));
9364 }
9365 }
9366
9367 if (FPClassTest PartialCheck = Mask & fcNormal) {
9368 // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
9369 // (max_exp-1))
9370 APInt ExpLSB = ExpMask & ~(ExpMask.shl(shiftAmt: 1));
9371 auto ExpMinusOne = MIRBuilder.buildSub(
9372 Dst: IntTy, Src0: Abs, Src1: MIRBuilder.buildConstant(Res: IntTy, Val: ExpLSB));
9373 APInt MaxExpMinusOne = ExpMask - ExpLSB;
9374 auto NormalRes =
9375 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: ExpMinusOne,
9376 Op1: MIRBuilder.buildConstant(Res: IntTy, Val: MaxExpMinusOne));
9377 if (PartialCheck == fcNegNormal)
9378 NormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: NormalRes, Src1: Sign);
9379 else if (PartialCheck == fcPosNormal) {
9380 auto PosSign = MIRBuilder.buildXor(
9381 Dst: DstTy, Src0: Sign, Src1: MIRBuilder.buildConstant(Res: DstTy, Val: InvertionMask));
9382 NormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: NormalRes, Src1: PosSign);
9383 }
9384 appendToRes(NormalRes);
9385 }
9386
9387 MIRBuilder.buildCopy(Res: DstReg, Op: Res);
9388 MI.eraseFromParent();
9389 return Legalized;
9390}
9391
9392LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
9393 // Implement G_SELECT in terms of XOR, AND, OR.
9394 auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
9395 MI.getFirst4RegLLTs();
9396
9397 bool IsEltPtr = DstTy.isPointerOrPointerVector();
9398 if (IsEltPtr) {
9399 LLT ScalarPtrTy = LLT::scalar(SizeInBits: DstTy.getScalarSizeInBits());
9400 LLT NewTy = DstTy.changeElementType(NewEltTy: ScalarPtrTy);
9401 Op1Reg = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Op1Reg).getReg(Idx: 0);
9402 Op2Reg = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Op2Reg).getReg(Idx: 0);
9403 DstTy = NewTy;
9404 }
9405
9406 if (MaskTy.isScalar()) {
9407 // Turn the scalar condition into a vector condition mask if needed.
9408
9409 Register MaskElt = MaskReg;
9410
9411 // The condition was potentially zero extended before, but we want a sign
9412 // extended boolean.
9413 if (MaskTy != LLT::scalar(SizeInBits: 1))
9414 MaskElt = MIRBuilder.buildSExtInReg(Res: MaskTy, Op: MaskElt, ImmOp: 1).getReg(Idx: 0);
9415
9416 // Continue the sign extension (or truncate) to match the data type.
9417 MaskElt =
9418 MIRBuilder.buildSExtOrTrunc(Res: DstTy.getScalarType(), Op: MaskElt).getReg(Idx: 0);
9419
9420 if (DstTy.isVector()) {
9421 // Generate a vector splat idiom.
9422 auto ShufSplat = MIRBuilder.buildShuffleSplat(Res: DstTy, Src: MaskElt);
9423 MaskReg = ShufSplat.getReg(Idx: 0);
9424 } else {
9425 MaskReg = MaskElt;
9426 }
9427 MaskTy = DstTy;
9428 } else if (!DstTy.isVector()) {
9429 // Cannot handle the case that mask is a vector and dst is a scalar.
9430 return UnableToLegalize;
9431 }
9432
9433 if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
9434 return UnableToLegalize;
9435 }
9436
9437 auto NotMask = MIRBuilder.buildNot(Dst: MaskTy, Src0: MaskReg);
9438 auto NewOp1 = MIRBuilder.buildAnd(Dst: MaskTy, Src0: Op1Reg, Src1: MaskReg);
9439 auto NewOp2 = MIRBuilder.buildAnd(Dst: MaskTy, Src0: Op2Reg, Src1: NotMask);
9440 if (IsEltPtr) {
9441 auto Or = MIRBuilder.buildOr(Dst: DstTy, Src0: NewOp1, Src1: NewOp2);
9442 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: Or);
9443 } else {
9444 MIRBuilder.buildOr(Dst: DstReg, Src0: NewOp1, Src1: NewOp2);
9445 }
9446 MI.eraseFromParent();
9447 return Legalized;
9448}
9449
9450LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
9451 // Split DIVREM into individual instructions.
9452 unsigned Opcode = MI.getOpcode();
9453
9454 MIRBuilder.buildInstr(
9455 Opc: Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
9456 : TargetOpcode::G_UDIV,
9457 DstOps: {MI.getOperand(i: 0).getReg()}, SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
9458 MIRBuilder.buildInstr(
9459 Opc: Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
9460 : TargetOpcode::G_UREM,
9461 DstOps: {MI.getOperand(i: 1).getReg()}, SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
9462 MI.eraseFromParent();
9463 return Legalized;
9464}
9465
9466LegalizerHelper::LegalizeResult
9467LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
9468 // Expand %res = G_ABS %a into:
9469 // %v1 = G_ASHR %a, scalar_size-1
9470 // %v2 = G_ADD %a, %v1
9471 // %res = G_XOR %v2, %v1
9472 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
9473 Register OpReg = MI.getOperand(i: 1).getReg();
9474 auto ShiftAmt =
9475 MIRBuilder.buildConstant(Res: DstTy, Val: DstTy.getScalarSizeInBits() - 1);
9476 auto Shift = MIRBuilder.buildAShr(Dst: DstTy, Src0: OpReg, Src1: ShiftAmt);
9477 auto Add = MIRBuilder.buildAdd(Dst: DstTy, Src0: OpReg, Src1: Shift);
9478 MIRBuilder.buildXor(Dst: MI.getOperand(i: 0).getReg(), Src0: Add, Src1: Shift);
9479 MI.eraseFromParent();
9480 return Legalized;
9481}
9482
9483LegalizerHelper::LegalizeResult
9484LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
9485 // Expand %res = G_ABS %a into:
9486 // %v1 = G_CONSTANT 0
9487 // %v2 = G_SUB %v1, %a
9488 // %res = G_SMAX %a, %v2
9489 Register SrcReg = MI.getOperand(i: 1).getReg();
9490 LLT Ty = MRI.getType(Reg: SrcReg);
9491 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9492 auto Sub = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: SrcReg);
9493 MIRBuilder.buildSMax(Dst: MI.getOperand(i: 0), Src0: SrcReg, Src1: Sub);
9494 MI.eraseFromParent();
9495 return Legalized;
9496}
9497
9498LegalizerHelper::LegalizeResult
9499LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
9500 Register SrcReg = MI.getOperand(i: 1).getReg();
9501 Register DestReg = MI.getOperand(i: 0).getReg();
9502 LLT Ty = MRI.getType(Reg: SrcReg), IType = LLT::scalar(SizeInBits: 1);
9503 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0).getReg(Idx: 0);
9504 auto Sub = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: SrcReg).getReg(Idx: 0);
9505 auto ICmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: IType, Op0: SrcReg, Op1: Zero);
9506 MIRBuilder.buildSelect(Res: DestReg, Tst: ICmp, Op0: SrcReg, Op1: Sub);
9507 MI.eraseFromParent();
9508 return Legalized;
9509}
9510
9511LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
9512 Register SrcReg = MI.getOperand(i: 1).getReg();
9513 Register DstReg = MI.getOperand(i: 0).getReg();
9514
9515 LLT Ty = MRI.getType(Reg: DstReg);
9516
9517 // Reset sign bit
9518 MIRBuilder.buildAnd(
9519 Dst: DstReg, Src0: SrcReg,
9520 Src1: MIRBuilder.buildConstant(
9521 Res: Ty, Val: APInt::getSignedMaxValue(numBits: Ty.getScalarSizeInBits())));
9522
9523 MI.eraseFromParent();
9524 return Legalized;
9525}
9526
9527LegalizerHelper::LegalizeResult
9528LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
9529 Register SrcReg = MI.getOperand(i: 1).getReg();
9530 LLT SrcTy = MRI.getType(Reg: SrcReg);
9531 LLT DstTy = MRI.getType(Reg: SrcReg);
9532
9533 // The source could be a scalar if the IR type was <1 x sN>.
9534 if (SrcTy.isScalar()) {
9535 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
9536 return UnableToLegalize; // FIXME: handle extension.
9537 // This can be just a plain copy.
9538 Observer.changingInstr(MI);
9539 MI.setDesc(MIRBuilder.getTII().get(Opcode: TargetOpcode::COPY));
9540 Observer.changedInstr(MI);
9541 return Legalized;
9542 }
9543 return UnableToLegalize;
9544}
9545
9546LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
9547 MachineFunction &MF = *MI.getMF();
9548 const DataLayout &DL = MIRBuilder.getDataLayout();
9549 LLVMContext &Ctx = MF.getFunction().getContext();
9550 Register ListPtr = MI.getOperand(i: 1).getReg();
9551 LLT PtrTy = MRI.getType(Reg: ListPtr);
9552
9553 // LstPtr is a pointer to the head of the list. Get the address
9554 // of the head of the list.
9555 Align PtrAlignment = DL.getABITypeAlign(Ty: getTypeForLLT(Ty: PtrTy, C&: Ctx));
9556 MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
9557 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad, MemTy: PtrTy, base_alignment: PtrAlignment);
9558 auto VAList = MIRBuilder.buildLoad(Res: PtrTy, Addr: ListPtr, MMO&: *PtrLoadMMO).getReg(Idx: 0);
9559
9560 const Align A(MI.getOperand(i: 2).getImm());
9561 LLT PtrTyAsScalarTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
9562 if (A > TLI.getMinStackArgumentAlignment()) {
9563 Register AlignAmt =
9564 MIRBuilder.buildConstant(Res: PtrTyAsScalarTy, Val: A.value() - 1).getReg(Idx: 0);
9565 auto AddDst = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VAList, Op1: AlignAmt);
9566 auto AndDst = MIRBuilder.buildMaskLowPtrBits(Res: PtrTy, Op0: AddDst, NumBits: Log2(A));
9567 VAList = AndDst.getReg(Idx: 0);
9568 }
9569
9570 // Increment the pointer, VAList, to the next vaarg
9571 // The list should be bumped by the size of element in the current head of
9572 // list.
9573 Register Dst = MI.getOperand(i: 0).getReg();
9574 LLT LLTTy = MRI.getType(Reg: Dst);
9575 Type *Ty = getTypeForLLT(Ty: LLTTy, C&: Ctx);
9576 auto IncAmt =
9577 MIRBuilder.buildConstant(Res: PtrTyAsScalarTy, Val: DL.getTypeAllocSize(Ty));
9578 auto Succ = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VAList, Op1: IncAmt);
9579
9580 // Store the increment VAList to the legalized pointer
9581 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
9582 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOStore, MemTy: PtrTy, base_alignment: PtrAlignment);
9583 MIRBuilder.buildStore(Val: Succ, Addr: ListPtr, MMO&: *StoreMMO);
9584 // Load the actual argument out of the pointer VAList
9585 Align EltAlignment = DL.getABITypeAlign(Ty);
9586 MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
9587 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad, MemTy: LLTTy, base_alignment: EltAlignment);
9588 MIRBuilder.buildLoad(Res: Dst, Addr: VAList, MMO&: *EltLoadMMO);
9589
9590 MI.eraseFromParent();
9591 return Legalized;
9592}
9593
9594static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
9595 // On Darwin, -Os means optimize for size without hurting performance, so
9596 // only really optimize for size when -Oz (MinSize) is used.
9597 if (MF.getTarget().getTargetTriple().isOSDarwin())
9598 return MF.getFunction().hasMinSize();
9599 return MF.getFunction().hasOptSize();
9600}
9601
9602// Returns a list of types to use for memory op lowering in MemOps. A partial
9603// port of findOptimalMemOpLowering in TargetLowering.
9604static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
9605 unsigned Limit, const MemOp &Op,
9606 unsigned DstAS, unsigned SrcAS,
9607 const AttributeList &FuncAttributes,
9608 const TargetLowering &TLI) {
9609 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
9610 return false;
9611
9612 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
9613
9614 if (Ty == LLT()) {
9615 // Use the largest scalar type whose alignment constraints are satisfied.
9616 // We only need to check DstAlign here as SrcAlign is always greater or
9617 // equal to DstAlign (or zero).
9618 Ty = LLT::scalar(SizeInBits: 64);
9619 if (Op.isFixedDstAlign())
9620 while (Op.getDstAlign() < Ty.getSizeInBytes() &&
9621 !TLI.allowsMisalignedMemoryAccesses(Ty, AddrSpace: DstAS, Alignment: Op.getDstAlign()))
9622 Ty = LLT::scalar(SizeInBits: Ty.getSizeInBytes());
9623 assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
9624 // FIXME: check for the largest legal type we can load/store to.
9625 }
9626
9627 unsigned NumMemOps = 0;
9628 uint64_t Size = Op.size();
9629 while (Size) {
9630 unsigned TySize = Ty.getSizeInBytes();
9631 while (TySize > Size) {
9632 // For now, only use non-vector load / store's for the left-over pieces.
9633 LLT NewTy = Ty;
9634 // FIXME: check for mem op safety and legality of the types. Not all of
9635 // SDAGisms map cleanly to GISel concepts.
9636 if (NewTy.isVector())
9637 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32);
9638 NewTy = LLT::scalar(SizeInBits: llvm::bit_floor(Value: NewTy.getSizeInBits() - 1));
9639 unsigned NewTySize = NewTy.getSizeInBytes();
9640 assert(NewTySize > 0 && "Could not find appropriate type");
9641
9642 // If the new LLT cannot cover all of the remaining bits, then consider
9643 // issuing a (or a pair of) unaligned and overlapping load / store.
9644 unsigned Fast;
9645 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
9646 MVT VT = getMVTForLLT(Ty);
9647 if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
9648 TLI.allowsMisalignedMemoryAccesses(
9649 VT, AddrSpace: DstAS, Alignment: Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
9650 Flags: MachineMemOperand::MONone, &Fast) &&
9651 Fast)
9652 TySize = Size;
9653 else {
9654 Ty = NewTy;
9655 TySize = NewTySize;
9656 }
9657 }
9658
9659 if (++NumMemOps > Limit)
9660 return false;
9661
9662 MemOps.push_back(x: Ty);
9663 Size -= TySize;
9664 }
9665
9666 return true;
9667}
9668
9669// Get a vectorized representation of the memset value operand, GISel edition.
9670static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
9671 MachineRegisterInfo &MRI = *MIB.getMRI();
9672 unsigned NumBits = Ty.getScalarSizeInBits();
9673 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Val, MRI);
9674 if (!Ty.isVector() && ValVRegAndVal) {
9675 APInt Scalar = ValVRegAndVal->Value.trunc(width: 8);
9676 APInt SplatVal = APInt::getSplat(NewLen: NumBits, V: Scalar);
9677 return MIB.buildConstant(Res: Ty, Val: SplatVal).getReg(Idx: 0);
9678 }
9679
9680 // Extend the byte value to the larger type, and then multiply by a magic
9681 // value 0x010101... in order to replicate it across every byte.
9682 // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
9683 if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
9684 return MIB.buildConstant(Res: Ty, Val: 0).getReg(Idx: 0);
9685 }
9686
9687 LLT ExtType = Ty.getScalarType();
9688 auto ZExt = MIB.buildZExtOrTrunc(Res: ExtType, Op: Val);
9689 if (NumBits > 8) {
9690 APInt Magic = APInt::getSplat(NewLen: NumBits, V: APInt(8, 0x01));
9691 auto MagicMI = MIB.buildConstant(Res: ExtType, Val: Magic);
9692 Val = MIB.buildMul(Dst: ExtType, Src0: ZExt, Src1: MagicMI).getReg(Idx: 0);
9693 }
9694
9695 // For vector types create a G_BUILD_VECTOR.
9696 if (Ty.isVector())
9697 Val = MIB.buildSplatBuildVector(Res: Ty, Src: Val).getReg(Idx: 0);
9698
9699 return Val;
9700}
9701
9702LegalizerHelper::LegalizeResult
9703LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
9704 uint64_t KnownLen, Align Alignment,
9705 bool IsVolatile) {
9706 auto &MF = *MI.getParent()->getParent();
9707 const auto &TLI = *MF.getSubtarget().getTargetLowering();
9708 auto &DL = MF.getDataLayout();
9709 LLVMContext &C = MF.getFunction().getContext();
9710
9711 assert(KnownLen != 0 && "Have a zero length memset length!");
9712
9713 bool DstAlignCanChange = false;
9714 MachineFrameInfo &MFI = MF.getFrameInfo();
9715 bool OptSize = shouldLowerMemFuncForSize(MF);
9716
9717 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
9718 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
9719 DstAlignCanChange = true;
9720
9721 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
9722 std::vector<LLT> MemOps;
9723
9724 const auto &DstMMO = **MI.memoperands_begin();
9725 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
9726
9727 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Val, MRI);
9728 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
9729
9730 if (!findGISelOptimalMemOpLowering(MemOps, Limit,
9731 Op: MemOp::Set(Size: KnownLen, DstAlignCanChange,
9732 DstAlign: Alignment,
9733 /*IsZeroMemset=*/IsZeroVal,
9734 /*IsVolatile=*/IsVolatile),
9735 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: ~0u,
9736 FuncAttributes: MF.getFunction().getAttributes(), TLI))
9737 return UnableToLegalize;
9738
9739 if (DstAlignCanChange) {
9740 // Get an estimate of the type from the LLT.
9741 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
9742 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
9743 if (NewAlign > Alignment) {
9744 Alignment = NewAlign;
9745 unsigned FI = FIDef->getOperand(i: 1).getIndex();
9746 // Give the stack frame object a larger alignment if needed.
9747 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
9748 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
9749 }
9750 }
9751
9752 MachineIRBuilder MIB(MI);
9753 // Find the largest store and generate the bit pattern for it.
9754 LLT LargestTy = MemOps[0];
9755 for (unsigned i = 1; i < MemOps.size(); i++)
9756 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
9757 LargestTy = MemOps[i];
9758
9759 // The memset stored value is always defined as an s8, so in order to make it
9760 // work with larger store types we need to repeat the bit pattern across the
9761 // wider type.
9762 Register MemSetValue = getMemsetValue(Val, Ty: LargestTy, MIB);
9763
9764 if (!MemSetValue)
9765 return UnableToLegalize;
9766
9767 // Generate the stores. For each store type in the list, we generate the
9768 // matching store of that type to the destination address.
9769 LLT PtrTy = MRI.getType(Reg: Dst);
9770 unsigned DstOff = 0;
9771 unsigned Size = KnownLen;
9772 for (unsigned I = 0; I < MemOps.size(); I++) {
9773 LLT Ty = MemOps[I];
9774 unsigned TySize = Ty.getSizeInBytes();
9775 if (TySize > Size) {
9776 // Issuing an unaligned load / store pair that overlaps with the previous
9777 // pair. Adjust the offset accordingly.
9778 assert(I == MemOps.size() - 1 && I != 0);
9779 DstOff -= TySize - Size;
9780 }
9781
9782 // If this store is smaller than the largest store see whether we can get
9783 // the smaller value for free with a truncate.
9784 Register Value = MemSetValue;
9785 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
9786 MVT VT = getMVTForLLT(Ty);
9787 MVT LargestVT = getMVTForLLT(Ty: LargestTy);
9788 if (!LargestTy.isVector() && !Ty.isVector() &&
9789 TLI.isTruncateFree(FromVT: LargestVT, ToVT: VT))
9790 Value = MIB.buildTrunc(Res: Ty, Op: MemSetValue).getReg(Idx: 0);
9791 else
9792 Value = getMemsetValue(Val, Ty, MIB);
9793 if (!Value)
9794 return UnableToLegalize;
9795 }
9796
9797 auto *StoreMMO = MF.getMachineMemOperand(MMO: &DstMMO, Offset: DstOff, Ty);
9798
9799 Register Ptr = Dst;
9800 if (DstOff != 0) {
9801 auto Offset =
9802 MIB.buildConstant(Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()), Val: DstOff);
9803 Ptr = MIB.buildPtrAdd(Res: PtrTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
9804 }
9805
9806 MIB.buildStore(Val: Value, Addr: Ptr, MMO&: *StoreMMO);
9807 DstOff += Ty.getSizeInBytes();
9808 Size -= TySize;
9809 }
9810
9811 MI.eraseFromParent();
9812 return Legalized;
9813}
9814
9815LegalizerHelper::LegalizeResult
9816LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
9817 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
9818
9819 auto [Dst, Src, Len] = MI.getFirst3Regs();
9820
9821 const auto *MMOIt = MI.memoperands_begin();
9822 const MachineMemOperand *MemOp = *MMOIt;
9823 bool IsVolatile = MemOp->isVolatile();
9824
9825 // See if this is a constant length copy
9826 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Len, MRI);
9827 // FIXME: support dynamically sized G_MEMCPY_INLINE
9828 assert(LenVRegAndVal &&
9829 "inline memcpy with dynamic size is not yet supported");
9830 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
9831 if (KnownLen == 0) {
9832 MI.eraseFromParent();
9833 return Legalized;
9834 }
9835
9836 const auto &DstMMO = **MI.memoperands_begin();
9837 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
9838 Align DstAlign = DstMMO.getBaseAlign();
9839 Align SrcAlign = SrcMMO.getBaseAlign();
9840
9841 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
9842 IsVolatile);
9843}
9844
9845LegalizerHelper::LegalizeResult
9846LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
9847 uint64_t KnownLen, Align DstAlign,
9848 Align SrcAlign, bool IsVolatile) {
9849 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
9850 return lowerMemcpy(MI, Dst, Src, KnownLen,
9851 Limit: std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
9852 IsVolatile);
9853}
9854
9855LegalizerHelper::LegalizeResult
9856LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
9857 uint64_t KnownLen, uint64_t Limit, Align DstAlign,
9858 Align SrcAlign, bool IsVolatile) {
9859 auto &MF = *MI.getParent()->getParent();
9860 const auto &TLI = *MF.getSubtarget().getTargetLowering();
9861 auto &DL = MF.getDataLayout();
9862 LLVMContext &C = MF.getFunction().getContext();
9863
9864 assert(KnownLen != 0 && "Have a zero length memcpy length!");
9865
9866 bool DstAlignCanChange = false;
9867 MachineFrameInfo &MFI = MF.getFrameInfo();
9868 Align Alignment = std::min(a: DstAlign, b: SrcAlign);
9869
9870 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
9871 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
9872 DstAlignCanChange = true;
9873
9874 // FIXME: infer better src pointer alignment like SelectionDAG does here.
9875 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
9876 // if the memcpy is in a tail call position.
9877
9878 std::vector<LLT> MemOps;
9879
9880 const auto &DstMMO = **MI.memoperands_begin();
9881 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
9882 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
9883 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
9884
9885 if (!findGISelOptimalMemOpLowering(
9886 MemOps, Limit,
9887 Op: MemOp::Copy(Size: KnownLen, DstAlignCanChange, DstAlign: Alignment, SrcAlign,
9888 IsVolatile),
9889 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: SrcPtrInfo.getAddrSpace(),
9890 FuncAttributes: MF.getFunction().getAttributes(), TLI))
9891 return UnableToLegalize;
9892
9893 if (DstAlignCanChange) {
9894 // Get an estimate of the type from the LLT.
9895 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
9896 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
9897
9898 // Don't promote to an alignment that would require dynamic stack
9899 // realignment.
9900 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
9901 if (!TRI->hasStackRealignment(MF))
9902 if (MaybeAlign StackAlign = DL.getStackAlignment())
9903 NewAlign = std::min(a: NewAlign, b: *StackAlign);
9904
9905 if (NewAlign > Alignment) {
9906 Alignment = NewAlign;
9907 unsigned FI = FIDef->getOperand(i: 1).getIndex();
9908 // Give the stack frame object a larger alignment if needed.
9909 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
9910 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
9911 }
9912 }
9913
9914 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
9915
9916 MachineIRBuilder MIB(MI);
9917 // Now we need to emit a pair of load and stores for each of the types we've
9918 // collected. I.e. for each type, generate a load from the source pointer of
9919 // that type width, and then generate a corresponding store to the dest buffer
9920 // of that value loaded. This can result in a sequence of loads and stores
9921 // mixed types, depending on what the target specifies as good types to use.
9922 unsigned CurrOffset = 0;
9923 unsigned Size = KnownLen;
9924 for (auto CopyTy : MemOps) {
9925 // Issuing an unaligned load / store pair that overlaps with the previous
9926 // pair. Adjust the offset accordingly.
9927 if (CopyTy.getSizeInBytes() > Size)
9928 CurrOffset -= CopyTy.getSizeInBytes() - Size;
9929
9930 // Construct MMOs for the accesses.
9931 auto *LoadMMO =
9932 MF.getMachineMemOperand(MMO: &SrcMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
9933 auto *StoreMMO =
9934 MF.getMachineMemOperand(MMO: &DstMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
9935
9936 // Create the load.
9937 Register LoadPtr = Src;
9938 Register Offset;
9939 if (CurrOffset != 0) {
9940 LLT SrcTy = MRI.getType(Reg: Src);
9941 Offset = MIB.buildConstant(Res: LLT::scalar(SizeInBits: SrcTy.getSizeInBits()), Val: CurrOffset)
9942 .getReg(Idx: 0);
9943 LoadPtr = MIB.buildPtrAdd(Res: SrcTy, Op0: Src, Op1: Offset).getReg(Idx: 0);
9944 }
9945 auto LdVal = MIB.buildLoad(Res: CopyTy, Addr: LoadPtr, MMO&: *LoadMMO);
9946
9947 // Create the store.
9948 Register StorePtr = Dst;
9949 if (CurrOffset != 0) {
9950 LLT DstTy = MRI.getType(Reg: Dst);
9951 StorePtr = MIB.buildPtrAdd(Res: DstTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
9952 }
9953 MIB.buildStore(Val: LdVal, Addr: StorePtr, MMO&: *StoreMMO);
9954 CurrOffset += CopyTy.getSizeInBytes();
9955 Size -= CopyTy.getSizeInBytes();
9956 }
9957
9958 MI.eraseFromParent();
9959 return Legalized;
9960}
9961
9962LegalizerHelper::LegalizeResult
9963LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
9964 uint64_t KnownLen, Align DstAlign, Align SrcAlign,
9965 bool IsVolatile) {
9966 auto &MF = *MI.getParent()->getParent();
9967 const auto &TLI = *MF.getSubtarget().getTargetLowering();
9968 auto &DL = MF.getDataLayout();
9969 LLVMContext &C = MF.getFunction().getContext();
9970
9971 assert(KnownLen != 0 && "Have a zero length memmove length!");
9972
9973 bool DstAlignCanChange = false;
9974 MachineFrameInfo &MFI = MF.getFrameInfo();
9975 bool OptSize = shouldLowerMemFuncForSize(MF);
9976 Align Alignment = std::min(a: DstAlign, b: SrcAlign);
9977
9978 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
9979 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
9980 DstAlignCanChange = true;
9981
9982 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
9983 std::vector<LLT> MemOps;
9984
9985 const auto &DstMMO = **MI.memoperands_begin();
9986 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
9987 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
9988 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
9989
9990 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
9991 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
9992 // same thing here.
9993 if (!findGISelOptimalMemOpLowering(
9994 MemOps, Limit,
9995 Op: MemOp::Copy(Size: KnownLen, DstAlignCanChange, DstAlign: Alignment, SrcAlign,
9996 /*IsVolatile*/ true),
9997 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: SrcPtrInfo.getAddrSpace(),
9998 FuncAttributes: MF.getFunction().getAttributes(), TLI))
9999 return UnableToLegalize;
10000
10001 if (DstAlignCanChange) {
10002 // Get an estimate of the type from the LLT.
10003 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
10004 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
10005
10006 // Don't promote to an alignment that would require dynamic stack
10007 // realignment.
10008 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
10009 if (!TRI->hasStackRealignment(MF))
10010 if (MaybeAlign StackAlign = DL.getStackAlignment())
10011 NewAlign = std::min(a: NewAlign, b: *StackAlign);
10012
10013 if (NewAlign > Alignment) {
10014 Alignment = NewAlign;
10015 unsigned FI = FIDef->getOperand(i: 1).getIndex();
10016 // Give the stack frame object a larger alignment if needed.
10017 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
10018 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
10019 }
10020 }
10021
10022 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
10023
10024 MachineIRBuilder MIB(MI);
10025 // Memmove requires that we perform the loads first before issuing the stores.
10026 // Apart from that, this loop is pretty much doing the same thing as the
10027 // memcpy codegen function.
10028 unsigned CurrOffset = 0;
10029 SmallVector<Register, 16> LoadVals;
10030 for (auto CopyTy : MemOps) {
10031 // Construct MMO for the load.
10032 auto *LoadMMO =
10033 MF.getMachineMemOperand(MMO: &SrcMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10034
10035 // Create the load.
10036 Register LoadPtr = Src;
10037 if (CurrOffset != 0) {
10038 LLT SrcTy = MRI.getType(Reg: Src);
10039 auto Offset =
10040 MIB.buildConstant(Res: LLT::scalar(SizeInBits: SrcTy.getSizeInBits()), Val: CurrOffset);
10041 LoadPtr = MIB.buildPtrAdd(Res: SrcTy, Op0: Src, Op1: Offset).getReg(Idx: 0);
10042 }
10043 LoadVals.push_back(Elt: MIB.buildLoad(Res: CopyTy, Addr: LoadPtr, MMO&: *LoadMMO).getReg(Idx: 0));
10044 CurrOffset += CopyTy.getSizeInBytes();
10045 }
10046
10047 CurrOffset = 0;
10048 for (unsigned I = 0; I < MemOps.size(); ++I) {
10049 LLT CopyTy = MemOps[I];
10050 // Now store the values loaded.
10051 auto *StoreMMO =
10052 MF.getMachineMemOperand(MMO: &DstMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10053
10054 Register StorePtr = Dst;
10055 if (CurrOffset != 0) {
10056 LLT DstTy = MRI.getType(Reg: Dst);
10057 auto Offset =
10058 MIB.buildConstant(Res: LLT::scalar(SizeInBits: DstTy.getSizeInBits()), Val: CurrOffset);
10059 StorePtr = MIB.buildPtrAdd(Res: DstTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
10060 }
10061 MIB.buildStore(Val: LoadVals[I], Addr: StorePtr, MMO&: *StoreMMO);
10062 CurrOffset += CopyTy.getSizeInBytes();
10063 }
10064 MI.eraseFromParent();
10065 return Legalized;
10066}
10067
10068LegalizerHelper::LegalizeResult
10069LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
10070 const unsigned Opc = MI.getOpcode();
10071 // This combine is fairly complex so it's not written with a separate
10072 // matcher function.
10073 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
10074 Opc == TargetOpcode::G_MEMSET) &&
10075 "Expected memcpy like instruction");
10076
10077 auto MMOIt = MI.memoperands_begin();
10078 const MachineMemOperand *MemOp = *MMOIt;
10079
10080 Align DstAlign = MemOp->getBaseAlign();
10081 Align SrcAlign;
10082 auto [Dst, Src, Len] = MI.getFirst3Regs();
10083
10084 if (Opc != TargetOpcode::G_MEMSET) {
10085 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
10086 MemOp = *(++MMOIt);
10087 SrcAlign = MemOp->getBaseAlign();
10088 }
10089
10090 // See if this is a constant length copy
10091 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Len, MRI);
10092 if (!LenVRegAndVal)
10093 return UnableToLegalize;
10094 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
10095
10096 if (KnownLen == 0) {
10097 MI.eraseFromParent();
10098 return Legalized;
10099 }
10100
10101 bool IsVolatile = MemOp->isVolatile();
10102 // Don't try to optimize volatile.
10103 if (IsVolatile)
10104 return UnableToLegalize;
10105
10106 if (MaxLen && KnownLen > MaxLen)
10107 return UnableToLegalize;
10108
10109 if (Opc == TargetOpcode::G_MEMCPY) {
10110 auto &MF = *MI.getParent()->getParent();
10111 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10112 bool OptSize = shouldLowerMemFuncForSize(MF);
10113 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
10114 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
10115 IsVolatile);
10116 }
10117 if (Opc == TargetOpcode::G_MEMMOVE)
10118 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
10119 if (Opc == TargetOpcode::G_MEMSET)
10120 return lowerMemset(MI, Dst, Val: Src, KnownLen, Alignment: DstAlign, IsVolatile);
10121 return UnableToLegalize;
10122}
10123