1//===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This file implements the LegalizerHelper class to legalize
10/// individual instructions and the LegalizeMachineIR wrapper pass for the
11/// primary legalization.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16#include "llvm/CodeGen/GlobalISel/CallLowering.h"
17#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
19#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24#include "llvm/CodeGen/GlobalISel/Utils.h"
25#include "llvm/CodeGen/LowLevelTypeUtils.h"
26#include "llvm/CodeGen/MachineConstantPool.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/CodeGen/MachineRegisterInfo.h"
29#include "llvm/CodeGen/RuntimeLibcallUtil.h"
30#include "llvm/CodeGen/TargetFrameLowering.h"
31#include "llvm/CodeGen/TargetInstrInfo.h"
32#include "llvm/CodeGen/TargetLowering.h"
33#include "llvm/CodeGen/TargetOpcodes.h"
34#include "llvm/CodeGen/TargetSubtargetInfo.h"
35#include "llvm/IR/Instructions.h"
36#include "llvm/Support/Debug.h"
37#include "llvm/Support/MathExtras.h"
38#include "llvm/Support/raw_ostream.h"
39#include "llvm/Target/TargetMachine.h"
40#include <numeric>
41#include <optional>
42
43#define DEBUG_TYPE "legalizer"
44
45using namespace llvm;
46using namespace LegalizeActions;
47using namespace MIPatternMatch;
48
49/// Try to break down \p OrigTy into \p NarrowTy sized pieces.
50///
51/// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
52/// with any leftover piece as type \p LeftoverTy
53///
54/// Returns -1 in the first element of the pair if the breakdown is not
55/// satisfiable.
56static std::pair<int, int>
57getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
58 assert(!LeftoverTy.isValid() && "this is an out argument");
59
60 unsigned Size = OrigTy.getSizeInBits();
61 unsigned NarrowSize = NarrowTy.getSizeInBits();
62 unsigned NumParts = Size / NarrowSize;
63 unsigned LeftoverSize = Size - NumParts * NarrowSize;
64 assert(Size > NarrowSize);
65
66 if (LeftoverSize == 0)
67 return {NumParts, 0};
68
69 if (NarrowTy.isVector()) {
70 unsigned EltSize = OrigTy.getScalarSizeInBits();
71 if (LeftoverSize % EltSize != 0)
72 return {-1, -1};
73 LeftoverTy = OrigTy.changeElementCount(
74 EC: ElementCount::getFixed(MinVal: LeftoverSize / EltSize));
75 } else {
76 LeftoverTy = LLT::scalar(SizeInBits: LeftoverSize);
77 }
78
79 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
80 return std::make_pair(x&: NumParts, y&: NumLeftover);
81}
82
83static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
84
85 if (!Ty.isScalar())
86 return nullptr;
87
88 switch (Ty.getSizeInBits()) {
89 case 16:
90 return Type::getHalfTy(C&: Ctx);
91 case 32:
92 return Type::getFloatTy(C&: Ctx);
93 case 64:
94 return Type::getDoubleTy(C&: Ctx);
95 case 80:
96 return Type::getX86_FP80Ty(C&: Ctx);
97 case 128:
98 return Type::getFP128Ty(C&: Ctx);
99 default:
100 return nullptr;
101 }
102}
103
104LegalizerHelper::LegalizerHelper(MachineFunction &MF,
105 GISelChangeObserver &Observer,
106 MachineIRBuilder &Builder,
107 const LibcallLoweringInfo *Libcalls)
108 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
109 LI(*MF.getSubtarget().getLegalizerInfo()),
110 TLI(*MF.getSubtarget().getTargetLowering()), Libcalls(Libcalls) {}
111
112LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
113 GISelChangeObserver &Observer,
114 MachineIRBuilder &B,
115 const LibcallLoweringInfo *Libcalls,
116 GISelValueTracking *VT)
117 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
118 TLI(*MF.getSubtarget().getTargetLowering()), Libcalls(Libcalls), VT(VT) {}
119
120LegalizerHelper::LegalizeResult
121LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
122 LostDebugLocObserver &LocObserver) {
123 LLVM_DEBUG(dbgs() << "\nLegalizing: " << MI);
124
125 MIRBuilder.setInstrAndDebugLoc(MI);
126
127 if (isa<GIntrinsic>(Val: MI))
128 return LI.legalizeIntrinsic(Helper&: *this, MI) ? Legalized : UnableToLegalize;
129 auto Step = LI.getAction(MI, MRI);
130 switch (Step.Action) {
131 case Legal:
132 LLVM_DEBUG(dbgs() << ".. Already legal\n");
133 return AlreadyLegal;
134 case Libcall:
135 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
136 return libcall(MI, LocObserver);
137 case NarrowScalar:
138 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
139 return narrowScalar(MI, TypeIdx: Step.TypeIdx, NarrowTy: Step.NewType);
140 case WidenScalar:
141 LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
142 return widenScalar(MI, TypeIdx: Step.TypeIdx, WideTy: Step.NewType);
143 case Bitcast:
144 LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
145 return bitcast(MI, TypeIdx: Step.TypeIdx, Ty: Step.NewType);
146 case Lower:
147 LLVM_DEBUG(dbgs() << ".. Lower\n");
148 return lower(MI, TypeIdx: Step.TypeIdx, Ty: Step.NewType);
149 case FewerElements:
150 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
151 return fewerElementsVector(MI, TypeIdx: Step.TypeIdx, NarrowTy: Step.NewType);
152 case MoreElements:
153 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
154 return moreElementsVector(MI, TypeIdx: Step.TypeIdx, MoreTy: Step.NewType);
155 case Custom:
156 LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
157 return LI.legalizeCustom(Helper&: *this, MI, LocObserver) ? Legalized
158 : UnableToLegalize;
159 default:
160 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
161 return UnableToLegalize;
162 }
163}
164
165void LegalizerHelper::insertParts(Register DstReg,
166 LLT ResultTy, LLT PartTy,
167 ArrayRef<Register> PartRegs,
168 LLT LeftoverTy,
169 ArrayRef<Register> LeftoverRegs) {
170 if (!LeftoverTy.isValid()) {
171 assert(LeftoverRegs.empty());
172
173 if (!ResultTy.isVector()) {
174 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: PartRegs);
175 return;
176 }
177
178 if (PartTy.isVector())
179 MIRBuilder.buildConcatVectors(Res: DstReg, Ops: PartRegs);
180 else
181 MIRBuilder.buildBuildVector(Res: DstReg, Ops: PartRegs);
182 return;
183 }
184
185 // Merge sub-vectors with different number of elements and insert into DstReg.
186 if (ResultTy.isVector()) {
187 assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
188 SmallVector<Register, 8> AllRegs(PartRegs);
189 AllRegs.append(in_start: LeftoverRegs.begin(), in_end: LeftoverRegs.end());
190 return mergeMixedSubvectors(DstReg, PartRegs: AllRegs);
191 }
192
193 SmallVector<Register> GCDRegs;
194 LLT GCDTy = getGCDType(OrigTy: getGCDType(OrigTy: ResultTy, TargetTy: LeftoverTy), TargetTy: PartTy);
195 for (auto PartReg : concat<const Register>(Ranges&: PartRegs, Ranges&: LeftoverRegs))
196 extractGCDType(Parts&: GCDRegs, GCDTy, SrcReg: PartReg);
197 LLT ResultLCMTy = buildLCMMergePieces(DstTy: ResultTy, NarrowTy: LeftoverTy, GCDTy, VRegs&: GCDRegs);
198 buildWidenedRemergeToDst(DstReg, LCMTy: ResultLCMTy, RemergeRegs: GCDRegs);
199}
200
201void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
202 Register Reg) {
203 LLT Ty = MRI.getType(Reg);
204 SmallVector<Register, 8> RegElts;
205 extractParts(Reg, Ty: Ty.getScalarType(), NumParts: Ty.getNumElements(), VRegs&: RegElts,
206 MIRBuilder, MRI);
207 Elts.append(RHS: RegElts);
208}
209
210/// Merge \p PartRegs with different types into \p DstReg.
211void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
212 ArrayRef<Register> PartRegs) {
213 SmallVector<Register, 8> AllElts;
214 for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
215 appendVectorElts(Elts&: AllElts, Reg: PartRegs[i]);
216
217 Register Leftover = PartRegs[PartRegs.size() - 1];
218 if (!MRI.getType(Reg: Leftover).isVector())
219 AllElts.push_back(Elt: Leftover);
220 else
221 appendVectorElts(Elts&: AllElts, Reg: Leftover);
222
223 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: AllElts);
224}
225
226/// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
227static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
228 const MachineInstr &MI) {
229 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
230
231 const int StartIdx = Regs.size();
232 const int NumResults = MI.getNumOperands() - 1;
233 Regs.resize(N: Regs.size() + NumResults);
234 for (int I = 0; I != NumResults; ++I)
235 Regs[StartIdx + I] = MI.getOperand(i: I).getReg();
236}
237
238void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
239 LLT GCDTy, Register SrcReg) {
240 LLT SrcTy = MRI.getType(Reg: SrcReg);
241 if (SrcTy == GCDTy) {
242 // If the source already evenly divides the result type, we don't need to do
243 // anything.
244 Parts.push_back(Elt: SrcReg);
245 } else {
246 // Need to split into common type sized pieces.
247 auto Unmerge = MIRBuilder.buildUnmerge(Res: GCDTy, Op: SrcReg);
248 getUnmergeResults(Regs&: Parts, MI: *Unmerge);
249 }
250}
251
252LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
253 LLT NarrowTy, Register SrcReg) {
254 LLT SrcTy = MRI.getType(Reg: SrcReg);
255 LLT GCDTy = getGCDType(OrigTy: getGCDType(OrigTy: SrcTy, TargetTy: NarrowTy), TargetTy: DstTy);
256 extractGCDType(Parts, GCDTy, SrcReg);
257 return GCDTy;
258}
259
260LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
261 SmallVectorImpl<Register> &VRegs,
262 unsigned PadStrategy) {
263 LLT LCMTy = getLCMType(OrigTy: DstTy, TargetTy: NarrowTy);
264
265 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
266 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
267 int NumOrigSrc = VRegs.size();
268
269 Register PadReg;
270
271 // Get a value we can use to pad the source value if the sources won't evenly
272 // cover the result type.
273 if (NumOrigSrc < NumParts * NumSubParts) {
274 if (PadStrategy == TargetOpcode::G_ZEXT)
275 PadReg = MIRBuilder.buildConstant(Res: GCDTy, Val: 0).getReg(Idx: 0);
276 else if (PadStrategy == TargetOpcode::G_ANYEXT)
277 PadReg = MIRBuilder.buildUndef(Res: GCDTy).getReg(Idx: 0);
278 else {
279 assert(PadStrategy == TargetOpcode::G_SEXT);
280
281 // Shift the sign bit of the low register through the high register.
282 auto ShiftAmt =
283 MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: GCDTy.getSizeInBits() - 1);
284 PadReg = MIRBuilder.buildAShr(Dst: GCDTy, Src0: VRegs.back(), Src1: ShiftAmt).getReg(Idx: 0);
285 }
286 }
287
288 // Registers for the final merge to be produced.
289 SmallVector<Register, 4> Remerge(NumParts);
290
291 // Registers needed for intermediate merges, which will be merged into a
292 // source for Remerge.
293 SmallVector<Register, 4> SubMerge(NumSubParts);
294
295 // Once we've fully read off the end of the original source bits, we can reuse
296 // the same high bits for remaining padding elements.
297 Register AllPadReg;
298
299 // Build merges to the LCM type to cover the original result type.
300 for (int I = 0; I != NumParts; ++I) {
301 bool AllMergePartsArePadding = true;
302
303 // Build the requested merges to the requested type.
304 for (int J = 0; J != NumSubParts; ++J) {
305 int Idx = I * NumSubParts + J;
306 if (Idx >= NumOrigSrc) {
307 SubMerge[J] = PadReg;
308 continue;
309 }
310
311 SubMerge[J] = VRegs[Idx];
312
313 // There are meaningful bits here we can't reuse later.
314 AllMergePartsArePadding = false;
315 }
316
317 // If we've filled up a complete piece with padding bits, we can directly
318 // emit the natural sized constant if applicable, rather than a merge of
319 // smaller constants.
320 if (AllMergePartsArePadding && !AllPadReg) {
321 if (PadStrategy == TargetOpcode::G_ANYEXT)
322 AllPadReg = MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0);
323 else if (PadStrategy == TargetOpcode::G_ZEXT)
324 AllPadReg = MIRBuilder.buildConstant(Res: NarrowTy, Val: 0).getReg(Idx: 0);
325
326 // If this is a sign extension, we can't materialize a trivial constant
327 // with the right type and have to produce a merge.
328 }
329
330 if (AllPadReg) {
331 // Avoid creating additional instructions if we're just adding additional
332 // copies of padding bits.
333 Remerge[I] = AllPadReg;
334 continue;
335 }
336
337 if (NumSubParts == 1)
338 Remerge[I] = SubMerge[0];
339 else
340 Remerge[I] = MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: SubMerge).getReg(Idx: 0);
341
342 // In the sign extend padding case, re-use the first all-signbit merge.
343 if (AllMergePartsArePadding && !AllPadReg)
344 AllPadReg = Remerge[I];
345 }
346
347 VRegs = std::move(Remerge);
348 return LCMTy;
349}
350
351void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
352 ArrayRef<Register> RemergeRegs) {
353 LLT DstTy = MRI.getType(Reg: DstReg);
354
355 // Create the merge to the widened source, and extract the relevant bits into
356 // the result.
357
358 if (DstTy == LCMTy) {
359 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: RemergeRegs);
360 return;
361 }
362
363 auto Remerge = MIRBuilder.buildMergeLikeInstr(Res: LCMTy, Ops: RemergeRegs);
364 if (DstTy.isScalar() && LCMTy.isScalar()) {
365 MIRBuilder.buildTrunc(Res: DstReg, Op: Remerge);
366 return;
367 }
368
369 if (LCMTy.isVector()) {
370 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
371 SmallVector<Register, 8> UnmergeDefs(NumDefs);
372 UnmergeDefs[0] = DstReg;
373 for (unsigned I = 1; I != NumDefs; ++I)
374 UnmergeDefs[I] = MRI.createGenericVirtualRegister(Ty: DstTy);
375
376 MIRBuilder.buildUnmerge(Res: UnmergeDefs,
377 Op: MIRBuilder.buildMergeLikeInstr(Res: LCMTy, Ops: RemergeRegs));
378 return;
379 }
380
381 llvm_unreachable("unhandled case");
382}
383
384static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
385#define RTLIBCASE_INT(LibcallPrefix) \
386 do { \
387 switch (Size) { \
388 case 32: \
389 return RTLIB::LibcallPrefix##32; \
390 case 64: \
391 return RTLIB::LibcallPrefix##64; \
392 case 128: \
393 return RTLIB::LibcallPrefix##128; \
394 default: \
395 llvm_unreachable("unexpected size"); \
396 } \
397 } while (0)
398
399#define RTLIBCASE(LibcallPrefix) \
400 do { \
401 switch (Size) { \
402 case 32: \
403 return RTLIB::LibcallPrefix##32; \
404 case 64: \
405 return RTLIB::LibcallPrefix##64; \
406 case 80: \
407 return RTLIB::LibcallPrefix##80; \
408 case 128: \
409 return RTLIB::LibcallPrefix##128; \
410 default: \
411 llvm_unreachable("unexpected size"); \
412 } \
413 } while (0)
414
415 switch (Opcode) {
416 case TargetOpcode::G_LROUND:
417 RTLIBCASE(LROUND_F);
418 case TargetOpcode::G_LLROUND:
419 RTLIBCASE(LLROUND_F);
420 case TargetOpcode::G_MUL:
421 RTLIBCASE_INT(MUL_I);
422 case TargetOpcode::G_SDIV:
423 RTLIBCASE_INT(SDIV_I);
424 case TargetOpcode::G_UDIV:
425 RTLIBCASE_INT(UDIV_I);
426 case TargetOpcode::G_SREM:
427 RTLIBCASE_INT(SREM_I);
428 case TargetOpcode::G_UREM:
429 RTLIBCASE_INT(UREM_I);
430 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
431 RTLIBCASE_INT(CTLZ_I);
432 case TargetOpcode::G_FADD:
433 RTLIBCASE(ADD_F);
434 case TargetOpcode::G_FSUB:
435 RTLIBCASE(SUB_F);
436 case TargetOpcode::G_FMUL:
437 RTLIBCASE(MUL_F);
438 case TargetOpcode::G_FDIV:
439 RTLIBCASE(DIV_F);
440 case TargetOpcode::G_FEXP:
441 RTLIBCASE(EXP_F);
442 case TargetOpcode::G_FEXP2:
443 RTLIBCASE(EXP2_F);
444 case TargetOpcode::G_FEXP10:
445 RTLIBCASE(EXP10_F);
446 case TargetOpcode::G_FREM:
447 RTLIBCASE(REM_F);
448 case TargetOpcode::G_FPOW:
449 RTLIBCASE(POW_F);
450 case TargetOpcode::G_FPOWI:
451 RTLIBCASE(POWI_F);
452 case TargetOpcode::G_FMA:
453 RTLIBCASE(FMA_F);
454 case TargetOpcode::G_FSIN:
455 RTLIBCASE(SIN_F);
456 case TargetOpcode::G_FCOS:
457 RTLIBCASE(COS_F);
458 case TargetOpcode::G_FTAN:
459 RTLIBCASE(TAN_F);
460 case TargetOpcode::G_FASIN:
461 RTLIBCASE(ASIN_F);
462 case TargetOpcode::G_FACOS:
463 RTLIBCASE(ACOS_F);
464 case TargetOpcode::G_FATAN:
465 RTLIBCASE(ATAN_F);
466 case TargetOpcode::G_FATAN2:
467 RTLIBCASE(ATAN2_F);
468 case TargetOpcode::G_FSINH:
469 RTLIBCASE(SINH_F);
470 case TargetOpcode::G_FCOSH:
471 RTLIBCASE(COSH_F);
472 case TargetOpcode::G_FTANH:
473 RTLIBCASE(TANH_F);
474 case TargetOpcode::G_FSINCOS:
475 RTLIBCASE(SINCOS_F);
476 case TargetOpcode::G_FMODF:
477 RTLIBCASE(MODF_F);
478 case TargetOpcode::G_FLOG10:
479 RTLIBCASE(LOG10_F);
480 case TargetOpcode::G_FLOG:
481 RTLIBCASE(LOG_F);
482 case TargetOpcode::G_FLOG2:
483 RTLIBCASE(LOG2_F);
484 case TargetOpcode::G_FLDEXP:
485 RTLIBCASE(LDEXP_F);
486 case TargetOpcode::G_FCEIL:
487 RTLIBCASE(CEIL_F);
488 case TargetOpcode::G_FFLOOR:
489 RTLIBCASE(FLOOR_F);
490 case TargetOpcode::G_FMINNUM:
491 RTLIBCASE(FMIN_F);
492 case TargetOpcode::G_FMAXNUM:
493 RTLIBCASE(FMAX_F);
494 case TargetOpcode::G_FMINIMUMNUM:
495 RTLIBCASE(FMINIMUM_NUM_F);
496 case TargetOpcode::G_FMAXIMUMNUM:
497 RTLIBCASE(FMAXIMUM_NUM_F);
498 case TargetOpcode::G_FSQRT:
499 RTLIBCASE(SQRT_F);
500 case TargetOpcode::G_FRINT:
501 RTLIBCASE(RINT_F);
502 case TargetOpcode::G_FNEARBYINT:
503 RTLIBCASE(NEARBYINT_F);
504 case TargetOpcode::G_INTRINSIC_TRUNC:
505 RTLIBCASE(TRUNC_F);
506 case TargetOpcode::G_INTRINSIC_ROUND:
507 RTLIBCASE(ROUND_F);
508 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
509 RTLIBCASE(ROUNDEVEN_F);
510 case TargetOpcode::G_INTRINSIC_LRINT:
511 RTLIBCASE(LRINT_F);
512 case TargetOpcode::G_INTRINSIC_LLRINT:
513 RTLIBCASE(LLRINT_F);
514 }
515 llvm_unreachable("Unknown libcall function");
516#undef RTLIBCASE_INT
517#undef RTLIBCASE
518}
519
520/// True if an instruction is in tail position in its caller. Intended for
521/// legalizing libcalls as tail calls when possible.
522static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
523 MachineInstr &MI,
524 const TargetInstrInfo &TII,
525 MachineRegisterInfo &MRI) {
526 MachineBasicBlock &MBB = *MI.getParent();
527 const Function &F = MBB.getParent()->getFunction();
528
529 // Conservatively require the attributes of the call to match those of
530 // the return. Ignore NoAlias and NonNull because they don't affect the
531 // call sequence.
532 AttributeList CallerAttrs = F.getAttributes();
533 if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
534 .removeAttribute(Val: Attribute::NoAlias)
535 .removeAttribute(Val: Attribute::NonNull)
536 .hasAttributes())
537 return false;
538
539 // It's not safe to eliminate the sign / zero extension of the return value.
540 if (CallerAttrs.hasRetAttr(Kind: Attribute::ZExt) ||
541 CallerAttrs.hasRetAttr(Kind: Attribute::SExt))
542 return false;
543
544 // Only tail call if the following instruction is a standard return or if we
545 // have a `thisreturn` callee, and a sequence like:
546 //
547 // G_MEMCPY %0, %1, %2
548 // $x0 = COPY %0
549 // RET_ReallyLR implicit $x0
550 auto Next = next_nodbg(It: MI.getIterator(), End: MBB.instr_end());
551 if (Next != MBB.instr_end() && Next->isCopy()) {
552 if (MI.getOpcode() == TargetOpcode::G_BZERO)
553 return false;
554
555 // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
556 // mempy/etc routines return the same parameter. For other it will be the
557 // returned value.
558 Register VReg = MI.getOperand(i: 0).getReg();
559 if (!VReg.isVirtual() || VReg != Next->getOperand(i: 1).getReg())
560 return false;
561
562 Register PReg = Next->getOperand(i: 0).getReg();
563 if (!PReg.isPhysical())
564 return false;
565
566 auto Ret = next_nodbg(It: Next, End: MBB.instr_end());
567 if (Ret == MBB.instr_end() || !Ret->isReturn())
568 return false;
569
570 if (Ret->getNumImplicitOperands() != 1)
571 return false;
572
573 if (!Ret->getOperand(i: 0).isReg() || PReg != Ret->getOperand(i: 0).getReg())
574 return false;
575
576 // Skip over the COPY that we just validated.
577 Next = Ret;
578 }
579
580 if (Next == MBB.instr_end() || TII.isTailCall(Inst: *Next) || !Next->isReturn())
581 return false;
582
583 return true;
584}
585
586LegalizerHelper::LegalizeResult LegalizerHelper::createLibcall(
587 const char *Name, const CallLowering::ArgInfo &Result,
588 ArrayRef<CallLowering::ArgInfo> Args, const CallingConv::ID CC,
589 LostDebugLocObserver &LocObserver, MachineInstr *MI) const {
590 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
591
592 CallLowering::CallLoweringInfo Info;
593 Info.CallConv = CC;
594 Info.Callee = MachineOperand::CreateES(SymName: Name);
595 Info.OrigRet = Result;
596 if (MI)
597 Info.IsTailCall =
598 (Result.Ty->isVoidTy() ||
599 Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
600 isLibCallInTailPosition(Result, MI&: *MI, TII: MIRBuilder.getTII(),
601 MRI&: *MIRBuilder.getMRI());
602
603 llvm::append_range(C&: Info.OrigArgs, R&: Args);
604 if (!CLI.lowerCall(MIRBuilder, Info))
605 return LegalizerHelper::UnableToLegalize;
606
607 if (MI && Info.LoweredTailCall) {
608 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
609
610 // Check debug locations before removing the return.
611 LocObserver.checkpoint(CheckDebugLocs: true);
612
613 // We must have a return following the call (or debug insts) to get past
614 // isLibCallInTailPosition.
615 do {
616 MachineInstr *Next = MI->getNextNode();
617 assert(Next &&
618 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
619 "Expected instr following MI to be return or debug inst?");
620 // We lowered a tail call, so the call is now the return from the block.
621 // Delete the old return.
622 Next->eraseFromParent();
623 } while (MI->getNextNode());
624
625 // We expect to lose the debug location from the return.
626 LocObserver.checkpoint(CheckDebugLocs: false);
627 }
628 return LegalizerHelper::Legalized;
629}
630
631LegalizerHelper::LegalizeResult LegalizerHelper::createLibcall(
632 RTLIB::Libcall Libcall, const CallLowering::ArgInfo &Result,
633 ArrayRef<CallLowering::ArgInfo> Args, LostDebugLocObserver &LocObserver,
634 MachineInstr *MI) const {
635 if (!Libcalls)
636 return LegalizerHelper::UnableToLegalize;
637
638 RTLIB::LibcallImpl LibcallImpl = Libcalls->getLibcallImpl(Call: Libcall);
639 if (LibcallImpl == RTLIB::Unsupported)
640 return LegalizerHelper::UnableToLegalize;
641
642 StringRef Name = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: LibcallImpl);
643 const CallingConv::ID CC = Libcalls->getLibcallImplCallingConv(Call: LibcallImpl);
644 return createLibcall(Name: Name.data(), Result, Args, CC, LocObserver, MI);
645}
646
647// Useful for libcalls where all operands have the same type.
648LegalizerHelper::LegalizeResult
649LegalizerHelper::simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
650 unsigned Size, Type *OpType,
651 LostDebugLocObserver &LocObserver) const {
652 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
653
654 // FIXME: What does the original arg index mean here?
655 SmallVector<CallLowering::ArgInfo, 3> Args;
656 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands()))
657 Args.push_back(Elt: {MO.getReg(), OpType, 0});
658 return createLibcall(Libcall, Result: {MI.getOperand(i: 0).getReg(), OpType, 0}, Args,
659 LocObserver, MI: &MI);
660}
661
662LegalizerHelper::LegalizeResult LegalizerHelper::emitSincosLibcall(
663 MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, Type *OpType,
664 LostDebugLocObserver &LocObserver) {
665 MachineFunction &MF = *MI.getMF();
666 MachineRegisterInfo &MRI = MF.getRegInfo();
667
668 Register DstSin = MI.getOperand(i: 0).getReg();
669 Register DstCos = MI.getOperand(i: 1).getReg();
670 Register Src = MI.getOperand(i: 2).getReg();
671 LLT DstTy = MRI.getType(Reg: DstSin);
672
673 int MemSize = DstTy.getSizeInBytes();
674 Align Alignment = getStackTemporaryAlignment(Type: DstTy);
675 const DataLayout &DL = MIRBuilder.getDataLayout();
676 unsigned AddrSpace = DL.getAllocaAddrSpace();
677 MachinePointerInfo PtrInfo;
678
679 Register StackPtrSin =
680 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
681 .getReg(Idx: 0);
682 Register StackPtrCos =
683 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
684 .getReg(Idx: 0);
685
686 auto &Ctx = MF.getFunction().getContext();
687 auto LibcallResult = createLibcall(
688 Libcall: getRTLibDesc(Opcode: MI.getOpcode(), Size), Result: {{0}, Type::getVoidTy(C&: Ctx), 0},
689 Args: {{Src, OpType, 0},
690 {StackPtrSin, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 1},
691 {StackPtrCos, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 2}},
692 LocObserver, MI: &MI);
693
694 if (LibcallResult != LegalizeResult::Legalized)
695 return LegalizerHelper::UnableToLegalize;
696
697 MachineMemOperand *LoadMMOSin = MF.getMachineMemOperand(
698 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
699 MachineMemOperand *LoadMMOCos = MF.getMachineMemOperand(
700 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
701
702 MIRBuilder.buildLoad(Res: DstSin, Addr: StackPtrSin, MMO&: *LoadMMOSin);
703 MIRBuilder.buildLoad(Res: DstCos, Addr: StackPtrCos, MMO&: *LoadMMOCos);
704 MI.eraseFromParent();
705
706 return LegalizerHelper::Legalized;
707}
708
709LegalizerHelper::LegalizeResult
710LegalizerHelper::emitModfLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
711 unsigned Size, Type *OpType,
712 LostDebugLocObserver &LocObserver) {
713 MachineFunction &MF = MIRBuilder.getMF();
714 MachineRegisterInfo &MRI = MF.getRegInfo();
715
716 Register DstFrac = MI.getOperand(i: 0).getReg();
717 Register DstInt = MI.getOperand(i: 1).getReg();
718 Register Src = MI.getOperand(i: 2).getReg();
719 LLT DstTy = MRI.getType(Reg: DstFrac);
720
721 int MemSize = DstTy.getSizeInBytes();
722 Align Alignment = getStackTemporaryAlignment(Type: DstTy);
723 const DataLayout &DL = MIRBuilder.getDataLayout();
724 unsigned AddrSpace = DL.getAllocaAddrSpace();
725 MachinePointerInfo PtrInfo;
726
727 Register StackPtrInt =
728 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
729 .getReg(Idx: 0);
730
731 auto &Ctx = MF.getFunction().getContext();
732 auto LibcallResult = createLibcall(
733 Libcall: getRTLibDesc(Opcode: MI.getOpcode(), Size), Result: {DstFrac, OpType, 0},
734 Args: {{Src, OpType, 0}, {StackPtrInt, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 1}},
735 LocObserver, MI: &MI);
736
737 if (LibcallResult != LegalizeResult::Legalized)
738 return LegalizerHelper::UnableToLegalize;
739
740 MachineMemOperand *LoadMMOInt = MF.getMachineMemOperand(
741 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
742
743 MIRBuilder.buildLoad(Res: DstInt, Addr: StackPtrInt, MMO&: *LoadMMOInt);
744 MI.eraseFromParent();
745
746 return LegalizerHelper::Legalized;
747}
748
749static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
750 Type *FromType) {
751 auto ToMVT = MVT::getVT(Ty: ToType);
752 auto FromMVT = MVT::getVT(Ty: FromType);
753
754 switch (Opcode) {
755 case TargetOpcode::G_FPEXT:
756 return RTLIB::getFPEXT(OpVT: FromMVT, RetVT: ToMVT);
757 case TargetOpcode::G_FPTRUNC:
758 return RTLIB::getFPROUND(OpVT: FromMVT, RetVT: ToMVT);
759 case TargetOpcode::G_FPTOSI:
760 return RTLIB::getFPTOSINT(OpVT: FromMVT, RetVT: ToMVT);
761 case TargetOpcode::G_FPTOUI:
762 return RTLIB::getFPTOUINT(OpVT: FromMVT, RetVT: ToMVT);
763 case TargetOpcode::G_SITOFP:
764 return RTLIB::getSINTTOFP(OpVT: FromMVT, RetVT: ToMVT);
765 case TargetOpcode::G_UITOFP:
766 return RTLIB::getUINTTOFP(OpVT: FromMVT, RetVT: ToMVT);
767 }
768 llvm_unreachable("Unsupported libcall function");
769}
770
771LegalizerHelper::LegalizeResult LegalizerHelper::conversionLibcall(
772 MachineInstr &MI, Type *ToType, Type *FromType,
773 LostDebugLocObserver &LocObserver, bool IsSigned) const {
774 CallLowering::ArgInfo Arg = {MI.getOperand(i: 1).getReg(), FromType, 0};
775 if (FromType->isIntegerTy()) {
776 if (TLI.shouldSignExtendTypeInLibCall(Ty: FromType, IsSigned))
777 Arg.Flags[0].setSExt();
778 else
779 Arg.Flags[0].setZExt();
780 }
781
782 RTLIB::Libcall Libcall = getConvRTLibDesc(Opcode: MI.getOpcode(), ToType, FromType);
783 return createLibcall(Libcall, Result: {MI.getOperand(i: 0).getReg(), ToType, 0}, Args: Arg,
784 LocObserver, MI: &MI);
785}
786
787LegalizerHelper::LegalizeResult
788LegalizerHelper::createMemLibcall(MachineRegisterInfo &MRI, MachineInstr &MI,
789 LostDebugLocObserver &LocObserver) const {
790 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
791
792 SmallVector<CallLowering::ArgInfo, 3> Args;
793 // Add all the args, except for the last which is an imm denoting 'tail'.
794 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
795 Register Reg = MI.getOperand(i).getReg();
796
797 // Need derive an IR type for call lowering.
798 LLT OpLLT = MRI.getType(Reg);
799 Type *OpTy = nullptr;
800 if (OpLLT.isPointer())
801 OpTy = PointerType::get(C&: Ctx, AddressSpace: OpLLT.getAddressSpace());
802 else
803 OpTy = IntegerType::get(C&: Ctx, NumBits: OpLLT.getSizeInBits());
804 Args.push_back(Elt: {Reg, OpTy, 0});
805 }
806
807 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
808 RTLIB::Libcall RTLibcall;
809 unsigned Opc = MI.getOpcode();
810 switch (Opc) {
811 case TargetOpcode::G_BZERO:
812 RTLibcall = RTLIB::BZERO;
813 break;
814 case TargetOpcode::G_MEMCPY:
815 RTLibcall = RTLIB::MEMCPY;
816 Args[0].Flags[0].setReturned();
817 break;
818 case TargetOpcode::G_MEMMOVE:
819 RTLibcall = RTLIB::MEMMOVE;
820 Args[0].Flags[0].setReturned();
821 break;
822 case TargetOpcode::G_MEMSET:
823 RTLibcall = RTLIB::MEMSET;
824 Args[0].Flags[0].setReturned();
825 break;
826 default:
827 llvm_unreachable("unsupported opcode");
828 }
829
830 if (!Libcalls) // FIXME: Should be mandatory
831 return LegalizerHelper::UnableToLegalize;
832
833 RTLIB::LibcallImpl RTLibcallImpl = Libcalls->getLibcallImpl(Call: RTLibcall);
834
835 // Unsupported libcall on the target.
836 if (RTLibcallImpl == RTLIB::Unsupported) {
837 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
838 << MIRBuilder.getTII().getName(Opc) << "\n");
839 return LegalizerHelper::UnableToLegalize;
840 }
841
842 CallLowering::CallLoweringInfo Info;
843 Info.CallConv = Libcalls->getLibcallImplCallingConv(Call: RTLibcallImpl);
844
845 StringRef LibcallName =
846 RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: RTLibcallImpl);
847 Info.Callee = MachineOperand::CreateES(SymName: LibcallName.data());
848 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0);
849 Info.IsTailCall =
850 MI.getOperand(i: MI.getNumOperands() - 1).getImm() &&
851 isLibCallInTailPosition(Result: Info.OrigRet, MI, TII: MIRBuilder.getTII(), MRI);
852
853 llvm::append_range(C&: Info.OrigArgs, R&: Args);
854 if (!CLI.lowerCall(MIRBuilder, Info))
855 return LegalizerHelper::UnableToLegalize;
856
857 if (Info.LoweredTailCall) {
858 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
859
860 // Check debug locations before removing the return.
861 LocObserver.checkpoint(CheckDebugLocs: true);
862
863 // We must have a return following the call (or debug insts) to get past
864 // isLibCallInTailPosition.
865 do {
866 MachineInstr *Next = MI.getNextNode();
867 assert(Next &&
868 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
869 "Expected instr following MI to be return or debug inst?");
870 // We lowered a tail call, so the call is now the return from the block.
871 // Delete the old return.
872 Next->eraseFromParent();
873 } while (MI.getNextNode());
874
875 // We expect to lose the debug location from the return.
876 LocObserver.checkpoint(CheckDebugLocs: false);
877 }
878
879 return LegalizerHelper::Legalized;
880}
881
882static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
883 unsigned Opc = MI.getOpcode();
884 auto &AtomicMI = cast<GMemOperation>(Val&: MI);
885 auto &MMO = AtomicMI.getMMO();
886 auto Ordering = MMO.getMergedOrdering();
887 LLT MemType = MMO.getMemoryType();
888 uint64_t MemSize = MemType.getSizeInBytes();
889 if (MemType.isVector())
890 return RTLIB::UNKNOWN_LIBCALL;
891
892#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
893#define LCALL5(A) \
894 LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
895 switch (Opc) {
896 case TargetOpcode::G_ATOMIC_CMPXCHG:
897 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
898 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
899 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
900 }
901 case TargetOpcode::G_ATOMICRMW_XCHG: {
902 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
903 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
904 }
905 case TargetOpcode::G_ATOMICRMW_ADD:
906 case TargetOpcode::G_ATOMICRMW_SUB: {
907 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
908 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
909 }
910 case TargetOpcode::G_ATOMICRMW_AND: {
911 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
912 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
913 }
914 case TargetOpcode::G_ATOMICRMW_OR: {
915 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
916 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
917 }
918 case TargetOpcode::G_ATOMICRMW_XOR: {
919 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
920 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
921 }
922 default:
923 return RTLIB::UNKNOWN_LIBCALL;
924 }
925#undef LCALLS
926#undef LCALL5
927}
928
929LegalizerHelper::LegalizeResult
930LegalizerHelper::createAtomicLibcall(MachineInstr &MI) const {
931 auto &Ctx = MIRBuilder.getContext();
932
933 Type *RetTy;
934 SmallVector<Register> RetRegs;
935 SmallVector<CallLowering::ArgInfo, 3> Args;
936 unsigned Opc = MI.getOpcode();
937 switch (Opc) {
938 case TargetOpcode::G_ATOMIC_CMPXCHG:
939 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
940 Register Success;
941 LLT SuccessLLT;
942 auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
943 MI.getFirst4RegLLTs();
944 RetRegs.push_back(Elt: Ret);
945 RetTy = IntegerType::get(C&: Ctx, NumBits: RetLLT.getSizeInBits());
946 if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
947 std::tie(args&: Ret, args&: RetLLT, args&: Success, args&: SuccessLLT, args&: Mem, args&: MemLLT, args&: Cmp, args&: CmpLLT, args&: New,
948 args&: NewLLT) = MI.getFirst5RegLLTs();
949 RetRegs.push_back(Elt: Success);
950 RetTy = StructType::get(
951 Context&: Ctx, Elements: {RetTy, IntegerType::get(C&: Ctx, NumBits: SuccessLLT.getSizeInBits())});
952 }
953 Args.push_back(Elt: {Cmp, IntegerType::get(C&: Ctx, NumBits: CmpLLT.getSizeInBits()), 0});
954 Args.push_back(Elt: {New, IntegerType::get(C&: Ctx, NumBits: NewLLT.getSizeInBits()), 0});
955 Args.push_back(Elt: {Mem, PointerType::get(C&: Ctx, AddressSpace: MemLLT.getAddressSpace()), 0});
956 break;
957 }
958 case TargetOpcode::G_ATOMICRMW_XCHG:
959 case TargetOpcode::G_ATOMICRMW_ADD:
960 case TargetOpcode::G_ATOMICRMW_SUB:
961 case TargetOpcode::G_ATOMICRMW_AND:
962 case TargetOpcode::G_ATOMICRMW_OR:
963 case TargetOpcode::G_ATOMICRMW_XOR: {
964 auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
965 RetRegs.push_back(Elt: Ret);
966 RetTy = IntegerType::get(C&: Ctx, NumBits: RetLLT.getSizeInBits());
967 if (Opc == TargetOpcode::G_ATOMICRMW_AND)
968 Val =
969 MIRBuilder.buildXor(Dst: ValLLT, Src0: MIRBuilder.buildConstant(Res: ValLLT, Val: -1), Src1: Val)
970 .getReg(Idx: 0);
971 else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
972 Val =
973 MIRBuilder.buildSub(Dst: ValLLT, Src0: MIRBuilder.buildConstant(Res: ValLLT, Val: 0), Src1: Val)
974 .getReg(Idx: 0);
975 Args.push_back(Elt: {Val, IntegerType::get(C&: Ctx, NumBits: ValLLT.getSizeInBits()), 0});
976 Args.push_back(Elt: {Mem, PointerType::get(C&: Ctx, AddressSpace: MemLLT.getAddressSpace()), 0});
977 break;
978 }
979 default:
980 llvm_unreachable("unsupported opcode");
981 }
982
983 if (!Libcalls) // FIXME: Should be mandatory
984 return LegalizerHelper::UnableToLegalize;
985
986 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
987 RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
988 RTLIB::LibcallImpl RTLibcallImpl = Libcalls->getLibcallImpl(Call: RTLibcall);
989
990 // Unsupported libcall on the target.
991 if (RTLibcallImpl == RTLIB::Unsupported) {
992 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
993 << MIRBuilder.getTII().getName(Opc) << "\n");
994 return LegalizerHelper::UnableToLegalize;
995 }
996
997 CallLowering::CallLoweringInfo Info;
998 Info.CallConv = Libcalls->getLibcallImplCallingConv(Call: RTLibcallImpl);
999
1000 StringRef LibcallName =
1001 RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: RTLibcallImpl);
1002 Info.Callee = MachineOperand::CreateES(SymName: LibcallName.data());
1003 Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
1004
1005 llvm::append_range(C&: Info.OrigArgs, R&: Args);
1006 if (!CLI.lowerCall(MIRBuilder, Info))
1007 return LegalizerHelper::UnableToLegalize;
1008
1009 return LegalizerHelper::Legalized;
1010}
1011
1012static RTLIB::Libcall
1013getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
1014 RTLIB::Libcall RTLibcall;
1015 switch (MI.getOpcode()) {
1016 case TargetOpcode::G_GET_FPENV:
1017 RTLibcall = RTLIB::FEGETENV;
1018 break;
1019 case TargetOpcode::G_SET_FPENV:
1020 case TargetOpcode::G_RESET_FPENV:
1021 RTLibcall = RTLIB::FESETENV;
1022 break;
1023 case TargetOpcode::G_GET_FPMODE:
1024 RTLibcall = RTLIB::FEGETMODE;
1025 break;
1026 case TargetOpcode::G_SET_FPMODE:
1027 case TargetOpcode::G_RESET_FPMODE:
1028 RTLibcall = RTLIB::FESETMODE;
1029 break;
1030 default:
1031 llvm_unreachable("Unexpected opcode");
1032 }
1033 return RTLibcall;
1034}
1035
1036// Some library functions that read FP state (fegetmode, fegetenv) write the
1037// state into a region in memory. IR intrinsics that do the same operations
1038// (get_fpmode, get_fpenv) return the state as integer value. To implement these
1039// intrinsics via the library functions, we need to use temporary variable,
1040// for example:
1041//
1042// %0:_(s32) = G_GET_FPMODE
1043//
1044// is transformed to:
1045//
1046// %1:_(p0) = G_FRAME_INDEX %stack.0
1047// BL &fegetmode
1048// %0:_(s32) = G_LOAD % 1
1049//
1050LegalizerHelper::LegalizeResult
1051LegalizerHelper::createGetStateLibcall(MachineInstr &MI,
1052 LostDebugLocObserver &LocObserver) {
1053 const DataLayout &DL = MIRBuilder.getDataLayout();
1054 auto &MF = MIRBuilder.getMF();
1055 auto &MRI = *MIRBuilder.getMRI();
1056 auto &Ctx = MF.getFunction().getContext();
1057
1058 // Create temporary, where library function will put the read state.
1059 Register Dst = MI.getOperand(i: 0).getReg();
1060 LLT StateTy = MRI.getType(Reg: Dst);
1061 TypeSize StateSize = StateTy.getSizeInBytes();
1062 Align TempAlign = getStackTemporaryAlignment(Type: StateTy);
1063 MachinePointerInfo TempPtrInfo;
1064 auto Temp = createStackTemporary(Bytes: StateSize, Alignment: TempAlign, PtrInfo&: TempPtrInfo);
1065
1066 // Create a call to library function, with the temporary as an argument.
1067 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1068 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: TempAddrSpace);
1069 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1070 auto Res = createLibcall(
1071 Libcall: RTLibcall, Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1072 Args: CallLowering::ArgInfo({Temp.getReg(Idx: 0), StatePtrTy, 0}), LocObserver,
1073 MI: nullptr);
1074 if (Res != LegalizerHelper::Legalized)
1075 return Res;
1076
1077 // Create a load from the temporary.
1078 MachineMemOperand *MMO = MF.getMachineMemOperand(
1079 PtrInfo: TempPtrInfo, f: MachineMemOperand::MOLoad, MemTy: StateTy, base_alignment: TempAlign);
1080 MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_LOAD, Res: Dst, Addr: Temp, MMO&: *MMO);
1081
1082 return LegalizerHelper::Legalized;
1083}
1084
1085// Similar to `createGetStateLibcall` the function calls a library function
1086// using transient space in stack. In this case the library function reads
1087// content of memory region.
1088LegalizerHelper::LegalizeResult
1089LegalizerHelper::createSetStateLibcall(MachineInstr &MI,
1090 LostDebugLocObserver &LocObserver) {
1091 const DataLayout &DL = MIRBuilder.getDataLayout();
1092 auto &MF = MIRBuilder.getMF();
1093 auto &MRI = *MIRBuilder.getMRI();
1094 auto &Ctx = MF.getFunction().getContext();
1095
1096 // Create temporary, where library function will get the new state.
1097 Register Src = MI.getOperand(i: 0).getReg();
1098 LLT StateTy = MRI.getType(Reg: Src);
1099 TypeSize StateSize = StateTy.getSizeInBytes();
1100 Align TempAlign = getStackTemporaryAlignment(Type: StateTy);
1101 MachinePointerInfo TempPtrInfo;
1102 auto Temp = createStackTemporary(Bytes: StateSize, Alignment: TempAlign, PtrInfo&: TempPtrInfo);
1103
1104 // Put the new state into the temporary.
1105 MachineMemOperand *MMO = MF.getMachineMemOperand(
1106 PtrInfo: TempPtrInfo, f: MachineMemOperand::MOStore, MemTy: StateTy, base_alignment: TempAlign);
1107 MIRBuilder.buildStore(Val: Src, Addr: Temp, MMO&: *MMO);
1108
1109 // Create a call to library function, with the temporary as an argument.
1110 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1111 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: TempAddrSpace);
1112 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1113 return createLibcall(Libcall: RTLibcall,
1114 Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1115 Args: CallLowering::ArgInfo({Temp.getReg(Idx: 0), StatePtrTy, 0}),
1116 LocObserver, MI: nullptr);
1117}
1118
1119/// Returns the corresponding libcall for the given Pred and
1120/// the ICMP predicate that should be generated to compare with #0
1121/// after the libcall.
1122static std::pair<RTLIB::Libcall, CmpInst::Predicate>
1123getFCMPLibcallDesc(const CmpInst::Predicate Pred, unsigned Size) {
1124#define RTLIBCASE_CMP(LibcallPrefix, ICmpPred) \
1125 do { \
1126 switch (Size) { \
1127 case 32: \
1128 return {RTLIB::LibcallPrefix##32, ICmpPred}; \
1129 case 64: \
1130 return {RTLIB::LibcallPrefix##64, ICmpPred}; \
1131 case 128: \
1132 return {RTLIB::LibcallPrefix##128, ICmpPred}; \
1133 default: \
1134 llvm_unreachable("unexpected size"); \
1135 } \
1136 } while (0)
1137
1138 switch (Pred) {
1139 case CmpInst::FCMP_OEQ:
1140 RTLIBCASE_CMP(OEQ_F, CmpInst::ICMP_EQ);
1141 case CmpInst::FCMP_UNE:
1142 RTLIBCASE_CMP(UNE_F, CmpInst::ICMP_NE);
1143 case CmpInst::FCMP_OGE:
1144 RTLIBCASE_CMP(OGE_F, CmpInst::ICMP_SGE);
1145 case CmpInst::FCMP_OLT:
1146 RTLIBCASE_CMP(OLT_F, CmpInst::ICMP_SLT);
1147 case CmpInst::FCMP_OLE:
1148 RTLIBCASE_CMP(OLE_F, CmpInst::ICMP_SLE);
1149 case CmpInst::FCMP_OGT:
1150 RTLIBCASE_CMP(OGT_F, CmpInst::ICMP_SGT);
1151 case CmpInst::FCMP_UNO:
1152 RTLIBCASE_CMP(UO_F, CmpInst::ICMP_NE);
1153 default:
1154 return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
1155 }
1156}
1157
1158LegalizerHelper::LegalizeResult
1159LegalizerHelper::createFCMPLibcall(MachineInstr &MI,
1160 LostDebugLocObserver &LocObserver) {
1161 auto &MF = MIRBuilder.getMF();
1162 auto &Ctx = MF.getFunction().getContext();
1163 const GFCmp *Cmp = cast<GFCmp>(Val: &MI);
1164
1165 LLT OpLLT = MRI.getType(Reg: Cmp->getLHSReg());
1166 unsigned Size = OpLLT.getSizeInBits();
1167 if ((Size != 32 && Size != 64 && Size != 128) ||
1168 OpLLT != MRI.getType(Reg: Cmp->getRHSReg()))
1169 return UnableToLegalize;
1170
1171 Type *OpType = getFloatTypeForLLT(Ctx, Ty: OpLLT);
1172
1173 // DstReg type is s32
1174 const Register DstReg = Cmp->getReg(Idx: 0);
1175 LLT DstTy = MRI.getType(Reg: DstReg);
1176 const auto Cond = Cmp->getCond();
1177
1178 // Reference:
1179 // https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
1180 // Generates a libcall followed by ICMP.
1181 const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
1182 const CmpInst::Predicate ICmpPred,
1183 const DstOp &Res) -> Register {
1184 // FCMP libcall always returns an i32, and needs an ICMP with #0.
1185 LLT TempLLT = LLT::integer(SizeInBits: 32);
1186 Register Temp = MRI.createGenericVirtualRegister(Ty: TempLLT);
1187 // Generate libcall, holding result in Temp
1188 const auto Status = createLibcall(
1189 Libcall, Result: {Temp, Type::getInt32Ty(C&: Ctx), 0},
1190 Args: {{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
1191 LocObserver, MI: &MI);
1192 if (!Status)
1193 return {};
1194
1195 // Compare temp with #0 to get the final result.
1196 return MIRBuilder
1197 .buildICmp(Pred: ICmpPred, Res, Op0: Temp, Op1: MIRBuilder.buildConstant(Res: TempLLT, Val: 0))
1198 .getReg(Idx: 0);
1199 };
1200
1201 // Simple case if we have a direct mapping from predicate to libcall
1202 if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Pred: Cond, Size);
1203 Libcall != RTLIB::UNKNOWN_LIBCALL &&
1204 ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
1205 if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
1206 return Legalized;
1207 }
1208 return UnableToLegalize;
1209 }
1210
1211 // No direct mapping found, should be generated as combination of libcalls.
1212
1213 switch (Cond) {
1214 case CmpInst::FCMP_UEQ: {
1215 // FCMP_UEQ: unordered or equal
1216 // Convert into (FCMP_OEQ || FCMP_UNO).
1217
1218 const auto [OeqLibcall, OeqPred] =
1219 getFCMPLibcallDesc(Pred: CmpInst::FCMP_OEQ, Size);
1220 const auto Oeq = BuildLibcall(OeqLibcall, OeqPred, DstTy);
1221
1222 const auto [UnoLibcall, UnoPred] =
1223 getFCMPLibcallDesc(Pred: CmpInst::FCMP_UNO, Size);
1224 const auto Uno = BuildLibcall(UnoLibcall, UnoPred, DstTy);
1225 if (Oeq && Uno)
1226 MIRBuilder.buildOr(Dst: DstReg, Src0: Oeq, Src1: Uno);
1227 else
1228 return UnableToLegalize;
1229
1230 break;
1231 }
1232 case CmpInst::FCMP_ONE: {
1233 // FCMP_ONE: ordered and operands are unequal
1234 // Convert into (!FCMP_OEQ && !FCMP_UNO).
1235
1236 // We inverse the predicate instead of generating a NOT
1237 // to save one instruction.
1238 // On AArch64 isel can even select two cmp into a single ccmp.
1239 const auto [OeqLibcall, OeqPred] =
1240 getFCMPLibcallDesc(Pred: CmpInst::FCMP_OEQ, Size);
1241 const auto NotOeq =
1242 BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(pred: OeqPred), DstTy);
1243
1244 const auto [UnoLibcall, UnoPred] =
1245 getFCMPLibcallDesc(Pred: CmpInst::FCMP_UNO, Size);
1246 const auto NotUno =
1247 BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(pred: UnoPred), DstTy);
1248
1249 if (NotOeq && NotUno)
1250 MIRBuilder.buildAnd(Dst: DstReg, Src0: NotOeq, Src1: NotUno);
1251 else
1252 return UnableToLegalize;
1253
1254 break;
1255 }
1256 case CmpInst::FCMP_ULT:
1257 case CmpInst::FCMP_UGE:
1258 case CmpInst::FCMP_UGT:
1259 case CmpInst::FCMP_ULE:
1260 case CmpInst::FCMP_ORD: {
1261 // Convert into: !(inverse(Pred))
1262 // E.g. FCMP_ULT becomes !FCMP_OGE
1263 // This is equivalent to the following, but saves some instructions.
1264 // MIRBuilder.buildNot(
1265 // PredTy,
1266 // MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
1267 // Op1, Op2));
1268 const auto [InversedLibcall, InversedPred] =
1269 getFCMPLibcallDesc(Pred: CmpInst::getInversePredicate(pred: Cond), Size);
1270 if (!BuildLibcall(InversedLibcall,
1271 CmpInst::getInversePredicate(pred: InversedPred), DstReg))
1272 return UnableToLegalize;
1273 break;
1274 }
1275 default:
1276 return UnableToLegalize;
1277 }
1278
1279 return Legalized;
1280}
1281
1282// The function is used to legalize operations that set default environment
1283// state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
1284// On most targets supported in glibc FE_DFL_MODE is defined as
1285// `((const femode_t *) -1)`. Such assumption is used here. If for some target
1286// it is not true, the target must provide custom lowering.
1287LegalizerHelper::LegalizeResult
1288LegalizerHelper::createResetStateLibcall(MachineInstr &MI,
1289 LostDebugLocObserver &LocObserver) {
1290 const DataLayout &DL = MIRBuilder.getDataLayout();
1291 auto &MF = MIRBuilder.getMF();
1292 auto &Ctx = MF.getFunction().getContext();
1293
1294 // Create an argument for the library function.
1295 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
1296 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: AddrSpace);
1297 unsigned PtrSize = DL.getPointerSizeInBits(AS: AddrSpace);
1298 LLT MemTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: PtrSize);
1299 auto DefValue = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: PtrSize), Val: -1LL);
1300 DstOp Dest(MRI.createGenericVirtualRegister(Ty: MemTy));
1301 MIRBuilder.buildIntToPtr(Dst: Dest, Src: DefValue);
1302
1303 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1304 return createLibcall(
1305 Libcall: RTLibcall, Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1306 Args: CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}), LocObserver, MI: &MI);
1307}
1308
1309LegalizerHelper::LegalizeResult
1310LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1311 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1312
1313 switch (MI.getOpcode()) {
1314 default:
1315 return UnableToLegalize;
1316 case TargetOpcode::G_MUL:
1317 case TargetOpcode::G_SDIV:
1318 case TargetOpcode::G_UDIV:
1319 case TargetOpcode::G_SREM:
1320 case TargetOpcode::G_UREM:
1321 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1322 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1323 unsigned Size = LLTy.getSizeInBits();
1324 Type *HLTy = IntegerType::get(C&: Ctx, NumBits: Size);
1325 auto Status = simpleLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1326 if (Status != Legalized)
1327 return Status;
1328 break;
1329 }
1330 case TargetOpcode::G_FADD:
1331 case TargetOpcode::G_FSUB:
1332 case TargetOpcode::G_FMUL:
1333 case TargetOpcode::G_FDIV:
1334 case TargetOpcode::G_FMA:
1335 case TargetOpcode::G_FPOW:
1336 case TargetOpcode::G_FREM:
1337 case TargetOpcode::G_FCOS:
1338 case TargetOpcode::G_FSIN:
1339 case TargetOpcode::G_FTAN:
1340 case TargetOpcode::G_FACOS:
1341 case TargetOpcode::G_FASIN:
1342 case TargetOpcode::G_FATAN:
1343 case TargetOpcode::G_FATAN2:
1344 case TargetOpcode::G_FCOSH:
1345 case TargetOpcode::G_FSINH:
1346 case TargetOpcode::G_FTANH:
1347 case TargetOpcode::G_FLOG10:
1348 case TargetOpcode::G_FLOG:
1349 case TargetOpcode::G_FLOG2:
1350 case TargetOpcode::G_FEXP:
1351 case TargetOpcode::G_FEXP2:
1352 case TargetOpcode::G_FEXP10:
1353 case TargetOpcode::G_FCEIL:
1354 case TargetOpcode::G_FFLOOR:
1355 case TargetOpcode::G_FMINNUM:
1356 case TargetOpcode::G_FMAXNUM:
1357 case TargetOpcode::G_FMINIMUMNUM:
1358 case TargetOpcode::G_FMAXIMUMNUM:
1359 case TargetOpcode::G_FSQRT:
1360 case TargetOpcode::G_FRINT:
1361 case TargetOpcode::G_FNEARBYINT:
1362 case TargetOpcode::G_INTRINSIC_TRUNC:
1363 case TargetOpcode::G_INTRINSIC_ROUND:
1364 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1365 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1366 unsigned Size = LLTy.getSizeInBits();
1367 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1368 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1369 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1370 return UnableToLegalize;
1371 }
1372 auto Status = simpleLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1373 if (Status != Legalized)
1374 return Status;
1375 break;
1376 }
1377 case TargetOpcode::G_FSINCOS: {
1378 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1379 unsigned Size = LLTy.getSizeInBits();
1380 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1381 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1382 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1383 return UnableToLegalize;
1384 }
1385 return emitSincosLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1386 }
1387 case TargetOpcode::G_FMODF: {
1388 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1389 unsigned Size = LLTy.getSizeInBits();
1390 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1391 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1392 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1393 return UnableToLegalize;
1394 }
1395 return emitModfLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1396 }
1397 case TargetOpcode::G_LROUND:
1398 case TargetOpcode::G_LLROUND:
1399 case TargetOpcode::G_INTRINSIC_LRINT:
1400 case TargetOpcode::G_INTRINSIC_LLRINT: {
1401 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
1402 unsigned Size = LLTy.getSizeInBits();
1403 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1404 Type *ITy = IntegerType::get(
1405 C&: Ctx, NumBits: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits());
1406 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1407 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1408 return UnableToLegalize;
1409 }
1410 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
1411 LegalizeResult Status =
1412 createLibcall(Libcall, Result: {MI.getOperand(i: 0).getReg(), ITy, 0},
1413 Args: {{MI.getOperand(i: 1).getReg(), HLTy, 0}}, LocObserver, MI: &MI);
1414 if (Status != Legalized)
1415 return Status;
1416 MI.eraseFromParent();
1417 return Legalized;
1418 }
1419 case TargetOpcode::G_FPOWI:
1420 case TargetOpcode::G_FLDEXP: {
1421 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1422 unsigned Size = LLTy.getSizeInBits();
1423 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1424 Type *ITy = IntegerType::get(
1425 C&: Ctx, NumBits: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits());
1426 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1427 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1428 return UnableToLegalize;
1429 }
1430 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
1431 SmallVector<CallLowering::ArgInfo, 2> Args = {
1432 {MI.getOperand(i: 1).getReg(), HLTy, 0},
1433 {MI.getOperand(i: 2).getReg(), ITy, 1}};
1434 Args[1].Flags[0].setSExt();
1435 LegalizeResult Status = createLibcall(
1436 Libcall, Result: {MI.getOperand(i: 0).getReg(), HLTy, 0}, Args, LocObserver, MI: &MI);
1437 if (Status != Legalized)
1438 return Status;
1439 break;
1440 }
1441 case TargetOpcode::G_FPEXT:
1442 case TargetOpcode::G_FPTRUNC: {
1443 Type *FromTy = getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 1).getReg()));
1444 Type *ToTy = getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
1445 if (!FromTy || !ToTy)
1446 return UnableToLegalize;
1447 LegalizeResult Status = conversionLibcall(MI, ToType: ToTy, FromType: FromTy, LocObserver);
1448 if (Status != Legalized)
1449 return Status;
1450 break;
1451 }
1452 case TargetOpcode::G_FCMP: {
1453 LegalizeResult Status = createFCMPLibcall(MI, LocObserver);
1454 if (Status != Legalized)
1455 return Status;
1456 MI.eraseFromParent();
1457 return Status;
1458 }
1459 case TargetOpcode::G_FPTOSI:
1460 case TargetOpcode::G_FPTOUI: {
1461 // FIXME: Support other types
1462 Type *FromTy =
1463 getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 1).getReg()));
1464 unsigned ToSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1465 if ((ToSize != 32 && ToSize != 64 && ToSize != 128) || !FromTy)
1466 return UnableToLegalize;
1467 LegalizeResult Status = conversionLibcall(MI, ToType: Type::getIntNTy(C&: Ctx, N: ToSize),
1468 FromType: FromTy, LocObserver);
1469 if (Status != Legalized)
1470 return Status;
1471 break;
1472 }
1473 case TargetOpcode::G_SITOFP:
1474 case TargetOpcode::G_UITOFP: {
1475 unsigned FromSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1476 Type *ToTy =
1477 getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
1478 if ((FromSize != 32 && FromSize != 64 && FromSize != 128) || !ToTy)
1479 return UnableToLegalize;
1480 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SITOFP;
1481 LegalizeResult Status = conversionLibcall(
1482 MI, ToType: ToTy, FromType: Type::getIntNTy(C&: Ctx, N: FromSize), LocObserver, IsSigned);
1483 if (Status != Legalized)
1484 return Status;
1485 break;
1486 }
1487 case TargetOpcode::G_ATOMICRMW_XCHG:
1488 case TargetOpcode::G_ATOMICRMW_ADD:
1489 case TargetOpcode::G_ATOMICRMW_SUB:
1490 case TargetOpcode::G_ATOMICRMW_AND:
1491 case TargetOpcode::G_ATOMICRMW_OR:
1492 case TargetOpcode::G_ATOMICRMW_XOR:
1493 case TargetOpcode::G_ATOMIC_CMPXCHG:
1494 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1495 auto Status = createAtomicLibcall(MI);
1496 if (Status != Legalized)
1497 return Status;
1498 break;
1499 }
1500 case TargetOpcode::G_BZERO:
1501 case TargetOpcode::G_MEMCPY:
1502 case TargetOpcode::G_MEMMOVE:
1503 case TargetOpcode::G_MEMSET: {
1504 LegalizeResult Result =
1505 createMemLibcall(MRI&: *MIRBuilder.getMRI(), MI, LocObserver);
1506 if (Result != Legalized)
1507 return Result;
1508 MI.eraseFromParent();
1509 return Result;
1510 }
1511 case TargetOpcode::G_GET_FPENV:
1512 case TargetOpcode::G_GET_FPMODE: {
1513 LegalizeResult Result = createGetStateLibcall(MI, LocObserver);
1514 if (Result != Legalized)
1515 return Result;
1516 break;
1517 }
1518 case TargetOpcode::G_SET_FPENV:
1519 case TargetOpcode::G_SET_FPMODE: {
1520 LegalizeResult Result = createSetStateLibcall(MI, LocObserver);
1521 if (Result != Legalized)
1522 return Result;
1523 break;
1524 }
1525 case TargetOpcode::G_RESET_FPENV:
1526 case TargetOpcode::G_RESET_FPMODE: {
1527 LegalizeResult Result = createResetStateLibcall(MI, LocObserver);
1528 if (Result != Legalized)
1529 return Result;
1530 break;
1531 }
1532 }
1533
1534 MI.eraseFromParent();
1535 return Legalized;
1536}
1537
1538LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1539 unsigned TypeIdx,
1540 LLT NarrowTy) {
1541 uint64_t SizeOp0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1542 uint64_t NarrowSize = NarrowTy.getSizeInBits();
1543
1544 switch (MI.getOpcode()) {
1545 default:
1546 return UnableToLegalize;
1547 case TargetOpcode::G_IMPLICIT_DEF: {
1548 Register DstReg = MI.getOperand(i: 0).getReg();
1549 LLT DstTy = MRI.getType(Reg: DstReg);
1550
1551 // If SizeOp0 is not an exact multiple of NarrowSize, emit
1552 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1553 // FIXME: Although this would also be legal for the general case, it causes
1554 // a lot of regressions in the emitted code (superfluous COPYs, artifact
1555 // combines not being hit). This seems to be a problem related to the
1556 // artifact combiner.
1557 if (SizeOp0 % NarrowSize != 0) {
1558 LLT ImplicitTy = DstTy.changeElementType(NewEltTy: NarrowTy);
1559 Register ImplicitReg = MIRBuilder.buildUndef(Res: ImplicitTy).getReg(Idx: 0);
1560 MIRBuilder.buildAnyExt(Res: DstReg, Op: ImplicitReg);
1561
1562 MI.eraseFromParent();
1563 return Legalized;
1564 }
1565
1566 int NumParts = SizeOp0 / NarrowSize;
1567
1568 SmallVector<Register, 2> DstRegs;
1569 for (int i = 0; i < NumParts; ++i)
1570 DstRegs.push_back(Elt: MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0));
1571
1572 if (DstTy.isVector())
1573 MIRBuilder.buildBuildVector(Res: DstReg, Ops: DstRegs);
1574 else
1575 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
1576 MI.eraseFromParent();
1577 return Legalized;
1578 }
1579 case TargetOpcode::G_CONSTANT: {
1580 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1581 const APInt &Val = MI.getOperand(i: 1).getCImm()->getValue();
1582 unsigned TotalSize = Ty.getSizeInBits();
1583 unsigned NarrowSize = NarrowTy.getSizeInBits();
1584 int NumParts = TotalSize / NarrowSize;
1585
1586 SmallVector<Register, 4> PartRegs;
1587 for (int I = 0; I != NumParts; ++I) {
1588 unsigned Offset = I * NarrowSize;
1589 auto K = MIRBuilder.buildConstant(Res: NarrowTy,
1590 Val: Val.lshr(shiftAmt: Offset).trunc(width: NarrowSize));
1591 PartRegs.push_back(Elt: K.getReg(Idx: 0));
1592 }
1593
1594 LLT LeftoverTy;
1595 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1596 SmallVector<Register, 1> LeftoverRegs;
1597 if (LeftoverBits != 0) {
1598 LeftoverTy = LLT::scalar(SizeInBits: LeftoverBits);
1599 auto K = MIRBuilder.buildConstant(
1600 Res: LeftoverTy,
1601 Val: Val.lshr(shiftAmt: NumParts * NarrowSize).trunc(width: LeftoverBits));
1602 LeftoverRegs.push_back(Elt: K.getReg(Idx: 0));
1603 }
1604
1605 insertParts(DstReg: MI.getOperand(i: 0).getReg(),
1606 ResultTy: Ty, PartTy: NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1607
1608 MI.eraseFromParent();
1609 return Legalized;
1610 }
1611 case TargetOpcode::G_SEXT:
1612 case TargetOpcode::G_ZEXT:
1613 case TargetOpcode::G_ANYEXT:
1614 return narrowScalarExt(MI, TypeIdx, Ty: NarrowTy);
1615 case TargetOpcode::G_TRUNC: {
1616 if (TypeIdx != 1)
1617 return UnableToLegalize;
1618
1619 uint64_t SizeOp1 = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1620 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1621 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1622 return UnableToLegalize;
1623 }
1624
1625 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1));
1626 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0), Op: Unmerge.getReg(Idx: 0));
1627 MI.eraseFromParent();
1628 return Legalized;
1629 }
1630 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
1631 case TargetOpcode::G_FREEZE: {
1632 if (TypeIdx != 0)
1633 return UnableToLegalize;
1634
1635 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1636 // Should widen scalar first
1637 if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1638 return UnableToLegalize;
1639
1640 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1).getReg());
1641 SmallVector<Register, 8> Parts;
1642 for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1643 Parts.push_back(
1644 Elt: MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy}, SrcOps: {Unmerge.getReg(Idx: i)})
1645 .getReg(Idx: 0));
1646 }
1647
1648 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: Parts);
1649 MI.eraseFromParent();
1650 return Legalized;
1651 }
1652 case TargetOpcode::G_ADD:
1653 case TargetOpcode::G_SUB:
1654 case TargetOpcode::G_SADDO:
1655 case TargetOpcode::G_SSUBO:
1656 case TargetOpcode::G_SADDE:
1657 case TargetOpcode::G_SSUBE:
1658 case TargetOpcode::G_UADDO:
1659 case TargetOpcode::G_USUBO:
1660 case TargetOpcode::G_UADDE:
1661 case TargetOpcode::G_USUBE:
1662 return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1663 case TargetOpcode::G_MUL:
1664 case TargetOpcode::G_UMULH:
1665 return narrowScalarMul(MI, Ty: NarrowTy);
1666 case TargetOpcode::G_EXTRACT:
1667 return narrowScalarExtract(MI, TypeIdx, Ty: NarrowTy);
1668 case TargetOpcode::G_INSERT:
1669 return narrowScalarInsert(MI, TypeIdx, Ty: NarrowTy);
1670 case TargetOpcode::G_LOAD: {
1671 auto &LoadMI = cast<GLoad>(Val&: MI);
1672 Register DstReg = LoadMI.getDstReg();
1673 LLT DstTy = MRI.getType(Reg: DstReg);
1674 if (DstTy.isVector())
1675 return UnableToLegalize;
1676
1677 if (8 * LoadMI.getMemSize().getValue() != DstTy.getSizeInBits()) {
1678 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1679 MIRBuilder.buildLoad(Res: TmpReg, Addr: LoadMI.getPointerReg(), MMO&: LoadMI.getMMO());
1680 MIRBuilder.buildAnyExt(Res: DstReg, Op: TmpReg);
1681 LoadMI.eraseFromParent();
1682 return Legalized;
1683 }
1684
1685 return reduceLoadStoreWidth(MI&: LoadMI, TypeIdx, NarrowTy);
1686 }
1687 case TargetOpcode::G_ZEXTLOAD:
1688 case TargetOpcode::G_SEXTLOAD: {
1689 auto &LoadMI = cast<GExtLoad>(Val&: MI);
1690 Register DstReg = LoadMI.getDstReg();
1691 Register PtrReg = LoadMI.getPointerReg();
1692
1693 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1694 auto &MMO = LoadMI.getMMO();
1695 unsigned MemSize = MMO.getSizeInBits().getValue();
1696
1697 if (MemSize == NarrowSize) {
1698 MIRBuilder.buildLoad(Res: TmpReg, Addr: PtrReg, MMO);
1699 } else if (MemSize < NarrowSize) {
1700 MIRBuilder.buildLoadInstr(Opcode: LoadMI.getOpcode(), Res: TmpReg, Addr: PtrReg, MMO);
1701 } else if (MemSize > NarrowSize) {
1702 // FIXME: Need to split the load.
1703 return UnableToLegalize;
1704 }
1705
1706 if (isa<GZExtLoad>(Val: LoadMI))
1707 MIRBuilder.buildZExt(Res: DstReg, Op: TmpReg);
1708 else
1709 MIRBuilder.buildSExt(Res: DstReg, Op: TmpReg);
1710
1711 LoadMI.eraseFromParent();
1712 return Legalized;
1713 }
1714 case TargetOpcode::G_STORE: {
1715 auto &StoreMI = cast<GStore>(Val&: MI);
1716
1717 Register SrcReg = StoreMI.getValueReg();
1718 LLT SrcTy = MRI.getType(Reg: SrcReg);
1719 if (SrcTy.isVector())
1720 return UnableToLegalize;
1721
1722 int NumParts = SizeOp0 / NarrowSize;
1723 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1724 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1725 if (SrcTy.isVector() && LeftoverBits != 0)
1726 return UnableToLegalize;
1727
1728 if (8 * StoreMI.getMemSize().getValue() != SrcTy.getSizeInBits()) {
1729 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1730 MIRBuilder.buildTrunc(Res: TmpReg, Op: SrcReg);
1731 MIRBuilder.buildStore(Val: TmpReg, Addr: StoreMI.getPointerReg(), MMO&: StoreMI.getMMO());
1732 StoreMI.eraseFromParent();
1733 return Legalized;
1734 }
1735
1736 return reduceLoadStoreWidth(MI&: StoreMI, TypeIdx: 0, NarrowTy);
1737 }
1738 case TargetOpcode::G_SELECT:
1739 return narrowScalarSelect(MI, TypeIdx, Ty: NarrowTy);
1740 case TargetOpcode::G_AND:
1741 case TargetOpcode::G_OR:
1742 case TargetOpcode::G_XOR: {
1743 // Legalize bitwise operation:
1744 // A = BinOp<Ty> B, C
1745 // into:
1746 // B1, ..., BN = G_UNMERGE_VALUES B
1747 // C1, ..., CN = G_UNMERGE_VALUES C
1748 // A1 = BinOp<Ty/N> B1, C2
1749 // ...
1750 // AN = BinOp<Ty/N> BN, CN
1751 // A = G_MERGE_VALUES A1, ..., AN
1752 return narrowScalarBasic(MI, TypeIdx, Ty: NarrowTy);
1753 }
1754 case TargetOpcode::G_SHL:
1755 case TargetOpcode::G_LSHR:
1756 case TargetOpcode::G_ASHR:
1757 return narrowScalarShift(MI, TypeIdx, Ty: NarrowTy);
1758 case TargetOpcode::G_CTLZ:
1759 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1760 case TargetOpcode::G_CTTZ:
1761 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1762 case TargetOpcode::G_CTLS:
1763 case TargetOpcode::G_CTPOP:
1764 if (TypeIdx == 1)
1765 switch (MI.getOpcode()) {
1766 case TargetOpcode::G_CTLZ:
1767 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1768 return narrowScalarCTLZ(MI, TypeIdx, Ty: NarrowTy);
1769 case TargetOpcode::G_CTTZ:
1770 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1771 return narrowScalarCTTZ(MI, TypeIdx, Ty: NarrowTy);
1772 case TargetOpcode::G_CTPOP:
1773 return narrowScalarCTPOP(MI, TypeIdx, Ty: NarrowTy);
1774 case TargetOpcode::G_CTLS:
1775 return narrowScalarCTLS(MI, TypeIdx, Ty: NarrowTy);
1776 default:
1777 return UnableToLegalize;
1778 }
1779
1780 Observer.changingInstr(MI);
1781 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1782 Observer.changedInstr(MI);
1783 return Legalized;
1784 case TargetOpcode::G_INTTOPTR:
1785 if (TypeIdx != 1)
1786 return UnableToLegalize;
1787
1788 Observer.changingInstr(MI);
1789 narrowScalarSrc(MI, NarrowTy, OpIdx: 1);
1790 Observer.changedInstr(MI);
1791 return Legalized;
1792 case TargetOpcode::G_PTRTOINT:
1793 if (TypeIdx != 0)
1794 return UnableToLegalize;
1795
1796 Observer.changingInstr(MI);
1797 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1798 Observer.changedInstr(MI);
1799 return Legalized;
1800 case TargetOpcode::G_PHI: {
1801 // FIXME: add support for when SizeOp0 isn't an exact multiple of
1802 // NarrowSize.
1803 if (SizeOp0 % NarrowSize != 0)
1804 return UnableToLegalize;
1805
1806 unsigned NumParts = SizeOp0 / NarrowSize;
1807 SmallVector<Register, 2> DstRegs(NumParts);
1808 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1809 Observer.changingInstr(MI);
1810 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1811 MachineBasicBlock &OpMBB = *MI.getOperand(i: i + 1).getMBB();
1812 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
1813 extractParts(Reg: MI.getOperand(i).getReg(), Ty: NarrowTy, NumParts,
1814 VRegs&: SrcRegs[i / 2], MIRBuilder, MRI);
1815 }
1816 MachineBasicBlock &MBB = *MI.getParent();
1817 MIRBuilder.setInsertPt(MBB, II: MI);
1818 for (unsigned i = 0; i < NumParts; ++i) {
1819 DstRegs[i] = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1820 MachineInstrBuilder MIB =
1821 MIRBuilder.buildInstr(Opcode: TargetOpcode::G_PHI).addDef(RegNo: DstRegs[i]);
1822 for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1823 MIB.addUse(RegNo: SrcRegs[j / 2][i]).add(MO: MI.getOperand(i: j + 1));
1824 }
1825 MIRBuilder.setInsertPt(MBB, II: MBB.getFirstNonPHI());
1826 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: DstRegs);
1827 Observer.changedInstr(MI);
1828 MI.eraseFromParent();
1829 return Legalized;
1830 }
1831 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1832 case TargetOpcode::G_INSERT_VECTOR_ELT: {
1833 if (TypeIdx != 2)
1834 return UnableToLegalize;
1835
1836 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1837 Observer.changingInstr(MI);
1838 narrowScalarSrc(MI, NarrowTy, OpIdx);
1839 Observer.changedInstr(MI);
1840 return Legalized;
1841 }
1842 case TargetOpcode::G_ICMP: {
1843 Register LHS = MI.getOperand(i: 2).getReg();
1844 LLT SrcTy = MRI.getType(Reg: LHS);
1845 CmpInst::Predicate Pred =
1846 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
1847
1848 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1849 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1850 if (!extractParts(Reg: LHS, RegTy: SrcTy, MainTy: NarrowTy, LeftoverTy, VRegs&: LHSPartRegs,
1851 LeftoverVRegs&: LHSLeftoverRegs, MIRBuilder, MRI))
1852 return UnableToLegalize;
1853
1854 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1855 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1856 if (!extractParts(Reg: MI.getOperand(i: 3).getReg(), RegTy: SrcTy, MainTy: NarrowTy, LeftoverTy&: Unused,
1857 VRegs&: RHSPartRegs, LeftoverVRegs&: RHSLeftoverRegs, MIRBuilder, MRI))
1858 return UnableToLegalize;
1859
1860 // We now have the LHS and RHS of the compare split into narrow-type
1861 // registers, plus potentially some leftover type.
1862 Register Dst = MI.getOperand(i: 0).getReg();
1863 LLT ResTy = MRI.getType(Reg: Dst);
1864 if (ICmpInst::isEquality(P: Pred)) {
1865 // For each part on the LHS and RHS, keep track of the result of XOR-ing
1866 // them together. For each equal part, the result should be all 0s. For
1867 // each non-equal part, we'll get at least one 1.
1868 auto Zero = MIRBuilder.buildConstant(Res: NarrowTy, Val: 0);
1869 SmallVector<Register, 4> Xors;
1870 for (auto LHSAndRHS : zip(t&: LHSPartRegs, u&: RHSPartRegs)) {
1871 auto LHS = std::get<0>(t&: LHSAndRHS);
1872 auto RHS = std::get<1>(t&: LHSAndRHS);
1873 auto Xor = MIRBuilder.buildXor(Dst: NarrowTy, Src0: LHS, Src1: RHS).getReg(Idx: 0);
1874 Xors.push_back(Elt: Xor);
1875 }
1876
1877 // Build a G_XOR for each leftover register. Each G_XOR must be widened
1878 // to the desired narrow type so that we can OR them together later.
1879 SmallVector<Register, 4> WidenedXors;
1880 for (auto LHSAndRHS : zip(t&: LHSLeftoverRegs, u&: RHSLeftoverRegs)) {
1881 auto LHS = std::get<0>(t&: LHSAndRHS);
1882 auto RHS = std::get<1>(t&: LHSAndRHS);
1883 auto Xor = MIRBuilder.buildXor(Dst: LeftoverTy, Src0: LHS, Src1: RHS).getReg(Idx: 0);
1884 LLT GCDTy = extractGCDType(Parts&: WidenedXors, DstTy: NarrowTy, NarrowTy: LeftoverTy, SrcReg: Xor);
1885 buildLCMMergePieces(DstTy: LeftoverTy, NarrowTy, GCDTy, VRegs&: WidenedXors,
1886 /* PadStrategy = */ TargetOpcode::G_ZEXT);
1887 llvm::append_range(C&: Xors, R&: WidenedXors);
1888 }
1889
1890 // Now, for each part we broke up, we know if they are equal/not equal
1891 // based off the G_XOR. We can OR these all together and compare against
1892 // 0 to get the result.
1893 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1894 auto Or = MIRBuilder.buildOr(Dst: NarrowTy, Src0: Xors[0], Src1: Xors[1]);
1895 for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1896 Or = MIRBuilder.buildOr(Dst: NarrowTy, Src0: Or, Src1: Xors[I]);
1897 MIRBuilder.buildICmp(Pred, Res: Dst, Op0: Or, Op1: Zero);
1898 } else {
1899 Register CmpIn;
1900 for (unsigned I = 0, E = LHSPartRegs.size(); I != E; ++I) {
1901 Register CmpOut;
1902 CmpInst::Predicate PartPred;
1903
1904 if (I == E - 1 && LHSLeftoverRegs.empty()) {
1905 PartPred = Pred;
1906 CmpOut = Dst;
1907 } else {
1908 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1909 CmpOut = MRI.createGenericVirtualRegister(Ty: ResTy);
1910 }
1911
1912 if (!CmpIn) {
1913 MIRBuilder.buildICmp(Pred: PartPred, Res: CmpOut, Op0: LHSPartRegs[I],
1914 Op1: RHSPartRegs[I]);
1915 } else {
1916 auto Cmp = MIRBuilder.buildICmp(Pred: PartPred, Res: ResTy, Op0: LHSPartRegs[I],
1917 Op1: RHSPartRegs[I]);
1918 auto CmpEq = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: ResTy,
1919 Op0: LHSPartRegs[I], Op1: RHSPartRegs[I]);
1920 MIRBuilder.buildSelect(Res: CmpOut, Tst: CmpEq, Op0: CmpIn, Op1: Cmp);
1921 }
1922
1923 CmpIn = CmpOut;
1924 }
1925
1926 for (unsigned I = 0, E = LHSLeftoverRegs.size(); I != E; ++I) {
1927 Register CmpOut;
1928 CmpInst::Predicate PartPred;
1929
1930 if (I == E - 1) {
1931 PartPred = Pred;
1932 CmpOut = Dst;
1933 } else {
1934 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1935 CmpOut = MRI.createGenericVirtualRegister(Ty: ResTy);
1936 }
1937
1938 if (!CmpIn) {
1939 MIRBuilder.buildICmp(Pred: PartPred, Res: CmpOut, Op0: LHSLeftoverRegs[I],
1940 Op1: RHSLeftoverRegs[I]);
1941 } else {
1942 auto Cmp = MIRBuilder.buildICmp(Pred: PartPred, Res: ResTy, Op0: LHSLeftoverRegs[I],
1943 Op1: RHSLeftoverRegs[I]);
1944 auto CmpEq =
1945 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: ResTy,
1946 Op0: LHSLeftoverRegs[I], Op1: RHSLeftoverRegs[I]);
1947 MIRBuilder.buildSelect(Res: CmpOut, Tst: CmpEq, Op0: CmpIn, Op1: Cmp);
1948 }
1949
1950 CmpIn = CmpOut;
1951 }
1952 }
1953 MI.eraseFromParent();
1954 return Legalized;
1955 }
1956 case TargetOpcode::G_FCMP:
1957 if (TypeIdx != 0)
1958 return UnableToLegalize;
1959
1960 Observer.changingInstr(MI);
1961 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1962 Observer.changedInstr(MI);
1963 return Legalized;
1964
1965 case TargetOpcode::G_SEXT_INREG: {
1966 if (TypeIdx != 0)
1967 return UnableToLegalize;
1968
1969 int64_t SizeInBits = MI.getOperand(i: 2).getImm();
1970
1971 // So long as the new type has more bits than the bits we're extending we
1972 // don't need to break it apart.
1973 if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1974 Observer.changingInstr(MI);
1975 // We don't lose any non-extension bits by truncating the src and
1976 // sign-extending the dst.
1977 MachineOperand &MO1 = MI.getOperand(i: 1);
1978 auto TruncMIB = MIRBuilder.buildTrunc(Res: NarrowTy, Op: MO1);
1979 MO1.setReg(TruncMIB.getReg(Idx: 0));
1980
1981 MachineOperand &MO2 = MI.getOperand(i: 0);
1982 Register DstExt = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1983 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
1984 MIRBuilder.buildSExt(Res: MO2, Op: DstExt);
1985 MO2.setReg(DstExt);
1986 Observer.changedInstr(MI);
1987 return Legalized;
1988 }
1989
1990 // Break it apart. Components below the extension point are unmodified. The
1991 // component containing the extension point becomes a narrower SEXT_INREG.
1992 // Components above it are ashr'd from the component containing the
1993 // extension point.
1994 if (SizeOp0 % NarrowSize != 0)
1995 return UnableToLegalize;
1996 int NumParts = SizeOp0 / NarrowSize;
1997
1998 // List the registers where the destination will be scattered.
1999 SmallVector<Register, 2> DstRegs;
2000 // List the registers where the source will be split.
2001 SmallVector<Register, 2> SrcRegs;
2002
2003 // Create all the temporary registers.
2004 for (int i = 0; i < NumParts; ++i) {
2005 Register SrcReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
2006
2007 SrcRegs.push_back(Elt: SrcReg);
2008 }
2009
2010 // Explode the big arguments into smaller chunks.
2011 MIRBuilder.buildUnmerge(Res: SrcRegs, Op: MI.getOperand(i: 1));
2012
2013 Register AshrCstReg =
2014 MIRBuilder.buildConstant(Res: NarrowTy, Val: NarrowTy.getScalarSizeInBits() - 1)
2015 .getReg(Idx: 0);
2016 Register FullExtensionReg;
2017 Register PartialExtensionReg;
2018
2019 // Do the operation on each small part.
2020 for (int i = 0; i < NumParts; ++i) {
2021 if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
2022 DstRegs.push_back(Elt: SrcRegs[i]);
2023 PartialExtensionReg = DstRegs.back();
2024 } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
2025 assert(PartialExtensionReg &&
2026 "Expected to visit partial extension before full");
2027 if (FullExtensionReg) {
2028 DstRegs.push_back(Elt: FullExtensionReg);
2029 continue;
2030 }
2031 DstRegs.push_back(
2032 Elt: MIRBuilder.buildAShr(Dst: NarrowTy, Src0: PartialExtensionReg, Src1: AshrCstReg)
2033 .getReg(Idx: 0));
2034 FullExtensionReg = DstRegs.back();
2035 } else {
2036 DstRegs.push_back(
2037 Elt: MIRBuilder
2038 .buildInstr(
2039 Opc: TargetOpcode::G_SEXT_INREG, DstOps: {NarrowTy},
2040 SrcOps: {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
2041 .getReg(Idx: 0));
2042 PartialExtensionReg = DstRegs.back();
2043 }
2044 }
2045
2046 // Gather the destination registers into the final destination.
2047 Register DstReg = MI.getOperand(i: 0).getReg();
2048 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
2049 MI.eraseFromParent();
2050 return Legalized;
2051 }
2052 case TargetOpcode::G_BSWAP:
2053 case TargetOpcode::G_BITREVERSE: {
2054 if (SizeOp0 % NarrowSize != 0)
2055 return UnableToLegalize;
2056
2057 Observer.changingInstr(MI);
2058 SmallVector<Register, 2> SrcRegs, DstRegs;
2059 unsigned NumParts = SizeOp0 / NarrowSize;
2060 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowTy, NumParts, VRegs&: SrcRegs,
2061 MIRBuilder, MRI);
2062
2063 for (unsigned i = 0; i < NumParts; ++i) {
2064 auto DstPart = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy},
2065 SrcOps: {SrcRegs[NumParts - 1 - i]});
2066 DstRegs.push_back(Elt: DstPart.getReg(Idx: 0));
2067 }
2068
2069 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: DstRegs);
2070
2071 Observer.changedInstr(MI);
2072 MI.eraseFromParent();
2073 return Legalized;
2074 }
2075 case TargetOpcode::G_PTR_ADD:
2076 case TargetOpcode::G_PTRMASK: {
2077 if (TypeIdx != 1)
2078 return UnableToLegalize;
2079 Observer.changingInstr(MI);
2080 narrowScalarSrc(MI, NarrowTy, OpIdx: 2);
2081 Observer.changedInstr(MI);
2082 return Legalized;
2083 }
2084 case TargetOpcode::G_FPTOUI:
2085 case TargetOpcode::G_FPTOSI:
2086 case TargetOpcode::G_FPTOUI_SAT:
2087 case TargetOpcode::G_FPTOSI_SAT:
2088 return narrowScalarFPTOI(MI, TypeIdx, Ty: NarrowTy);
2089 case TargetOpcode::G_FPEXT:
2090 if (TypeIdx != 0)
2091 return UnableToLegalize;
2092 Observer.changingInstr(MI);
2093 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_FPEXT);
2094 Observer.changedInstr(MI);
2095 return Legalized;
2096 case TargetOpcode::G_FLDEXP:
2097 case TargetOpcode::G_STRICT_FLDEXP:
2098 return narrowScalarFLDEXP(MI, TypeIdx, Ty: NarrowTy);
2099 case TargetOpcode::G_VSCALE: {
2100 Register Dst = MI.getOperand(i: 0).getReg();
2101 LLT Ty = MRI.getType(Reg: Dst);
2102
2103 // Assume VSCALE(1) fits into a legal integer
2104 const APInt One(NarrowTy.getSizeInBits(), 1);
2105 auto VScaleBase = MIRBuilder.buildVScale(Res: NarrowTy, MinElts: One);
2106 auto ZExt = MIRBuilder.buildZExt(Res: Ty, Op: VScaleBase);
2107 auto C = MIRBuilder.buildConstant(Res: Ty, Val: *MI.getOperand(i: 1).getCImm());
2108 MIRBuilder.buildMul(Dst, Src0: ZExt, Src1: C);
2109
2110 MI.eraseFromParent();
2111 return Legalized;
2112 }
2113 }
2114}
2115
2116Register LegalizerHelper::coerceToScalar(Register Val) {
2117 LLT Ty = MRI.getType(Reg: Val);
2118 if (Ty.isScalar())
2119 return Val;
2120
2121 const DataLayout &DL = MIRBuilder.getDataLayout();
2122 LLT NewTy = LLT::scalar(SizeInBits: Ty.getSizeInBits());
2123 if (Ty.isPointer()) {
2124 if (DL.isNonIntegralAddressSpace(AddrSpace: Ty.getAddressSpace()))
2125 return Register();
2126 return MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Val).getReg(Idx: 0);
2127 }
2128
2129 Register NewVal = Val;
2130
2131 assert(Ty.isVector());
2132 if (Ty.isPointerVector())
2133 NewVal = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: NewVal).getReg(Idx: 0);
2134 return MIRBuilder.buildBitcast(Dst: NewTy, Src: NewVal).getReg(Idx: 0);
2135}
2136
2137void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
2138 unsigned OpIdx, unsigned ExtOpcode) {
2139 MachineOperand &MO = MI.getOperand(i: OpIdx);
2140 auto ExtB = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MO});
2141 MO.setReg(ExtB.getReg(Idx: 0));
2142}
2143
2144void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
2145 unsigned OpIdx) {
2146 MachineOperand &MO = MI.getOperand(i: OpIdx);
2147 auto ExtB = MIRBuilder.buildTrunc(Res: NarrowTy, Op: MO);
2148 MO.setReg(ExtB.getReg(Idx: 0));
2149}
2150
2151void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
2152 unsigned OpIdx, unsigned TruncOpcode) {
2153 MachineOperand &MO = MI.getOperand(i: OpIdx);
2154 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2155 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2156 MIRBuilder.buildInstr(Opc: TruncOpcode, DstOps: {MO}, SrcOps: {DstExt});
2157 MO.setReg(DstExt);
2158}
2159
2160void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
2161 unsigned OpIdx, unsigned ExtOpcode) {
2162 MachineOperand &MO = MI.getOperand(i: OpIdx);
2163 Register DstTrunc = MRI.createGenericVirtualRegister(Ty: NarrowTy);
2164 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2165 MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {MO}, SrcOps: {DstTrunc});
2166 MO.setReg(DstTrunc);
2167}
2168
2169void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
2170 unsigned OpIdx) {
2171 MachineOperand &MO = MI.getOperand(i: OpIdx);
2172 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2173 Register Dst = MO.getReg();
2174 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2175 MO.setReg(DstExt);
2176 MIRBuilder.buildDeleteTrailingVectorElements(Res: Dst, Op0: DstExt);
2177}
2178
2179void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
2180 unsigned OpIdx) {
2181 MachineOperand &MO = MI.getOperand(i: OpIdx);
2182 MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(Res: MoreTy, Op0: MO).getReg(Idx: 0));
2183}
2184
2185void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2186 MachineOperand &Op = MI.getOperand(i: OpIdx);
2187 Op.setReg(MIRBuilder.buildBitcast(Dst: CastTy, Src: Op).getReg(Idx: 0));
2188}
2189
2190void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2191 MachineOperand &MO = MI.getOperand(i: OpIdx);
2192 Register CastDst = MRI.createGenericVirtualRegister(Ty: CastTy);
2193 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2194 MIRBuilder.buildBitcast(Dst: MO, Src: CastDst);
2195 MO.setReg(CastDst);
2196}
2197
2198LegalizerHelper::LegalizeResult
2199LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
2200 LLT WideTy) {
2201 if (TypeIdx != 1)
2202 return UnableToLegalize;
2203
2204 auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
2205 if (DstTy.isVector())
2206 return UnableToLegalize;
2207
2208 LLT SrcTy = MRI.getType(Reg: Src1Reg);
2209 const int DstSize = DstTy.getSizeInBits();
2210 const int SrcSize = SrcTy.getSizeInBits();
2211 const int WideSize = WideTy.getSizeInBits();
2212 const int NumMerge = (DstSize + WideSize - 1) / WideSize;
2213
2214 unsigned NumOps = MI.getNumOperands();
2215 unsigned NumSrc = MI.getNumOperands() - 1;
2216 unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
2217
2218 if (WideSize >= DstSize) {
2219 // Directly pack the bits in the target type.
2220 Register ResultReg = MIRBuilder.buildZExt(Res: WideTy, Op: Src1Reg).getReg(Idx: 0);
2221
2222 for (unsigned I = 2; I != NumOps; ++I) {
2223 const unsigned Offset = (I - 1) * PartSize;
2224
2225 Register SrcReg = MI.getOperand(i: I).getReg();
2226 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
2227
2228 auto ZextInput = MIRBuilder.buildZExt(Res: WideTy, Op: SrcReg);
2229
2230 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
2231 MRI.createGenericVirtualRegister(Ty: WideTy);
2232
2233 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: Offset);
2234 auto Shl = MIRBuilder.buildShl(Dst: WideTy, Src0: ZextInput, Src1: ShiftAmt);
2235 MIRBuilder.buildOr(Dst: NextResult, Src0: ResultReg, Src1: Shl);
2236 ResultReg = NextResult;
2237 }
2238
2239 if (WideSize > DstSize)
2240 MIRBuilder.buildTrunc(Res: DstReg, Op: ResultReg);
2241 else if (DstTy.isPointer())
2242 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
2243
2244 MI.eraseFromParent();
2245 return Legalized;
2246 }
2247
2248 // Unmerge the original values to the GCD type, and recombine to the next
2249 // multiple greater than the original type.
2250 //
2251 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
2252 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
2253 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
2254 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
2255 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
2256 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
2257 // %12:_(s12) = G_MERGE_VALUES %10, %11
2258 //
2259 // Padding with undef if necessary:
2260 //
2261 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
2262 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
2263 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
2264 // %7:_(s2) = G_IMPLICIT_DEF
2265 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
2266 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
2267 // %10:_(s12) = G_MERGE_VALUES %8, %9
2268
2269 const int GCD = std::gcd(m: SrcSize, n: WideSize);
2270 LLT GCDTy = LLT::scalar(SizeInBits: GCD);
2271
2272 SmallVector<Register, 8> NewMergeRegs;
2273 SmallVector<Register, 8> Unmerges;
2274 LLT WideDstTy = LLT::scalar(SizeInBits: NumMerge * WideSize);
2275
2276 // Decompose the original operands if they don't evenly divide.
2277 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands())) {
2278 Register SrcReg = MO.getReg();
2279 if (GCD == SrcSize) {
2280 Unmerges.push_back(Elt: SrcReg);
2281 } else {
2282 auto Unmerge = MIRBuilder.buildUnmerge(Res: GCDTy, Op: SrcReg);
2283 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
2284 Unmerges.push_back(Elt: Unmerge.getReg(Idx: J));
2285 }
2286 }
2287
2288 // Pad with undef to the next size that is a multiple of the requested size.
2289 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
2290 Register UndefReg = MIRBuilder.buildUndef(Res: GCDTy).getReg(Idx: 0);
2291 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
2292 Unmerges.push_back(Elt: UndefReg);
2293 }
2294
2295 const int PartsPerGCD = WideSize / GCD;
2296
2297 // Build merges of each piece.
2298 ArrayRef<Register> Slicer(Unmerges);
2299 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(N: PartsPerGCD)) {
2300 auto Merge =
2301 MIRBuilder.buildMergeLikeInstr(Res: WideTy, Ops: Slicer.take_front(N: PartsPerGCD));
2302 NewMergeRegs.push_back(Elt: Merge.getReg(Idx: 0));
2303 }
2304
2305 // A truncate may be necessary if the requested type doesn't evenly divide the
2306 // original result type.
2307 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
2308 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NewMergeRegs);
2309 } else {
2310 auto FinalMerge = MIRBuilder.buildMergeLikeInstr(Res: WideDstTy, Ops: NewMergeRegs);
2311 MIRBuilder.buildTrunc(Res: DstReg, Op: FinalMerge.getReg(Idx: 0));
2312 }
2313
2314 MI.eraseFromParent();
2315 return Legalized;
2316}
2317
2318LegalizerHelper::LegalizeResult
2319LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
2320 LLT WideTy) {
2321 if (TypeIdx != 0)
2322 return UnableToLegalize;
2323
2324 int NumDst = MI.getNumOperands() - 1;
2325 Register SrcReg = MI.getOperand(i: NumDst).getReg();
2326 LLT SrcTy = MRI.getType(Reg: SrcReg);
2327 if (SrcTy.isVector())
2328 return UnableToLegalize;
2329
2330 Register Dst0Reg = MI.getOperand(i: 0).getReg();
2331 LLT DstTy = MRI.getType(Reg: Dst0Reg);
2332 if (!DstTy.isScalar())
2333 return UnableToLegalize;
2334
2335 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
2336 if (SrcTy.isPointer()) {
2337 const DataLayout &DL = MIRBuilder.getDataLayout();
2338 if (DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace())) {
2339 LLVM_DEBUG(
2340 dbgs() << "Not casting non-integral address space integer\n");
2341 return UnableToLegalize;
2342 }
2343
2344 SrcTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
2345 SrcReg = MIRBuilder.buildPtrToInt(Dst: SrcTy, Src: SrcReg).getReg(Idx: 0);
2346 }
2347
2348 // Widen SrcTy to WideTy. This does not affect the result, but since the
2349 // user requested this size, it is probably better handled than SrcTy and
2350 // should reduce the total number of legalization artifacts.
2351 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2352 SrcTy = WideTy;
2353 SrcReg = MIRBuilder.buildAnyExt(Res: WideTy, Op: SrcReg).getReg(Idx: 0);
2354 }
2355
2356 // Theres no unmerge type to target. Directly extract the bits from the
2357 // source type
2358 unsigned DstSize = DstTy.getSizeInBits();
2359
2360 MIRBuilder.buildTrunc(Res: Dst0Reg, Op: SrcReg);
2361 for (int I = 1; I != NumDst; ++I) {
2362 auto ShiftAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: DstSize * I);
2363 auto Shr = MIRBuilder.buildLShr(Dst: SrcTy, Src0: SrcReg, Src1: ShiftAmt);
2364 MIRBuilder.buildTrunc(Res: MI.getOperand(i: I), Op: Shr);
2365 }
2366
2367 MI.eraseFromParent();
2368 return Legalized;
2369 }
2370
2371 // Extend the source to a wider type.
2372 LLT LCMTy = getLCMType(OrigTy: SrcTy, TargetTy: WideTy);
2373
2374 Register WideSrc = SrcReg;
2375 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
2376 // TODO: If this is an integral address space, cast to integer and anyext.
2377 if (SrcTy.isPointer()) {
2378 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
2379 return UnableToLegalize;
2380 }
2381
2382 WideSrc = MIRBuilder.buildAnyExt(Res: LCMTy, Op: WideSrc).getReg(Idx: 0);
2383 }
2384
2385 auto Unmerge = MIRBuilder.buildUnmerge(Res: WideTy, Op: WideSrc);
2386
2387 // Create a sequence of unmerges and merges to the original results. Since we
2388 // may have widened the source, we will need to pad the results with dead defs
2389 // to cover the source register.
2390 // e.g. widen s48 to s64:
2391 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
2392 //
2393 // =>
2394 // %4:_(s192) = G_ANYEXT %0:_(s96)
2395 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
2396 // ; unpack to GCD type, with extra dead defs
2397 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
2398 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
2399 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
2400 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination
2401 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
2402 const LLT GCDTy = getGCDType(OrigTy: WideTy, TargetTy: DstTy);
2403 const int NumUnmerge = Unmerge->getNumOperands() - 1;
2404 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
2405
2406 // Directly unmerge to the destination without going through a GCD type
2407 // if possible
2408 if (PartsPerRemerge == 1) {
2409 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
2410
2411 for (int I = 0; I != NumUnmerge; ++I) {
2412 auto MIB = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_UNMERGE_VALUES);
2413
2414 for (int J = 0; J != PartsPerUnmerge; ++J) {
2415 int Idx = I * PartsPerUnmerge + J;
2416 if (Idx < NumDst)
2417 MIB.addDef(RegNo: MI.getOperand(i: Idx).getReg());
2418 else {
2419 // Create dead def for excess components.
2420 MIB.addDef(RegNo: MRI.createGenericVirtualRegister(Ty: DstTy));
2421 }
2422 }
2423
2424 MIB.addUse(RegNo: Unmerge.getReg(Idx: I));
2425 }
2426 } else {
2427 SmallVector<Register, 16> Parts;
2428 for (int J = 0; J != NumUnmerge; ++J)
2429 extractGCDType(Parts, GCDTy, SrcReg: Unmerge.getReg(Idx: J));
2430
2431 SmallVector<Register, 8> RemergeParts;
2432 for (int I = 0; I != NumDst; ++I) {
2433 for (int J = 0; J < PartsPerRemerge; ++J) {
2434 const int Idx = I * PartsPerRemerge + J;
2435 RemergeParts.emplace_back(Args&: Parts[Idx]);
2436 }
2437
2438 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: I).getReg(), Ops: RemergeParts);
2439 RemergeParts.clear();
2440 }
2441 }
2442
2443 MI.eraseFromParent();
2444 return Legalized;
2445}
2446
2447LegalizerHelper::LegalizeResult
2448LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2449 LLT WideTy) {
2450 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2451 unsigned Offset = MI.getOperand(i: 2).getImm();
2452
2453 if (TypeIdx == 0) {
2454 if (SrcTy.isVector() || DstTy.isVector())
2455 return UnableToLegalize;
2456
2457 SrcOp Src(SrcReg);
2458 if (SrcTy.isPointer()) {
2459 // Extracts from pointers can be handled only if they are really just
2460 // simple integers.
2461 const DataLayout &DL = MIRBuilder.getDataLayout();
2462 if (DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace()))
2463 return UnableToLegalize;
2464
2465 LLT SrcAsIntTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
2466 Src = MIRBuilder.buildPtrToInt(Dst: SrcAsIntTy, Src);
2467 SrcTy = SrcAsIntTy;
2468 }
2469
2470 if (DstTy.isPointer())
2471 return UnableToLegalize;
2472
2473 if (Offset == 0) {
2474 // Avoid a shift in the degenerate case.
2475 MIRBuilder.buildTrunc(Res: DstReg,
2476 Op: MIRBuilder.buildAnyExtOrTrunc(Res: WideTy, Op: Src));
2477 MI.eraseFromParent();
2478 return Legalized;
2479 }
2480
2481 // Do a shift in the source type.
2482 LLT ShiftTy = SrcTy;
2483 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2484 Src = MIRBuilder.buildAnyExt(Res: WideTy, Op: Src);
2485 ShiftTy = WideTy;
2486 }
2487
2488 auto LShr = MIRBuilder.buildLShr(
2489 Dst: ShiftTy, Src0: Src, Src1: MIRBuilder.buildConstant(Res: ShiftTy, Val: Offset));
2490 MIRBuilder.buildTrunc(Res: DstReg, Op: LShr);
2491 MI.eraseFromParent();
2492 return Legalized;
2493 }
2494
2495 if (SrcTy.isScalar()) {
2496 Observer.changingInstr(MI);
2497 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2498 Observer.changedInstr(MI);
2499 return Legalized;
2500 }
2501
2502 if (!SrcTy.isVector())
2503 return UnableToLegalize;
2504
2505 if (DstTy != SrcTy.getElementType())
2506 return UnableToLegalize;
2507
2508 if (Offset % SrcTy.getScalarSizeInBits() != 0)
2509 return UnableToLegalize;
2510
2511 Observer.changingInstr(MI);
2512 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2513
2514 MI.getOperand(i: 2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2515 Offset);
2516 widenScalarDst(MI, WideTy: WideTy.getScalarType(), OpIdx: 0);
2517 Observer.changedInstr(MI);
2518 return Legalized;
2519}
2520
2521LegalizerHelper::LegalizeResult
2522LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2523 LLT WideTy) {
2524 if (TypeIdx != 0 || WideTy.isVector())
2525 return UnableToLegalize;
2526 Observer.changingInstr(MI);
2527 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2528 widenScalarDst(MI, WideTy);
2529 Observer.changedInstr(MI);
2530 return Legalized;
2531}
2532
2533LegalizerHelper::LegalizeResult
2534LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2535 LLT WideTy) {
2536 unsigned Opcode;
2537 unsigned ExtOpcode;
2538 std::optional<Register> CarryIn;
2539 switch (MI.getOpcode()) {
2540 default:
2541 llvm_unreachable("Unexpected opcode!");
2542 case TargetOpcode::G_SADDO:
2543 Opcode = TargetOpcode::G_ADD;
2544 ExtOpcode = TargetOpcode::G_SEXT;
2545 break;
2546 case TargetOpcode::G_SSUBO:
2547 Opcode = TargetOpcode::G_SUB;
2548 ExtOpcode = TargetOpcode::G_SEXT;
2549 break;
2550 case TargetOpcode::G_UADDO:
2551 Opcode = TargetOpcode::G_ADD;
2552 ExtOpcode = TargetOpcode::G_ZEXT;
2553 break;
2554 case TargetOpcode::G_USUBO:
2555 Opcode = TargetOpcode::G_SUB;
2556 ExtOpcode = TargetOpcode::G_ZEXT;
2557 break;
2558 case TargetOpcode::G_SADDE:
2559 Opcode = TargetOpcode::G_UADDE;
2560 ExtOpcode = TargetOpcode::G_SEXT;
2561 CarryIn = MI.getOperand(i: 4).getReg();
2562 break;
2563 case TargetOpcode::G_SSUBE:
2564 Opcode = TargetOpcode::G_USUBE;
2565 ExtOpcode = TargetOpcode::G_SEXT;
2566 CarryIn = MI.getOperand(i: 4).getReg();
2567 break;
2568 case TargetOpcode::G_UADDE:
2569 Opcode = TargetOpcode::G_UADDE;
2570 ExtOpcode = TargetOpcode::G_ZEXT;
2571 CarryIn = MI.getOperand(i: 4).getReg();
2572 break;
2573 case TargetOpcode::G_USUBE:
2574 Opcode = TargetOpcode::G_USUBE;
2575 ExtOpcode = TargetOpcode::G_ZEXT;
2576 CarryIn = MI.getOperand(i: 4).getReg();
2577 break;
2578 }
2579
2580 if (TypeIdx == 1) {
2581 unsigned BoolExtOp = MIRBuilder.getBoolExtOp(IsVec: WideTy.isVector(), IsFP: false);
2582
2583 Observer.changingInstr(MI);
2584 if (CarryIn)
2585 widenScalarSrc(MI, WideTy, OpIdx: 4, ExtOpcode: BoolExtOp);
2586 widenScalarDst(MI, WideTy, OpIdx: 1);
2587
2588 Observer.changedInstr(MI);
2589 return Legalized;
2590 }
2591
2592 auto LHSExt = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 2)});
2593 auto RHSExt = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 3)});
2594 // Do the arithmetic in the larger type.
2595 Register NewOp;
2596 if (CarryIn) {
2597 LLT CarryOutTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
2598 NewOp = MIRBuilder
2599 .buildInstr(Opc: Opcode, DstOps: {WideTy, CarryOutTy},
2600 SrcOps: {LHSExt, RHSExt, *CarryIn})
2601 .getReg(Idx: 0);
2602 } else {
2603 NewOp = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {WideTy}, SrcOps: {LHSExt, RHSExt}).getReg(Idx: 0);
2604 }
2605 LLT OrigTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
2606 auto TruncOp = MIRBuilder.buildTrunc(Res: OrigTy, Op: NewOp);
2607 auto ExtOp = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {TruncOp});
2608 // There is no overflow if the ExtOp is the same as NewOp.
2609 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: MI.getOperand(i: 1), Op0: NewOp, Op1: ExtOp);
2610 // Now trunc the NewOp to the original result.
2611 MIRBuilder.buildTrunc(Res: MI.getOperand(i: 0), Op: NewOp);
2612 MI.eraseFromParent();
2613 return Legalized;
2614}
2615
2616LegalizerHelper::LegalizeResult
2617LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2618 LLT WideTy) {
2619 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2620 MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2621 MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2622 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2623 MI.getOpcode() == TargetOpcode::G_USHLSAT;
2624 // We can convert this to:
2625 // 1. Any extend iN to iM
2626 // 2. SHL by M-N
2627 // 3. [US][ADD|SUB|SHL]SAT
2628 // 4. L/ASHR by M-N
2629 //
2630 // It may be more efficient to lower this to a min and a max operation in
2631 // the higher precision arithmetic if the promoted operation isn't legal,
2632 // but this decision is up to the target's lowering request.
2633 Register DstReg = MI.getOperand(i: 0).getReg();
2634
2635 unsigned NewBits = WideTy.getScalarSizeInBits();
2636 unsigned SHLAmount = NewBits - MRI.getType(Reg: DstReg).getScalarSizeInBits();
2637
2638 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2639 // must not left shift the RHS to preserve the shift amount.
2640 auto LHS = MIRBuilder.buildAnyExt(Res: WideTy, Op: MI.getOperand(i: 1));
2641 auto RHS = IsShift ? MIRBuilder.buildZExt(Res: WideTy, Op: MI.getOperand(i: 2))
2642 : MIRBuilder.buildAnyExt(Res: WideTy, Op: MI.getOperand(i: 2));
2643 auto ShiftK = MIRBuilder.buildConstant(Res: WideTy, Val: SHLAmount);
2644 auto ShiftL = MIRBuilder.buildShl(Dst: WideTy, Src0: LHS, Src1: ShiftK);
2645 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(Dst: WideTy, Src0: RHS, Src1: ShiftK);
2646
2647 auto WideInst = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {WideTy},
2648 SrcOps: {ShiftL, ShiftR}, Flags: MI.getFlags());
2649
2650 // Use a shift that will preserve the number of sign bits when the trunc is
2651 // folded away.
2652 auto Result = IsSigned ? MIRBuilder.buildAShr(Dst: WideTy, Src0: WideInst, Src1: ShiftK)
2653 : MIRBuilder.buildLShr(Dst: WideTy, Src0: WideInst, Src1: ShiftK);
2654
2655 MIRBuilder.buildTrunc(Res: DstReg, Op: Result);
2656 MI.eraseFromParent();
2657 return Legalized;
2658}
2659
2660LegalizerHelper::LegalizeResult
2661LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2662 LLT WideTy) {
2663 if (TypeIdx == 1) {
2664 Observer.changingInstr(MI);
2665 widenScalarDst(MI, WideTy, OpIdx: 1);
2666 Observer.changedInstr(MI);
2667 return Legalized;
2668 }
2669
2670 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2671 auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2672 LLT SrcTy = MRI.getType(Reg: LHS);
2673 LLT OverflowTy = MRI.getType(Reg: OriginalOverflow);
2674 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2675
2676 // To determine if the result overflowed in the larger type, we extend the
2677 // input to the larger type, do the multiply (checking if it overflows),
2678 // then also check the high bits of the result to see if overflow happened
2679 // there.
2680 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2681 auto LeftOperand = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {LHS});
2682 auto RightOperand = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {RHS});
2683
2684 // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2685 // so we don't need to check the overflow result of larger type Mulo.
2686 bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2687
2688 unsigned MulOpc =
2689 WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2690
2691 MachineInstrBuilder Mulo;
2692 if (WideMulCanOverflow)
2693 Mulo = MIRBuilder.buildInstr(Opc: MulOpc, DstOps: {WideTy, OverflowTy},
2694 SrcOps: {LeftOperand, RightOperand});
2695 else
2696 Mulo = MIRBuilder.buildInstr(Opc: MulOpc, DstOps: {WideTy}, SrcOps: {LeftOperand, RightOperand});
2697
2698 auto Mul = Mulo->getOperand(i: 0);
2699 MIRBuilder.buildTrunc(Res: Result, Op: Mul);
2700
2701 MachineInstrBuilder ExtResult;
2702 // Overflow occurred if it occurred in the larger type, or if the high part
2703 // of the result does not zero/sign-extend the low part. Check this second
2704 // possibility first.
2705 if (IsSigned) {
2706 // For signed, overflow occurred when the high part does not sign-extend
2707 // the low part.
2708 ExtResult = MIRBuilder.buildSExtInReg(Res: WideTy, Op: Mul, ImmOp: SrcBitWidth);
2709 } else {
2710 // Unsigned overflow occurred when the high part does not zero-extend the
2711 // low part.
2712 ExtResult = MIRBuilder.buildZExtInReg(Res: WideTy, Op: Mul, ImmOp: SrcBitWidth);
2713 }
2714
2715 if (WideMulCanOverflow) {
2716 auto Overflow =
2717 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: OverflowTy, Op0: Mul, Op1: ExtResult);
2718 // Finally check if the multiplication in the larger type itself overflowed.
2719 MIRBuilder.buildOr(Dst: OriginalOverflow, Src0: Mulo->getOperand(i: 1), Src1: Overflow);
2720 } else {
2721 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: OriginalOverflow, Op0: Mul, Op1: ExtResult);
2722 }
2723 MI.eraseFromParent();
2724 return Legalized;
2725}
2726
2727LegalizerHelper::LegalizeResult
2728LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2729 unsigned Opcode = MI.getOpcode();
2730 switch (Opcode) {
2731 default:
2732 return UnableToLegalize;
2733 case TargetOpcode::G_ATOMICRMW_XCHG:
2734 case TargetOpcode::G_ATOMICRMW_ADD:
2735 case TargetOpcode::G_ATOMICRMW_SUB:
2736 case TargetOpcode::G_ATOMICRMW_AND:
2737 case TargetOpcode::G_ATOMICRMW_OR:
2738 case TargetOpcode::G_ATOMICRMW_XOR:
2739 case TargetOpcode::G_ATOMICRMW_MIN:
2740 case TargetOpcode::G_ATOMICRMW_MAX:
2741 case TargetOpcode::G_ATOMICRMW_UMIN:
2742 case TargetOpcode::G_ATOMICRMW_UMAX:
2743 assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2744 Observer.changingInstr(MI);
2745 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2746 widenScalarDst(MI, WideTy, OpIdx: 0);
2747 Observer.changedInstr(MI);
2748 return Legalized;
2749 case TargetOpcode::G_ATOMIC_CMPXCHG:
2750 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2751 Observer.changingInstr(MI);
2752 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2753 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2754 widenScalarDst(MI, WideTy, OpIdx: 0);
2755 Observer.changedInstr(MI);
2756 return Legalized;
2757 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2758 if (TypeIdx == 0) {
2759 Observer.changingInstr(MI);
2760 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2761 widenScalarSrc(MI, WideTy, OpIdx: 4, ExtOpcode: TargetOpcode::G_ANYEXT);
2762 widenScalarDst(MI, WideTy, OpIdx: 0);
2763 Observer.changedInstr(MI);
2764 return Legalized;
2765 }
2766 assert(TypeIdx == 1 &&
2767 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2768 Observer.changingInstr(MI);
2769 widenScalarDst(MI, WideTy, OpIdx: 1);
2770 Observer.changedInstr(MI);
2771 return Legalized;
2772 case TargetOpcode::G_EXTRACT:
2773 return widenScalarExtract(MI, TypeIdx, WideTy);
2774 case TargetOpcode::G_INSERT:
2775 return widenScalarInsert(MI, TypeIdx, WideTy);
2776 case TargetOpcode::G_MERGE_VALUES:
2777 return widenScalarMergeValues(MI, TypeIdx, WideTy);
2778 case TargetOpcode::G_UNMERGE_VALUES:
2779 return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2780 case TargetOpcode::G_SADDO:
2781 case TargetOpcode::G_SSUBO:
2782 case TargetOpcode::G_UADDO:
2783 case TargetOpcode::G_USUBO:
2784 case TargetOpcode::G_SADDE:
2785 case TargetOpcode::G_SSUBE:
2786 case TargetOpcode::G_UADDE:
2787 case TargetOpcode::G_USUBE:
2788 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2789 case TargetOpcode::G_UMULO:
2790 case TargetOpcode::G_SMULO:
2791 return widenScalarMulo(MI, TypeIdx, WideTy);
2792 case TargetOpcode::G_SADDSAT:
2793 case TargetOpcode::G_SSUBSAT:
2794 case TargetOpcode::G_SSHLSAT:
2795 case TargetOpcode::G_UADDSAT:
2796 case TargetOpcode::G_USUBSAT:
2797 case TargetOpcode::G_USHLSAT:
2798 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2799 case TargetOpcode::G_CTTZ:
2800 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2801 case TargetOpcode::G_CTLZ:
2802 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2803 case TargetOpcode::G_CTLS:
2804 case TargetOpcode::G_CTPOP: {
2805 if (TypeIdx == 0) {
2806 Observer.changingInstr(MI);
2807 widenScalarDst(MI, WideTy, OpIdx: 0);
2808 Observer.changedInstr(MI);
2809 return Legalized;
2810 }
2811
2812 Register SrcReg = MI.getOperand(i: 1).getReg();
2813
2814 // First extend the input.
2815 unsigned ExtOpc;
2816 switch (Opcode) {
2817 case TargetOpcode::G_CTTZ:
2818 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2819 case TargetOpcode::G_CTLZ_ZERO_UNDEF: // undef bits shifted out below
2820 ExtOpc = TargetOpcode::G_ANYEXT;
2821 break;
2822 case TargetOpcode::G_CTLS:
2823 ExtOpc = TargetOpcode::G_SEXT;
2824 break;
2825 default:
2826 ExtOpc = TargetOpcode::G_ZEXT;
2827 }
2828
2829 auto MIBSrc = MIRBuilder.buildInstr(Opc: ExtOpc, DstOps: {WideTy}, SrcOps: {SrcReg});
2830 LLT CurTy = MRI.getType(Reg: SrcReg);
2831 unsigned NewOpc = Opcode;
2832 if (NewOpc == TargetOpcode::G_CTTZ) {
2833 // The count is the same in the larger type except if the original
2834 // value was zero. This can be handled by setting the bit just off
2835 // the top of the original type.
2836 auto TopBit = APInt::getOneBitSet(numBits: WideTy.getScalarSizeInBits(),
2837 BitNo: CurTy.getScalarSizeInBits());
2838 MIBSrc = MIRBuilder.buildOr(
2839 Dst: WideTy, Src0: MIBSrc, Src1: MIRBuilder.buildConstant(Res: WideTy, Val: TopBit));
2840 // Now we know the operand is non-zero, use the more relaxed opcode.
2841 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2842 }
2843
2844 unsigned SizeDiff =
2845 WideTy.getScalarSizeInBits() - CurTy.getScalarSizeInBits();
2846
2847 if (Opcode == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2848 // An optimization where the result is the CTLZ after the left shift by
2849 // (Difference in widety and current ty), that is,
2850 // MIBSrc = MIBSrc << (sizeinbits(WideTy) - sizeinbits(CurTy))
2851 // Result = ctlz MIBSrc
2852 MIBSrc = MIRBuilder.buildShl(Dst: WideTy, Src0: MIBSrc,
2853 Src1: MIRBuilder.buildConstant(Res: WideTy, Val: SizeDiff));
2854 }
2855
2856 // Perform the operation at the larger size.
2857 auto MIBNewOp = MIRBuilder.buildInstr(Opc: NewOpc, DstOps: {WideTy}, SrcOps: {MIBSrc});
2858 // This is already the correct result for CTPOP and CTTZs
2859 if (Opcode == TargetOpcode::G_CTLZ || Opcode == TargetOpcode::G_CTLS) {
2860 // The correct result is NewOp - (Difference in widety and current ty).
2861 // At this stage SUB is guaranteed to be positive no-wrap,
2862 // that to be used in further KnownBits optimizations for CTLZ.
2863 MIBNewOp = MIRBuilder.buildSub(
2864 Dst: WideTy, Src0: MIBNewOp, Src1: MIRBuilder.buildConstant(Res: WideTy, Val: SizeDiff),
2865 Flags: Opcode == TargetOpcode::G_CTLZ
2866 ? std::optional<unsigned>(MachineInstr::NoUWrap)
2867 : std::nullopt);
2868 }
2869
2870 MIRBuilder.buildZExtOrTrunc(Res: MI.getOperand(i: 0), Op: MIBNewOp);
2871 MI.eraseFromParent();
2872 return Legalized;
2873 }
2874 case TargetOpcode::G_BSWAP: {
2875 Observer.changingInstr(MI);
2876 Register DstReg = MI.getOperand(i: 0).getReg();
2877
2878 Register ShrReg = MRI.createGenericVirtualRegister(Ty: WideTy);
2879 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2880 Register ShiftAmtReg = MRI.createGenericVirtualRegister(Ty: WideTy);
2881 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2882
2883 MI.getOperand(i: 0).setReg(DstExt);
2884
2885 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2886
2887 LLT Ty = MRI.getType(Reg: DstReg);
2888 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2889 MIRBuilder.buildConstant(Res: ShiftAmtReg, Val: DiffBits);
2890 MIRBuilder.buildLShr(Dst: ShrReg, Src0: DstExt, Src1: ShiftAmtReg);
2891
2892 MIRBuilder.buildTrunc(Res: DstReg, Op: ShrReg);
2893 Observer.changedInstr(MI);
2894 return Legalized;
2895 }
2896 case TargetOpcode::G_BITREVERSE: {
2897 Observer.changingInstr(MI);
2898
2899 Register DstReg = MI.getOperand(i: 0).getReg();
2900 LLT Ty = MRI.getType(Reg: DstReg);
2901 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2902
2903 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2904 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2905 MI.getOperand(i: 0).setReg(DstExt);
2906 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2907
2908 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: DiffBits);
2909 auto Shift = MIRBuilder.buildLShr(Dst: WideTy, Src0: DstExt, Src1: ShiftAmt);
2910 MIRBuilder.buildTrunc(Res: DstReg, Op: Shift);
2911 Observer.changedInstr(MI);
2912 return Legalized;
2913 }
2914 case TargetOpcode::G_FREEZE:
2915 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
2916 Observer.changingInstr(MI);
2917 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2918 widenScalarDst(MI, WideTy);
2919 Observer.changedInstr(MI);
2920 return Legalized;
2921
2922 case TargetOpcode::G_ABS:
2923 Observer.changingInstr(MI);
2924 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2925 widenScalarDst(MI, WideTy);
2926 Observer.changedInstr(MI);
2927 return Legalized;
2928
2929 case TargetOpcode::G_ADD:
2930 case TargetOpcode::G_AND:
2931 case TargetOpcode::G_MUL:
2932 case TargetOpcode::G_OR:
2933 case TargetOpcode::G_XOR:
2934 case TargetOpcode::G_SUB:
2935 case TargetOpcode::G_SHUFFLE_VECTOR:
2936 // Perform operation at larger width (any extension is fines here, high bits
2937 // don't affect the result) and then truncate the result back to the
2938 // original type.
2939 Observer.changingInstr(MI);
2940 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2941 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2942 widenScalarDst(MI, WideTy);
2943 Observer.changedInstr(MI);
2944 return Legalized;
2945
2946 case TargetOpcode::G_SBFX:
2947 case TargetOpcode::G_UBFX:
2948 Observer.changingInstr(MI);
2949
2950 if (TypeIdx == 0) {
2951 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2952 widenScalarDst(MI, WideTy);
2953 } else {
2954 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2955 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
2956 }
2957
2958 Observer.changedInstr(MI);
2959 return Legalized;
2960
2961 case TargetOpcode::G_SHL:
2962 Observer.changingInstr(MI);
2963
2964 if (TypeIdx == 0) {
2965 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2966 widenScalarDst(MI, WideTy);
2967 } else {
2968 assert(TypeIdx == 1);
2969 // The "number of bits to shift" operand must preserve its value as an
2970 // unsigned integer:
2971 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2972 }
2973
2974 Observer.changedInstr(MI);
2975 return Legalized;
2976
2977 case TargetOpcode::G_ROTR:
2978 case TargetOpcode::G_ROTL:
2979 if (TypeIdx != 1)
2980 return UnableToLegalize;
2981
2982 Observer.changingInstr(MI);
2983 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2984 Observer.changedInstr(MI);
2985 return Legalized;
2986
2987 case TargetOpcode::G_SDIV:
2988 case TargetOpcode::G_SREM:
2989 case TargetOpcode::G_SMIN:
2990 case TargetOpcode::G_SMAX:
2991 case TargetOpcode::G_ABDS:
2992 Observer.changingInstr(MI);
2993 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2994 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2995 widenScalarDst(MI, WideTy);
2996 Observer.changedInstr(MI);
2997 return Legalized;
2998
2999 case TargetOpcode::G_SDIVREM:
3000 Observer.changingInstr(MI);
3001 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
3002 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_SEXT);
3003 widenScalarDst(MI, WideTy);
3004 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: --MIRBuilder.getInsertPt());
3005 widenScalarDst(MI, WideTy, OpIdx: 1);
3006 Observer.changedInstr(MI);
3007 return Legalized;
3008
3009 case TargetOpcode::G_ASHR:
3010 case TargetOpcode::G_LSHR:
3011 Observer.changingInstr(MI);
3012
3013 if (TypeIdx == 0) {
3014 unsigned CvtOp = Opcode == TargetOpcode::G_ASHR ? TargetOpcode::G_SEXT
3015 : TargetOpcode::G_ZEXT;
3016
3017 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: CvtOp);
3018 widenScalarDst(MI, WideTy);
3019 } else {
3020 assert(TypeIdx == 1);
3021 // The "number of bits to shift" operand must preserve its value as an
3022 // unsigned integer:
3023 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3024 }
3025
3026 Observer.changedInstr(MI);
3027 return Legalized;
3028 case TargetOpcode::G_UDIV:
3029 case TargetOpcode::G_UREM:
3030 case TargetOpcode::G_ABDU:
3031 Observer.changingInstr(MI);
3032 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3033 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3034 widenScalarDst(MI, WideTy);
3035 Observer.changedInstr(MI);
3036 return Legalized;
3037 case TargetOpcode::G_UDIVREM:
3038 Observer.changingInstr(MI);
3039 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3040 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
3041 widenScalarDst(MI, WideTy);
3042 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: --MIRBuilder.getInsertPt());
3043 widenScalarDst(MI, WideTy, OpIdx: 1);
3044 Observer.changedInstr(MI);
3045 return Legalized;
3046 case TargetOpcode::G_UMIN:
3047 case TargetOpcode::G_UMAX: {
3048 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3049
3050 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
3051 unsigned ExtOpc =
3052 TLI.isSExtCheaperThanZExt(FromTy: getApproximateEVTForLLT(Ty, Ctx),
3053 ToTy: getApproximateEVTForLLT(Ty: WideTy, Ctx))
3054 ? TargetOpcode::G_SEXT
3055 : TargetOpcode::G_ZEXT;
3056
3057 Observer.changingInstr(MI);
3058 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: ExtOpc);
3059 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: ExtOpc);
3060 widenScalarDst(MI, WideTy);
3061 Observer.changedInstr(MI);
3062 return Legalized;
3063 }
3064
3065 case TargetOpcode::G_SELECT:
3066 Observer.changingInstr(MI);
3067 if (TypeIdx == 0) {
3068 // Perform operation at larger width (any extension is fine here, high
3069 // bits don't affect the result) and then truncate the result back to the
3070 // original type.
3071 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3072 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
3073 widenScalarDst(MI, WideTy);
3074 } else {
3075 bool IsVec = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector();
3076 // Explicit extension is required here since high bits affect the result.
3077 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: MIRBuilder.getBoolExtOp(IsVec, IsFP: false));
3078 }
3079 Observer.changedInstr(MI);
3080 return Legalized;
3081
3082 case TargetOpcode::G_FPEXT:
3083 if (TypeIdx != 1)
3084 return UnableToLegalize;
3085
3086 Observer.changingInstr(MI);
3087 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3088 Observer.changedInstr(MI);
3089 return Legalized;
3090 case TargetOpcode::G_FPTOSI:
3091 case TargetOpcode::G_FPTOUI:
3092 case TargetOpcode::G_INTRINSIC_LRINT:
3093 case TargetOpcode::G_INTRINSIC_LLRINT:
3094 case TargetOpcode::G_IS_FPCLASS:
3095 Observer.changingInstr(MI);
3096
3097 if (TypeIdx == 0)
3098 widenScalarDst(MI, WideTy);
3099 else
3100 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3101
3102 Observer.changedInstr(MI);
3103 return Legalized;
3104 case TargetOpcode::G_SITOFP:
3105 Observer.changingInstr(MI);
3106
3107 if (TypeIdx == 0)
3108 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3109 else
3110 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
3111
3112 Observer.changedInstr(MI);
3113 return Legalized;
3114 case TargetOpcode::G_UITOFP:
3115 Observer.changingInstr(MI);
3116
3117 if (TypeIdx == 0)
3118 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3119 else
3120 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3121
3122 Observer.changedInstr(MI);
3123 return Legalized;
3124 case TargetOpcode::G_FPTOSI_SAT:
3125 case TargetOpcode::G_FPTOUI_SAT:
3126 Observer.changingInstr(MI);
3127
3128 if (TypeIdx == 0) {
3129 Register OldDst = MI.getOperand(i: 0).getReg();
3130 LLT Ty = MRI.getType(Reg: OldDst);
3131 Register ExtReg = MRI.createGenericVirtualRegister(Ty: WideTy);
3132 Register NewDst;
3133 MI.getOperand(i: 0).setReg(ExtReg);
3134 uint64_t ShortBits = Ty.getScalarSizeInBits();
3135 uint64_t WideBits = WideTy.getScalarSizeInBits();
3136 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
3137 if (Opcode == TargetOpcode::G_FPTOSI_SAT) {
3138 // z = i16 fptosi_sat(a)
3139 // ->
3140 // x = i32 fptosi_sat(a)
3141 // y = smin(x, 32767)
3142 // z = smax(y, -32768)
3143 auto MaxVal = MIRBuilder.buildConstant(
3144 Res: WideTy, Val: APInt::getSignedMaxValue(numBits: ShortBits).sext(width: WideBits));
3145 auto MinVal = MIRBuilder.buildConstant(
3146 Res: WideTy, Val: APInt::getSignedMinValue(numBits: ShortBits).sext(width: WideBits));
3147 Register MidReg =
3148 MIRBuilder.buildSMin(Dst: WideTy, Src0: ExtReg, Src1: MaxVal).getReg(Idx: 0);
3149 NewDst = MIRBuilder.buildSMax(Dst: WideTy, Src0: MidReg, Src1: MinVal).getReg(Idx: 0);
3150 } else {
3151 // z = i16 fptoui_sat(a)
3152 // ->
3153 // x = i32 fptoui_sat(a)
3154 // y = smin(x, 65535)
3155 auto MaxVal = MIRBuilder.buildConstant(
3156 Res: WideTy, Val: APInt::getAllOnes(numBits: ShortBits).zext(width: WideBits));
3157 NewDst = MIRBuilder.buildUMin(Dst: WideTy, Src0: ExtReg, Src1: MaxVal).getReg(Idx: 0);
3158 }
3159 MIRBuilder.buildTrunc(Res: OldDst, Op: NewDst);
3160 } else
3161 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3162
3163 Observer.changedInstr(MI);
3164 return Legalized;
3165 case TargetOpcode::G_LOAD:
3166 case TargetOpcode::G_SEXTLOAD:
3167 case TargetOpcode::G_ZEXTLOAD:
3168 Observer.changingInstr(MI);
3169 widenScalarDst(MI, WideTy);
3170 Observer.changedInstr(MI);
3171 return Legalized;
3172
3173 case TargetOpcode::G_STORE: {
3174 if (TypeIdx != 0)
3175 return UnableToLegalize;
3176
3177 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3178 assert(!Ty.isPointerOrPointerVector() && "Can't widen type");
3179 if (!Ty.isScalar()) {
3180 // We need to widen the vector element type.
3181 Observer.changingInstr(MI);
3182 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ANYEXT);
3183 // We also need to adjust the MMO to turn this into a truncating store.
3184 MachineMemOperand &MMO = **MI.memoperands_begin();
3185 MachineFunction &MF = MIRBuilder.getMF();
3186 auto *NewMMO = MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty);
3187 MI.setMemRefs(MF, MemRefs: {NewMMO});
3188 Observer.changedInstr(MI);
3189 return Legalized;
3190 }
3191
3192 Observer.changingInstr(MI);
3193
3194 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
3195 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
3196 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: ExtType);
3197
3198 Observer.changedInstr(MI);
3199 return Legalized;
3200 }
3201 case TargetOpcode::G_CONSTANT: {
3202 MachineOperand &SrcMO = MI.getOperand(i: 1);
3203 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3204 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
3205 SmallTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
3206 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
3207 ExtOpc == TargetOpcode::G_ANYEXT) &&
3208 "Illegal Extend");
3209 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3210 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
3211 ? SrcVal.sext(width: WideTy.getSizeInBits())
3212 : SrcVal.zext(width: WideTy.getSizeInBits());
3213 Observer.changingInstr(MI);
3214 SrcMO.setCImm(ConstantInt::get(Context&: Ctx, V: Val));
3215
3216 widenScalarDst(MI, WideTy);
3217 Observer.changedInstr(MI);
3218 return Legalized;
3219 }
3220 case TargetOpcode::G_FCONSTANT: {
3221 // To avoid changing the bits of the constant due to extension to a larger
3222 // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
3223 MachineOperand &SrcMO = MI.getOperand(i: 1);
3224 APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
3225 MIRBuilder.setInstrAndDebugLoc(MI);
3226 auto IntCst = MIRBuilder.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val);
3227 widenScalarDst(MI&: *IntCst, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3228 MI.eraseFromParent();
3229 return Legalized;
3230 }
3231 case TargetOpcode::G_IMPLICIT_DEF: {
3232 Observer.changingInstr(MI);
3233 widenScalarDst(MI, WideTy);
3234 Observer.changedInstr(MI);
3235 return Legalized;
3236 }
3237 case TargetOpcode::G_BRCOND:
3238 Observer.changingInstr(MI);
3239 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: MIRBuilder.getBoolExtOp(IsVec: false, IsFP: false));
3240 Observer.changedInstr(MI);
3241 return Legalized;
3242
3243 case TargetOpcode::G_FCMP:
3244 Observer.changingInstr(MI);
3245 if (TypeIdx == 0)
3246 widenScalarDst(MI, WideTy);
3247 else {
3248 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3249 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_FPEXT);
3250 }
3251 Observer.changedInstr(MI);
3252 return Legalized;
3253
3254 case TargetOpcode::G_ICMP:
3255 Observer.changingInstr(MI);
3256 if (TypeIdx == 0)
3257 widenScalarDst(MI, WideTy);
3258 else {
3259 LLT SrcTy = MRI.getType(Reg: MI.getOperand(i: 2).getReg());
3260 CmpInst::Predicate Pred =
3261 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
3262
3263 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
3264 unsigned ExtOpcode =
3265 (CmpInst::isSigned(Pred) ||
3266 TLI.isSExtCheaperThanZExt(FromTy: getApproximateEVTForLLT(Ty: SrcTy, Ctx),
3267 ToTy: getApproximateEVTForLLT(Ty: WideTy, Ctx)))
3268 ? TargetOpcode::G_SEXT
3269 : TargetOpcode::G_ZEXT;
3270 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode);
3271 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode);
3272 }
3273 Observer.changedInstr(MI);
3274 return Legalized;
3275
3276 case TargetOpcode::G_PTR_ADD:
3277 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
3278 Observer.changingInstr(MI);
3279 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
3280 Observer.changedInstr(MI);
3281 return Legalized;
3282
3283 case TargetOpcode::G_PHI: {
3284 assert(TypeIdx == 0 && "Expecting only Idx 0");
3285
3286 Observer.changingInstr(MI);
3287 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
3288 MachineBasicBlock &OpMBB = *MI.getOperand(i: I + 1).getMBB();
3289 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
3290 widenScalarSrc(MI, WideTy, OpIdx: I, ExtOpcode: TargetOpcode::G_ANYEXT);
3291 }
3292
3293 MachineBasicBlock &MBB = *MI.getParent();
3294 MIRBuilder.setInsertPt(MBB, II: --MBB.getFirstNonPHI());
3295 widenScalarDst(MI, WideTy);
3296 Observer.changedInstr(MI);
3297 return Legalized;
3298 }
3299 case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
3300 if (TypeIdx == 0) {
3301 Register VecReg = MI.getOperand(i: 1).getReg();
3302 LLT VecTy = MRI.getType(Reg: VecReg);
3303 Observer.changingInstr(MI);
3304
3305 widenScalarSrc(MI, WideTy: LLT::vector(EC: VecTy.getElementCount(), ScalarTy: WideTy), OpIdx: 1,
3306 ExtOpcode: TargetOpcode::G_ANYEXT);
3307
3308 widenScalarDst(MI, WideTy, OpIdx: 0);
3309 Observer.changedInstr(MI);
3310 return Legalized;
3311 }
3312
3313 if (TypeIdx != 2)
3314 return UnableToLegalize;
3315 Observer.changingInstr(MI);
3316 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3317 Observer.changedInstr(MI);
3318 return Legalized;
3319 }
3320 case TargetOpcode::G_INSERT_VECTOR_ELT: {
3321 if (TypeIdx == 0) {
3322 Observer.changingInstr(MI);
3323 const LLT WideEltTy = WideTy.getElementType();
3324
3325 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3326 widenScalarSrc(MI, WideTy: WideEltTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3327 widenScalarDst(MI, WideTy, OpIdx: 0);
3328 Observer.changedInstr(MI);
3329 return Legalized;
3330 }
3331
3332 if (TypeIdx == 1) {
3333 Observer.changingInstr(MI);
3334
3335 Register VecReg = MI.getOperand(i: 1).getReg();
3336 LLT VecTy = MRI.getType(Reg: VecReg);
3337 LLT WideVecTy = VecTy.changeVectorElementType(NewEltTy: WideTy);
3338
3339 widenScalarSrc(MI, WideTy: WideVecTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3340 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3341 widenScalarDst(MI, WideTy: WideVecTy, OpIdx: 0);
3342 Observer.changedInstr(MI);
3343 return Legalized;
3344 }
3345
3346 if (TypeIdx == 2) {
3347 Observer.changingInstr(MI);
3348 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
3349 Observer.changedInstr(MI);
3350 return Legalized;
3351 }
3352
3353 return UnableToLegalize;
3354 }
3355 case TargetOpcode::G_FADD:
3356 case TargetOpcode::G_FMUL:
3357 case TargetOpcode::G_FSUB:
3358 case TargetOpcode::G_FMA:
3359 case TargetOpcode::G_FMAD:
3360 case TargetOpcode::G_FNEG:
3361 case TargetOpcode::G_FABS:
3362 case TargetOpcode::G_FCANONICALIZE:
3363 case TargetOpcode::G_FMINNUM:
3364 case TargetOpcode::G_FMAXNUM:
3365 case TargetOpcode::G_FMINNUM_IEEE:
3366 case TargetOpcode::G_FMAXNUM_IEEE:
3367 case TargetOpcode::G_FMINIMUM:
3368 case TargetOpcode::G_FMAXIMUM:
3369 case TargetOpcode::G_FMINIMUMNUM:
3370 case TargetOpcode::G_FMAXIMUMNUM:
3371 case TargetOpcode::G_FDIV:
3372 case TargetOpcode::G_FREM:
3373 case TargetOpcode::G_FCEIL:
3374 case TargetOpcode::G_FFLOOR:
3375 case TargetOpcode::G_FCOS:
3376 case TargetOpcode::G_FSIN:
3377 case TargetOpcode::G_FTAN:
3378 case TargetOpcode::G_FACOS:
3379 case TargetOpcode::G_FASIN:
3380 case TargetOpcode::G_FATAN:
3381 case TargetOpcode::G_FATAN2:
3382 case TargetOpcode::G_FCOSH:
3383 case TargetOpcode::G_FSINH:
3384 case TargetOpcode::G_FTANH:
3385 case TargetOpcode::G_FLOG10:
3386 case TargetOpcode::G_FLOG:
3387 case TargetOpcode::G_FLOG2:
3388 case TargetOpcode::G_FRINT:
3389 case TargetOpcode::G_FNEARBYINT:
3390 case TargetOpcode::G_FSQRT:
3391 case TargetOpcode::G_FEXP:
3392 case TargetOpcode::G_FEXP2:
3393 case TargetOpcode::G_FEXP10:
3394 case TargetOpcode::G_FPOW:
3395 case TargetOpcode::G_INTRINSIC_TRUNC:
3396 case TargetOpcode::G_INTRINSIC_ROUND:
3397 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
3398 assert(TypeIdx == 0);
3399 Observer.changingInstr(MI);
3400
3401 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3402 widenScalarSrc(MI, WideTy, OpIdx: I, ExtOpcode: TargetOpcode::G_FPEXT);
3403
3404 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3405 Observer.changedInstr(MI);
3406 return Legalized;
3407 case TargetOpcode::G_FMODF: {
3408 Observer.changingInstr(MI);
3409 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3410
3411 widenScalarDst(MI, WideTy, OpIdx: 1, TruncOpcode: TargetOpcode::G_FPTRUNC);
3412 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: --MIRBuilder.getInsertPt());
3413 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3414 Observer.changedInstr(MI);
3415 return Legalized;
3416 }
3417 case TargetOpcode::G_FPOWI:
3418 case TargetOpcode::G_FLDEXP:
3419 case TargetOpcode::G_STRICT_FLDEXP: {
3420 if (TypeIdx == 0) {
3421 if (Opcode == TargetOpcode::G_STRICT_FLDEXP)
3422 return UnableToLegalize;
3423
3424 Observer.changingInstr(MI);
3425 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3426 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3427 Observer.changedInstr(MI);
3428 return Legalized;
3429 }
3430
3431 if (TypeIdx == 1) {
3432 // For some reason SelectionDAG tries to promote to a libcall without
3433 // actually changing the integer type for promotion.
3434 Observer.changingInstr(MI);
3435 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
3436 Observer.changedInstr(MI);
3437 return Legalized;
3438 }
3439
3440 return UnableToLegalize;
3441 }
3442 case TargetOpcode::G_FFREXP: {
3443 Observer.changingInstr(MI);
3444
3445 if (TypeIdx == 0) {
3446 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3447 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3448 } else {
3449 widenScalarDst(MI, WideTy, OpIdx: 1);
3450 }
3451
3452 Observer.changedInstr(MI);
3453 return Legalized;
3454 }
3455 case TargetOpcode::G_LROUND:
3456 case TargetOpcode::G_LLROUND:
3457 Observer.changingInstr(MI);
3458
3459 if (TypeIdx == 0)
3460 widenScalarDst(MI, WideTy);
3461 else
3462 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3463
3464 Observer.changedInstr(MI);
3465 return Legalized;
3466
3467 case TargetOpcode::G_INTTOPTR:
3468 if (TypeIdx != 1)
3469 return UnableToLegalize;
3470
3471 Observer.changingInstr(MI);
3472 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3473 Observer.changedInstr(MI);
3474 return Legalized;
3475 case TargetOpcode::G_PTRTOINT:
3476 if (TypeIdx != 0)
3477 return UnableToLegalize;
3478
3479 Observer.changingInstr(MI);
3480 widenScalarDst(MI, WideTy, OpIdx: 0);
3481 Observer.changedInstr(MI);
3482 return Legalized;
3483 case TargetOpcode::G_BUILD_VECTOR: {
3484 Observer.changingInstr(MI);
3485
3486 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
3487 for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
3488 widenScalarSrc(MI, WideTy: WideEltTy, OpIdx: I, ExtOpcode: TargetOpcode::G_ANYEXT);
3489
3490 // Avoid changing the result vector type if the source element type was
3491 // requested.
3492 if (TypeIdx == 1) {
3493 MI.setDesc(MIRBuilder.getTII().get(Opcode: TargetOpcode::G_BUILD_VECTOR_TRUNC));
3494 } else {
3495 widenScalarDst(MI, WideTy, OpIdx: 0);
3496 }
3497
3498 Observer.changedInstr(MI);
3499 return Legalized;
3500 }
3501 case TargetOpcode::G_SEXT_INREG:
3502 if (TypeIdx != 0)
3503 return UnableToLegalize;
3504
3505 Observer.changingInstr(MI);
3506 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3507 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3508 Observer.changedInstr(MI);
3509 return Legalized;
3510 case TargetOpcode::G_PTRMASK: {
3511 if (TypeIdx != 1)
3512 return UnableToLegalize;
3513 Observer.changingInstr(MI);
3514 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3515 Observer.changedInstr(MI);
3516 return Legalized;
3517 }
3518 case TargetOpcode::G_VECREDUCE_ADD: {
3519 if (TypeIdx != 1)
3520 return UnableToLegalize;
3521 Observer.changingInstr(MI);
3522 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3523 widenScalarDst(MI, WideTy: WideTy.getScalarType(), OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3524 Observer.changedInstr(MI);
3525 return Legalized;
3526 }
3527 case TargetOpcode::G_VECREDUCE_FADD:
3528 case TargetOpcode::G_VECREDUCE_FMUL:
3529 case TargetOpcode::G_VECREDUCE_FMIN:
3530 case TargetOpcode::G_VECREDUCE_FMAX:
3531 case TargetOpcode::G_VECREDUCE_FMINIMUM:
3532 case TargetOpcode::G_VECREDUCE_FMAXIMUM: {
3533 if (TypeIdx != 0)
3534 return UnableToLegalize;
3535 Observer.changingInstr(MI);
3536 Register VecReg = MI.getOperand(i: 1).getReg();
3537 LLT VecTy = MRI.getType(Reg: VecReg);
3538 LLT WideVecTy = VecTy.changeElementType(NewEltTy: WideTy);
3539 widenScalarSrc(MI, WideTy: WideVecTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3540 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3541 Observer.changedInstr(MI);
3542 return Legalized;
3543 }
3544 case TargetOpcode::G_VSCALE: {
3545 MachineOperand &SrcMO = MI.getOperand(i: 1);
3546 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3547 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3548 // The CImm is always a signed value
3549 const APInt Val = SrcVal.sext(width: WideTy.getSizeInBits());
3550 Observer.changingInstr(MI);
3551 SrcMO.setCImm(ConstantInt::get(Context&: Ctx, V: Val));
3552 widenScalarDst(MI, WideTy);
3553 Observer.changedInstr(MI);
3554 return Legalized;
3555 }
3556 case TargetOpcode::G_SPLAT_VECTOR: {
3557 if (TypeIdx != 1)
3558 return UnableToLegalize;
3559
3560 Observer.changingInstr(MI);
3561 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3562 Observer.changedInstr(MI);
3563 return Legalized;
3564 }
3565 case TargetOpcode::G_INSERT_SUBVECTOR: {
3566 if (TypeIdx != 0)
3567 return UnableToLegalize;
3568
3569 GInsertSubvector &IS = cast<GInsertSubvector>(Val&: MI);
3570 Register BigVec = IS.getBigVec();
3571 Register SubVec = IS.getSubVec();
3572
3573 LLT SubVecTy = MRI.getType(Reg: SubVec);
3574 LLT SubVecWideTy = SubVecTy.changeElementType(NewEltTy: WideTy.getElementType());
3575
3576 // Widen the G_INSERT_SUBVECTOR
3577 auto BigZExt = MIRBuilder.buildZExt(Res: WideTy, Op: BigVec);
3578 auto SubZExt = MIRBuilder.buildZExt(Res: SubVecWideTy, Op: SubVec);
3579 auto WideInsert = MIRBuilder.buildInsertSubvector(Res: WideTy, Src0: BigZExt, Src1: SubZExt,
3580 Index: IS.getIndexImm());
3581
3582 // Truncate back down
3583 auto SplatZero = MIRBuilder.buildSplatVector(
3584 Res: WideTy, Val: MIRBuilder.buildConstant(Res: WideTy.getElementType(), Val: 0));
3585 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_NE, Res: IS.getReg(Idx: 0), Op0: WideInsert,
3586 Op1: SplatZero);
3587
3588 MI.eraseFromParent();
3589
3590 return Legalized;
3591 }
3592 }
3593}
3594
3595static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
3596 MachineIRBuilder &B, Register Src, LLT Ty) {
3597 auto Unmerge = B.buildUnmerge(Res: Ty, Op: Src);
3598 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3599 Pieces.push_back(Elt: Unmerge.getReg(Idx: I));
3600}
3601
3602static void emitLoadFromConstantPool(Register DstReg, const Constant *ConstVal,
3603 MachineIRBuilder &MIRBuilder) {
3604 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3605 MachineFunction &MF = MIRBuilder.getMF();
3606 const DataLayout &DL = MIRBuilder.getDataLayout();
3607 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
3608 LLT AddrPtrTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: DL.getPointerSizeInBits(AS: AddrSpace));
3609 LLT DstLLT = MRI.getType(Reg: DstReg);
3610
3611 Align Alignment(DL.getABITypeAlign(Ty: ConstVal->getType()));
3612
3613 auto Addr = MIRBuilder.buildConstantPool(
3614 Res: AddrPtrTy,
3615 Idx: MF.getConstantPool()->getConstantPoolIndex(C: ConstVal, Alignment));
3616
3617 MachineMemOperand *MMO =
3618 MF.getMachineMemOperand(PtrInfo: MachinePointerInfo::getConstantPool(MF),
3619 f: MachineMemOperand::MOLoad, MemTy: DstLLT, base_alignment: Alignment);
3620
3621 MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_LOAD, Res: DstReg, Addr, MMO&: *MMO);
3622}
3623
3624LegalizerHelper::LegalizeResult
3625LegalizerHelper::lowerConstant(MachineInstr &MI) {
3626 const MachineOperand &ConstOperand = MI.getOperand(i: 1);
3627 const Constant *ConstantVal = ConstOperand.getCImm();
3628
3629 emitLoadFromConstantPool(DstReg: MI.getOperand(i: 0).getReg(), ConstVal: ConstantVal, MIRBuilder);
3630 MI.eraseFromParent();
3631
3632 return Legalized;
3633}
3634
3635LegalizerHelper::LegalizeResult
3636LegalizerHelper::lowerFConstant(MachineInstr &MI) {
3637 const MachineOperand &ConstOperand = MI.getOperand(i: 1);
3638 const Constant *ConstantVal = ConstOperand.getFPImm();
3639
3640 emitLoadFromConstantPool(DstReg: MI.getOperand(i: 0).getReg(), ConstVal: ConstantVal, MIRBuilder);
3641 MI.eraseFromParent();
3642
3643 return Legalized;
3644}
3645
3646LegalizerHelper::LegalizeResult
3647LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3648 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3649 if (SrcTy.isVector()) {
3650 LLT SrcEltTy = SrcTy.getElementType();
3651 SmallVector<Register, 8> SrcRegs;
3652
3653 if (DstTy.isVector()) {
3654 int NumDstElt = DstTy.getNumElements();
3655 int NumSrcElt = SrcTy.getNumElements();
3656
3657 LLT DstEltTy = DstTy.getElementType();
3658 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3659 LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3660
3661 // If there's an element size mismatch, insert intermediate casts to match
3662 // the result element type.
3663 if (NumSrcElt < NumDstElt) { // Source element type is larger.
3664 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3665 //
3666 // =>
3667 //
3668 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3669 // %3:_(<2 x s8>) = G_BITCAST %2
3670 // %4:_(<2 x s8>) = G_BITCAST %3
3671 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3672 DstCastTy = DstTy.changeVectorElementCount(
3673 EC: ElementCount::getFixed(MinVal: NumDstElt / NumSrcElt));
3674 SrcPartTy = SrcEltTy;
3675 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3676 //
3677 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3678 //
3679 // =>
3680 //
3681 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3682 // %3:_(s16) = G_BITCAST %2
3683 // %4:_(s16) = G_BITCAST %3
3684 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3685 SrcPartTy = SrcTy.changeVectorElementCount(
3686 EC: ElementCount::getFixed(MinVal: NumSrcElt / NumDstElt));
3687 DstCastTy = DstEltTy;
3688 }
3689
3690 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: SrcPartTy);
3691 for (Register &SrcReg : SrcRegs)
3692 SrcReg = MIRBuilder.buildBitcast(Dst: DstCastTy, Src: SrcReg).getReg(Idx: 0);
3693 } else
3694 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: SrcEltTy);
3695
3696 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3697 MI.eraseFromParent();
3698 return Legalized;
3699 }
3700
3701 if (DstTy.isVector()) {
3702 SmallVector<Register, 8> SrcRegs;
3703 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: DstTy.getElementType());
3704 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3705 MI.eraseFromParent();
3706 return Legalized;
3707 }
3708
3709 return UnableToLegalize;
3710}
3711
3712/// Figure out the bit offset into a register when coercing a vector index for
3713/// the wide element type. This is only for the case when promoting vector to
3714/// one with larger elements.
3715//
3716///
3717/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3718/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3719static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3720 Register Idx,
3721 unsigned NewEltSize,
3722 unsigned OldEltSize) {
3723 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3724 LLT IdxTy = B.getMRI()->getType(Reg: Idx);
3725
3726 // Now figure out the amount we need to shift to get the target bits.
3727 auto OffsetMask = B.buildConstant(
3728 Res: IdxTy, Val: ~(APInt::getAllOnes(numBits: IdxTy.getSizeInBits()) << Log2EltRatio));
3729 auto OffsetIdx = B.buildAnd(Dst: IdxTy, Src0: Idx, Src1: OffsetMask);
3730 return B.buildShl(Dst: IdxTy, Src0: OffsetIdx,
3731 Src1: B.buildConstant(Res: IdxTy, Val: Log2_32(Value: OldEltSize))).getReg(Idx: 0);
3732}
3733
3734/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3735/// is casting to a vector with a smaller element size, perform multiple element
3736/// extracts and merge the results. If this is coercing to a vector with larger
3737/// elements, index the bitcasted vector and extract the target element with bit
3738/// operations. This is intended to force the indexing in the native register
3739/// size for architectures that can dynamically index the register file.
3740LegalizerHelper::LegalizeResult
3741LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3742 LLT CastTy) {
3743 if (TypeIdx != 1)
3744 return UnableToLegalize;
3745
3746 auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3747
3748 LLT SrcEltTy = SrcVecTy.getElementType();
3749 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3750 unsigned OldNumElts = SrcVecTy.getNumElements();
3751
3752 LLT NewEltTy = CastTy.getScalarType();
3753 Register CastVec = MIRBuilder.buildBitcast(Dst: CastTy, Src: SrcVec).getReg(Idx: 0);
3754
3755 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3756 const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3757 if (NewNumElts > OldNumElts) {
3758 // Decreasing the vector element size
3759 //
3760 // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3761 // =>
3762 // v4i32:castx = bitcast x:v2i64
3763 //
3764 // i64 = bitcast
3765 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3766 // (i32 (extract_vector_elt castx, (2 * y + 1)))
3767 //
3768 if (NewNumElts % OldNumElts != 0)
3769 return UnableToLegalize;
3770
3771 // Type of the intermediate result vector.
3772 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3773 LLT MidTy =
3774 CastTy.changeElementCount(EC: ElementCount::getFixed(MinVal: NewEltsPerOldElt));
3775
3776 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(Res: IdxTy, Val: NewEltsPerOldElt);
3777
3778 SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3779 auto NewBaseIdx = MIRBuilder.buildMul(Dst: IdxTy, Src0: Idx, Src1: NewEltsPerOldEltK);
3780
3781 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3782 auto IdxOffset = MIRBuilder.buildConstant(Res: IdxTy, Val: I);
3783 auto TmpIdx = MIRBuilder.buildAdd(Dst: IdxTy, Src0: NewBaseIdx, Src1: IdxOffset);
3784 auto Elt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec, Idx: TmpIdx);
3785 NewOps[I] = Elt.getReg(Idx: 0);
3786 }
3787
3788 auto NewVec = MIRBuilder.buildBuildVector(Res: MidTy, Ops: NewOps);
3789 MIRBuilder.buildBitcast(Dst, Src: NewVec);
3790 MI.eraseFromParent();
3791 return Legalized;
3792 }
3793
3794 if (NewNumElts < OldNumElts) {
3795 if (NewEltSize % OldEltSize != 0)
3796 return UnableToLegalize;
3797
3798 // This only depends on powers of 2 because we use bit tricks to figure out
3799 // the bit offset we need to shift to get the target element. A general
3800 // expansion could emit division/multiply.
3801 if (!isPowerOf2_32(Value: NewEltSize / OldEltSize))
3802 return UnableToLegalize;
3803
3804 // Increasing the vector element size.
3805 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3806 //
3807 // =>
3808 //
3809 // %cast = G_BITCAST %vec
3810 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3811 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3812 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3813 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3814 // %elt_bits = G_LSHR %wide_elt, %offset_bits
3815 // %elt = G_TRUNC %elt_bits
3816
3817 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3818 auto Log2Ratio = MIRBuilder.buildConstant(Res: IdxTy, Val: Log2EltRatio);
3819
3820 // Divide to get the index in the wider element type.
3821 auto ScaledIdx = MIRBuilder.buildLShr(Dst: IdxTy, Src0: Idx, Src1: Log2Ratio);
3822
3823 Register WideElt = CastVec;
3824 if (CastTy.isVector()) {
3825 WideElt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec,
3826 Idx: ScaledIdx).getReg(Idx: 0);
3827 }
3828
3829 // Compute the bit offset into the register of the target element.
3830 Register OffsetBits = getBitcastWiderVectorElementOffset(
3831 B&: MIRBuilder, Idx, NewEltSize, OldEltSize);
3832
3833 // Shift the wide element to get the target element.
3834 auto ExtractedBits = MIRBuilder.buildLShr(Dst: NewEltTy, Src0: WideElt, Src1: OffsetBits);
3835 MIRBuilder.buildTrunc(Res: Dst, Op: ExtractedBits);
3836 MI.eraseFromParent();
3837 return Legalized;
3838 }
3839
3840 return UnableToLegalize;
3841}
3842
3843/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3844/// TargetReg, while preserving other bits in \p TargetReg.
3845///
3846/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3847static Register buildBitFieldInsert(MachineIRBuilder &B,
3848 Register TargetReg, Register InsertReg,
3849 Register OffsetBits) {
3850 LLT TargetTy = B.getMRI()->getType(Reg: TargetReg);
3851 LLT InsertTy = B.getMRI()->getType(Reg: InsertReg);
3852 auto ZextVal = B.buildZExt(Res: TargetTy, Op: InsertReg);
3853 auto ShiftedInsertVal = B.buildShl(Dst: TargetTy, Src0: ZextVal, Src1: OffsetBits);
3854
3855 // Produce a bitmask of the value to insert
3856 auto EltMask = B.buildConstant(
3857 Res: TargetTy, Val: APInt::getLowBitsSet(numBits: TargetTy.getSizeInBits(),
3858 loBitsSet: InsertTy.getSizeInBits()));
3859 // Shift it into position
3860 auto ShiftedMask = B.buildShl(Dst: TargetTy, Src0: EltMask, Src1: OffsetBits);
3861 auto InvShiftedMask = B.buildNot(Dst: TargetTy, Src0: ShiftedMask);
3862
3863 // Clear out the bits in the wide element
3864 auto MaskedOldElt = B.buildAnd(Dst: TargetTy, Src0: TargetReg, Src1: InvShiftedMask);
3865
3866 // The value to insert has all zeros already, so stick it into the masked
3867 // wide element.
3868 return B.buildOr(Dst: TargetTy, Src0: MaskedOldElt, Src1: ShiftedInsertVal).getReg(Idx: 0);
3869}
3870
3871/// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3872/// is increasing the element size, perform the indexing in the target element
3873/// type, and use bit operations to insert at the element position. This is
3874/// intended for architectures that can dynamically index the register file and
3875/// want to force indexing in the native register size.
3876LegalizerHelper::LegalizeResult
3877LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3878 LLT CastTy) {
3879 if (TypeIdx != 0)
3880 return UnableToLegalize;
3881
3882 auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3883 MI.getFirst4RegLLTs();
3884 LLT VecTy = DstTy;
3885
3886 LLT VecEltTy = VecTy.getElementType();
3887 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3888 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3889 const unsigned OldEltSize = VecEltTy.getSizeInBits();
3890
3891 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3892 unsigned OldNumElts = VecTy.getNumElements();
3893
3894 Register CastVec = MIRBuilder.buildBitcast(Dst: CastTy, Src: SrcVec).getReg(Idx: 0);
3895 if (NewNumElts < OldNumElts) {
3896 if (NewEltSize % OldEltSize != 0)
3897 return UnableToLegalize;
3898
3899 // This only depends on powers of 2 because we use bit tricks to figure out
3900 // the bit offset we need to shift to get the target element. A general
3901 // expansion could emit division/multiply.
3902 if (!isPowerOf2_32(Value: NewEltSize / OldEltSize))
3903 return UnableToLegalize;
3904
3905 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3906 auto Log2Ratio = MIRBuilder.buildConstant(Res: IdxTy, Val: Log2EltRatio);
3907
3908 // Divide to get the index in the wider element type.
3909 auto ScaledIdx = MIRBuilder.buildLShr(Dst: IdxTy, Src0: Idx, Src1: Log2Ratio);
3910
3911 Register ExtractedElt = CastVec;
3912 if (CastTy.isVector()) {
3913 ExtractedElt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec,
3914 Idx: ScaledIdx).getReg(Idx: 0);
3915 }
3916
3917 // Compute the bit offset into the register of the target element.
3918 Register OffsetBits = getBitcastWiderVectorElementOffset(
3919 B&: MIRBuilder, Idx, NewEltSize, OldEltSize);
3920
3921 Register InsertedElt = buildBitFieldInsert(B&: MIRBuilder, TargetReg: ExtractedElt,
3922 InsertReg: Val, OffsetBits);
3923 if (CastTy.isVector()) {
3924 InsertedElt = MIRBuilder.buildInsertVectorElement(
3925 Res: CastTy, Val: CastVec, Elt: InsertedElt, Idx: ScaledIdx).getReg(Idx: 0);
3926 }
3927
3928 MIRBuilder.buildBitcast(Dst, Src: InsertedElt);
3929 MI.eraseFromParent();
3930 return Legalized;
3931 }
3932
3933 return UnableToLegalize;
3934}
3935
3936// This attempts to handle G_CONCAT_VECTORS with illegal operands, particularly
3937// those that have smaller than legal operands.
3938//
3939// <16 x s8> = G_CONCAT_VECTORS <4 x s8>, <4 x s8>, <4 x s8>, <4 x s8>
3940//
3941// ===>
3942//
3943// s32 = G_BITCAST <4 x s8>
3944// s32 = G_BITCAST <4 x s8>
3945// s32 = G_BITCAST <4 x s8>
3946// s32 = G_BITCAST <4 x s8>
3947// <4 x s32> = G_BUILD_VECTOR s32, s32, s32, s32
3948// <16 x s8> = G_BITCAST <4 x s32>
3949LegalizerHelper::LegalizeResult
3950LegalizerHelper::bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx,
3951 LLT CastTy) {
3952 // Convert it to CONCAT instruction
3953 auto ConcatMI = dyn_cast<GConcatVectors>(Val: &MI);
3954 if (!ConcatMI) {
3955 return UnableToLegalize;
3956 }
3957
3958 // Check if bitcast is Legal
3959 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
3960 LLT SrcScalTy = CastTy.getScalarType();
3961
3962 // Check if the build vector is Legal
3963 if (!LI.isLegal(Query: {TargetOpcode::G_BUILD_VECTOR, {CastTy, SrcScalTy}})) {
3964 return UnableToLegalize;
3965 }
3966
3967 // Bitcast the sources
3968 SmallVector<Register> BitcastRegs;
3969 for (unsigned i = 0; i < ConcatMI->getNumSources(); i++) {
3970 BitcastRegs.push_back(
3971 Elt: MIRBuilder.buildBitcast(Dst: SrcScalTy, Src: ConcatMI->getSourceReg(I: i))
3972 .getReg(Idx: 0));
3973 }
3974
3975 // Build the scalar values into a vector
3976 Register BuildReg =
3977 MIRBuilder.buildBuildVector(Res: CastTy, Ops: BitcastRegs).getReg(Idx: 0);
3978 MIRBuilder.buildBitcast(Dst: DstReg, Src: BuildReg);
3979
3980 MI.eraseFromParent();
3981 return Legalized;
3982}
3983
3984// This bitcasts a shuffle vector to a different type currently of the same
3985// element size. Mostly used to legalize ptr vectors, where ptrtoint/inttoptr
3986// will be used instead.
3987//
3988// <16 x p0> = G_CONCAT_VECTORS <4 x p0>, <4 x p0>, mask
3989// ===>
3990// <4 x s64> = G_PTRTOINT <4 x p0>
3991// <4 x s64> = G_PTRTOINT <4 x p0>
3992// <16 x s64> = G_CONCAT_VECTORS <4 x s64>, <4 x s64>, mask
3993// <16 x p0> = G_INTTOPTR <16 x s64>
3994LegalizerHelper::LegalizeResult
3995LegalizerHelper::bitcastShuffleVector(MachineInstr &MI, unsigned TypeIdx,
3996 LLT CastTy) {
3997 auto ShuffleMI = cast<GShuffleVector>(Val: &MI);
3998 LLT DstTy = MRI.getType(Reg: ShuffleMI->getReg(Idx: 0));
3999 LLT SrcTy = MRI.getType(Reg: ShuffleMI->getReg(Idx: 1));
4000
4001 // We currently only handle vectors of the same size.
4002 if (TypeIdx != 0 ||
4003 CastTy.getScalarSizeInBits() != DstTy.getScalarSizeInBits() ||
4004 CastTy.getElementCount() != DstTy.getElementCount())
4005 return UnableToLegalize;
4006
4007 LLT NewSrcTy = SrcTy.changeElementType(NewEltTy: CastTy.getScalarType());
4008
4009 auto Inp1 = MIRBuilder.buildCast(Dst: NewSrcTy, Src: ShuffleMI->getReg(Idx: 1));
4010 auto Inp2 = MIRBuilder.buildCast(Dst: NewSrcTy, Src: ShuffleMI->getReg(Idx: 2));
4011 auto Shuf =
4012 MIRBuilder.buildShuffleVector(Res: CastTy, Src1: Inp1, Src2: Inp2, Mask: ShuffleMI->getMask());
4013 MIRBuilder.buildCast(Dst: ShuffleMI->getReg(Idx: 0), Src: Shuf);
4014
4015 MI.eraseFromParent();
4016 return Legalized;
4017}
4018
4019/// This attempts to bitcast G_EXTRACT_SUBVECTOR to CastTy.
4020///
4021/// <vscale x 8 x i1> = G_EXTRACT_SUBVECTOR <vscale x 16 x i1>, N
4022///
4023/// ===>
4024///
4025/// <vscale x 2 x i1> = G_BITCAST <vscale x 16 x i1>
4026/// <vscale x 1 x i8> = G_EXTRACT_SUBVECTOR <vscale x 2 x i1>, N / 8
4027/// <vscale x 8 x i1> = G_BITCAST <vscale x 1 x i8>
4028LegalizerHelper::LegalizeResult
4029LegalizerHelper::bitcastExtractSubvector(MachineInstr &MI, unsigned TypeIdx,
4030 LLT CastTy) {
4031 auto ES = cast<GExtractSubvector>(Val: &MI);
4032
4033 if (!CastTy.isVector())
4034 return UnableToLegalize;
4035
4036 if (TypeIdx != 0)
4037 return UnableToLegalize;
4038
4039 Register Dst = ES->getReg(Idx: 0);
4040 Register Src = ES->getSrcVec();
4041 uint64_t Idx = ES->getIndexImm();
4042
4043 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4044
4045 LLT DstTy = MRI.getType(Reg: Dst);
4046 LLT SrcTy = MRI.getType(Reg: Src);
4047 ElementCount DstTyEC = DstTy.getElementCount();
4048 ElementCount SrcTyEC = SrcTy.getElementCount();
4049 auto DstTyMinElts = DstTyEC.getKnownMinValue();
4050 auto SrcTyMinElts = SrcTyEC.getKnownMinValue();
4051
4052 if (DstTy == CastTy)
4053 return Legalized;
4054
4055 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
4056 return UnableToLegalize;
4057
4058 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
4059 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
4060 if (CastEltSize < DstEltSize)
4061 return UnableToLegalize;
4062
4063 auto AdjustAmt = CastEltSize / DstEltSize;
4064 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
4065 SrcTyMinElts % AdjustAmt != 0)
4066 return UnableToLegalize;
4067
4068 Idx /= AdjustAmt;
4069 SrcTy = LLT::vector(EC: SrcTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4070 auto CastVec = MIRBuilder.buildBitcast(Dst: SrcTy, Src);
4071 auto PromotedES = MIRBuilder.buildExtractSubvector(Res: CastTy, Src: CastVec, Index: Idx);
4072 MIRBuilder.buildBitcast(Dst, Src: PromotedES);
4073
4074 ES->eraseFromParent();
4075 return Legalized;
4076}
4077
4078/// This attempts to bitcast G_INSERT_SUBVECTOR to CastTy.
4079///
4080/// <vscale x 16 x i1> = G_INSERT_SUBVECTOR <vscale x 16 x i1>,
4081/// <vscale x 8 x i1>,
4082/// N
4083///
4084/// ===>
4085///
4086/// <vscale x 2 x i8> = G_BITCAST <vscale x 16 x i1>
4087/// <vscale x 1 x i8> = G_BITCAST <vscale x 8 x i1>
4088/// <vscale x 2 x i8> = G_INSERT_SUBVECTOR <vscale x 2 x i8>,
4089/// <vscale x 1 x i8>, N / 8
4090/// <vscale x 16 x i1> = G_BITCAST <vscale x 2 x i8>
4091LegalizerHelper::LegalizeResult
4092LegalizerHelper::bitcastInsertSubvector(MachineInstr &MI, unsigned TypeIdx,
4093 LLT CastTy) {
4094 auto ES = cast<GInsertSubvector>(Val: &MI);
4095
4096 if (!CastTy.isVector())
4097 return UnableToLegalize;
4098
4099 if (TypeIdx != 0)
4100 return UnableToLegalize;
4101
4102 Register Dst = ES->getReg(Idx: 0);
4103 Register BigVec = ES->getBigVec();
4104 Register SubVec = ES->getSubVec();
4105 uint64_t Idx = ES->getIndexImm();
4106
4107 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4108
4109 LLT DstTy = MRI.getType(Reg: Dst);
4110 LLT BigVecTy = MRI.getType(Reg: BigVec);
4111 LLT SubVecTy = MRI.getType(Reg: SubVec);
4112
4113 if (DstTy == CastTy)
4114 return Legalized;
4115
4116 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
4117 return UnableToLegalize;
4118
4119 ElementCount DstTyEC = DstTy.getElementCount();
4120 ElementCount BigVecTyEC = BigVecTy.getElementCount();
4121 ElementCount SubVecTyEC = SubVecTy.getElementCount();
4122 auto DstTyMinElts = DstTyEC.getKnownMinValue();
4123 auto BigVecTyMinElts = BigVecTyEC.getKnownMinValue();
4124 auto SubVecTyMinElts = SubVecTyEC.getKnownMinValue();
4125
4126 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
4127 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
4128 if (CastEltSize < DstEltSize)
4129 return UnableToLegalize;
4130
4131 auto AdjustAmt = CastEltSize / DstEltSize;
4132 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
4133 BigVecTyMinElts % AdjustAmt != 0 || SubVecTyMinElts % AdjustAmt != 0)
4134 return UnableToLegalize;
4135
4136 Idx /= AdjustAmt;
4137 BigVecTy = LLT::vector(EC: BigVecTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4138 SubVecTy = LLT::vector(EC: SubVecTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4139 auto CastBigVec = MIRBuilder.buildBitcast(Dst: BigVecTy, Src: BigVec);
4140 auto CastSubVec = MIRBuilder.buildBitcast(Dst: SubVecTy, Src: SubVec);
4141 auto PromotedIS =
4142 MIRBuilder.buildInsertSubvector(Res: CastTy, Src0: CastBigVec, Src1: CastSubVec, Index: Idx);
4143 MIRBuilder.buildBitcast(Dst, Src: PromotedIS);
4144
4145 ES->eraseFromParent();
4146 return Legalized;
4147}
4148
4149LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
4150 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
4151 Register DstReg = LoadMI.getDstReg();
4152 Register PtrReg = LoadMI.getPointerReg();
4153 LLT DstTy = MRI.getType(Reg: DstReg);
4154 MachineMemOperand &MMO = LoadMI.getMMO();
4155 LLT MemTy = MMO.getMemoryType();
4156 MachineFunction &MF = MIRBuilder.getMF();
4157
4158 LLT EltTy = MemTy.getScalarType();
4159
4160 unsigned MemSizeInBits = MemTy.getSizeInBits();
4161 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
4162
4163 if (MemSizeInBits != MemStoreSizeInBits) {
4164 if (MemTy.isVector())
4165 return UnableToLegalize;
4166
4167 // Promote to a byte-sized load if not loading an integral number of
4168 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
4169 LLT WideMemTy = EltTy.changeElementSize(NewEltSize: MemStoreSizeInBits);
4170 MachineMemOperand *NewMMO =
4171 MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty: WideMemTy);
4172
4173 Register LoadReg = DstReg;
4174 LLT LoadTy = DstTy;
4175
4176 // If this wasn't already an extending load, we need to widen the result
4177 // register to avoid creating a load with a narrower result than the source.
4178 if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
4179 LoadTy = WideMemTy;
4180 LoadReg = MRI.createGenericVirtualRegister(Ty: WideMemTy);
4181 }
4182
4183 if (isa<GSExtLoad>(Val: LoadMI)) {
4184 auto NewLoad = MIRBuilder.buildLoad(Res: LoadTy, Addr: PtrReg, MMO&: *NewMMO);
4185 MIRBuilder.buildSExtInReg(Res: LoadReg, Op: NewLoad, ImmOp: MemSizeInBits);
4186 } else if (isa<GZExtLoad>(Val: LoadMI) || WideMemTy == LoadTy) {
4187 auto NewLoad = MIRBuilder.buildLoad(Res: LoadTy, Addr: PtrReg, MMO&: *NewMMO);
4188 // The extra bits are guaranteed to be zero, since we stored them that
4189 // way. A zext load from Wide thus automatically gives zext from MemVT.
4190 MIRBuilder.buildAssertZExt(Res: LoadReg, Op: NewLoad, Size: MemSizeInBits);
4191 } else {
4192 MIRBuilder.buildLoad(Res: LoadReg, Addr: PtrReg, MMO&: *NewMMO);
4193 }
4194
4195 if (DstTy != LoadTy)
4196 MIRBuilder.buildTrunc(Res: DstReg, Op: LoadReg);
4197
4198 LoadMI.eraseFromParent();
4199 return Legalized;
4200 }
4201
4202 // Big endian lowering not implemented.
4203 if (MIRBuilder.getDataLayout().isBigEndian())
4204 return UnableToLegalize;
4205
4206 // This load needs splitting into power of 2 sized loads.
4207 //
4208 // Our strategy here is to generate anyextending loads for the smaller
4209 // types up to next power-2 result type, and then combine the two larger
4210 // result values together, before truncating back down to the non-pow-2
4211 // type.
4212 // E.g. v1 = i24 load =>
4213 // v2 = i32 zextload (2 byte)
4214 // v3 = i32 load (1 byte)
4215 // v4 = i32 shl v3, 16
4216 // v5 = i32 or v4, v2
4217 // v1 = i24 trunc v5
4218 // By doing this we generate the correct truncate which should get
4219 // combined away as an artifact with a matching extend.
4220
4221 uint64_t LargeSplitSize, SmallSplitSize;
4222
4223 if (!isPowerOf2_32(Value: MemSizeInBits)) {
4224 // This load needs splitting into power of 2 sized loads.
4225 LargeSplitSize = llvm::bit_floor(Value: MemSizeInBits);
4226 SmallSplitSize = MemSizeInBits - LargeSplitSize;
4227 } else {
4228 // This is already a power of 2, but we still need to split this in half.
4229 //
4230 // Assume we're being asked to decompose an unaligned load.
4231 // TODO: If this requires multiple splits, handle them all at once.
4232 auto &Ctx = MF.getFunction().getContext();
4233 if (TLI.allowsMemoryAccess(Context&: Ctx, DL: MIRBuilder.getDataLayout(), Ty: MemTy, MMO))
4234 return UnableToLegalize;
4235
4236 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4237 }
4238
4239 if (MemTy.isVector()) {
4240 // TODO: Handle vector extloads
4241 if (MemTy != DstTy)
4242 return UnableToLegalize;
4243
4244 Align Alignment = LoadMI.getAlign();
4245 // Given an alignment larger than the size of the memory, we can increase
4246 // the size of the load without needing to scalarize it.
4247 if (Alignment.value() * 8 > MemSizeInBits &&
4248 isPowerOf2_64(Value: DstTy.getScalarSizeInBits())) {
4249 LLT MoreTy = DstTy.changeVectorElementCount(
4250 EC: ElementCount::getFixed(MinVal: NextPowerOf2(A: DstTy.getNumElements())));
4251 MachineMemOperand *NewMMO = MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Ty: MoreTy);
4252 auto NewLoad = MIRBuilder.buildLoad(Res: MoreTy, Addr: PtrReg, MMO&: *NewMMO);
4253 MIRBuilder.buildDeleteTrailingVectorElements(Res: LoadMI.getReg(Idx: 0),
4254 Op0: NewLoad.getReg(Idx: 0));
4255 LoadMI.eraseFromParent();
4256 return Legalized;
4257 }
4258
4259 // TODO: We can do better than scalarizing the vector and at least split it
4260 // in half.
4261 return reduceLoadStoreWidth(MI&: LoadMI, TypeIdx: 0, NarrowTy: DstTy.getElementType());
4262 }
4263
4264 MachineMemOperand *LargeMMO =
4265 MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Size: LargeSplitSize / 8);
4266 MachineMemOperand *SmallMMO =
4267 MF.getMachineMemOperand(MMO: &MMO, Offset: LargeSplitSize / 8, Size: SmallSplitSize / 8);
4268
4269 LLT PtrTy = MRI.getType(Reg: PtrReg);
4270 unsigned AnyExtSize = PowerOf2Ceil(A: DstTy.getSizeInBits());
4271
4272 LLT AnyExtTy;
4273 LLT OffsetCstRes;
4274 if (EltTy.isPointer()) {
4275 AnyExtTy = LLT::scalar(SizeInBits: AnyExtSize);
4276 OffsetCstRes = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
4277 } else {
4278 AnyExtTy = EltTy.changeElementSize(NewEltSize: AnyExtSize);
4279 OffsetCstRes = EltTy.changeElementSize(NewEltSize: PtrTy.getSizeInBits());
4280 }
4281
4282 auto LargeLoad = MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_ZEXTLOAD, Res: AnyExtTy,
4283 Addr: PtrReg, MMO&: *LargeMMO);
4284
4285 auto OffsetCst = MIRBuilder.buildConstant(Res: OffsetCstRes, Val: LargeSplitSize / 8);
4286 Register PtrAddReg = MRI.createGenericVirtualRegister(Ty: PtrTy);
4287 auto SmallPtr = MIRBuilder.buildObjectPtrOffset(Res: PtrAddReg, Op0: PtrReg, Op1: OffsetCst);
4288 auto SmallLoad = MIRBuilder.buildLoadInstr(Opcode: LoadMI.getOpcode(), Res: AnyExtTy,
4289 Addr: SmallPtr, MMO&: *SmallMMO);
4290
4291 auto ShiftAmt = MIRBuilder.buildConstant(Res: AnyExtTy, Val: LargeSplitSize);
4292 auto Shift = MIRBuilder.buildShl(Dst: AnyExtTy, Src0: SmallLoad, Src1: ShiftAmt);
4293
4294 if (AnyExtTy == DstTy)
4295 MIRBuilder.buildOr(Dst: DstReg, Src0: Shift, Src1: LargeLoad);
4296 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
4297 auto Or = MIRBuilder.buildOr(Dst: AnyExtTy, Src0: Shift, Src1: LargeLoad);
4298 MIRBuilder.buildTrunc(Res: DstReg, Op: {Or});
4299 } else {
4300 assert(DstTy.isPointer() && "expected pointer");
4301 auto Or = MIRBuilder.buildOr(Dst: AnyExtTy, Src0: Shift, Src1: LargeLoad);
4302
4303 // FIXME: We currently consider this to be illegal for non-integral address
4304 // spaces, but we need still need a way to reinterpret the bits.
4305 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: Or);
4306 }
4307
4308 LoadMI.eraseFromParent();
4309 return Legalized;
4310}
4311
4312LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
4313 // Lower a non-power of 2 store into multiple pow-2 stores.
4314 // E.g. split an i24 store into an i16 store + i8 store.
4315 // We do this by first extending the stored value to the next largest power
4316 // of 2 type, and then using truncating stores to store the components.
4317 // By doing this, likewise with G_LOAD, generate an extend that can be
4318 // artifact-combined away instead of leaving behind extracts.
4319 Register SrcReg = StoreMI.getValueReg();
4320 Register PtrReg = StoreMI.getPointerReg();
4321 LLT SrcTy = MRI.getType(Reg: SrcReg);
4322 MachineFunction &MF = MIRBuilder.getMF();
4323 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4324 LLT MemTy = MMO.getMemoryType();
4325
4326 unsigned StoreWidth = MemTy.getSizeInBits();
4327 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
4328
4329 if (StoreWidth != StoreSizeInBits && !SrcTy.isVector()) {
4330 // Promote to a byte-sized store with upper bits zero if not
4331 // storing an integral number of bytes. For example, promote
4332 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
4333 LLT WideTy = LLT::scalar(SizeInBits: StoreSizeInBits);
4334
4335 if (StoreSizeInBits > SrcTy.getSizeInBits()) {
4336 // Avoid creating a store with a narrower source than result.
4337 SrcReg = MIRBuilder.buildAnyExt(Res: WideTy, Op: SrcReg).getReg(Idx: 0);
4338 SrcTy = WideTy;
4339 }
4340
4341 auto ZextInReg = MIRBuilder.buildZExtInReg(Res: SrcTy, Op: SrcReg, ImmOp: StoreWidth);
4342
4343 MachineMemOperand *NewMMO =
4344 MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty: WideTy);
4345 MIRBuilder.buildStore(Val: ZextInReg, Addr: PtrReg, MMO&: *NewMMO);
4346 StoreMI.eraseFromParent();
4347 return Legalized;
4348 }
4349
4350 if (MemTy.isVector()) {
4351 if (MemTy != SrcTy)
4352 return scalarizeVectorBooleanStore(MI&: StoreMI);
4353
4354 // TODO: We can do better than scalarizing the vector and at least split it
4355 // in half.
4356 return reduceLoadStoreWidth(MI&: StoreMI, TypeIdx: 0, NarrowTy: SrcTy.getElementType());
4357 }
4358
4359 unsigned MemSizeInBits = MemTy.getSizeInBits();
4360 uint64_t LargeSplitSize, SmallSplitSize;
4361
4362 if (!isPowerOf2_32(Value: MemSizeInBits)) {
4363 LargeSplitSize = llvm::bit_floor<uint64_t>(Value: MemTy.getSizeInBits());
4364 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
4365 } else {
4366 auto &Ctx = MF.getFunction().getContext();
4367 if (TLI.allowsMemoryAccess(Context&: Ctx, DL: MIRBuilder.getDataLayout(), Ty: MemTy, MMO))
4368 return UnableToLegalize; // Don't know what we're being asked to do.
4369
4370 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4371 }
4372
4373 // Extend to the next pow-2. If this store was itself the result of lowering,
4374 // e.g. an s56 store being broken into s32 + s24, we might have a stored type
4375 // that's wider than the stored size.
4376 unsigned AnyExtSize = PowerOf2Ceil(A: MemTy.getSizeInBits());
4377 const LLT NewSrcTy = LLT::scalar(SizeInBits: AnyExtSize);
4378
4379 if (SrcTy.isPointer()) {
4380 const LLT IntPtrTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
4381 SrcReg = MIRBuilder.buildPtrToInt(Dst: IntPtrTy, Src: SrcReg).getReg(Idx: 0);
4382 }
4383
4384 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(Res: NewSrcTy, Op: SrcReg);
4385
4386 // Obtain the smaller value by shifting away the larger value.
4387 auto ShiftAmt = MIRBuilder.buildConstant(Res: NewSrcTy, Val: LargeSplitSize);
4388 auto SmallVal = MIRBuilder.buildLShr(Dst: NewSrcTy, Src0: ExtVal, Src1: ShiftAmt);
4389
4390 // Generate the PtrAdd and truncating stores.
4391 LLT PtrTy = MRI.getType(Reg: PtrReg);
4392 auto OffsetCst = MIRBuilder.buildConstant(
4393 Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()), Val: LargeSplitSize / 8);
4394 auto SmallPtr = MIRBuilder.buildObjectPtrOffset(Res: PtrTy, Op0: PtrReg, Op1: OffsetCst);
4395
4396 MachineMemOperand *LargeMMO =
4397 MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Size: LargeSplitSize / 8);
4398 MachineMemOperand *SmallMMO =
4399 MF.getMachineMemOperand(MMO: &MMO, Offset: LargeSplitSize / 8, Size: SmallSplitSize / 8);
4400 MIRBuilder.buildStore(Val: ExtVal, Addr: PtrReg, MMO&: *LargeMMO);
4401 MIRBuilder.buildStore(Val: SmallVal, Addr: SmallPtr, MMO&: *SmallMMO);
4402 StoreMI.eraseFromParent();
4403 return Legalized;
4404}
4405
4406LegalizerHelper::LegalizeResult
4407LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
4408 Register SrcReg = StoreMI.getValueReg();
4409 Register PtrReg = StoreMI.getPointerReg();
4410 LLT SrcTy = MRI.getType(Reg: SrcReg);
4411 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4412 LLT MemTy = MMO.getMemoryType();
4413 LLT MemScalarTy = MemTy.getElementType();
4414 MachineFunction &MF = MIRBuilder.getMF();
4415
4416 assert(SrcTy.isVector() && "Expect a vector store type");
4417
4418 if (!MemScalarTy.isByteSized()) {
4419 // We need to build an integer scalar of the vector bit pattern.
4420 // It's not legal for us to add padding when storing a vector.
4421 unsigned NumBits = MemTy.getSizeInBits();
4422 LLT IntTy = LLT::integer(SizeInBits: NumBits);
4423 auto CurrVal = MIRBuilder.buildConstant(Res: IntTy, Val: 0);
4424 LLT IdxTy = TLI.getVectorIdxLLT(DL: MF.getDataLayout());
4425
4426 for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
4427 auto Elt = MIRBuilder.buildExtractVectorElement(
4428 Res: SrcTy.getElementType(), Val: SrcReg, Idx: MIRBuilder.buildConstant(Res: IdxTy, Val: I));
4429 auto Trunc = MIRBuilder.buildTrunc(Res: MemScalarTy, Op: Elt);
4430 auto ZExt = MIRBuilder.buildZExt(Res: IntTy, Op: Trunc);
4431 unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
4432 ? (MemTy.getNumElements() - 1) - I
4433 : I;
4434 auto ShiftAmt = MIRBuilder.buildConstant(
4435 Res: IntTy, Val: ShiftIntoIdx * MemScalarTy.getSizeInBits());
4436 auto Shifted = MIRBuilder.buildShl(Dst: IntTy, Src0: ZExt, Src1: ShiftAmt);
4437 CurrVal = MIRBuilder.buildOr(Dst: IntTy, Src0: CurrVal, Src1: Shifted);
4438 }
4439 auto PtrInfo = MMO.getPointerInfo();
4440 auto *NewMMO = MF.getMachineMemOperand(MMO: &MMO, PtrInfo, Ty: IntTy);
4441 MIRBuilder.buildStore(Val: CurrVal, Addr: PtrReg, MMO&: *NewMMO);
4442 StoreMI.eraseFromParent();
4443 return Legalized;
4444 }
4445
4446 // TODO: implement simple scalarization.
4447 return UnableToLegalize;
4448}
4449
4450LegalizerHelper::LegalizeResult
4451LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
4452 switch (MI.getOpcode()) {
4453 case TargetOpcode::G_LOAD: {
4454 if (TypeIdx != 0)
4455 return UnableToLegalize;
4456 MachineMemOperand &MMO = **MI.memoperands_begin();
4457
4458 // Not sure how to interpret a bitcast of an extending load.
4459 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4460 return UnableToLegalize;
4461
4462 Observer.changingInstr(MI);
4463 bitcastDst(MI, CastTy, OpIdx: 0);
4464 MMO.setType(CastTy);
4465 // The range metadata is no longer valid when reinterpreted as a different
4466 // type.
4467 MMO.clearRanges();
4468 Observer.changedInstr(MI);
4469 return Legalized;
4470 }
4471 case TargetOpcode::G_STORE: {
4472 if (TypeIdx != 0)
4473 return UnableToLegalize;
4474
4475 MachineMemOperand &MMO = **MI.memoperands_begin();
4476
4477 // Not sure how to interpret a bitcast of a truncating store.
4478 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4479 return UnableToLegalize;
4480
4481 Observer.changingInstr(MI);
4482 bitcastSrc(MI, CastTy, OpIdx: 0);
4483 MMO.setType(CastTy);
4484 Observer.changedInstr(MI);
4485 return Legalized;
4486 }
4487 case TargetOpcode::G_SELECT: {
4488 if (TypeIdx != 0)
4489 return UnableToLegalize;
4490
4491 if (MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector()) {
4492 LLVM_DEBUG(
4493 dbgs() << "bitcast action not implemented for vector select\n");
4494 return UnableToLegalize;
4495 }
4496
4497 Observer.changingInstr(MI);
4498 bitcastSrc(MI, CastTy, OpIdx: 2);
4499 bitcastSrc(MI, CastTy, OpIdx: 3);
4500 bitcastDst(MI, CastTy, OpIdx: 0);
4501 Observer.changedInstr(MI);
4502 return Legalized;
4503 }
4504 case TargetOpcode::G_AND:
4505 case TargetOpcode::G_OR:
4506 case TargetOpcode::G_XOR: {
4507 Observer.changingInstr(MI);
4508 bitcastSrc(MI, CastTy, OpIdx: 1);
4509 bitcastSrc(MI, CastTy, OpIdx: 2);
4510 bitcastDst(MI, CastTy, OpIdx: 0);
4511 Observer.changedInstr(MI);
4512 return Legalized;
4513 }
4514 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4515 return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
4516 case TargetOpcode::G_INSERT_VECTOR_ELT:
4517 return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
4518 case TargetOpcode::G_CONCAT_VECTORS:
4519 return bitcastConcatVector(MI, TypeIdx, CastTy);
4520 case TargetOpcode::G_SHUFFLE_VECTOR:
4521 return bitcastShuffleVector(MI, TypeIdx, CastTy);
4522 case TargetOpcode::G_EXTRACT_SUBVECTOR:
4523 return bitcastExtractSubvector(MI, TypeIdx, CastTy);
4524 case TargetOpcode::G_INSERT_SUBVECTOR:
4525 return bitcastInsertSubvector(MI, TypeIdx, CastTy);
4526 default:
4527 return UnableToLegalize;
4528 }
4529}
4530
4531// Legalize an instruction by changing the opcode in place.
4532void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
4533 Observer.changingInstr(MI);
4534 MI.setDesc(MIRBuilder.getTII().get(Opcode: NewOpcode));
4535 Observer.changedInstr(MI);
4536}
4537
4538LegalizerHelper::LegalizeResult
4539LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
4540 using namespace TargetOpcode;
4541
4542 switch(MI.getOpcode()) {
4543 default:
4544 return UnableToLegalize;
4545 case TargetOpcode::G_FCONSTANT:
4546 return lowerFConstant(MI);
4547 case TargetOpcode::G_BITCAST:
4548 return lowerBitcast(MI);
4549 case TargetOpcode::G_SREM:
4550 case TargetOpcode::G_UREM: {
4551 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4552 auto Quot =
4553 MIRBuilder.buildInstr(Opc: MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, DstOps: {Ty},
4554 SrcOps: {MI.getOperand(i: 1), MI.getOperand(i: 2)});
4555
4556 auto Prod = MIRBuilder.buildMul(Dst: Ty, Src0: Quot, Src1: MI.getOperand(i: 2));
4557 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MI.getOperand(i: 1), Src1: Prod);
4558 MI.eraseFromParent();
4559 return Legalized;
4560 }
4561 case TargetOpcode::G_SADDO:
4562 case TargetOpcode::G_SSUBO:
4563 return lowerSADDO_SSUBO(MI);
4564 case TargetOpcode::G_SADDE:
4565 return lowerSADDE(MI);
4566 case TargetOpcode::G_SSUBE:
4567 return lowerSSUBE(MI);
4568 case TargetOpcode::G_UMULH:
4569 case TargetOpcode::G_SMULH:
4570 return lowerSMULH_UMULH(MI);
4571 case TargetOpcode::G_SMULO:
4572 case TargetOpcode::G_UMULO: {
4573 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
4574 // result.
4575 auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
4576 LLT Ty = MRI.getType(Reg: Res);
4577
4578 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
4579 ? TargetOpcode::G_SMULH
4580 : TargetOpcode::G_UMULH;
4581
4582 Observer.changingInstr(MI);
4583 const auto &TII = MIRBuilder.getTII();
4584 MI.setDesc(TII.get(Opcode: TargetOpcode::G_MUL));
4585 MI.removeOperand(OpNo: 1);
4586 Observer.changedInstr(MI);
4587
4588 auto HiPart = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {Ty}, SrcOps: {LHS, RHS});
4589 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4590
4591 // Move insert point forward so we can use the Res register if needed.
4592 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
4593
4594 // For *signed* multiply, overflow is detected by checking:
4595 // (hi != (lo >> bitwidth-1))
4596 if (Opcode == TargetOpcode::G_SMULH) {
4597 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: Ty.getSizeInBits() - 1);
4598 auto Shifted = MIRBuilder.buildAShr(Dst: Ty, Src0: Res, Src1: ShiftAmt);
4599 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: Overflow, Op0: HiPart, Op1: Shifted);
4600 } else {
4601 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: Overflow, Op0: HiPart, Op1: Zero);
4602 }
4603 return Legalized;
4604 }
4605 case TargetOpcode::G_FNEG: {
4606 auto [Res, ResTy, SubByReg, SubByRegTy] = MI.getFirst2RegLLTs();
4607 LLT TyInt =
4608 ResTy.changeElementType(NewEltTy: LLT::integer(SizeInBits: ResTy.getScalarSizeInBits()));
4609 Register CastedSubByReg = SubByReg;
4610
4611 if (!SubByRegTy.getScalarType().isAnyScalar() &&
4612 !SubByRegTy.getScalarType().isInteger()) {
4613 auto BitcastDst = SubByRegTy.changeElementType(
4614 NewEltTy: LLT::integer(SizeInBits: SubByRegTy.getScalarSizeInBits()));
4615 CastedSubByReg = MIRBuilder.buildBitcast(Dst: BitcastDst, Src: SubByReg).getReg(Idx: 0);
4616 }
4617
4618 auto SignMask = MIRBuilder.buildConstant(
4619 Res: TyInt, Val: APInt::getSignMask(BitWidth: TyInt.getScalarSizeInBits()));
4620
4621 if (ResTy != TyInt) {
4622 Register NewDst =
4623 MIRBuilder.buildXor(Dst: TyInt, Src0: CastedSubByReg, Src1: SignMask).getReg(Idx: 0);
4624 MIRBuilder.buildBitcast(Dst: Res, Src: NewDst);
4625 } else
4626 MIRBuilder.buildXor(Dst: Res, Src0: CastedSubByReg, Src1: SignMask).getReg(Idx: 0);
4627
4628 MI.eraseFromParent();
4629 return Legalized;
4630 }
4631 case TargetOpcode::G_FSUB:
4632 case TargetOpcode::G_STRICT_FSUB: {
4633 auto [Res, LHS, RHS] = MI.getFirst3Regs();
4634 LLT Ty = MRI.getType(Reg: Res);
4635
4636 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
4637 auto Neg = MIRBuilder.buildFNeg(Dst: Ty, Src0: RHS);
4638
4639 if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
4640 MIRBuilder.buildStrictFAdd(Dst: Res, Src0: LHS, Src1: Neg, Flags: MI.getFlags());
4641 else
4642 MIRBuilder.buildFAdd(Dst: Res, Src0: LHS, Src1: Neg, Flags: MI.getFlags());
4643
4644 MI.eraseFromParent();
4645 return Legalized;
4646 }
4647 case TargetOpcode::G_FMAD:
4648 return lowerFMad(MI);
4649 case TargetOpcode::G_FFLOOR:
4650 return lowerFFloor(MI);
4651 case TargetOpcode::G_LROUND:
4652 case TargetOpcode::G_LLROUND: {
4653 Register DstReg = MI.getOperand(i: 0).getReg();
4654 Register SrcReg = MI.getOperand(i: 1).getReg();
4655 LLT SrcTy = MRI.getType(Reg: SrcReg);
4656 auto Round = MIRBuilder.buildInstr(Opc: TargetOpcode::G_INTRINSIC_ROUND, DstOps: {SrcTy},
4657 SrcOps: {SrcReg});
4658 MIRBuilder.buildFPTOSI(Dst: DstReg, Src0: Round);
4659 MI.eraseFromParent();
4660 return Legalized;
4661 }
4662 case TargetOpcode::G_INTRINSIC_ROUND:
4663 return lowerIntrinsicRound(MI);
4664 case TargetOpcode::G_FRINT: {
4665 // Since round even is the assumed rounding mode for unconstrained FP
4666 // operations, rint and roundeven are the same operation.
4667 changeOpcode(MI, NewOpcode: TargetOpcode::G_INTRINSIC_ROUNDEVEN);
4668 return Legalized;
4669 }
4670 case TargetOpcode::G_INTRINSIC_LRINT:
4671 case TargetOpcode::G_INTRINSIC_LLRINT: {
4672 Register DstReg = MI.getOperand(i: 0).getReg();
4673 Register SrcReg = MI.getOperand(i: 1).getReg();
4674 LLT SrcTy = MRI.getType(Reg: SrcReg);
4675 auto Round =
4676 MIRBuilder.buildInstr(Opc: TargetOpcode::G_FRINT, DstOps: {SrcTy}, SrcOps: {SrcReg});
4677 MIRBuilder.buildFPTOSI(Dst: DstReg, Src0: Round);
4678 MI.eraseFromParent();
4679 return Legalized;
4680 }
4681 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
4682 auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
4683 Register NewOldValRes = MRI.cloneVirtualRegister(VReg: OldValRes);
4684 MIRBuilder.buildAtomicCmpXchg(OldValRes: NewOldValRes, Addr, CmpVal, NewVal,
4685 MMO&: **MI.memoperands_begin());
4686 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: SuccessRes, Op0: NewOldValRes, Op1: CmpVal);
4687 MIRBuilder.buildCopy(Res: OldValRes, Op: NewOldValRes);
4688 MI.eraseFromParent();
4689 return Legalized;
4690 }
4691 case TargetOpcode::G_LOAD:
4692 case TargetOpcode::G_SEXTLOAD:
4693 case TargetOpcode::G_ZEXTLOAD:
4694 return lowerLoad(LoadMI&: cast<GAnyLoad>(Val&: MI));
4695 case TargetOpcode::G_STORE:
4696 return lowerStore(StoreMI&: cast<GStore>(Val&: MI));
4697 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
4698 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
4699 case TargetOpcode::G_CTLZ:
4700 case TargetOpcode::G_CTTZ:
4701 case TargetOpcode::G_CTPOP:
4702 case TargetOpcode::G_CTLS:
4703 return lowerBitCount(MI);
4704 case G_UADDO: {
4705 auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
4706
4707 Register NewRes = MRI.cloneVirtualRegister(VReg: Res);
4708
4709 MIRBuilder.buildAdd(Dst: NewRes, Src0: LHS, Src1: RHS);
4710 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: CarryOut, Op0: NewRes, Op1: RHS);
4711
4712 MIRBuilder.buildCopy(Res, Op: NewRes);
4713
4714 MI.eraseFromParent();
4715 return Legalized;
4716 }
4717 case G_UADDE: {
4718 auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
4719 const LLT CondTy = MRI.getType(Reg: CarryOut);
4720 const LLT Ty = MRI.getType(Reg: Res);
4721
4722 Register NewRes = MRI.cloneVirtualRegister(VReg: Res);
4723
4724 // Initial add of the two operands.
4725 auto TmpRes = MIRBuilder.buildAdd(Dst: Ty, Src0: LHS, Src1: RHS);
4726
4727 // Initial check for carry.
4728 auto Carry = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: CondTy, Op0: TmpRes, Op1: LHS);
4729
4730 // Add the sum and the carry.
4731 auto ZExtCarryIn = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
4732 MIRBuilder.buildAdd(Dst: NewRes, Src0: TmpRes, Src1: ZExtCarryIn);
4733
4734 // Second check for carry. We can only carry if the initial sum is all 1s
4735 // and the carry is set, resulting in a new sum of 0.
4736 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4737 auto ResEqZero =
4738 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CondTy, Op0: NewRes, Op1: Zero);
4739 auto Carry2 = MIRBuilder.buildAnd(Dst: CondTy, Src0: ResEqZero, Src1: CarryIn);
4740 MIRBuilder.buildOr(Dst: CarryOut, Src0: Carry, Src1: Carry2);
4741
4742 MIRBuilder.buildCopy(Res, Op: NewRes);
4743
4744 MI.eraseFromParent();
4745 return Legalized;
4746 }
4747 case G_USUBO: {
4748 auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
4749
4750 MIRBuilder.buildSub(Dst: Res, Src0: LHS, Src1: RHS);
4751 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: BorrowOut, Op0: LHS, Op1: RHS);
4752
4753 MI.eraseFromParent();
4754 return Legalized;
4755 }
4756 case G_USUBE: {
4757 auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
4758 const LLT CondTy = MRI.getType(Reg: BorrowOut);
4759 const LLT Ty = MRI.getType(Reg: Res);
4760
4761 // Initial subtract of the two operands.
4762 auto TmpRes = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHS);
4763
4764 // Initial check for borrow.
4765 auto Borrow = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_UGT, Res: CondTy, Op0: TmpRes, Op1: LHS);
4766
4767 // Subtract the borrow from the first subtract.
4768 auto ZExtBorrowIn = MIRBuilder.buildZExt(Res: Ty, Op: BorrowIn);
4769 MIRBuilder.buildSub(Dst: Res, Src0: TmpRes, Src1: ZExtBorrowIn);
4770
4771 // Second check for borrow. We can only borrow if the initial difference is
4772 // 0 and the borrow is set, resulting in a new difference of all 1s.
4773 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4774 auto TmpResEqZero =
4775 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CondTy, Op0: TmpRes, Op1: Zero);
4776 auto Borrow2 = MIRBuilder.buildAnd(Dst: CondTy, Src0: TmpResEqZero, Src1: BorrowIn);
4777 MIRBuilder.buildOr(Dst: BorrowOut, Src0: Borrow, Src1: Borrow2);
4778
4779 MI.eraseFromParent();
4780 return Legalized;
4781 }
4782 case G_UITOFP:
4783 return lowerUITOFP(MI);
4784 case G_SITOFP:
4785 return lowerSITOFP(MI);
4786 case G_FPTOUI:
4787 return lowerFPTOUI(MI);
4788 case G_FPTOSI:
4789 return lowerFPTOSI(MI);
4790 case G_FPTOUI_SAT:
4791 case G_FPTOSI_SAT:
4792 return lowerFPTOINT_SAT(MI);
4793 case G_FPTRUNC:
4794 return lowerFPTRUNC(MI);
4795 case G_FPOWI:
4796 return lowerFPOWI(MI);
4797 case G_FMODF:
4798 return lowerFMODF(MI);
4799 case G_SMIN:
4800 case G_SMAX:
4801 case G_UMIN:
4802 case G_UMAX:
4803 return lowerMinMax(MI);
4804 case G_SCMP:
4805 case G_UCMP:
4806 return lowerThreewayCompare(MI);
4807 case G_FCOPYSIGN:
4808 return lowerFCopySign(MI);
4809 case G_FMINNUM:
4810 case G_FMAXNUM:
4811 case G_FMINIMUMNUM:
4812 case G_FMAXIMUMNUM:
4813 return lowerFMinNumMaxNum(MI);
4814 case G_FMINIMUM:
4815 case G_FMAXIMUM:
4816 return lowerFMinimumMaximum(MI);
4817 case G_MERGE_VALUES:
4818 return lowerMergeValues(MI);
4819 case G_UNMERGE_VALUES:
4820 return lowerUnmergeValues(MI);
4821 case TargetOpcode::G_SEXT_INREG: {
4822 assert(MI.getOperand(2).isImm() && "Expected immediate");
4823 int64_t SizeInBits = MI.getOperand(i: 2).getImm();
4824
4825 auto [DstReg, SrcReg] = MI.getFirst2Regs();
4826 LLT DstTy = MRI.getType(Reg: DstReg);
4827 Register TmpRes = MRI.createGenericVirtualRegister(Ty: DstTy);
4828
4829 auto MIBSz = MIRBuilder.buildConstant(Res: DstTy, Val: DstTy.getScalarSizeInBits() - SizeInBits);
4830 MIRBuilder.buildShl(Dst: TmpRes, Src0: SrcReg, Src1: MIBSz->getOperand(i: 0));
4831 MIRBuilder.buildAShr(Dst: DstReg, Src0: TmpRes, Src1: MIBSz->getOperand(i: 0));
4832 MI.eraseFromParent();
4833 return Legalized;
4834 }
4835 case G_EXTRACT_VECTOR_ELT:
4836 case G_INSERT_VECTOR_ELT:
4837 return lowerExtractInsertVectorElt(MI);
4838 case G_SHUFFLE_VECTOR:
4839 return lowerShuffleVector(MI);
4840 case G_VECTOR_COMPRESS:
4841 return lowerVECTOR_COMPRESS(MI);
4842 case G_DYN_STACKALLOC:
4843 return lowerDynStackAlloc(MI);
4844 case G_STACKSAVE:
4845 return lowerStackSave(MI);
4846 case G_STACKRESTORE:
4847 return lowerStackRestore(MI);
4848 case G_EXTRACT:
4849 return lowerExtract(MI);
4850 case G_INSERT:
4851 return lowerInsert(MI);
4852 case G_BSWAP:
4853 return lowerBswap(MI);
4854 case G_BITREVERSE:
4855 return lowerBitreverse(MI);
4856 case G_READ_REGISTER:
4857 case G_WRITE_REGISTER:
4858 return lowerReadWriteRegister(MI);
4859 case G_UADDSAT:
4860 case G_USUBSAT: {
4861 // Try to make a reasonable guess about which lowering strategy to use. The
4862 // target can override this with custom lowering and calling the
4863 // implementation functions.
4864 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4865 if (LI.isLegalOrCustom(Query: {G_UMIN, Ty}))
4866 return lowerAddSubSatToMinMax(MI);
4867 return lowerAddSubSatToAddoSubo(MI);
4868 }
4869 case G_SADDSAT:
4870 case G_SSUBSAT: {
4871 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4872
4873 // FIXME: It would probably make more sense to see if G_SADDO is preferred,
4874 // since it's a shorter expansion. However, we would need to figure out the
4875 // preferred boolean type for the carry out for the query.
4876 if (LI.isLegalOrCustom(Query: {G_SMIN, Ty}) && LI.isLegalOrCustom(Query: {G_SMAX, Ty}))
4877 return lowerAddSubSatToMinMax(MI);
4878 return lowerAddSubSatToAddoSubo(MI);
4879 }
4880 case G_SSHLSAT:
4881 case G_USHLSAT:
4882 return lowerShlSat(MI);
4883 case G_ABS:
4884 return lowerAbsToAddXor(MI);
4885 case G_ABDS:
4886 case G_ABDU: {
4887 bool IsSigned = MI.getOpcode() == G_ABDS;
4888 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4889 if ((IsSigned && LI.isLegal(Query: {G_SMIN, Ty}) && LI.isLegal(Query: {G_SMAX, Ty})) ||
4890 (!IsSigned && LI.isLegal(Query: {G_UMIN, Ty}) && LI.isLegal(Query: {G_UMAX, Ty}))) {
4891 return lowerAbsDiffToMinMax(MI);
4892 }
4893 return lowerAbsDiffToSelect(MI);
4894 }
4895 case G_FABS:
4896 return lowerFAbs(MI);
4897 case G_SELECT:
4898 return lowerSelect(MI);
4899 case G_IS_FPCLASS:
4900 return lowerISFPCLASS(MI);
4901 case G_SDIVREM:
4902 case G_UDIVREM:
4903 return lowerDIVREM(MI);
4904 case G_FSHL:
4905 case G_FSHR:
4906 return lowerFunnelShift(MI);
4907 case G_ROTL:
4908 case G_ROTR:
4909 return lowerRotate(MI);
4910 case G_MEMSET:
4911 case G_MEMCPY:
4912 case G_MEMMOVE:
4913 return lowerMemCpyFamily(MI);
4914 case G_MEMCPY_INLINE:
4915 return lowerMemcpyInline(MI);
4916 case G_ZEXT:
4917 case G_SEXT:
4918 case G_ANYEXT:
4919 return lowerEXT(MI);
4920 case G_TRUNC:
4921 return lowerTRUNC(MI);
4922 GISEL_VECREDUCE_CASES_NONSEQ
4923 return lowerVectorReduction(MI);
4924 case G_VAARG:
4925 return lowerVAArg(MI);
4926 case G_ATOMICRMW_SUB: {
4927 auto [Ret, Mem, Val] = MI.getFirst3Regs();
4928 const LLT ValTy = MRI.getType(Reg: Val);
4929 MachineMemOperand *MMO = *MI.memoperands_begin();
4930
4931 auto VNeg = MIRBuilder.buildNeg(Dst: ValTy, Src0: Val);
4932 MIRBuilder.buildAtomicRMW(Opcode: G_ATOMICRMW_ADD, OldValRes: Ret, Addr: Mem, Val: VNeg, MMO&: *MMO);
4933 MI.eraseFromParent();
4934 return Legalized;
4935 }
4936 }
4937}
4938
4939Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
4940 Align MinAlign) const {
4941 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
4942 // datalayout for the preferred alignment. Also there should be a target hook
4943 // for this to allow targets to reduce the alignment and ignore the
4944 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
4945 // the type.
4946 return std::max(a: Align(PowerOf2Ceil(A: Ty.getSizeInBytes())), b: MinAlign);
4947}
4948
4949MachineInstrBuilder
4950LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
4951 MachinePointerInfo &PtrInfo) {
4952 MachineFunction &MF = MIRBuilder.getMF();
4953 const DataLayout &DL = MIRBuilder.getDataLayout();
4954 int FrameIdx = MF.getFrameInfo().CreateStackObject(Size: Bytes, Alignment, isSpillSlot: false);
4955
4956 unsigned AddrSpace = DL.getAllocaAddrSpace();
4957 LLT FramePtrTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: DL.getPointerSizeInBits(AS: AddrSpace));
4958
4959 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI: FrameIdx);
4960 return MIRBuilder.buildFrameIndex(Res: FramePtrTy, Idx: FrameIdx);
4961}
4962
4963MachineInstrBuilder LegalizerHelper::createStackStoreLoad(const DstOp &Res,
4964 const SrcOp &Val) {
4965 LLT SrcTy = Val.getLLTTy(MRI);
4966 Align StackTypeAlign =
4967 std::max(a: getStackTemporaryAlignment(Ty: SrcTy),
4968 b: getStackTemporaryAlignment(Ty: Res.getLLTTy(MRI)));
4969 MachinePointerInfo PtrInfo;
4970 auto StackTemp =
4971 createStackTemporary(Bytes: SrcTy.getSizeInBytes(), Alignment: StackTypeAlign, PtrInfo);
4972
4973 MIRBuilder.buildStore(Val, Addr: StackTemp, PtrInfo, Alignment: StackTypeAlign);
4974 return MIRBuilder.buildLoad(Res, Addr: StackTemp, PtrInfo, Alignment: StackTypeAlign);
4975}
4976
4977static Register clampVectorIndex(MachineIRBuilder &B, Register IdxReg,
4978 LLT VecTy) {
4979 LLT IdxTy = B.getMRI()->getType(Reg: IdxReg);
4980 unsigned NElts = VecTy.getNumElements();
4981
4982 int64_t IdxVal;
4983 if (mi_match(R: IdxReg, MRI: *B.getMRI(), P: m_ICst(Cst&: IdxVal))) {
4984 if (IdxVal < VecTy.getNumElements())
4985 return IdxReg;
4986 // If a constant index would be out of bounds, clamp it as well.
4987 }
4988
4989 if (isPowerOf2_32(Value: NElts)) {
4990 APInt Imm = APInt::getLowBitsSet(numBits: IdxTy.getSizeInBits(), loBitsSet: Log2_32(Value: NElts));
4991 return B.buildAnd(Dst: IdxTy, Src0: IdxReg, Src1: B.buildConstant(Res: IdxTy, Val: Imm)).getReg(Idx: 0);
4992 }
4993
4994 return B.buildUMin(Dst: IdxTy, Src0: IdxReg, Src1: B.buildConstant(Res: IdxTy, Val: NElts - 1))
4995 .getReg(Idx: 0);
4996}
4997
4998Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
4999 Register Index) {
5000 LLT EltTy = VecTy.getElementType();
5001
5002 // Calculate the element offset and add it to the pointer.
5003 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
5004 assert(EltSize * 8 == EltTy.getSizeInBits() &&
5005 "Converting bits to bytes lost precision");
5006
5007 Index = clampVectorIndex(B&: MIRBuilder, IdxReg: Index, VecTy);
5008
5009 // Convert index to the correct size for the address space.
5010 const DataLayout &DL = MIRBuilder.getDataLayout();
5011 unsigned AS = MRI.getType(Reg: VecPtr).getAddressSpace();
5012 unsigned IndexSizeInBits = DL.getIndexSize(AS) * 8;
5013 LLT IdxTy = MRI.getType(Reg: Index).changeElementSize(NewEltSize: IndexSizeInBits);
5014 if (IdxTy != MRI.getType(Reg: Index))
5015 Index = MIRBuilder.buildSExtOrTrunc(Res: IdxTy, Op: Index).getReg(Idx: 0);
5016
5017 auto Mul = MIRBuilder.buildMul(Dst: IdxTy, Src0: Index,
5018 Src1: MIRBuilder.buildConstant(Res: IdxTy, Val: EltSize));
5019
5020 LLT PtrTy = MRI.getType(Reg: VecPtr);
5021 return MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VecPtr, Op1: Mul).getReg(Idx: 0);
5022}
5023
5024#ifndef NDEBUG
5025/// Check that all vector operands have same number of elements. Other operands
5026/// should be listed in NonVecOp.
5027static bool hasSameNumEltsOnAllVectorOperands(
5028 GenericMachineInstr &MI, MachineRegisterInfo &MRI,
5029 std::initializer_list<unsigned> NonVecOpIndices) {
5030 if (MI.getNumMemOperands() != 0)
5031 return false;
5032
5033 LLT VecTy = MRI.getType(MI.getReg(0));
5034 if (!VecTy.isVector())
5035 return false;
5036 unsigned NumElts = VecTy.getNumElements();
5037
5038 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
5039 MachineOperand &Op = MI.getOperand(OpIdx);
5040 if (!Op.isReg()) {
5041 if (!is_contained(NonVecOpIndices, OpIdx))
5042 return false;
5043 continue;
5044 }
5045
5046 LLT Ty = MRI.getType(Op.getReg());
5047 if (!Ty.isVector()) {
5048 if (!is_contained(NonVecOpIndices, OpIdx))
5049 return false;
5050 continue;
5051 }
5052
5053 if (Ty.getNumElements() != NumElts)
5054 return false;
5055 }
5056
5057 return true;
5058}
5059#endif
5060
5061/// Fill \p DstOps with DstOps that have same number of elements combined as
5062/// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
5063/// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
5064/// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
5065static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
5066 unsigned NumElts) {
5067 LLT LeftoverTy;
5068 assert(Ty.isVector() && "Expected vector type");
5069 LLT NarrowTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: NumElts));
5070 int NumParts, NumLeftover;
5071 std::tie(args&: NumParts, args&: NumLeftover) =
5072 getNarrowTypeBreakDown(OrigTy: Ty, NarrowTy, LeftoverTy);
5073
5074 assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
5075 for (int i = 0; i < NumParts; ++i) {
5076 DstOps.push_back(Elt: NarrowTy);
5077 }
5078
5079 if (LeftoverTy.isValid()) {
5080 assert(NumLeftover == 1 && "expected exactly one leftover");
5081 DstOps.push_back(Elt: LeftoverTy);
5082 }
5083}
5084
5085/// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
5086/// made from \p Op depending on operand type.
5087static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
5088 MachineOperand &Op) {
5089 for (unsigned i = 0; i < N; ++i) {
5090 if (Op.isReg())
5091 Ops.push_back(Elt: Op.getReg());
5092 else if (Op.isImm())
5093 Ops.push_back(Elt: Op.getImm());
5094 else if (Op.isPredicate())
5095 Ops.push_back(Elt: static_cast<CmpInst::Predicate>(Op.getPredicate()));
5096 else
5097 llvm_unreachable("Unsupported type");
5098 }
5099}
5100
5101// Handle splitting vector operations which need to have the same number of
5102// elements in each type index, but each type index may have a different element
5103// type.
5104//
5105// e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
5106// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
5107// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
5108//
5109// Also handles some irregular breakdown cases, e.g.
5110// e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
5111// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
5112// s64 = G_SHL s64, s32
5113LegalizerHelper::LegalizeResult
5114LegalizerHelper::fewerElementsVectorMultiEltType(
5115 GenericMachineInstr &MI, unsigned NumElts,
5116 std::initializer_list<unsigned> NonVecOpIndices) {
5117 assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
5118 "Non-compatible opcode or not specified non-vector operands");
5119 unsigned OrigNumElts = MRI.getType(Reg: MI.getReg(Idx: 0)).getNumElements();
5120
5121 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
5122 unsigned NumDefs = MI.getNumDefs();
5123
5124 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
5125 // Build instructions with DstOps to use instruction found by CSE directly.
5126 // CSE copies found instruction into given vreg when building with vreg dest.
5127 SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
5128 // Output registers will be taken from created instructions.
5129 SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
5130 for (unsigned i = 0; i < NumDefs; ++i) {
5131 makeDstOps(DstOps&: OutputOpsPieces[i], Ty: MRI.getType(Reg: MI.getReg(Idx: i)), NumElts);
5132 }
5133
5134 // Split vector input operands into sub-vectors with NumElts elts + Leftover.
5135 // Operands listed in NonVecOpIndices will be used as is without splitting;
5136 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
5137 // scalar condition (op 1), immediate in sext_inreg (op 2).
5138 SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
5139 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
5140 ++UseIdx, ++UseNo) {
5141 if (is_contained(Set: NonVecOpIndices, Element: UseIdx)) {
5142 broadcastSrcOp(Ops&: InputOpsPieces[UseNo], N: OutputOpsPieces[0].size(),
5143 Op&: MI.getOperand(i: UseIdx));
5144 } else {
5145 SmallVector<Register, 8> SplitPieces;
5146 extractVectorParts(Reg: MI.getReg(Idx: UseIdx), NumElts, VRegs&: SplitPieces, MIRBuilder,
5147 MRI);
5148 llvm::append_range(C&: InputOpsPieces[UseNo], R&: SplitPieces);
5149 }
5150 }
5151
5152 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
5153
5154 // Take i-th piece of each input operand split and build sub-vector/scalar
5155 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
5156 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
5157 SmallVector<DstOp, 2> Defs;
5158 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5159 Defs.push_back(Elt: OutputOpsPieces[DstNo][i]);
5160
5161 SmallVector<SrcOp, 3> Uses;
5162 for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
5163 Uses.push_back(Elt: InputOpsPieces[InputNo][i]);
5164
5165 auto I = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: Defs, SrcOps: Uses, Flags: MI.getFlags());
5166 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5167 OutputRegs[DstNo].push_back(Elt: I.getReg(Idx: DstNo));
5168 }
5169
5170 // Merge small outputs into MI's output for each def operand.
5171 if (NumLeftovers) {
5172 for (unsigned i = 0; i < NumDefs; ++i)
5173 mergeMixedSubvectors(DstReg: MI.getReg(Idx: i), PartRegs: OutputRegs[i]);
5174 } else {
5175 for (unsigned i = 0; i < NumDefs; ++i)
5176 MIRBuilder.buildMergeLikeInstr(Res: MI.getReg(Idx: i), Ops: OutputRegs[i]);
5177 }
5178
5179 MI.eraseFromParent();
5180 return Legalized;
5181}
5182
5183LegalizerHelper::LegalizeResult
5184LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
5185 unsigned NumElts) {
5186 unsigned OrigNumElts = MRI.getType(Reg: MI.getReg(Idx: 0)).getNumElements();
5187
5188 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
5189 unsigned NumDefs = MI.getNumDefs();
5190
5191 SmallVector<DstOp, 8> OutputOpsPieces;
5192 SmallVector<Register, 8> OutputRegs;
5193 makeDstOps(DstOps&: OutputOpsPieces, Ty: MRI.getType(Reg: MI.getReg(Idx: 0)), NumElts);
5194
5195 // Instructions that perform register split will be inserted in basic block
5196 // where register is defined (basic block is in the next operand).
5197 SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
5198 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
5199 UseIdx += 2, ++UseNo) {
5200 MachineBasicBlock &OpMBB = *MI.getOperand(i: UseIdx + 1).getMBB();
5201 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
5202 extractVectorParts(Reg: MI.getReg(Idx: UseIdx), NumElts, VRegs&: InputOpsPieces[UseNo],
5203 MIRBuilder, MRI);
5204 }
5205
5206 // Build PHIs with fewer elements.
5207 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
5208 MIRBuilder.setInsertPt(MBB&: *MI.getParent(), II: MI);
5209 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
5210 auto Phi = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_PHI);
5211 Phi.addDef(
5212 RegNo: MRI.createGenericVirtualRegister(Ty: OutputOpsPieces[i].getLLTTy(MRI)));
5213 OutputRegs.push_back(Elt: Phi.getReg(Idx: 0));
5214
5215 for (unsigned j = 0; j < NumInputs / 2; ++j) {
5216 Phi.addUse(RegNo: InputOpsPieces[j][i]);
5217 Phi.add(MO: MI.getOperand(i: 1 + j * 2 + 1));
5218 }
5219 }
5220
5221 // Set the insert point after the existing PHIs
5222 MachineBasicBlock &MBB = *MI.getParent();
5223 MIRBuilder.setInsertPt(MBB, II: MBB.getFirstNonPHI());
5224
5225 // Merge small outputs into MI's def.
5226 if (NumLeftovers) {
5227 mergeMixedSubvectors(DstReg: MI.getReg(Idx: 0), PartRegs: OutputRegs);
5228 } else {
5229 MIRBuilder.buildMergeLikeInstr(Res: MI.getReg(Idx: 0), Ops: OutputRegs);
5230 }
5231
5232 MI.eraseFromParent();
5233 return Legalized;
5234}
5235
5236LegalizerHelper::LegalizeResult
5237LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
5238 unsigned TypeIdx,
5239 LLT NarrowTy) {
5240 const int NumDst = MI.getNumOperands() - 1;
5241 const Register SrcReg = MI.getOperand(i: NumDst).getReg();
5242 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5243 LLT SrcTy = MRI.getType(Reg: SrcReg);
5244
5245 if (TypeIdx != 1 || NarrowTy == DstTy)
5246 return UnableToLegalize;
5247
5248 // Requires compatible types. Otherwise SrcReg should have been defined by
5249 // merge-like instruction that would get artifact combined. Most likely
5250 // instruction that defines SrcReg has to perform more/fewer elements
5251 // legalization compatible with NarrowTy.
5252 assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5253 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5254
5255 if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5256 (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
5257 return UnableToLegalize;
5258
5259 // This is most likely DstTy (smaller then register size) packed in SrcTy
5260 // (larger then register size) and since unmerge was not combined it will be
5261 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
5262 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
5263
5264 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
5265 //
5266 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
5267 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
5268 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
5269 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: SrcReg);
5270 const int NumUnmerge = Unmerge->getNumOperands() - 1;
5271 const int PartsPerUnmerge = NumDst / NumUnmerge;
5272
5273 for (int I = 0; I != NumUnmerge; ++I) {
5274 auto MIB = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_UNMERGE_VALUES);
5275
5276 for (int J = 0; J != PartsPerUnmerge; ++J)
5277 MIB.addDef(RegNo: MI.getOperand(i: I * PartsPerUnmerge + J).getReg());
5278 MIB.addUse(RegNo: Unmerge.getReg(Idx: I));
5279 }
5280
5281 MI.eraseFromParent();
5282 return Legalized;
5283}
5284
5285LegalizerHelper::LegalizeResult
5286LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
5287 LLT NarrowTy) {
5288 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5289 // Requires compatible types. Otherwise user of DstReg did not perform unmerge
5290 // that should have been artifact combined. Most likely instruction that uses
5291 // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
5292 assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5293 assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5294 if (NarrowTy == SrcTy)
5295 return UnableToLegalize;
5296
5297 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
5298 // is for old mir tests. Since the changes to more/fewer elements it should no
5299 // longer be possible to generate MIR like this when starting from llvm-ir
5300 // because LCMTy approach was replaced with merge/unmerge to vector elements.
5301 if (TypeIdx == 1) {
5302 assert(SrcTy.isVector() && "Expected vector types");
5303 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5304 if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5305 (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
5306 return UnableToLegalize;
5307 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
5308 //
5309 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
5310 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
5311 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
5312 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
5313 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
5314 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
5315
5316 SmallVector<Register, 8> Elts;
5317 LLT EltTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getScalarType();
5318 for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
5319 auto Unmerge = MIRBuilder.buildUnmerge(Res: EltTy, Op: MI.getOperand(i).getReg());
5320 for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
5321 Elts.push_back(Elt: Unmerge.getReg(Idx: j));
5322 }
5323
5324 SmallVector<Register, 8> NarrowTyElts;
5325 unsigned NumNarrowTyElts = NarrowTy.getNumElements();
5326 unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
5327 for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
5328 ++i, Offset += NumNarrowTyElts) {
5329 ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
5330 NarrowTyElts.push_back(
5331 Elt: MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: Pieces).getReg(Idx: 0));
5332 }
5333
5334 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NarrowTyElts);
5335 MI.eraseFromParent();
5336 return Legalized;
5337 }
5338
5339 assert(TypeIdx == 0 && "Bad type index");
5340 if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
5341 (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
5342 return UnableToLegalize;
5343
5344 // This is most likely SrcTy (smaller then register size) packed in DstTy
5345 // (larger then register size) and since merge was not combined it will be
5346 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
5347 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
5348
5349 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
5350 //
5351 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
5352 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
5353 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
5354 SmallVector<Register, 8> NarrowTyElts;
5355 unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
5356 unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
5357 unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
5358 for (unsigned i = 0; i < NumParts; ++i) {
5359 SmallVector<Register, 8> Sources;
5360 for (unsigned j = 0; j < NumElts; ++j)
5361 Sources.push_back(Elt: MI.getOperand(i: 1 + i * NumElts + j).getReg());
5362 NarrowTyElts.push_back(
5363 Elt: MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: Sources).getReg(Idx: 0));
5364 }
5365
5366 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NarrowTyElts);
5367 MI.eraseFromParent();
5368 return Legalized;
5369}
5370
5371LegalizerHelper::LegalizeResult
5372LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
5373 unsigned TypeIdx,
5374 LLT NarrowVecTy) {
5375 auto [DstReg, SrcVec] = MI.getFirst2Regs();
5376 Register InsertVal;
5377 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
5378
5379 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
5380 if (IsInsert)
5381 InsertVal = MI.getOperand(i: 2).getReg();
5382
5383 Register Idx = MI.getOperand(i: MI.getNumOperands() - 1).getReg();
5384 LLT VecTy = MRI.getType(Reg: SrcVec);
5385
5386 // If the index is a constant, we can really break this down as you would
5387 // expect, and index into the target size pieces.
5388 auto MaybeCst = getIConstantVRegValWithLookThrough(VReg: Idx, MRI);
5389 if (MaybeCst) {
5390 uint64_t IdxVal = MaybeCst->Value.getZExtValue();
5391 // Avoid out of bounds indexing the pieces.
5392 if (IdxVal >= VecTy.getNumElements()) {
5393 MIRBuilder.buildUndef(Res: DstReg);
5394 MI.eraseFromParent();
5395 return Legalized;
5396 }
5397
5398 if (!NarrowVecTy.isVector()) {
5399 SmallVector<Register, 8> SplitPieces;
5400 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowVecTy,
5401 NumParts: VecTy.getNumElements(), VRegs&: SplitPieces, MIRBuilder, MRI);
5402 if (IsInsert) {
5403 SplitPieces[IdxVal] = InsertVal;
5404 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: SplitPieces);
5405 } else {
5406 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0).getReg(), Op: SplitPieces[IdxVal]);
5407 }
5408 } else {
5409 SmallVector<Register, 8> VecParts;
5410 LLT GCDTy = extractGCDType(Parts&: VecParts, DstTy: VecTy, NarrowTy: NarrowVecTy, SrcReg: SrcVec);
5411
5412 // Build a sequence of NarrowTy pieces in VecParts for this operand.
5413 LLT LCMTy = buildLCMMergePieces(DstTy: VecTy, NarrowTy: NarrowVecTy, GCDTy, VRegs&: VecParts,
5414 PadStrategy: TargetOpcode::G_ANYEXT);
5415
5416 unsigned NewNumElts = NarrowVecTy.getNumElements();
5417
5418 LLT IdxTy = MRI.getType(Reg: Idx);
5419 int64_t PartIdx = IdxVal / NewNumElts;
5420 auto NewIdx =
5421 MIRBuilder.buildConstant(Res: IdxTy, Val: IdxVal - NewNumElts * PartIdx);
5422
5423 if (IsInsert) {
5424 LLT PartTy = MRI.getType(Reg: VecParts[PartIdx]);
5425
5426 // Use the adjusted index to insert into one of the subvectors.
5427 auto InsertPart = MIRBuilder.buildInsertVectorElement(
5428 Res: PartTy, Val: VecParts[PartIdx], Elt: InsertVal, Idx: NewIdx);
5429 VecParts[PartIdx] = InsertPart.getReg(Idx: 0);
5430
5431 // Recombine the inserted subvector with the others to reform the result
5432 // vector.
5433 buildWidenedRemergeToDst(DstReg, LCMTy, RemergeRegs: VecParts);
5434 } else {
5435 MIRBuilder.buildExtractVectorElement(Res: DstReg, Val: VecParts[PartIdx], Idx: NewIdx);
5436 }
5437 }
5438
5439 MI.eraseFromParent();
5440 return Legalized;
5441 }
5442
5443 // With a variable index, we can't perform the operation in a smaller type, so
5444 // we're forced to expand this.
5445 //
5446 // TODO: We could emit a chain of compare/select to figure out which piece to
5447 // index.
5448 return lowerExtractInsertVectorElt(MI);
5449}
5450
5451LegalizerHelper::LegalizeResult
5452LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
5453 LLT NarrowTy) {
5454 // FIXME: Don't know how to handle secondary types yet.
5455 if (TypeIdx != 0)
5456 return UnableToLegalize;
5457
5458 if (!NarrowTy.isByteSized()) {
5459 LLVM_DEBUG(dbgs() << "Can't narrow load/store to non-byte-sized type\n");
5460 return UnableToLegalize;
5461 }
5462
5463 // This implementation doesn't work for atomics. Give up instead of doing
5464 // something invalid.
5465 if (LdStMI.isAtomic())
5466 return UnableToLegalize;
5467
5468 bool IsLoad = isa<GLoad>(Val: LdStMI);
5469 Register ValReg = LdStMI.getReg(Idx: 0);
5470 Register AddrReg = LdStMI.getPointerReg();
5471 LLT ValTy = MRI.getType(Reg: ValReg);
5472
5473 // FIXME: Do we need a distinct NarrowMemory legalize action?
5474 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize().getValue()) {
5475 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
5476 return UnableToLegalize;
5477 }
5478
5479 int NumParts = -1;
5480 int NumLeftover = -1;
5481 LLT LeftoverTy;
5482 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
5483 if (IsLoad) {
5484 std::tie(args&: NumParts, args&: NumLeftover) = getNarrowTypeBreakDown(OrigTy: ValTy, NarrowTy, LeftoverTy);
5485 } else {
5486 if (extractParts(Reg: ValReg, RegTy: ValTy, MainTy: NarrowTy, LeftoverTy, VRegs&: NarrowRegs,
5487 LeftoverVRegs&: NarrowLeftoverRegs, MIRBuilder, MRI)) {
5488 NumParts = NarrowRegs.size();
5489 NumLeftover = NarrowLeftoverRegs.size();
5490 }
5491 }
5492
5493 if (NumParts == -1)
5494 return UnableToLegalize;
5495
5496 LLT PtrTy = MRI.getType(Reg: AddrReg);
5497 const LLT OffsetTy = LLT::integer(SizeInBits: PtrTy.getSizeInBits());
5498
5499 unsigned TotalSize = ValTy.getSizeInBits();
5500
5501 // Split the load/store into PartTy sized pieces starting at Offset. If this
5502 // is a load, return the new registers in ValRegs. For a store, each elements
5503 // of ValRegs should be PartTy. Returns the next offset that needs to be
5504 // handled.
5505 bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
5506 auto MMO = LdStMI.getMMO();
5507 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
5508 unsigned NumParts, unsigned Offset) -> unsigned {
5509 MachineFunction &MF = MIRBuilder.getMF();
5510 unsigned PartSize = PartTy.getSizeInBits();
5511 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
5512 ++Idx) {
5513 unsigned ByteOffset = Offset / 8;
5514 Register NewAddrReg;
5515
5516 MIRBuilder.materializeObjectPtrOffset(Res&: NewAddrReg, Op0: AddrReg, ValueTy: OffsetTy,
5517 Value: ByteOffset);
5518
5519 MachineMemOperand *NewMMO =
5520 MF.getMachineMemOperand(MMO: &MMO, Offset: ByteOffset, Ty: PartTy);
5521
5522 if (IsLoad) {
5523 Register Dst = MRI.createGenericVirtualRegister(Ty: PartTy);
5524 ValRegs.push_back(Elt: Dst);
5525 MIRBuilder.buildLoad(Res: Dst, Addr: NewAddrReg, MMO&: *NewMMO);
5526 } else {
5527 MIRBuilder.buildStore(Val: ValRegs[Idx], Addr: NewAddrReg, MMO&: *NewMMO);
5528 }
5529 Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
5530 }
5531
5532 return Offset;
5533 };
5534
5535 unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
5536 unsigned HandledOffset =
5537 splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
5538
5539 // Handle the rest of the register if this isn't an even type breakdown.
5540 if (LeftoverTy.isValid())
5541 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
5542
5543 if (IsLoad) {
5544 insertParts(DstReg: ValReg, ResultTy: ValTy, PartTy: NarrowTy, PartRegs: NarrowRegs,
5545 LeftoverTy, LeftoverRegs: NarrowLeftoverRegs);
5546 }
5547
5548 LdStMI.eraseFromParent();
5549 return Legalized;
5550}
5551
5552LegalizerHelper::LegalizeResult
5553LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
5554 LLT NarrowTy) {
5555 using namespace TargetOpcode;
5556 GenericMachineInstr &GMI = cast<GenericMachineInstr>(Val&: MI);
5557 unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5558
5559 switch (MI.getOpcode()) {
5560 case G_IMPLICIT_DEF:
5561 case G_TRUNC:
5562 case G_AND:
5563 case G_OR:
5564 case G_XOR:
5565 case G_ADD:
5566 case G_SUB:
5567 case G_MUL:
5568 case G_PTR_ADD:
5569 case G_SMULH:
5570 case G_UMULH:
5571 case G_FADD:
5572 case G_FMUL:
5573 case G_FSUB:
5574 case G_FNEG:
5575 case G_FABS:
5576 case G_FCANONICALIZE:
5577 case G_FDIV:
5578 case G_FREM:
5579 case G_FMA:
5580 case G_FMAD:
5581 case G_FPOW:
5582 case G_FEXP:
5583 case G_FEXP2:
5584 case G_FEXP10:
5585 case G_FLOG:
5586 case G_FLOG2:
5587 case G_FLOG10:
5588 case G_FLDEXP:
5589 case G_FNEARBYINT:
5590 case G_FCEIL:
5591 case G_FFLOOR:
5592 case G_FRINT:
5593 case G_INTRINSIC_LRINT:
5594 case G_INTRINSIC_LLRINT:
5595 case G_INTRINSIC_ROUND:
5596 case G_INTRINSIC_ROUNDEVEN:
5597 case G_LROUND:
5598 case G_LLROUND:
5599 case G_INTRINSIC_TRUNC:
5600 case G_FMODF:
5601 case G_FCOS:
5602 case G_FSIN:
5603 case G_FTAN:
5604 case G_FACOS:
5605 case G_FASIN:
5606 case G_FATAN:
5607 case G_FATAN2:
5608 case G_FCOSH:
5609 case G_FSINH:
5610 case G_FTANH:
5611 case G_FSQRT:
5612 case G_BSWAP:
5613 case G_BITREVERSE:
5614 case G_SDIV:
5615 case G_UDIV:
5616 case G_SREM:
5617 case G_UREM:
5618 case G_SDIVREM:
5619 case G_UDIVREM:
5620 case G_SMIN:
5621 case G_SMAX:
5622 case G_UMIN:
5623 case G_UMAX:
5624 case G_ABS:
5625 case G_FMINNUM:
5626 case G_FMAXNUM:
5627 case G_FMINNUM_IEEE:
5628 case G_FMAXNUM_IEEE:
5629 case G_FMINIMUM:
5630 case G_FMAXIMUM:
5631 case G_FMINIMUMNUM:
5632 case G_FMAXIMUMNUM:
5633 case G_FSHL:
5634 case G_FSHR:
5635 case G_ROTL:
5636 case G_ROTR:
5637 case G_FREEZE:
5638 case G_SADDSAT:
5639 case G_SSUBSAT:
5640 case G_UADDSAT:
5641 case G_USUBSAT:
5642 case G_UMULO:
5643 case G_SMULO:
5644 case G_SHL:
5645 case G_LSHR:
5646 case G_ASHR:
5647 case G_SSHLSAT:
5648 case G_USHLSAT:
5649 case G_CTLZ:
5650 case G_CTLZ_ZERO_UNDEF:
5651 case G_CTTZ:
5652 case G_CTTZ_ZERO_UNDEF:
5653 case G_CTPOP:
5654 case G_CTLS:
5655 case G_FCOPYSIGN:
5656 case G_ZEXT:
5657 case G_SEXT:
5658 case G_ANYEXT:
5659 case G_FPEXT:
5660 case G_FPTRUNC:
5661 case G_SITOFP:
5662 case G_UITOFP:
5663 case G_FPTOSI:
5664 case G_FPTOUI:
5665 case G_FPTOSI_SAT:
5666 case G_FPTOUI_SAT:
5667 case G_INTTOPTR:
5668 case G_PTRTOINT:
5669 case G_ADDRSPACE_CAST:
5670 case G_UADDO:
5671 case G_USUBO:
5672 case G_UADDE:
5673 case G_USUBE:
5674 case G_SADDO:
5675 case G_SSUBO:
5676 case G_SADDE:
5677 case G_SSUBE:
5678 case G_STRICT_FADD:
5679 case G_STRICT_FSUB:
5680 case G_STRICT_FMUL:
5681 case G_STRICT_FMA:
5682 case G_STRICT_FLDEXP:
5683 case G_FFREXP:
5684 case G_TRUNC_SSAT_S:
5685 case G_TRUNC_SSAT_U:
5686 case G_TRUNC_USAT_U:
5687 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts);
5688 case G_ICMP:
5689 case G_FCMP:
5690 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {1 /*cpm predicate*/});
5691 case G_IS_FPCLASS:
5692 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2, 3 /*mask,fpsem*/});
5693 case G_SELECT:
5694 if (MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector())
5695 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts);
5696 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {1 /*scalar cond*/});
5697 case G_PHI:
5698 return fewerElementsVectorPhi(MI&: GMI, NumElts);
5699 case G_UNMERGE_VALUES:
5700 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
5701 case G_BUILD_VECTOR:
5702 assert(TypeIdx == 0 && "not a vector type index");
5703 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5704 case G_CONCAT_VECTORS:
5705 if (TypeIdx != 1) // TODO: This probably does work as expected already.
5706 return UnableToLegalize;
5707 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5708 case G_EXTRACT_VECTOR_ELT:
5709 case G_INSERT_VECTOR_ELT:
5710 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowVecTy: NarrowTy);
5711 case G_LOAD:
5712 case G_STORE:
5713 return reduceLoadStoreWidth(LdStMI&: cast<GLoadStore>(Val&: MI), TypeIdx, NarrowTy);
5714 case G_SEXT_INREG:
5715 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2 /*imm*/});
5716 GISEL_VECREDUCE_CASES_NONSEQ
5717 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
5718 case TargetOpcode::G_VECREDUCE_SEQ_FADD:
5719 case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
5720 return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
5721 case G_SHUFFLE_VECTOR:
5722 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
5723 case G_FPOWI:
5724 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2 /*pow*/});
5725 case G_BITCAST:
5726 return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
5727 case G_INTRINSIC_FPTRUNC_ROUND:
5728 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2});
5729 default:
5730 return UnableToLegalize;
5731 }
5732}
5733
5734LegalizerHelper::LegalizeResult
5735LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
5736 LLT NarrowTy) {
5737 assert(MI.getOpcode() == TargetOpcode::G_BITCAST &&
5738 "Not a bitcast operation");
5739
5740 if (TypeIdx != 0)
5741 return UnableToLegalize;
5742
5743 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5744
5745 unsigned NewElemCount =
5746 NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
5747 SmallVector<Register> SrcVRegs, BitcastVRegs;
5748 if (NewElemCount == 1) {
5749 LLT SrcNarrowTy = SrcTy.getElementType();
5750
5751 auto Unmerge = MIRBuilder.buildUnmerge(Res: SrcNarrowTy, Op: SrcReg);
5752 getUnmergeResults(Regs&: SrcVRegs, MI: *Unmerge);
5753 } else {
5754 LLT SrcNarrowTy =
5755 SrcTy.changeVectorElementCount(EC: ElementCount::getFixed(MinVal: NewElemCount));
5756
5757 // Split the Src and Dst Reg into smaller registers
5758 if (extractGCDType(Parts&: SrcVRegs, DstTy, NarrowTy: SrcNarrowTy, SrcReg) != SrcNarrowTy)
5759 return UnableToLegalize;
5760 }
5761
5762 // Build new smaller bitcast instructions
5763 // Not supporting Leftover types for now but will have to
5764 for (Register Reg : SrcVRegs)
5765 BitcastVRegs.push_back(Elt: MIRBuilder.buildBitcast(Dst: NarrowTy, Src: Reg).getReg(Idx: 0));
5766
5767 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: BitcastVRegs);
5768 MI.eraseFromParent();
5769 return Legalized;
5770}
5771
5772LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
5773 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5774 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
5775 if (TypeIdx != 0)
5776 return UnableToLegalize;
5777
5778 auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
5779 MI.getFirst3RegLLTs();
5780 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
5781 // The shuffle should be canonicalized by now.
5782 if (DstTy != Src1Ty)
5783 return UnableToLegalize;
5784 if (DstTy != Src2Ty)
5785 return UnableToLegalize;
5786
5787 if (!isPowerOf2_32(Value: DstTy.getNumElements()))
5788 return UnableToLegalize;
5789
5790 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
5791 // Further legalization attempts will be needed to do split further.
5792 NarrowTy =
5793 DstTy.changeElementCount(EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2));
5794 unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5795
5796 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
5797 extractParts(Reg: Src1Reg, Ty: NarrowTy, NumParts: 2, VRegs&: SplitSrc1Regs, MIRBuilder, MRI);
5798 extractParts(Reg: Src2Reg, Ty: NarrowTy, NumParts: 2, VRegs&: SplitSrc2Regs, MIRBuilder, MRI);
5799 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
5800 SplitSrc2Regs[1]};
5801
5802 Register Hi, Lo;
5803
5804 // If Lo or Hi uses elements from at most two of the four input vectors, then
5805 // express it as a vector shuffle of those two inputs. Otherwise extract the
5806 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
5807 SmallVector<int, 16> Ops;
5808 for (unsigned High = 0; High < 2; ++High) {
5809 Register &Output = High ? Hi : Lo;
5810
5811 // Build a shuffle mask for the output, discovering on the fly which
5812 // input vectors to use as shuffle operands (recorded in InputUsed).
5813 // If building a suitable shuffle vector proves too hard, then bail
5814 // out with useBuildVector set.
5815 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
5816 unsigned FirstMaskIdx = High * NewElts;
5817 bool UseBuildVector = false;
5818 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5819 // The mask element. This indexes into the input.
5820 int Idx = Mask[FirstMaskIdx + MaskOffset];
5821
5822 // The input vector this mask element indexes into.
5823 unsigned Input = (unsigned)Idx / NewElts;
5824
5825 if (Input >= std::size(Inputs)) {
5826 // The mask element does not index into any input vector.
5827 Ops.push_back(Elt: -1);
5828 continue;
5829 }
5830
5831 // Turn the index into an offset from the start of the input vector.
5832 Idx -= Input * NewElts;
5833
5834 // Find or create a shuffle vector operand to hold this input.
5835 unsigned OpNo;
5836 for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
5837 if (InputUsed[OpNo] == Input) {
5838 // This input vector is already an operand.
5839 break;
5840 } else if (InputUsed[OpNo] == -1U) {
5841 // Create a new operand for this input vector.
5842 InputUsed[OpNo] = Input;
5843 break;
5844 }
5845 }
5846
5847 if (OpNo >= std::size(InputUsed)) {
5848 // More than two input vectors used! Give up on trying to create a
5849 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
5850 UseBuildVector = true;
5851 break;
5852 }
5853
5854 // Add the mask index for the new shuffle vector.
5855 Ops.push_back(Elt: Idx + OpNo * NewElts);
5856 }
5857
5858 if (UseBuildVector) {
5859 LLT EltTy = NarrowTy.getElementType();
5860 SmallVector<Register, 16> SVOps;
5861
5862 // Extract the input elements by hand.
5863 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5864 // The mask element. This indexes into the input.
5865 int Idx = Mask[FirstMaskIdx + MaskOffset];
5866
5867 // The input vector this mask element indexes into.
5868 unsigned Input = (unsigned)Idx / NewElts;
5869
5870 if (Input >= std::size(Inputs)) {
5871 // The mask element is "undef" or indexes off the end of the input.
5872 SVOps.push_back(Elt: MIRBuilder.buildUndef(Res: EltTy).getReg(Idx: 0));
5873 continue;
5874 }
5875
5876 // Turn the index into an offset from the start of the input vector.
5877 Idx -= Input * NewElts;
5878
5879 // Extract the vector element by hand.
5880 SVOps.push_back(Elt: MIRBuilder
5881 .buildExtractVectorElement(
5882 Res: EltTy, Val: Inputs[Input],
5883 Idx: MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: Idx))
5884 .getReg(Idx: 0));
5885 }
5886
5887 // Construct the Lo/Hi output using a G_BUILD_VECTOR.
5888 Output = MIRBuilder.buildBuildVector(Res: NarrowTy, Ops: SVOps).getReg(Idx: 0);
5889 } else if (InputUsed[0] == -1U) {
5890 // No input vectors were used! The result is undefined.
5891 Output = MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0);
5892 } else if (NewElts == 1) {
5893 Output = MIRBuilder.buildCopy(Res: NarrowTy, Op: Inputs[InputUsed[0]]).getReg(Idx: 0);
5894 } else {
5895 Register Op0 = Inputs[InputUsed[0]];
5896 // If only one input was used, use an undefined vector for the other.
5897 Register Op1 = InputUsed[1] == -1U
5898 ? MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0)
5899 : Inputs[InputUsed[1]];
5900 // At least one input vector was used. Create a new shuffle vector.
5901 Output = MIRBuilder.buildShuffleVector(Res: NarrowTy, Src1: Op0, Src2: Op1, Mask: Ops).getReg(Idx: 0);
5902 }
5903
5904 Ops.clear();
5905 }
5906
5907 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: {Lo, Hi});
5908 MI.eraseFromParent();
5909 return Legalized;
5910}
5911
5912LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
5913 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5914 auto &RdxMI = cast<GVecReduce>(Val&: MI);
5915
5916 if (TypeIdx != 1)
5917 return UnableToLegalize;
5918
5919 // The semantics of the normal non-sequential reductions allow us to freely
5920 // re-associate the operation.
5921 auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
5922
5923 if (NarrowTy.isVector() &&
5924 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
5925 return UnableToLegalize;
5926
5927 unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
5928 SmallVector<Register> SplitSrcs;
5929 // If NarrowTy is a scalar then we're being asked to scalarize.
5930 const unsigned NumParts =
5931 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
5932 : SrcTy.getNumElements();
5933
5934 extractParts(Reg: SrcReg, Ty: NarrowTy, NumParts, VRegs&: SplitSrcs, MIRBuilder, MRI);
5935 if (NarrowTy.isScalar()) {
5936 if (DstTy != NarrowTy)
5937 return UnableToLegalize; // FIXME: handle implicit extensions.
5938
5939 if (isPowerOf2_32(Value: NumParts)) {
5940 // Generate a tree of scalar operations to reduce the critical path.
5941 SmallVector<Register> PartialResults;
5942 unsigned NumPartsLeft = NumParts;
5943 while (NumPartsLeft > 1) {
5944 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
5945 PartialResults.emplace_back(
5946 Args: MIRBuilder
5947 .buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy},
5948 SrcOps: {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
5949 .getReg(Idx: 0));
5950 }
5951 SplitSrcs = PartialResults;
5952 PartialResults.clear();
5953 NumPartsLeft = SplitSrcs.size();
5954 }
5955 assert(SplitSrcs.size() == 1);
5956 MIRBuilder.buildCopy(Res: DstReg, Op: SplitSrcs[0]);
5957 MI.eraseFromParent();
5958 return Legalized;
5959 }
5960 // If we can't generate a tree, then just do sequential operations.
5961 Register Acc = SplitSrcs[0];
5962 for (unsigned Idx = 1; Idx < NumParts; ++Idx)
5963 Acc = MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {Acc, SplitSrcs[Idx]})
5964 .getReg(Idx: 0);
5965 MIRBuilder.buildCopy(Res: DstReg, Op: Acc);
5966 MI.eraseFromParent();
5967 return Legalized;
5968 }
5969 SmallVector<Register> PartialReductions;
5970 for (unsigned Part = 0; Part < NumParts; ++Part) {
5971 PartialReductions.push_back(
5972 Elt: MIRBuilder.buildInstr(Opc: RdxMI.getOpcode(), DstOps: {DstTy}, SrcOps: {SplitSrcs[Part]})
5973 .getReg(Idx: 0));
5974 }
5975
5976 // If the types involved are powers of 2, we can generate intermediate vector
5977 // ops, before generating a final reduction operation.
5978 if (isPowerOf2_32(Value: SrcTy.getNumElements()) &&
5979 isPowerOf2_32(Value: NarrowTy.getNumElements())) {
5980 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
5981 }
5982
5983 Register Acc = PartialReductions[0];
5984 for (unsigned Part = 1; Part < NumParts; ++Part) {
5985 if (Part == NumParts - 1) {
5986 MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {DstReg},
5987 SrcOps: {Acc, PartialReductions[Part]});
5988 } else {
5989 Acc = MIRBuilder
5990 .buildInstr(Opc: ScalarOpc, DstOps: {DstTy}, SrcOps: {Acc, PartialReductions[Part]})
5991 .getReg(Idx: 0);
5992 }
5993 }
5994 MI.eraseFromParent();
5995 return Legalized;
5996}
5997
5998LegalizerHelper::LegalizeResult
5999LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
6000 unsigned int TypeIdx,
6001 LLT NarrowTy) {
6002 auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
6003 MI.getFirst3RegLLTs();
6004 if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
6005 DstTy != NarrowTy)
6006 return UnableToLegalize;
6007
6008 assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
6009 MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
6010 "Unexpected vecreduce opcode");
6011 unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
6012 ? TargetOpcode::G_FADD
6013 : TargetOpcode::G_FMUL;
6014
6015 SmallVector<Register> SplitSrcs;
6016 unsigned NumParts = SrcTy.getNumElements();
6017 extractParts(Reg: SrcReg, Ty: NarrowTy, NumParts, VRegs&: SplitSrcs, MIRBuilder, MRI);
6018 Register Acc = ScalarReg;
6019 for (unsigned i = 0; i < NumParts; i++)
6020 Acc = MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {Acc, SplitSrcs[i]})
6021 .getReg(Idx: 0);
6022
6023 MIRBuilder.buildCopy(Res: DstReg, Op: Acc);
6024 MI.eraseFromParent();
6025 return Legalized;
6026}
6027
6028LegalizerHelper::LegalizeResult
6029LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
6030 LLT SrcTy, LLT NarrowTy,
6031 unsigned ScalarOpc) {
6032 SmallVector<Register> SplitSrcs;
6033 // Split the sources into NarrowTy size pieces.
6034 extractParts(Reg: SrcReg, Ty: NarrowTy,
6035 NumParts: SrcTy.getNumElements() / NarrowTy.getNumElements(), VRegs&: SplitSrcs,
6036 MIRBuilder, MRI);
6037 // We're going to do a tree reduction using vector operations until we have
6038 // one NarrowTy size value left.
6039 while (SplitSrcs.size() > 1) {
6040 SmallVector<Register> PartialRdxs;
6041 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
6042 Register LHS = SplitSrcs[Idx];
6043 Register RHS = SplitSrcs[Idx + 1];
6044 // Create the intermediate vector op.
6045 Register Res =
6046 MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {LHS, RHS}).getReg(Idx: 0);
6047 PartialRdxs.push_back(Elt: Res);
6048 }
6049 SplitSrcs = std::move(PartialRdxs);
6050 }
6051 // Finally generate the requested NarrowTy based reduction.
6052 Observer.changingInstr(MI);
6053 MI.getOperand(i: 1).setReg(SplitSrcs[0]);
6054 Observer.changedInstr(MI);
6055 return Legalized;
6056}
6057
6058LegalizerHelper::LegalizeResult
6059LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
6060 const LLT HalfTy, const LLT AmtTy) {
6061
6062 Register InL = MRI.createGenericVirtualRegister(Ty: HalfTy);
6063 Register InH = MRI.createGenericVirtualRegister(Ty: HalfTy);
6064 MIRBuilder.buildUnmerge(Res: {InL, InH}, Op: MI.getOperand(i: 1));
6065
6066 if (Amt.isZero()) {
6067 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {InL, InH});
6068 MI.eraseFromParent();
6069 return Legalized;
6070 }
6071
6072 LLT NVT = HalfTy;
6073 unsigned NVTBits = HalfTy.getSizeInBits();
6074 unsigned VTBits = 2 * NVTBits;
6075
6076 SrcOp Lo(Register(0)), Hi(Register(0));
6077 if (MI.getOpcode() == TargetOpcode::G_SHL) {
6078 if (Amt.ugt(RHS: VTBits)) {
6079 Lo = Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6080 } else if (Amt.ugt(RHS: NVTBits)) {
6081 Lo = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6082 Hi = MIRBuilder.buildShl(Dst: NVT, Src0: InL,
6083 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
6084 } else if (Amt == NVTBits) {
6085 Lo = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6086 Hi = InL;
6087 } else {
6088 Lo = MIRBuilder.buildShl(Dst: NVT, Src0: InL, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt));
6089 auto OrLHS =
6090 MIRBuilder.buildShl(Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt));
6091 auto OrRHS = MIRBuilder.buildLShr(
6092 Dst: NVT, Src0: InL, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
6093 Hi = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
6094 }
6095 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
6096 if (Amt.ugt(RHS: VTBits)) {
6097 Lo = Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6098 } else if (Amt.ugt(RHS: NVTBits)) {
6099 Lo = MIRBuilder.buildLShr(Dst: NVT, Src0: InH,
6100 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
6101 Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6102 } else if (Amt == NVTBits) {
6103 Lo = InH;
6104 Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6105 } else {
6106 auto ShiftAmtConst = MIRBuilder.buildConstant(Res: AmtTy, Val: Amt);
6107
6108 auto OrLHS = MIRBuilder.buildLShr(Dst: NVT, Src0: InL, Src1: ShiftAmtConst);
6109 auto OrRHS = MIRBuilder.buildShl(
6110 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
6111
6112 Lo = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
6113 Hi = MIRBuilder.buildLShr(Dst: NVT, Src0: InH, Src1: ShiftAmtConst);
6114 }
6115 } else {
6116 if (Amt.ugt(RHS: VTBits)) {
6117 Hi = Lo = MIRBuilder.buildAShr(
6118 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
6119 } else if (Amt.ugt(RHS: NVTBits)) {
6120 Lo = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
6121 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
6122 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
6123 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
6124 } else if (Amt == NVTBits) {
6125 Lo = InH;
6126 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
6127 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
6128 } else {
6129 auto ShiftAmtConst = MIRBuilder.buildConstant(Res: AmtTy, Val: Amt);
6130
6131 auto OrLHS = MIRBuilder.buildLShr(Dst: NVT, Src0: InL, Src1: ShiftAmtConst);
6132 auto OrRHS = MIRBuilder.buildShl(
6133 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
6134
6135 Lo = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
6136 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH, Src1: ShiftAmtConst);
6137 }
6138 }
6139
6140 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {Lo, Hi});
6141 MI.eraseFromParent();
6142
6143 return Legalized;
6144}
6145
6146LegalizerHelper::LegalizeResult
6147LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
6148 LLT RequestedTy) {
6149 if (TypeIdx == 1) {
6150 Observer.changingInstr(MI);
6151 narrowScalarSrc(MI, NarrowTy: RequestedTy, OpIdx: 2);
6152 Observer.changedInstr(MI);
6153 return Legalized;
6154 }
6155
6156 Register DstReg = MI.getOperand(i: 0).getReg();
6157 LLT DstTy = MRI.getType(Reg: DstReg);
6158 if (DstTy.isVector())
6159 return UnableToLegalize;
6160
6161 Register Amt = MI.getOperand(i: 2).getReg();
6162 LLT ShiftAmtTy = MRI.getType(Reg: Amt);
6163 const unsigned DstEltSize = DstTy.getScalarSizeInBits();
6164 if (DstEltSize % 2 != 0)
6165 return UnableToLegalize;
6166
6167 // Check if we should use multi-way splitting instead of recursive binary
6168 // splitting.
6169 //
6170 // Multi-way splitting directly decomposes wide shifts (e.g., 128-bit ->
6171 // 4×32-bit) in a single legalization step, avoiding the recursive overhead
6172 // and dependency chains created by usual binary splitting approach
6173 // (128->64->32).
6174 //
6175 // The >= 8 parts threshold ensures we only use this optimization when binary
6176 // splitting would require multiple recursive passes, avoiding overhead for
6177 // simple 2-way splits where binary approach is sufficient.
6178 if (RequestedTy.isValid() && RequestedTy.isScalar() &&
6179 DstEltSize % RequestedTy.getSizeInBits() == 0) {
6180 const unsigned NumParts = DstEltSize / RequestedTy.getSizeInBits();
6181 // Use multiway if we have 8 or more parts (i.e., would need 3+ recursive
6182 // steps).
6183 if (NumParts >= 8)
6184 return narrowScalarShiftMultiway(MI, TargetTy: RequestedTy);
6185 }
6186
6187 // Fall back to binary splitting:
6188 // Ignore the input type. We can only go to exactly half the size of the
6189 // input. If that isn't small enough, the resulting pieces will be further
6190 // legalized.
6191 const unsigned NewBitSize = DstEltSize / 2;
6192 const LLT HalfTy = DstTy.getScalarType().changeElementSize(NewEltSize: NewBitSize);
6193 const LLT CondTy = LLT::scalar(SizeInBits: 1);
6194
6195 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: Amt, MRI)) {
6196 return narrowScalarShiftByConstant(MI, Amt: VRegAndVal->Value, HalfTy,
6197 AmtTy: ShiftAmtTy);
6198 }
6199
6200 // TODO: Expand with known bits.
6201
6202 // Handle the fully general expansion by an unknown amount.
6203 auto NewBits = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: NewBitSize);
6204
6205 Register InL = MRI.createGenericVirtualRegister(Ty: HalfTy);
6206 Register InH = MRI.createGenericVirtualRegister(Ty: HalfTy);
6207 MIRBuilder.buildUnmerge(Res: {InL, InH}, Op: MI.getOperand(i: 1));
6208
6209 auto AmtExcess = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: Amt, Src1: NewBits);
6210 auto AmtLack = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: NewBits, Src1: Amt);
6211
6212 auto Zero = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
6213 auto IsShort = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_ULT, Res: CondTy, Op0: Amt, Op1: NewBits);
6214 auto IsZero = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: CondTy, Op0: Amt, Op1: Zero);
6215
6216 Register ResultRegs[2];
6217 switch (MI.getOpcode()) {
6218 case TargetOpcode::G_SHL: {
6219 // Short: ShAmt < NewBitSize
6220 auto LoS = MIRBuilder.buildShl(Dst: HalfTy, Src0: InL, Src1: Amt);
6221
6222 auto LoOr = MIRBuilder.buildLShr(Dst: HalfTy, Src0: InL, Src1: AmtLack);
6223 auto HiOr = MIRBuilder.buildShl(Dst: HalfTy, Src0: InH, Src1: Amt);
6224 auto HiS = MIRBuilder.buildOr(Dst: HalfTy, Src0: LoOr, Src1: HiOr);
6225
6226 // Long: ShAmt >= NewBitSize
6227 auto LoL = MIRBuilder.buildConstant(Res: HalfTy, Val: 0); // Lo part is zero.
6228 auto HiL = MIRBuilder.buildShl(Dst: HalfTy, Src0: InL, Src1: AmtExcess); // Hi from Lo part.
6229
6230 auto Lo = MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: LoS, Op1: LoL);
6231 auto Hi = MIRBuilder.buildSelect(
6232 Res: HalfTy, Tst: IsZero, Op0: InH, Op1: MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: HiS, Op1: HiL));
6233
6234 ResultRegs[0] = Lo.getReg(Idx: 0);
6235 ResultRegs[1] = Hi.getReg(Idx: 0);
6236 break;
6237 }
6238 case TargetOpcode::G_LSHR:
6239 case TargetOpcode::G_ASHR: {
6240 // Short: ShAmt < NewBitSize
6241 auto HiS = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {HalfTy}, SrcOps: {InH, Amt});
6242
6243 auto LoOr = MIRBuilder.buildLShr(Dst: HalfTy, Src0: InL, Src1: Amt);
6244 auto HiOr = MIRBuilder.buildShl(Dst: HalfTy, Src0: InH, Src1: AmtLack);
6245 auto LoS = MIRBuilder.buildOr(Dst: HalfTy, Src0: LoOr, Src1: HiOr);
6246
6247 // Long: ShAmt >= NewBitSize
6248 MachineInstrBuilder HiL;
6249 if (MI.getOpcode() == TargetOpcode::G_LSHR) {
6250 HiL = MIRBuilder.buildConstant(Res: HalfTy, Val: 0); // Hi part is zero.
6251 } else {
6252 auto ShiftAmt = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: NewBitSize - 1);
6253 HiL = MIRBuilder.buildAShr(Dst: HalfTy, Src0: InH, Src1: ShiftAmt); // Sign of Hi part.
6254 }
6255 auto LoL = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {HalfTy},
6256 SrcOps: {InH, AmtExcess}); // Lo from Hi part.
6257
6258 auto Lo = MIRBuilder.buildSelect(
6259 Res: HalfTy, Tst: IsZero, Op0: InL, Op1: MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: LoS, Op1: LoL));
6260
6261 auto Hi = MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: HiS, Op1: HiL);
6262
6263 ResultRegs[0] = Lo.getReg(Idx: 0);
6264 ResultRegs[1] = Hi.getReg(Idx: 0);
6265 break;
6266 }
6267 default:
6268 llvm_unreachable("not a shift");
6269 }
6270
6271 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: ResultRegs);
6272 MI.eraseFromParent();
6273 return Legalized;
6274}
6275
6276Register LegalizerHelper::buildConstantShiftPart(unsigned Opcode,
6277 unsigned PartIdx,
6278 unsigned NumParts,
6279 ArrayRef<Register> SrcParts,
6280 const ShiftParams &Params,
6281 LLT TargetTy, LLT ShiftAmtTy) {
6282 auto WordShiftConst = getIConstantVRegVal(VReg: Params.WordShift, MRI);
6283 auto BitShiftConst = getIConstantVRegVal(VReg: Params.BitShift, MRI);
6284 assert(WordShiftConst && BitShiftConst && "Expected constants");
6285
6286 const unsigned ShiftWords = WordShiftConst->getZExtValue();
6287 const unsigned ShiftBits = BitShiftConst->getZExtValue();
6288 const bool NeedsInterWordShift = ShiftBits != 0;
6289
6290 switch (Opcode) {
6291 case TargetOpcode::G_SHL: {
6292 // Data moves from lower indices to higher indices
6293 // If this part would come from a source beyond our range, it's zero
6294 if (PartIdx < ShiftWords)
6295 return Params.Zero;
6296
6297 unsigned SrcIdx = PartIdx - ShiftWords;
6298 if (!NeedsInterWordShift)
6299 return SrcParts[SrcIdx];
6300
6301 // Combine shifted main part with carry from previous part
6302 auto Hi = MIRBuilder.buildShl(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift);
6303 if (SrcIdx > 0) {
6304 auto Lo = MIRBuilder.buildLShr(Dst: TargetTy, Src0: SrcParts[SrcIdx - 1],
6305 Src1: Params.InvBitShift);
6306 return MIRBuilder.buildOr(Dst: TargetTy, Src0: Hi, Src1: Lo).getReg(Idx: 0);
6307 }
6308 return Hi.getReg(Idx: 0);
6309 }
6310
6311 case TargetOpcode::G_LSHR: {
6312 unsigned SrcIdx = PartIdx + ShiftWords;
6313 if (SrcIdx >= NumParts)
6314 return Params.Zero;
6315 if (!NeedsInterWordShift)
6316 return SrcParts[SrcIdx];
6317
6318 // Combine shifted main part with carry from next part
6319 auto Lo = MIRBuilder.buildLShr(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift);
6320 if (SrcIdx + 1 < NumParts) {
6321 auto Hi = MIRBuilder.buildShl(Dst: TargetTy, Src0: SrcParts[SrcIdx + 1],
6322 Src1: Params.InvBitShift);
6323 return MIRBuilder.buildOr(Dst: TargetTy, Src0: Lo, Src1: Hi).getReg(Idx: 0);
6324 }
6325 return Lo.getReg(Idx: 0);
6326 }
6327
6328 case TargetOpcode::G_ASHR: {
6329 // Like LSHR but preserves sign bit
6330 unsigned SrcIdx = PartIdx + ShiftWords;
6331 if (SrcIdx >= NumParts)
6332 return Params.SignBit;
6333 if (!NeedsInterWordShift)
6334 return SrcParts[SrcIdx];
6335
6336 // Only the original MSB part uses arithmetic shift to preserve sign. All
6337 // other parts use logical shift since they're just moving data bits.
6338 auto Lo =
6339 (SrcIdx == NumParts - 1)
6340 ? MIRBuilder.buildAShr(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift)
6341 : MIRBuilder.buildLShr(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift);
6342 Register HiSrc =
6343 (SrcIdx + 1 < NumParts) ? SrcParts[SrcIdx + 1] : Params.SignBit;
6344 auto Hi = MIRBuilder.buildShl(Dst: TargetTy, Src0: HiSrc, Src1: Params.InvBitShift);
6345 return MIRBuilder.buildOr(Dst: TargetTy, Src0: Lo, Src1: Hi).getReg(Idx: 0);
6346 }
6347
6348 default:
6349 llvm_unreachable("not a shift");
6350 }
6351}
6352
6353Register LegalizerHelper::buildVariableShiftPart(unsigned Opcode,
6354 Register MainOperand,
6355 Register ShiftAmt,
6356 LLT TargetTy,
6357 Register CarryOperand) {
6358 // This helper generates a single output part for variable shifts by combining
6359 // the main operand (shifted by BitShift) with carry bits from an adjacent
6360 // part.
6361
6362 // For G_ASHR, individual parts don't have their own sign bit, only the
6363 // complete value does. So we use LSHR for the main operand shift in ASHR
6364 // context.
6365 unsigned MainOpcode = (Opcode == TargetOpcode::G_ASHR)
6366 ? static_cast<unsigned>(TargetOpcode::G_LSHR)
6367 : Opcode;
6368
6369 // Perform the primary shift on the main operand
6370 Register MainShifted =
6371 MIRBuilder.buildInstr(Opc: MainOpcode, DstOps: {TargetTy}, SrcOps: {MainOperand, ShiftAmt})
6372 .getReg(Idx: 0);
6373
6374 // No carry operand available
6375 if (!CarryOperand.isValid())
6376 return MainShifted;
6377
6378 // If BitShift is 0 (word-aligned shift), no inter-word bit movement occurs,
6379 // so carry bits aren't needed.
6380 LLT ShiftAmtTy = MRI.getType(Reg: ShiftAmt);
6381 auto ZeroConst = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
6382 LLT BoolTy = LLT::scalar(SizeInBits: 1);
6383 auto IsZeroBitShift =
6384 MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: BoolTy, Op0: ShiftAmt, Op1: ZeroConst);
6385
6386 // Extract bits from the adjacent part that will "carry over" into this part.
6387 // The carry direction is opposite to the main shift direction, so we can
6388 // align the two shifted values before combining them with OR.
6389
6390 // Determine the carry shift opcode (opposite direction)
6391 unsigned CarryOpcode = (Opcode == TargetOpcode::G_SHL) ? TargetOpcode::G_LSHR
6392 : TargetOpcode::G_SHL;
6393
6394 // Calculate inverse shift amount: BitWidth - ShiftAmt
6395 auto TargetBitsConst =
6396 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetTy.getScalarSizeInBits());
6397 auto InvShiftAmt = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: TargetBitsConst, Src1: ShiftAmt);
6398
6399 // Shift the carry operand
6400 Register CarryBits =
6401 MIRBuilder
6402 .buildInstr(Opc: CarryOpcode, DstOps: {TargetTy}, SrcOps: {CarryOperand, InvShiftAmt})
6403 .getReg(Idx: 0);
6404
6405 // If BitShift is 0, don't include carry bits (InvShiftAmt would equal
6406 // TargetBits which would be poison for the individual carry shift operation).
6407 auto ZeroReg = MIRBuilder.buildConstant(Res: TargetTy, Val: 0);
6408 Register SafeCarryBits =
6409 MIRBuilder.buildSelect(Res: TargetTy, Tst: IsZeroBitShift, Op0: ZeroReg, Op1: CarryBits)
6410 .getReg(Idx: 0);
6411
6412 // Combine the main shifted part with the carry bits
6413 return MIRBuilder.buildOr(Dst: TargetTy, Src0: MainShifted, Src1: SafeCarryBits).getReg(Idx: 0);
6414}
6415
6416LegalizerHelper::LegalizeResult
6417LegalizerHelper::narrowScalarShiftByConstantMultiway(MachineInstr &MI,
6418 const APInt &Amt,
6419 LLT TargetTy,
6420 LLT ShiftAmtTy) {
6421 // Any wide shift can be decomposed into WordShift + BitShift components.
6422 // When shift amount is known constant, directly compute the decomposition
6423 // values and generate constant registers.
6424 Register DstReg = MI.getOperand(i: 0).getReg();
6425 Register SrcReg = MI.getOperand(i: 1).getReg();
6426 LLT DstTy = MRI.getType(Reg: DstReg);
6427
6428 const unsigned DstBits = DstTy.getScalarSizeInBits();
6429 const unsigned TargetBits = TargetTy.getScalarSizeInBits();
6430 const unsigned NumParts = DstBits / TargetBits;
6431
6432 assert(DstBits % TargetBits == 0 && "Target type must evenly divide source");
6433
6434 // When the shift amount is known at compile time, we just calculate which
6435 // source parts contribute to each output part.
6436
6437 SmallVector<Register, 8> SrcParts;
6438 extractParts(Reg: SrcReg, Ty: TargetTy, NumParts, VRegs&: SrcParts, MIRBuilder, MRI);
6439
6440 if (Amt.isZero()) {
6441 // No shift needed, just copy
6442 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SrcParts);
6443 MI.eraseFromParent();
6444 return Legalized;
6445 }
6446
6447 ShiftParams Params;
6448 const unsigned ShiftWords = Amt.getZExtValue() / TargetBits;
6449 const unsigned ShiftBits = Amt.getZExtValue() % TargetBits;
6450
6451 // Generate constants and values needed by all shift types
6452 Params.WordShift = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: ShiftWords).getReg(Idx: 0);
6453 Params.BitShift = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: ShiftBits).getReg(Idx: 0);
6454 Params.InvBitShift =
6455 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - ShiftBits).getReg(Idx: 0);
6456 Params.Zero = MIRBuilder.buildConstant(Res: TargetTy, Val: 0).getReg(Idx: 0);
6457
6458 // For ASHR, we need the sign-extended value to fill shifted-out positions
6459 if (MI.getOpcode() == TargetOpcode::G_ASHR)
6460 Params.SignBit =
6461 MIRBuilder
6462 .buildAShr(Dst: TargetTy, Src0: SrcParts[SrcParts.size() - 1],
6463 Src1: MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - 1))
6464 .getReg(Idx: 0);
6465
6466 SmallVector<Register, 8> DstParts(NumParts);
6467 for (unsigned I = 0; I < NumParts; ++I)
6468 DstParts[I] = buildConstantShiftPart(Opcode: MI.getOpcode(), PartIdx: I, NumParts, SrcParts,
6469 Params, TargetTy, ShiftAmtTy);
6470
6471 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstParts);
6472 MI.eraseFromParent();
6473 return Legalized;
6474}
6475
6476LegalizerHelper::LegalizeResult
6477LegalizerHelper::narrowScalarShiftMultiway(MachineInstr &MI, LLT TargetTy) {
6478 Register DstReg = MI.getOperand(i: 0).getReg();
6479 Register SrcReg = MI.getOperand(i: 1).getReg();
6480 Register AmtReg = MI.getOperand(i: 2).getReg();
6481 LLT DstTy = MRI.getType(Reg: DstReg);
6482 LLT ShiftAmtTy = MRI.getType(Reg: AmtReg);
6483
6484 const unsigned DstBits = DstTy.getScalarSizeInBits();
6485 const unsigned TargetBits = TargetTy.getScalarSizeInBits();
6486 const unsigned NumParts = DstBits / TargetBits;
6487
6488 assert(DstBits % TargetBits == 0 && "Target type must evenly divide source");
6489 assert(isPowerOf2_32(TargetBits) && "Target bit width must be power of 2");
6490
6491 // If the shift amount is known at compile time, we can use direct indexing
6492 // instead of generating select chains in the general case.
6493 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: AmtReg, MRI))
6494 return narrowScalarShiftByConstantMultiway(MI, Amt: VRegAndVal->Value, TargetTy,
6495 ShiftAmtTy);
6496
6497 // For runtime-variable shift amounts, we must generate a more complex
6498 // sequence that handles all possible shift values using select chains.
6499
6500 // Split the input into target-sized pieces
6501 SmallVector<Register, 8> SrcParts;
6502 extractParts(Reg: SrcReg, Ty: TargetTy, NumParts, VRegs&: SrcParts, MIRBuilder, MRI);
6503
6504 // Shifting by zero should be a no-op.
6505 auto ZeroAmtConst = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
6506 LLT BoolTy = LLT::scalar(SizeInBits: 1);
6507 auto IsZeroShift =
6508 MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: BoolTy, Op0: AmtReg, Op1: ZeroAmtConst);
6509
6510 // Any wide shift can be decomposed into two components:
6511 // 1. WordShift: number of complete target-sized words to shift
6512 // 2. BitShift: number of bits to shift within each word
6513 //
6514 // Example: 128-bit >> 50 with 32-bit target:
6515 // WordShift = 50 / 32 = 1 (shift right by 1 complete word)
6516 // BitShift = 50 % 32 = 18 (shift each word right by 18 bits)
6517 unsigned TargetBitsLog2 = Log2_32(Value: TargetBits);
6518 auto TargetBitsLog2Const =
6519 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBitsLog2);
6520 auto TargetBitsMask = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - 1);
6521
6522 Register WordShift =
6523 MIRBuilder.buildLShr(Dst: ShiftAmtTy, Src0: AmtReg, Src1: TargetBitsLog2Const).getReg(Idx: 0);
6524 Register BitShift =
6525 MIRBuilder.buildAnd(Dst: ShiftAmtTy, Src0: AmtReg, Src1: TargetBitsMask).getReg(Idx: 0);
6526
6527 // Fill values:
6528 // - SHL/LSHR: fill with zeros
6529 // - ASHR: fill with sign-extended MSB
6530 Register ZeroReg = MIRBuilder.buildConstant(Res: TargetTy, Val: 0).getReg(Idx: 0);
6531
6532 Register FillValue;
6533 if (MI.getOpcode() == TargetOpcode::G_ASHR) {
6534 auto TargetBitsMinusOneConst =
6535 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - 1);
6536 FillValue = MIRBuilder
6537 .buildAShr(Dst: TargetTy, Src0: SrcParts[NumParts - 1],
6538 Src1: TargetBitsMinusOneConst)
6539 .getReg(Idx: 0);
6540 } else {
6541 FillValue = ZeroReg;
6542 }
6543
6544 SmallVector<Register, 8> DstParts(NumParts);
6545
6546 // For each output part, generate a select chain that chooses the correct
6547 // result based on the runtime WordShift value. This handles all possible
6548 // word shift amounts by pre-calculating what each would produce.
6549 for (unsigned I = 0; I < NumParts; ++I) {
6550 // Initialize with appropriate default value for this shift type
6551 Register InBoundsResult = FillValue;
6552
6553 // clang-format off
6554 // Build a branchless select chain by pre-computing results for all possible
6555 // WordShift values (0 to NumParts-1). Each iteration nests a new select:
6556 //
6557 // K=0: select(WordShift==0, result0, FillValue)
6558 // K=1: select(WordShift==1, result1, select(WordShift==0, result0, FillValue))
6559 // K=2: select(WordShift==2, result2, select(WordShift==1, result1, select(...)))
6560 // clang-format on
6561 for (unsigned K = 0; K < NumParts; ++K) {
6562 auto WordShiftKConst = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: K);
6563 auto IsWordShiftK = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: BoolTy,
6564 Op0: WordShift, Op1: WordShiftKConst);
6565
6566 // Calculate source indices for this word shift
6567 //
6568 // For 4-part 128-bit value with K=1 word shift:
6569 // SHL: [3][2][1][0] << K => [2][1][0][Z]
6570 // -> (MainIdx = I-K, CarryIdx = I-K-1)
6571 // LSHR: [3][2][1][0] >> K => [Z][3][2][1]
6572 // -> (MainIdx = I+K, CarryIdx = I+K+1)
6573 int MainSrcIdx;
6574 int CarrySrcIdx; // Index for the word that provides the carried-in bits.
6575
6576 switch (MI.getOpcode()) {
6577 case TargetOpcode::G_SHL:
6578 MainSrcIdx = (int)I - (int)K;
6579 CarrySrcIdx = MainSrcIdx - 1;
6580 break;
6581 case TargetOpcode::G_LSHR:
6582 case TargetOpcode::G_ASHR:
6583 MainSrcIdx = (int)I + (int)K;
6584 CarrySrcIdx = MainSrcIdx + 1;
6585 break;
6586 default:
6587 llvm_unreachable("Not a shift");
6588 }
6589
6590 // Check bounds and build the result for this word shift
6591 Register ResultForK;
6592 if (MainSrcIdx >= 0 && MainSrcIdx < (int)NumParts) {
6593 Register MainOp = SrcParts[MainSrcIdx];
6594 Register CarryOp;
6595
6596 // Determine carry operand with bounds checking
6597 if (CarrySrcIdx >= 0 && CarrySrcIdx < (int)NumParts)
6598 CarryOp = SrcParts[CarrySrcIdx];
6599 else if (MI.getOpcode() == TargetOpcode::G_ASHR &&
6600 CarrySrcIdx >= (int)NumParts)
6601 CarryOp = FillValue; // Use sign extension
6602
6603 ResultForK = buildVariableShiftPart(Opcode: MI.getOpcode(), MainOperand: MainOp, ShiftAmt: BitShift,
6604 TargetTy, CarryOperand: CarryOp);
6605 } else {
6606 // Out of bounds - use fill value for this k
6607 ResultForK = FillValue;
6608 }
6609
6610 // Select this result if WordShift equals k
6611 InBoundsResult =
6612 MIRBuilder
6613 .buildSelect(Res: TargetTy, Tst: IsWordShiftK, Op0: ResultForK, Op1: InBoundsResult)
6614 .getReg(Idx: 0);
6615 }
6616
6617 // Handle zero-shift special case: if shift is 0, use original input
6618 DstParts[I] =
6619 MIRBuilder
6620 .buildSelect(Res: TargetTy, Tst: IsZeroShift, Op0: SrcParts[I], Op1: InBoundsResult)
6621 .getReg(Idx: 0);
6622 }
6623
6624 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstParts);
6625 MI.eraseFromParent();
6626 return Legalized;
6627}
6628
6629LegalizerHelper::LegalizeResult
6630LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
6631 LLT MoreTy) {
6632 assert(TypeIdx == 0 && "Expecting only Idx 0");
6633
6634 Observer.changingInstr(MI);
6635 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6636 MachineBasicBlock &OpMBB = *MI.getOperand(i: I + 1).getMBB();
6637 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminator());
6638 moreElementsVectorSrc(MI, MoreTy, OpIdx: I);
6639 }
6640
6641 MachineBasicBlock &MBB = *MI.getParent();
6642 MIRBuilder.setInsertPt(MBB, II: --MBB.getFirstNonPHI());
6643 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6644 Observer.changedInstr(MI);
6645 return Legalized;
6646}
6647
6648MachineInstrBuilder LegalizerHelper::getNeutralElementForVecReduce(
6649 unsigned Opcode, MachineIRBuilder &MIRBuilder, LLT Ty) {
6650 assert(Ty.isScalar() && "Expected scalar type to make neutral element for");
6651
6652 switch (Opcode) {
6653 default:
6654 llvm_unreachable(
6655 "getNeutralElementForVecReduce called with invalid opcode!");
6656 case TargetOpcode::G_VECREDUCE_ADD:
6657 case TargetOpcode::G_VECREDUCE_OR:
6658 case TargetOpcode::G_VECREDUCE_XOR:
6659 case TargetOpcode::G_VECREDUCE_UMAX:
6660 return MIRBuilder.buildConstant(Res: Ty, Val: 0);
6661 case TargetOpcode::G_VECREDUCE_MUL:
6662 return MIRBuilder.buildConstant(Res: Ty, Val: 1);
6663 case TargetOpcode::G_VECREDUCE_AND:
6664 case TargetOpcode::G_VECREDUCE_UMIN:
6665 return MIRBuilder.buildConstant(
6666 Res: Ty, Val: APInt::getAllOnes(numBits: Ty.getScalarSizeInBits()));
6667 case TargetOpcode::G_VECREDUCE_SMAX:
6668 return MIRBuilder.buildConstant(
6669 Res: Ty, Val: APInt::getSignedMinValue(numBits: Ty.getSizeInBits()));
6670 case TargetOpcode::G_VECREDUCE_SMIN:
6671 return MIRBuilder.buildConstant(
6672 Res: Ty, Val: APInt::getSignedMaxValue(numBits: Ty.getSizeInBits()));
6673 case TargetOpcode::G_VECREDUCE_FADD:
6674 return MIRBuilder.buildFConstant(Res: Ty, Val: -0.0);
6675 case TargetOpcode::G_VECREDUCE_FMUL:
6676 return MIRBuilder.buildFConstant(Res: Ty, Val: 1.0);
6677 case TargetOpcode::G_VECREDUCE_FMINIMUM:
6678 case TargetOpcode::G_VECREDUCE_FMAXIMUM:
6679 assert(false && "getNeutralElementForVecReduce unimplemented for "
6680 "G_VECREDUCE_FMINIMUM and G_VECREDUCE_FMAXIMUM!");
6681 }
6682 llvm_unreachable("switch expected to return!");
6683}
6684
6685LegalizerHelper::LegalizeResult
6686LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
6687 LLT MoreTy) {
6688 unsigned Opc = MI.getOpcode();
6689 switch (Opc) {
6690 case TargetOpcode::G_IMPLICIT_DEF:
6691 case TargetOpcode::G_LOAD: {
6692 if (TypeIdx != 0)
6693 return UnableToLegalize;
6694 Observer.changingInstr(MI);
6695 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6696 Observer.changedInstr(MI);
6697 return Legalized;
6698 }
6699 case TargetOpcode::G_STORE:
6700 if (TypeIdx != 0)
6701 return UnableToLegalize;
6702 Observer.changingInstr(MI);
6703 moreElementsVectorSrc(MI, MoreTy, OpIdx: 0);
6704 Observer.changedInstr(MI);
6705 return Legalized;
6706 case TargetOpcode::G_AND:
6707 case TargetOpcode::G_OR:
6708 case TargetOpcode::G_XOR:
6709 case TargetOpcode::G_ADD:
6710 case TargetOpcode::G_SUB:
6711 case TargetOpcode::G_MUL:
6712 case TargetOpcode::G_FADD:
6713 case TargetOpcode::G_FSUB:
6714 case TargetOpcode::G_FMUL:
6715 case TargetOpcode::G_FDIV:
6716 case TargetOpcode::G_FCOPYSIGN:
6717 case TargetOpcode::G_UADDSAT:
6718 case TargetOpcode::G_USUBSAT:
6719 case TargetOpcode::G_SADDSAT:
6720 case TargetOpcode::G_SSUBSAT:
6721 case TargetOpcode::G_SMIN:
6722 case TargetOpcode::G_SMAX:
6723 case TargetOpcode::G_UMIN:
6724 case TargetOpcode::G_UMAX:
6725 case TargetOpcode::G_FMINNUM:
6726 case TargetOpcode::G_FMAXNUM:
6727 case TargetOpcode::G_FMINNUM_IEEE:
6728 case TargetOpcode::G_FMAXNUM_IEEE:
6729 case TargetOpcode::G_FMINIMUM:
6730 case TargetOpcode::G_FMAXIMUM:
6731 case TargetOpcode::G_FMINIMUMNUM:
6732 case TargetOpcode::G_FMAXIMUMNUM:
6733 case TargetOpcode::G_STRICT_FADD:
6734 case TargetOpcode::G_STRICT_FSUB:
6735 case TargetOpcode::G_STRICT_FMUL: {
6736 Observer.changingInstr(MI);
6737 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6738 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6739 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6740 Observer.changedInstr(MI);
6741 return Legalized;
6742 }
6743 case TargetOpcode::G_SHL:
6744 case TargetOpcode::G_ASHR:
6745 case TargetOpcode::G_LSHR: {
6746 Observer.changingInstr(MI);
6747 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6748 // The shift operand may have a different scalar type from the source and
6749 // destination operands.
6750 LLT ShiftMoreTy = MoreTy.changeElementType(
6751 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getElementType());
6752 moreElementsVectorSrc(MI, MoreTy: ShiftMoreTy, OpIdx: 2);
6753 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6754 Observer.changedInstr(MI);
6755 return Legalized;
6756 }
6757 case TargetOpcode::G_FMA:
6758 case TargetOpcode::G_STRICT_FMA:
6759 case TargetOpcode::G_FSHR:
6760 case TargetOpcode::G_FSHL: {
6761 Observer.changingInstr(MI);
6762 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6763 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6764 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6765 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6766 Observer.changedInstr(MI);
6767 return Legalized;
6768 }
6769 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
6770 case TargetOpcode::G_EXTRACT:
6771 if (TypeIdx != 1)
6772 return UnableToLegalize;
6773 Observer.changingInstr(MI);
6774 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6775 Observer.changedInstr(MI);
6776 return Legalized;
6777 case TargetOpcode::G_INSERT:
6778 case TargetOpcode::G_INSERT_VECTOR_ELT:
6779 case TargetOpcode::G_FREEZE:
6780 case TargetOpcode::G_FNEG:
6781 case TargetOpcode::G_FABS:
6782 case TargetOpcode::G_FSQRT:
6783 case TargetOpcode::G_FCEIL:
6784 case TargetOpcode::G_FFLOOR:
6785 case TargetOpcode::G_FNEARBYINT:
6786 case TargetOpcode::G_FRINT:
6787 case TargetOpcode::G_INTRINSIC_ROUND:
6788 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
6789 case TargetOpcode::G_INTRINSIC_TRUNC:
6790 case TargetOpcode::G_BITREVERSE:
6791 case TargetOpcode::G_BSWAP:
6792 case TargetOpcode::G_FCANONICALIZE:
6793 case TargetOpcode::G_SEXT_INREG:
6794 case TargetOpcode::G_ABS:
6795 case TargetOpcode::G_CTLZ:
6796 case TargetOpcode::G_CTPOP:
6797 if (TypeIdx != 0)
6798 return UnableToLegalize;
6799 Observer.changingInstr(MI);
6800 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6801 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6802 Observer.changedInstr(MI);
6803 return Legalized;
6804 case TargetOpcode::G_SELECT: {
6805 auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
6806 if (TypeIdx == 1) {
6807 if (!CondTy.isScalar() ||
6808 DstTy.getElementCount() != MoreTy.getElementCount())
6809 return UnableToLegalize;
6810
6811 // This is turning a scalar select of vectors into a vector
6812 // select. Broadcast the select condition.
6813 auto ShufSplat = MIRBuilder.buildShuffleSplat(Res: MoreTy, Src: CondReg);
6814 Observer.changingInstr(MI);
6815 MI.getOperand(i: 1).setReg(ShufSplat.getReg(Idx: 0));
6816 Observer.changedInstr(MI);
6817 return Legalized;
6818 }
6819
6820 if (CondTy.isVector())
6821 return UnableToLegalize;
6822
6823 Observer.changingInstr(MI);
6824 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6825 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6826 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6827 Observer.changedInstr(MI);
6828 return Legalized;
6829 }
6830 case TargetOpcode::G_UNMERGE_VALUES:
6831 return UnableToLegalize;
6832 case TargetOpcode::G_PHI:
6833 return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
6834 case TargetOpcode::G_SHUFFLE_VECTOR:
6835 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
6836 case TargetOpcode::G_BUILD_VECTOR: {
6837 SmallVector<SrcOp, 8> Elts;
6838 for (auto Op : MI.uses()) {
6839 Elts.push_back(Elt: Op.getReg());
6840 }
6841
6842 for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
6843 Elts.push_back(Elt: MIRBuilder.buildUndef(Res: MoreTy.getScalarType()));
6844 }
6845
6846 MIRBuilder.buildDeleteTrailingVectorElements(
6847 Res: MI.getOperand(i: 0).getReg(), Op0: MIRBuilder.buildInstr(Opc, DstOps: {MoreTy}, SrcOps: Elts));
6848 MI.eraseFromParent();
6849 return Legalized;
6850 }
6851 case TargetOpcode::G_SEXT:
6852 case TargetOpcode::G_ZEXT:
6853 case TargetOpcode::G_ANYEXT:
6854 case TargetOpcode::G_TRUNC:
6855 case TargetOpcode::G_FPTRUNC:
6856 case TargetOpcode::G_FPEXT:
6857 case TargetOpcode::G_FPTOSI:
6858 case TargetOpcode::G_FPTOUI:
6859 case TargetOpcode::G_FPTOSI_SAT:
6860 case TargetOpcode::G_FPTOUI_SAT:
6861 case TargetOpcode::G_SITOFP:
6862 case TargetOpcode::G_UITOFP: {
6863 Observer.changingInstr(MI);
6864 LLT SrcExtTy;
6865 LLT DstExtTy;
6866 if (TypeIdx == 0) {
6867 DstExtTy = MoreTy;
6868 SrcExtTy = MoreTy.changeElementType(
6869 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getElementType());
6870 } else {
6871 DstExtTy = MoreTy.changeElementType(
6872 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getElementType());
6873 SrcExtTy = MoreTy;
6874 }
6875 moreElementsVectorSrc(MI, MoreTy: SrcExtTy, OpIdx: 1);
6876 moreElementsVectorDst(MI, WideTy: DstExtTy, OpIdx: 0);
6877 Observer.changedInstr(MI);
6878 return Legalized;
6879 }
6880 case TargetOpcode::G_ICMP:
6881 case TargetOpcode::G_FCMP: {
6882 if (TypeIdx != 1)
6883 return UnableToLegalize;
6884
6885 Observer.changingInstr(MI);
6886 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6887 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6888 LLT CondTy = MoreTy.changeVectorElementType(
6889 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getElementType());
6890 moreElementsVectorDst(MI, WideTy: CondTy, OpIdx: 0);
6891 Observer.changedInstr(MI);
6892 return Legalized;
6893 }
6894 case TargetOpcode::G_BITCAST: {
6895 if (TypeIdx != 0)
6896 return UnableToLegalize;
6897
6898 LLT SrcTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
6899 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
6900
6901 unsigned coefficient = SrcTy.getNumElements() * MoreTy.getNumElements();
6902 if (coefficient % DstTy.getNumElements() != 0)
6903 return UnableToLegalize;
6904
6905 coefficient = coefficient / DstTy.getNumElements();
6906
6907 LLT NewTy = SrcTy.changeElementCount(
6908 EC: ElementCount::get(MinVal: coefficient, Scalable: MoreTy.isScalable()));
6909 Observer.changingInstr(MI);
6910 moreElementsVectorSrc(MI, MoreTy: NewTy, OpIdx: 1);
6911 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6912 Observer.changedInstr(MI);
6913 return Legalized;
6914 }
6915 case TargetOpcode::G_VECREDUCE_FADD:
6916 case TargetOpcode::G_VECREDUCE_FMUL:
6917 case TargetOpcode::G_VECREDUCE_ADD:
6918 case TargetOpcode::G_VECREDUCE_MUL:
6919 case TargetOpcode::G_VECREDUCE_AND:
6920 case TargetOpcode::G_VECREDUCE_OR:
6921 case TargetOpcode::G_VECREDUCE_XOR:
6922 case TargetOpcode::G_VECREDUCE_SMAX:
6923 case TargetOpcode::G_VECREDUCE_SMIN:
6924 case TargetOpcode::G_VECREDUCE_UMAX:
6925 case TargetOpcode::G_VECREDUCE_UMIN: {
6926 LLT OrigTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
6927 MachineOperand &MO = MI.getOperand(i: 1);
6928 auto NewVec = MIRBuilder.buildPadVectorWithUndefElements(Res: MoreTy, Op0: MO);
6929 auto NeutralElement = getNeutralElementForVecReduce(
6930 Opcode: MI.getOpcode(), MIRBuilder, Ty: MoreTy.getElementType());
6931
6932 LLT IdxTy(TLI.getVectorIdxLLT(DL: MIRBuilder.getDataLayout()));
6933 for (size_t i = OrigTy.getNumElements(), e = MoreTy.getNumElements();
6934 i != e; i++) {
6935 auto Idx = MIRBuilder.buildConstant(Res: IdxTy, Val: i);
6936 NewVec = MIRBuilder.buildInsertVectorElement(Res: MoreTy, Val: NewVec,
6937 Elt: NeutralElement, Idx);
6938 }
6939
6940 Observer.changingInstr(MI);
6941 MO.setReg(NewVec.getReg(Idx: 0));
6942 Observer.changedInstr(MI);
6943 return Legalized;
6944 }
6945
6946 default:
6947 return UnableToLegalize;
6948 }
6949}
6950
6951LegalizerHelper::LegalizeResult
6952LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
6953 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6954 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
6955 unsigned MaskNumElts = Mask.size();
6956 unsigned SrcNumElts = SrcTy.getNumElements();
6957 LLT DestEltTy = DstTy.getElementType();
6958
6959 if (MaskNumElts == SrcNumElts)
6960 return Legalized;
6961
6962 if (MaskNumElts < SrcNumElts) {
6963 // Extend mask to match new destination vector size with
6964 // undef values.
6965 SmallVector<int, 16> NewMask(SrcNumElts, -1);
6966 llvm::copy(Range&: Mask, Out: NewMask.begin());
6967
6968 moreElementsVectorDst(MI, WideTy: SrcTy, OpIdx: 0);
6969 MIRBuilder.setInstrAndDebugLoc(MI);
6970 MIRBuilder.buildShuffleVector(Res: MI.getOperand(i: 0).getReg(),
6971 Src1: MI.getOperand(i: 1).getReg(),
6972 Src2: MI.getOperand(i: 2).getReg(), Mask: NewMask);
6973 MI.eraseFromParent();
6974
6975 return Legalized;
6976 }
6977
6978 unsigned PaddedMaskNumElts = alignTo(Value: MaskNumElts, Align: SrcNumElts);
6979 unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
6980 LLT PaddedTy =
6981 DstTy.changeVectorElementCount(EC: ElementCount::getFixed(MinVal: PaddedMaskNumElts));
6982
6983 // Create new source vectors by concatenating the initial
6984 // source vectors with undefined vectors of the same size.
6985 auto Undef = MIRBuilder.buildUndef(Res: SrcTy);
6986 SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(Idx: 0));
6987 SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(Idx: 0));
6988 MOps1[0] = MI.getOperand(i: 1).getReg();
6989 MOps2[0] = MI.getOperand(i: 2).getReg();
6990
6991 auto Src1 = MIRBuilder.buildConcatVectors(Res: PaddedTy, Ops: MOps1);
6992 auto Src2 = MIRBuilder.buildConcatVectors(Res: PaddedTy, Ops: MOps2);
6993
6994 // Readjust mask for new input vector length.
6995 SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
6996 for (unsigned I = 0; I != MaskNumElts; ++I) {
6997 int Idx = Mask[I];
6998 if (Idx >= static_cast<int>(SrcNumElts))
6999 Idx += PaddedMaskNumElts - SrcNumElts;
7000 MappedOps[I] = Idx;
7001 }
7002
7003 // If we got more elements than required, extract subvector.
7004 if (MaskNumElts != PaddedMaskNumElts) {
7005 auto Shuffle =
7006 MIRBuilder.buildShuffleVector(Res: PaddedTy, Src1, Src2, Mask: MappedOps);
7007
7008 SmallVector<Register, 16> Elts(MaskNumElts);
7009 for (unsigned I = 0; I < MaskNumElts; ++I) {
7010 Elts[I] =
7011 MIRBuilder.buildExtractVectorElementConstant(Res: DestEltTy, Val: Shuffle, Idx: I)
7012 .getReg(Idx: 0);
7013 }
7014 MIRBuilder.buildBuildVector(Res: DstReg, Ops: Elts);
7015 } else {
7016 MIRBuilder.buildShuffleVector(Res: DstReg, Src1, Src2, Mask: MappedOps);
7017 }
7018
7019 MI.eraseFromParent();
7020 return LegalizerHelper::LegalizeResult::Legalized;
7021}
7022
7023LegalizerHelper::LegalizeResult
7024LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
7025 unsigned int TypeIdx, LLT MoreTy) {
7026 auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
7027 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
7028 unsigned NumElts = DstTy.getNumElements();
7029 unsigned WidenNumElts = MoreTy.getNumElements();
7030
7031 if (DstTy.isVector() && Src1Ty.isVector() &&
7032 DstTy.getNumElements() != Src1Ty.getNumElements()) {
7033 return equalizeVectorShuffleLengths(MI);
7034 }
7035
7036 if (TypeIdx != 0)
7037 return UnableToLegalize;
7038
7039 // Expect a canonicalized shuffle.
7040 if (DstTy != Src1Ty || DstTy != Src2Ty)
7041 return UnableToLegalize;
7042
7043 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
7044 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
7045
7046 // Adjust mask based on new input vector length.
7047 SmallVector<int, 16> NewMask(WidenNumElts, -1);
7048 for (unsigned I = 0; I != NumElts; ++I) {
7049 int Idx = Mask[I];
7050 if (Idx < static_cast<int>(NumElts))
7051 NewMask[I] = Idx;
7052 else
7053 NewMask[I] = Idx - NumElts + WidenNumElts;
7054 }
7055 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
7056 MIRBuilder.setInstrAndDebugLoc(MI);
7057 MIRBuilder.buildShuffleVector(Res: MI.getOperand(i: 0).getReg(),
7058 Src1: MI.getOperand(i: 1).getReg(),
7059 Src2: MI.getOperand(i: 2).getReg(), Mask: NewMask);
7060 MI.eraseFromParent();
7061 return Legalized;
7062}
7063
7064void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
7065 ArrayRef<Register> Src1Regs,
7066 ArrayRef<Register> Src2Regs,
7067 LLT NarrowTy) {
7068 MachineIRBuilder &B = MIRBuilder;
7069 unsigned SrcParts = Src1Regs.size();
7070 unsigned DstParts = DstRegs.size();
7071
7072 unsigned DstIdx = 0; // Low bits of the result.
7073 Register FactorSum =
7074 B.buildMul(Dst: NarrowTy, Src0: Src1Regs[DstIdx], Src1: Src2Regs[DstIdx]).getReg(Idx: 0);
7075 DstRegs[DstIdx] = FactorSum;
7076
7077 Register CarrySumPrevDstIdx;
7078 SmallVector<Register, 4> Factors;
7079
7080 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
7081 // Collect low parts of muls for DstIdx.
7082 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
7083 i <= std::min(a: DstIdx, b: SrcParts - 1); ++i) {
7084 MachineInstrBuilder Mul =
7085 B.buildMul(Dst: NarrowTy, Src0: Src1Regs[DstIdx - i], Src1: Src2Regs[i]);
7086 Factors.push_back(Elt: Mul.getReg(Idx: 0));
7087 }
7088 // Collect high parts of muls from previous DstIdx.
7089 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
7090 i <= std::min(a: DstIdx - 1, b: SrcParts - 1); ++i) {
7091 MachineInstrBuilder Umulh =
7092 B.buildUMulH(Dst: NarrowTy, Src0: Src1Regs[DstIdx - 1 - i], Src1: Src2Regs[i]);
7093 Factors.push_back(Elt: Umulh.getReg(Idx: 0));
7094 }
7095 // Add CarrySum from additions calculated for previous DstIdx.
7096 if (DstIdx != 1) {
7097 Factors.push_back(Elt: CarrySumPrevDstIdx);
7098 }
7099
7100 Register CarrySum;
7101 // Add all factors and accumulate all carries into CarrySum.
7102 if (DstIdx != DstParts - 1) {
7103 MachineInstrBuilder Uaddo =
7104 B.buildUAddo(Res: NarrowTy, CarryOut: LLT::scalar(SizeInBits: 1), Op0: Factors[0], Op1: Factors[1]);
7105 FactorSum = Uaddo.getReg(Idx: 0);
7106 CarrySum = B.buildZExt(Res: NarrowTy, Op: Uaddo.getReg(Idx: 1)).getReg(Idx: 0);
7107 for (unsigned i = 2; i < Factors.size(); ++i) {
7108 MachineInstrBuilder Uaddo =
7109 B.buildUAddo(Res: NarrowTy, CarryOut: LLT::scalar(SizeInBits: 1), Op0: FactorSum, Op1: Factors[i]);
7110 FactorSum = Uaddo.getReg(Idx: 0);
7111 MachineInstrBuilder Carry = B.buildZExt(Res: NarrowTy, Op: Uaddo.getReg(Idx: 1));
7112 CarrySum = B.buildAdd(Dst: NarrowTy, Src0: CarrySum, Src1: Carry).getReg(Idx: 0);
7113 }
7114 } else {
7115 // Since value for the next index is not calculated, neither is CarrySum.
7116 FactorSum = B.buildAdd(Dst: NarrowTy, Src0: Factors[0], Src1: Factors[1]).getReg(Idx: 0);
7117 for (unsigned i = 2; i < Factors.size(); ++i)
7118 FactorSum = B.buildAdd(Dst: NarrowTy, Src0: FactorSum, Src1: Factors[i]).getReg(Idx: 0);
7119 }
7120
7121 CarrySumPrevDstIdx = CarrySum;
7122 DstRegs[DstIdx] = FactorSum;
7123 Factors.clear();
7124 }
7125}
7126
7127LegalizerHelper::LegalizeResult
7128LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
7129 LLT NarrowTy) {
7130 if (TypeIdx != 0)
7131 return UnableToLegalize;
7132
7133 Register DstReg = MI.getOperand(i: 0).getReg();
7134 LLT DstType = MRI.getType(Reg: DstReg);
7135 // FIXME: add support for vector types
7136 if (DstType.isVector())
7137 return UnableToLegalize;
7138
7139 unsigned Opcode = MI.getOpcode();
7140 unsigned OpO, OpE, OpF;
7141 switch (Opcode) {
7142 case TargetOpcode::G_SADDO:
7143 case TargetOpcode::G_SADDE:
7144 case TargetOpcode::G_UADDO:
7145 case TargetOpcode::G_UADDE:
7146 case TargetOpcode::G_ADD:
7147 OpO = TargetOpcode::G_UADDO;
7148 OpE = TargetOpcode::G_UADDE;
7149 OpF = TargetOpcode::G_UADDE;
7150 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
7151 OpF = TargetOpcode::G_SADDE;
7152 break;
7153 case TargetOpcode::G_SSUBO:
7154 case TargetOpcode::G_SSUBE:
7155 case TargetOpcode::G_USUBO:
7156 case TargetOpcode::G_USUBE:
7157 case TargetOpcode::G_SUB:
7158 OpO = TargetOpcode::G_USUBO;
7159 OpE = TargetOpcode::G_USUBE;
7160 OpF = TargetOpcode::G_USUBE;
7161 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
7162 OpF = TargetOpcode::G_SSUBE;
7163 break;
7164 default:
7165 llvm_unreachable("Unexpected add/sub opcode!");
7166 }
7167
7168 // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
7169 unsigned NumDefs = MI.getNumExplicitDefs();
7170 Register Src1 = MI.getOperand(i: NumDefs).getReg();
7171 Register Src2 = MI.getOperand(i: NumDefs + 1).getReg();
7172 Register CarryDst, CarryIn;
7173 if (NumDefs == 2)
7174 CarryDst = MI.getOperand(i: 1).getReg();
7175 if (MI.getNumOperands() == NumDefs + 3)
7176 CarryIn = MI.getOperand(i: NumDefs + 2).getReg();
7177
7178 LLT RegTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
7179 LLT LeftoverTy, DummyTy;
7180 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
7181 extractParts(Reg: Src1, RegTy, MainTy: NarrowTy, LeftoverTy, VRegs&: Src1Regs, LeftoverVRegs&: Src1Left,
7182 MIRBuilder, MRI);
7183 extractParts(Reg: Src2, RegTy, MainTy: NarrowTy, LeftoverTy&: DummyTy, VRegs&: Src2Regs, LeftoverVRegs&: Src2Left, MIRBuilder,
7184 MRI);
7185
7186 int NarrowParts = Src1Regs.size();
7187 Src1Regs.append(RHS: Src1Left);
7188 Src2Regs.append(RHS: Src2Left);
7189 DstRegs.reserve(N: Src1Regs.size());
7190
7191 for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
7192 Register DstReg =
7193 MRI.createGenericVirtualRegister(Ty: MRI.getType(Reg: Src1Regs[i]));
7194 Register CarryOut;
7195 // Forward the final carry-out to the destination register
7196 if (i == e - 1 && CarryDst)
7197 CarryOut = CarryDst;
7198 else
7199 CarryOut = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 1));
7200
7201 if (!CarryIn) {
7202 MIRBuilder.buildInstr(Opc: OpO, DstOps: {DstReg, CarryOut},
7203 SrcOps: {Src1Regs[i], Src2Regs[i]});
7204 } else if (i == e - 1) {
7205 MIRBuilder.buildInstr(Opc: OpF, DstOps: {DstReg, CarryOut},
7206 SrcOps: {Src1Regs[i], Src2Regs[i], CarryIn});
7207 } else {
7208 MIRBuilder.buildInstr(Opc: OpE, DstOps: {DstReg, CarryOut},
7209 SrcOps: {Src1Regs[i], Src2Regs[i], CarryIn});
7210 }
7211
7212 DstRegs.push_back(Elt: DstReg);
7213 CarryIn = CarryOut;
7214 }
7215 insertParts(DstReg: MI.getOperand(i: 0).getReg(), ResultTy: RegTy, PartTy: NarrowTy,
7216 PartRegs: ArrayRef(DstRegs).take_front(N: NarrowParts), LeftoverTy,
7217 LeftoverRegs: ArrayRef(DstRegs).drop_front(N: NarrowParts));
7218
7219 MI.eraseFromParent();
7220 return Legalized;
7221}
7222
7223LegalizerHelper::LegalizeResult
7224LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
7225 auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
7226
7227 LLT Ty = MRI.getType(Reg: DstReg);
7228 if (Ty.isVector())
7229 return UnableToLegalize;
7230
7231 unsigned Size = Ty.getSizeInBits();
7232 unsigned NarrowSize = NarrowTy.getSizeInBits();
7233 if (Size % NarrowSize != 0)
7234 return UnableToLegalize;
7235
7236 unsigned NumParts = Size / NarrowSize;
7237 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
7238 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
7239
7240 SmallVector<Register, 2> Src1Parts, Src2Parts;
7241 SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
7242 extractParts(Reg: Src1, Ty: NarrowTy, NumParts, VRegs&: Src1Parts, MIRBuilder, MRI);
7243 extractParts(Reg: Src2, Ty: NarrowTy, NumParts, VRegs&: Src2Parts, MIRBuilder, MRI);
7244 multiplyRegisters(DstRegs&: DstTmpRegs, Src1Regs: Src1Parts, Src2Regs: Src2Parts, NarrowTy);
7245
7246 // Take only high half of registers if this is high mul.
7247 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
7248 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
7249 MI.eraseFromParent();
7250 return Legalized;
7251}
7252
7253LegalizerHelper::LegalizeResult
7254LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
7255 LLT NarrowTy) {
7256 if (TypeIdx != 0)
7257 return UnableToLegalize;
7258
7259 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
7260
7261 Register Src = MI.getOperand(i: 1).getReg();
7262 LLT SrcTy = MRI.getType(Reg: Src);
7263
7264 // If all finite floats fit into the narrowed integer type, we can just swap
7265 // out the result type. This is practically only useful for conversions from
7266 // half to at least 16-bits, so just handle the one case.
7267 if (SrcTy.getScalarType() != LLT::scalar(SizeInBits: 16) ||
7268 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
7269 return UnableToLegalize;
7270
7271 Observer.changingInstr(MI);
7272 narrowScalarDst(MI, NarrowTy, OpIdx: 0,
7273 ExtOpcode: IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
7274 Observer.changedInstr(MI);
7275 return Legalized;
7276}
7277
7278LegalizerHelper::LegalizeResult
7279LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
7280 LLT NarrowTy) {
7281 if (TypeIdx != 1)
7282 return UnableToLegalize;
7283
7284 uint64_t NarrowSize = NarrowTy.getSizeInBits();
7285
7286 int64_t SizeOp1 = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7287 // FIXME: add support for when SizeOp1 isn't an exact multiple of
7288 // NarrowSize.
7289 if (SizeOp1 % NarrowSize != 0)
7290 return UnableToLegalize;
7291 int NumParts = SizeOp1 / NarrowSize;
7292
7293 SmallVector<Register, 2> SrcRegs, DstRegs;
7294 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowTy, NumParts, VRegs&: SrcRegs,
7295 MIRBuilder, MRI);
7296
7297 Register OpReg = MI.getOperand(i: 0).getReg();
7298 uint64_t OpStart = MI.getOperand(i: 2).getImm();
7299 uint64_t OpSize = MRI.getType(Reg: OpReg).getSizeInBits();
7300 for (int i = 0; i < NumParts; ++i) {
7301 unsigned SrcStart = i * NarrowSize;
7302
7303 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
7304 // No part of the extract uses this subregister, ignore it.
7305 continue;
7306 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(Reg: OpReg)) {
7307 // The entire subregister is extracted, forward the value.
7308 DstRegs.push_back(Elt: SrcRegs[i]);
7309 continue;
7310 }
7311
7312 // OpSegStart is where this destination segment would start in OpReg if it
7313 // extended infinitely in both directions.
7314 int64_t ExtractOffset;
7315 uint64_t SegSize;
7316 if (OpStart < SrcStart) {
7317 ExtractOffset = 0;
7318 SegSize = std::min(a: NarrowSize, b: OpStart + OpSize - SrcStart);
7319 } else {
7320 ExtractOffset = OpStart - SrcStart;
7321 SegSize = std::min(a: SrcStart + NarrowSize - OpStart, b: OpSize);
7322 }
7323
7324 Register SegReg = SrcRegs[i];
7325 if (ExtractOffset != 0 || SegSize != NarrowSize) {
7326 // A genuine extract is needed.
7327 SegReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: SegSize));
7328 MIRBuilder.buildExtract(Res: SegReg, Src: SrcRegs[i], Index: ExtractOffset);
7329 }
7330
7331 DstRegs.push_back(Elt: SegReg);
7332 }
7333
7334 Register DstReg = MI.getOperand(i: 0).getReg();
7335 if (MRI.getType(Reg: DstReg).isVector())
7336 MIRBuilder.buildBuildVector(Res: DstReg, Ops: DstRegs);
7337 else if (DstRegs.size() > 1)
7338 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
7339 else
7340 MIRBuilder.buildCopy(Res: DstReg, Op: DstRegs[0]);
7341 MI.eraseFromParent();
7342 return Legalized;
7343}
7344
7345LegalizerHelper::LegalizeResult
7346LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
7347 LLT NarrowTy) {
7348 // FIXME: Don't know how to handle secondary types yet.
7349 if (TypeIdx != 0)
7350 return UnableToLegalize;
7351
7352 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
7353 LLT RegTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
7354 LLT LeftoverTy;
7355 extractParts(Reg: MI.getOperand(i: 1).getReg(), RegTy, MainTy: NarrowTy, LeftoverTy, VRegs&: SrcRegs,
7356 LeftoverVRegs&: LeftoverRegs, MIRBuilder, MRI);
7357
7358 SrcRegs.append(RHS: LeftoverRegs);
7359
7360 uint64_t NarrowSize = NarrowTy.getSizeInBits();
7361 Register OpReg = MI.getOperand(i: 2).getReg();
7362 uint64_t OpStart = MI.getOperand(i: 3).getImm();
7363 uint64_t OpSize = MRI.getType(Reg: OpReg).getSizeInBits();
7364 for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
7365 unsigned DstStart = I * NarrowSize;
7366
7367 if (DstStart == OpStart && NarrowTy == MRI.getType(Reg: OpReg)) {
7368 // The entire subregister is defined by this insert, forward the new
7369 // value.
7370 DstRegs.push_back(Elt: OpReg);
7371 continue;
7372 }
7373
7374 Register SrcReg = SrcRegs[I];
7375 if (MRI.getType(Reg: SrcRegs[I]) == LeftoverTy) {
7376 // The leftover reg is smaller than NarrowTy, so we need to extend it.
7377 SrcReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
7378 MIRBuilder.buildAnyExt(Res: SrcReg, Op: SrcRegs[I]);
7379 }
7380
7381 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
7382 // No part of the insert affects this subregister, forward the original.
7383 DstRegs.push_back(Elt: SrcReg);
7384 continue;
7385 }
7386
7387 // OpSegStart is where this destination segment would start in OpReg if it
7388 // extended infinitely in both directions.
7389 int64_t ExtractOffset, InsertOffset;
7390 uint64_t SegSize;
7391 if (OpStart < DstStart) {
7392 InsertOffset = 0;
7393 ExtractOffset = DstStart - OpStart;
7394 SegSize = std::min(a: NarrowSize, b: OpStart + OpSize - DstStart);
7395 } else {
7396 InsertOffset = OpStart - DstStart;
7397 ExtractOffset = 0;
7398 SegSize =
7399 std::min(a: NarrowSize - InsertOffset, b: OpStart + OpSize - DstStart);
7400 }
7401
7402 Register SegReg = OpReg;
7403 if (ExtractOffset != 0 || SegSize != OpSize) {
7404 // A genuine extract is needed.
7405 SegReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: SegSize));
7406 MIRBuilder.buildExtract(Res: SegReg, Src: OpReg, Index: ExtractOffset);
7407 }
7408
7409 Register DstReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
7410 MIRBuilder.buildInsert(Res: DstReg, Src: SrcReg, Op: SegReg, Index: InsertOffset);
7411 DstRegs.push_back(Elt: DstReg);
7412 }
7413
7414 uint64_t WideSize = DstRegs.size() * NarrowSize;
7415 Register DstReg = MI.getOperand(i: 0).getReg();
7416 if (WideSize > RegTy.getSizeInBits()) {
7417 Register MergeReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: WideSize));
7418 MIRBuilder.buildMergeLikeInstr(Res: MergeReg, Ops: DstRegs);
7419 MIRBuilder.buildTrunc(Res: DstReg, Op: MergeReg);
7420 } else
7421 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
7422
7423 MI.eraseFromParent();
7424 return Legalized;
7425}
7426
7427LegalizerHelper::LegalizeResult
7428LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
7429 LLT NarrowTy) {
7430 Register DstReg = MI.getOperand(i: 0).getReg();
7431 LLT DstTy = MRI.getType(Reg: DstReg);
7432
7433 assert(MI.getNumOperands() == 3 && TypeIdx == 0);
7434
7435 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
7436 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
7437 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
7438 LLT LeftoverTy;
7439 if (!extractParts(Reg: MI.getOperand(i: 1).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy,
7440 VRegs&: Src0Regs, LeftoverVRegs&: Src0LeftoverRegs, MIRBuilder, MRI))
7441 return UnableToLegalize;
7442
7443 LLT Unused;
7444 if (!extractParts(Reg: MI.getOperand(i: 2).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy&: Unused,
7445 VRegs&: Src1Regs, LeftoverVRegs&: Src1LeftoverRegs, MIRBuilder, MRI))
7446 llvm_unreachable("inconsistent extractParts result");
7447
7448 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
7449 auto Inst = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy},
7450 SrcOps: {Src0Regs[I], Src1Regs[I]});
7451 DstRegs.push_back(Elt: Inst.getReg(Idx: 0));
7452 }
7453
7454 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
7455 auto Inst = MIRBuilder.buildInstr(
7456 Opc: MI.getOpcode(),
7457 DstOps: {LeftoverTy}, SrcOps: {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
7458 DstLeftoverRegs.push_back(Elt: Inst.getReg(Idx: 0));
7459 }
7460
7461 insertParts(DstReg, ResultTy: DstTy, PartTy: NarrowTy, PartRegs: DstRegs,
7462 LeftoverTy, LeftoverRegs: DstLeftoverRegs);
7463
7464 MI.eraseFromParent();
7465 return Legalized;
7466}
7467
7468LegalizerHelper::LegalizeResult
7469LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
7470 LLT NarrowTy) {
7471 if (TypeIdx != 0)
7472 return UnableToLegalize;
7473
7474 auto [DstReg, SrcReg] = MI.getFirst2Regs();
7475
7476 LLT DstTy = MRI.getType(Reg: DstReg);
7477 if (DstTy.isVector())
7478 return UnableToLegalize;
7479
7480 SmallVector<Register, 8> Parts;
7481 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
7482 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, VRegs&: Parts, PadStrategy: MI.getOpcode());
7483 buildWidenedRemergeToDst(DstReg, LCMTy, RemergeRegs: Parts);
7484
7485 MI.eraseFromParent();
7486 return Legalized;
7487}
7488
7489LegalizerHelper::LegalizeResult
7490LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
7491 LLT NarrowTy) {
7492 if (TypeIdx != 0)
7493 return UnableToLegalize;
7494
7495 Register CondReg = MI.getOperand(i: 1).getReg();
7496 LLT CondTy = MRI.getType(Reg: CondReg);
7497 if (CondTy.isVector()) // TODO: Handle vselect
7498 return UnableToLegalize;
7499
7500 Register DstReg = MI.getOperand(i: 0).getReg();
7501 LLT DstTy = MRI.getType(Reg: DstReg);
7502
7503 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
7504 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
7505 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
7506 LLT LeftoverTy;
7507 if (!extractParts(Reg: MI.getOperand(i: 2).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy,
7508 VRegs&: Src1Regs, LeftoverVRegs&: Src1LeftoverRegs, MIRBuilder, MRI))
7509 return UnableToLegalize;
7510
7511 LLT Unused;
7512 if (!extractParts(Reg: MI.getOperand(i: 3).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy&: Unused,
7513 VRegs&: Src2Regs, LeftoverVRegs&: Src2LeftoverRegs, MIRBuilder, MRI))
7514 llvm_unreachable("inconsistent extractParts result");
7515
7516 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
7517 auto Select = MIRBuilder.buildSelect(Res: NarrowTy,
7518 Tst: CondReg, Op0: Src1Regs[I], Op1: Src2Regs[I]);
7519 DstRegs.push_back(Elt: Select.getReg(Idx: 0));
7520 }
7521
7522 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
7523 auto Select = MIRBuilder.buildSelect(
7524 Res: LeftoverTy, Tst: CondReg, Op0: Src1LeftoverRegs[I], Op1: Src2LeftoverRegs[I]);
7525 DstLeftoverRegs.push_back(Elt: Select.getReg(Idx: 0));
7526 }
7527
7528 insertParts(DstReg, ResultTy: DstTy, PartTy: NarrowTy, PartRegs: DstRegs,
7529 LeftoverTy, LeftoverRegs: DstLeftoverRegs);
7530
7531 MI.eraseFromParent();
7532 return Legalized;
7533}
7534
7535LegalizerHelper::LegalizeResult
7536LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
7537 LLT NarrowTy) {
7538 if (TypeIdx != 1)
7539 return UnableToLegalize;
7540
7541 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7542 unsigned NarrowSize = NarrowTy.getSizeInBits();
7543
7544 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7545 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
7546
7547 MachineIRBuilder &B = MIRBuilder;
7548 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
7549 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
7550 auto C_0 = B.buildConstant(Res: NarrowTy, Val: 0);
7551 auto HiIsZero = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
7552 Op0: UnmergeSrc.getReg(Idx: 1), Op1: C_0);
7553 auto LoCTLZ = IsUndef ?
7554 B.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0)) :
7555 B.buildCTLZ(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7556 auto C_NarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
7557 auto HiIsZeroCTLZ = B.buildAdd(Dst: DstTy, Src0: LoCTLZ, Src1: C_NarrowSize);
7558 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7559 B.buildSelect(Res: DstReg, Tst: HiIsZero, Op0: HiIsZeroCTLZ, Op1: HiCTLZ);
7560
7561 MI.eraseFromParent();
7562 return Legalized;
7563 }
7564
7565 return UnableToLegalize;
7566}
7567
7568LegalizerHelper::LegalizeResult
7569LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
7570 LLT NarrowTy) {
7571 if (TypeIdx != 1)
7572 return UnableToLegalize;
7573
7574 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7575 unsigned NarrowSize = NarrowTy.getSizeInBits();
7576
7577 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7578 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
7579
7580 MachineIRBuilder &B = MIRBuilder;
7581 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
7582 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
7583 auto C_0 = B.buildConstant(Res: NarrowTy, Val: 0);
7584 auto LoIsZero = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
7585 Op0: UnmergeSrc.getReg(Idx: 0), Op1: C_0);
7586 auto HiCTTZ = IsUndef ?
7587 B.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1)) :
7588 B.buildCTTZ(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7589 auto C_NarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
7590 auto LoIsZeroCTTZ = B.buildAdd(Dst: DstTy, Src0: HiCTTZ, Src1: C_NarrowSize);
7591 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7592 B.buildSelect(Res: DstReg, Tst: LoIsZero, Op0: LoIsZeroCTTZ, Op1: LoCTTZ);
7593
7594 MI.eraseFromParent();
7595 return Legalized;
7596 }
7597
7598 return UnableToLegalize;
7599}
7600
7601LegalizerHelper::LegalizeResult
7602LegalizerHelper::narrowScalarCTLS(MachineInstr &MI, unsigned TypeIdx,
7603 LLT NarrowTy) {
7604 if (TypeIdx != 1)
7605 return UnableToLegalize;
7606
7607 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7608 unsigned NarrowSize = NarrowTy.getSizeInBits();
7609
7610 if (!SrcTy.isScalar() || SrcTy.getSizeInBits() != 2 * NarrowSize)
7611 return UnableToLegalize;
7612
7613 MachineIRBuilder &B = MIRBuilder;
7614
7615 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
7616 Register Lo = UnmergeSrc.getReg(Idx: 0);
7617 Register Hi = UnmergeSrc.getReg(Idx: 1);
7618
7619 auto ShAmt = B.buildConstant(Res: NarrowTy, Val: NarrowSize - 1);
7620 auto Sign = B.buildAShr(Dst: NarrowTy, Src0: Hi, Src1: ShAmt);
7621
7622 auto HiIsSign = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1), Op0: Hi, Op1: Sign);
7623
7624 // Invert Lo if Hi is negative. Then count the leading zeros. If there are no
7625 // leading zeros, then the MSB of Lo is different than the MSB of Hi.
7626 // Otherwise the leading zeros represent additional sign bits of the original
7627 // value.
7628 auto LoInv = B.buildXor(Dst: DstTy, Src0: Lo, Src1: Sign);
7629 auto LoCTLZ = B.buildCTLZ(Dst: DstTy, Src0: LoInv);
7630
7631 // Add NarrowSize-1 to LoCTLZ. This is the full CTLS if Hi is all sign bits.
7632 auto C_NarrowSizeM1 = B.buildConstant(Res: DstTy, Val: NarrowSize - 1);
7633 auto HiIsSignCTLS = B.buildAdd(Dst: DstTy, Src0: LoCTLZ, Src1: C_NarrowSizeM1);
7634
7635 auto HiCTLS = B.buildCTLS(Dst: DstTy, Src0: Hi);
7636
7637 B.buildSelect(Res: DstReg, Tst: HiIsSign, Op0: HiIsSignCTLS, Op1: HiCTLS);
7638
7639 MI.eraseFromParent();
7640 return Legalized;
7641}
7642
7643LegalizerHelper::LegalizeResult
7644LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
7645 LLT NarrowTy) {
7646 if (TypeIdx != 1)
7647 return UnableToLegalize;
7648
7649 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7650 unsigned NarrowSize = NarrowTy.getSizeInBits();
7651
7652 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7653 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1));
7654
7655 auto LoCTPOP = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7656 auto HiCTPOP = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7657 MIRBuilder.buildAdd(Dst: DstReg, Src0: HiCTPOP, Src1: LoCTPOP);
7658
7659 MI.eraseFromParent();
7660 return Legalized;
7661 }
7662
7663 return UnableToLegalize;
7664}
7665
7666LegalizerHelper::LegalizeResult
7667LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
7668 LLT NarrowTy) {
7669 if (TypeIdx != 1)
7670 return UnableToLegalize;
7671
7672 MachineIRBuilder &B = MIRBuilder;
7673 Register ExpReg = MI.getOperand(i: 2).getReg();
7674 LLT ExpTy = MRI.getType(Reg: ExpReg);
7675
7676 unsigned ClampSize = NarrowTy.getScalarSizeInBits();
7677
7678 // Clamp the exponent to the range of the target type.
7679 auto MinExp = B.buildConstant(Res: ExpTy, Val: minIntN(N: ClampSize));
7680 auto ClampMin = B.buildSMax(Dst: ExpTy, Src0: ExpReg, Src1: MinExp);
7681 auto MaxExp = B.buildConstant(Res: ExpTy, Val: maxIntN(N: ClampSize));
7682 auto Clamp = B.buildSMin(Dst: ExpTy, Src0: ClampMin, Src1: MaxExp);
7683
7684 auto Trunc = B.buildTrunc(Res: NarrowTy, Op: Clamp);
7685 Observer.changingInstr(MI);
7686 MI.getOperand(i: 2).setReg(Trunc.getReg(Idx: 0));
7687 Observer.changedInstr(MI);
7688 return Legalized;
7689}
7690
7691LegalizerHelper::LegalizeResult
7692LegalizerHelper::lowerBitCount(MachineInstr &MI) {
7693 unsigned Opc = MI.getOpcode();
7694 const auto &TII = MIRBuilder.getTII();
7695 auto isSupported = [this](const LegalityQuery &Q) {
7696 auto QAction = LI.getAction(Query: Q).Action;
7697 return QAction == Legal || QAction == Libcall || QAction == Custom;
7698 };
7699 switch (Opc) {
7700 default:
7701 return UnableToLegalize;
7702 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
7703 // This trivially expands to CTLZ.
7704 Observer.changingInstr(MI);
7705 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTLZ));
7706 Observer.changedInstr(MI);
7707 return Legalized;
7708 }
7709 case TargetOpcode::G_CTLZ: {
7710 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7711 unsigned Len = SrcTy.getScalarSizeInBits();
7712
7713 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7714 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
7715 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: SrcReg);
7716 auto ZeroSrc = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
7717 auto ICmp = MIRBuilder.buildICmp(
7718 Pred: CmpInst::ICMP_EQ, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: SrcReg, Op1: ZeroSrc);
7719 auto LenConst = MIRBuilder.buildConstant(Res: DstTy, Val: Len);
7720 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LenConst, Op1: CtlzZU);
7721 MI.eraseFromParent();
7722 return Legalized;
7723 }
7724 // for now, we do this:
7725 // NewLen = NextPowerOf2(Len);
7726 // x = x | (x >> 1);
7727 // x = x | (x >> 2);
7728 // ...
7729 // x = x | (x >>16);
7730 // x = x | (x >>32); // for 64-bit input
7731 // Upto NewLen/2
7732 // return Len - popcount(x);
7733 //
7734 // Ref: "Hacker's Delight" by Henry Warren
7735 Register Op = SrcReg;
7736 unsigned NewLen = PowerOf2Ceil(A: Len);
7737 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
7738 auto MIBShiftAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: 1ULL << i);
7739 auto MIBOp = MIRBuilder.buildOr(
7740 Dst: SrcTy, Src0: Op, Src1: MIRBuilder.buildLShr(Dst: SrcTy, Src0: Op, Src1: MIBShiftAmt));
7741 Op = MIBOp.getReg(Idx: 0);
7742 }
7743 auto MIBPop = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: Op);
7744 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MIRBuilder.buildConstant(Res: DstTy, Val: Len),
7745 Src1: MIBPop);
7746 MI.eraseFromParent();
7747 return Legalized;
7748 }
7749 case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
7750 // This trivially expands to CTTZ.
7751 Observer.changingInstr(MI);
7752 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTTZ));
7753 Observer.changedInstr(MI);
7754 return Legalized;
7755 }
7756 case TargetOpcode::G_CTTZ: {
7757 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7758
7759 unsigned Len = SrcTy.getScalarSizeInBits();
7760 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7761 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
7762 // zero.
7763 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: SrcReg);
7764 auto Zero = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
7765 auto ICmp = MIRBuilder.buildICmp(
7766 Pred: CmpInst::ICMP_EQ, Res: DstTy.changeElementSize(NewEltSize: 1), Op0: SrcReg, Op1: Zero);
7767 auto LenConst = MIRBuilder.buildConstant(Res: DstTy, Val: Len);
7768 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LenConst, Op1: CttzZU);
7769 MI.eraseFromParent();
7770 return Legalized;
7771 }
7772 // for now, we use: { return popcount(~x & (x - 1)); }
7773 // unless the target has ctlz but not ctpop, in which case we use:
7774 // { return 32 - nlz(~x & (x-1)); }
7775 // Ref: "Hacker's Delight" by Henry Warren
7776 auto MIBCstNeg1 = MIRBuilder.buildConstant(Res: SrcTy, Val: -1);
7777 auto MIBNot = MIRBuilder.buildXor(Dst: SrcTy, Src0: SrcReg, Src1: MIBCstNeg1);
7778 auto MIBTmp = MIRBuilder.buildAnd(
7779 Dst: SrcTy, Src0: MIBNot, Src1: MIRBuilder.buildAdd(Dst: SrcTy, Src0: SrcReg, Src1: MIBCstNeg1));
7780 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
7781 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
7782 auto MIBCstLen = MIRBuilder.buildConstant(Res: SrcTy, Val: Len);
7783 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MIBCstLen,
7784 Src1: MIRBuilder.buildCTLZ(Dst: SrcTy, Src0: MIBTmp));
7785 MI.eraseFromParent();
7786 return Legalized;
7787 }
7788 Observer.changingInstr(MI);
7789 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTPOP));
7790 MI.getOperand(i: 1).setReg(MIBTmp.getReg(Idx: 0));
7791 Observer.changedInstr(MI);
7792 return Legalized;
7793 }
7794 case TargetOpcode::G_CTPOP: {
7795 Register SrcReg = MI.getOperand(i: 1).getReg();
7796 LLT Ty = MRI.getType(Reg: SrcReg);
7797 unsigned Size = Ty.getScalarSizeInBits();
7798 MachineIRBuilder &B = MIRBuilder;
7799
7800 // Bail out on irregular type lengths.
7801 if (Size > 128 || Size % 8 != 0)
7802 return UnableToLegalize;
7803
7804 // Count set bits in blocks of 2 bits. Default approach would be
7805 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
7806 // We use following formula instead:
7807 // B2Count = val - { (val >> 1) & 0x55555555 }
7808 // since it gives same result in blocks of 2 with one instruction less.
7809 auto C_1 = B.buildConstant(Res: Ty, Val: 1);
7810 auto B2Set1LoTo1Hi = B.buildLShr(Dst: Ty, Src0: SrcReg, Src1: C_1);
7811 APInt B2Mask1HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x55));
7812 auto C_B2Mask1HiTo0 = B.buildConstant(Res: Ty, Val: B2Mask1HiTo0);
7813 auto B2Count1Hi = B.buildAnd(Dst: Ty, Src0: B2Set1LoTo1Hi, Src1: C_B2Mask1HiTo0);
7814 auto B2Count = B.buildSub(Dst: Ty, Src0: SrcReg, Src1: B2Count1Hi);
7815
7816 // In order to get count in blocks of 4 add values from adjacent block of 2.
7817 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
7818 auto C_2 = B.buildConstant(Res: Ty, Val: 2);
7819 auto B4Set2LoTo2Hi = B.buildLShr(Dst: Ty, Src0: B2Count, Src1: C_2);
7820 APInt B4Mask2HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x33));
7821 auto C_B4Mask2HiTo0 = B.buildConstant(Res: Ty, Val: B4Mask2HiTo0);
7822 auto B4HiB2Count = B.buildAnd(Dst: Ty, Src0: B4Set2LoTo2Hi, Src1: C_B4Mask2HiTo0);
7823 auto B4LoB2Count = B.buildAnd(Dst: Ty, Src0: B2Count, Src1: C_B4Mask2HiTo0);
7824 auto B4Count = B.buildAdd(Dst: Ty, Src0: B4HiB2Count, Src1: B4LoB2Count);
7825
7826 // For count in blocks of 8 bits we don't have to mask high 4 bits before
7827 // addition since count value sits in range {0,...,8} and 4 bits are enough
7828 // to hold such binary values. After addition high 4 bits still hold count
7829 // of set bits in high 4 bit block, set them to zero and get 8 bit result.
7830 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
7831 auto C_4 = B.buildConstant(Res: Ty, Val: 4);
7832 auto B8HiB4Count = B.buildLShr(Dst: Ty, Src0: B4Count, Src1: C_4);
7833 auto B8CountDirty4Hi = B.buildAdd(Dst: Ty, Src0: B8HiB4Count, Src1: B4Count);
7834 APInt B8Mask4HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x0F));
7835 auto C_B8Mask4HiTo0 = B.buildConstant(Res: Ty, Val: B8Mask4HiTo0);
7836 auto B8Count = B.buildAnd(Dst: Ty, Src0: B8CountDirty4Hi, Src1: C_B8Mask4HiTo0);
7837
7838 assert(Size <= 128 && "Scalar size is too large for CTPOP lower algorithm");
7839
7840 // Avoid the multiply when shift-add is cheaper.
7841 if (Size == 16 && !Ty.isVector()) {
7842 // v = (v + (v >> 8)) & 0xFF;
7843 auto C_8 = B.buildConstant(Res: Ty, Val: 8);
7844 auto HighSum = B.buildLShr(Dst: Ty, Src0: B8Count, Src1: C_8);
7845 auto Res = B.buildAdd(Dst: Ty, Src0: B8Count, Src1: HighSum);
7846 B.buildAnd(Dst: MI.getOperand(i: 0).getReg(), Src0: Res, Src1: B.buildConstant(Res: Ty, Val: 0xFF));
7847 MI.eraseFromParent();
7848 return Legalized;
7849 }
7850
7851 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
7852 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
7853 auto MulMask = B.buildConstant(Res: Ty, Val: APInt::getSplat(NewLen: Size, V: APInt(8, 0x01)));
7854
7855 // Shift count result from 8 high bits to low bits.
7856 auto C_SizeM8 = B.buildConstant(Res: Ty, Val: Size - 8);
7857
7858 auto IsMulSupported = [this](const LLT Ty) {
7859 auto Action = LI.getAction(Query: {TargetOpcode::G_MUL, {Ty}}).Action;
7860 return Action == Legal || Action == WidenScalar || Action == Custom;
7861 };
7862 if (IsMulSupported(Ty)) {
7863 auto ResTmp = B.buildMul(Dst: Ty, Src0: B8Count, Src1: MulMask);
7864 B.buildLShr(Dst: MI.getOperand(i: 0).getReg(), Src0: ResTmp, Src1: C_SizeM8);
7865 } else {
7866 auto ResTmp = B8Count;
7867 for (unsigned Shift = 8; Shift < Size; Shift *= 2) {
7868 auto ShiftC = B.buildConstant(Res: Ty, Val: Shift);
7869 auto Shl = B.buildShl(Dst: Ty, Src0: ResTmp, Src1: ShiftC);
7870 ResTmp = B.buildAdd(Dst: Ty, Src0: ResTmp, Src1: Shl);
7871 }
7872 B.buildLShr(Dst: MI.getOperand(i: 0).getReg(), Src0: ResTmp, Src1: C_SizeM8);
7873 }
7874 MI.eraseFromParent();
7875 return Legalized;
7876 }
7877 case TargetOpcode::G_CTLS: {
7878 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7879
7880 // ctls(x) -> ctlz(x ^ (x >> (N - 1))) - 1
7881 auto SignIdxC =
7882 MIRBuilder.buildConstant(Res: SrcTy, Val: SrcTy.getScalarSizeInBits() - 1);
7883 auto OneC = MIRBuilder.buildConstant(Res: DstTy, Val: 1);
7884
7885 auto Shr = MIRBuilder.buildAShr(Dst: SrcTy, Src0: SrcReg, Src1: SignIdxC);
7886
7887 auto Xor = MIRBuilder.buildXor(Dst: SrcTy, Src0: SrcReg, Src1: Shr);
7888 auto Ctlz = MIRBuilder.buildCTLZ(Dst: DstTy, Src0: Xor);
7889
7890 MIRBuilder.buildSub(Dst: DstReg, Src0: Ctlz, Src1: OneC);
7891 MI.eraseFromParent();
7892 return Legalized;
7893 }
7894 }
7895}
7896
7897// Check that (every element of) Reg is undef or not an exact multiple of BW.
7898static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
7899 Register Reg, unsigned BW) {
7900 return matchUnaryPredicate(
7901 MRI, Reg,
7902 Match: [=](const Constant *C) {
7903 // Null constant here means an undef.
7904 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(Val: C);
7905 return !CI || CI->getValue().urem(RHS: BW) != 0;
7906 },
7907 /*AllowUndefs*/ true);
7908}
7909
7910LegalizerHelper::LegalizeResult
7911LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
7912 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7913 LLT Ty = MRI.getType(Reg: Dst);
7914 LLT ShTy = MRI.getType(Reg: Z);
7915
7916 unsigned BW = Ty.getScalarSizeInBits();
7917
7918 if (!isPowerOf2_32(Value: BW))
7919 return UnableToLegalize;
7920
7921 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7922 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7923
7924 if (isNonZeroModBitWidthOrUndef(MRI, Reg: Z, BW)) {
7925 // fshl X, Y, Z -> fshr X, Y, -Z
7926 // fshr X, Y, Z -> fshl X, Y, -Z
7927 auto Zero = MIRBuilder.buildConstant(Res: ShTy, Val: 0);
7928 Z = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: Z).getReg(Idx: 0);
7929 } else {
7930 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
7931 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
7932 auto One = MIRBuilder.buildConstant(Res: ShTy, Val: 1);
7933 if (IsFSHL) {
7934 Y = MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Ty}, SrcOps: {X, Y, One}).getReg(Idx: 0);
7935 X = MIRBuilder.buildLShr(Dst: Ty, Src0: X, Src1: One).getReg(Idx: 0);
7936 } else {
7937 X = MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Ty}, SrcOps: {X, Y, One}).getReg(Idx: 0);
7938 Y = MIRBuilder.buildShl(Dst: Ty, Src0: Y, Src1: One).getReg(Idx: 0);
7939 }
7940
7941 Z = MIRBuilder.buildNot(Dst: ShTy, Src0: Z).getReg(Idx: 0);
7942 }
7943
7944 MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Dst}, SrcOps: {X, Y, Z});
7945 MI.eraseFromParent();
7946 return Legalized;
7947}
7948
7949LegalizerHelper::LegalizeResult
7950LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
7951 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7952 LLT Ty = MRI.getType(Reg: Dst);
7953 LLT ShTy = MRI.getType(Reg: Z);
7954
7955 const unsigned BW = Ty.getScalarSizeInBits();
7956 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7957
7958 Register ShX, ShY;
7959 Register ShAmt, InvShAmt;
7960
7961 // FIXME: Emit optimized urem by constant instead of letting it expand later.
7962 if (isNonZeroModBitWidthOrUndef(MRI, Reg: Z, BW)) {
7963 // fshl: X << C | Y >> (BW - C)
7964 // fshr: X << (BW - C) | Y >> C
7965 // where C = Z % BW is not zero
7966 auto BitWidthC = MIRBuilder.buildConstant(Res: ShTy, Val: BW);
7967 ShAmt = MIRBuilder.buildURem(Dst: ShTy, Src0: Z, Src1: BitWidthC).getReg(Idx: 0);
7968 InvShAmt = MIRBuilder.buildSub(Dst: ShTy, Src0: BitWidthC, Src1: ShAmt).getReg(Idx: 0);
7969 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: IsFSHL ? ShAmt : InvShAmt).getReg(Idx: 0);
7970 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: IsFSHL ? InvShAmt : ShAmt).getReg(Idx: 0);
7971 } else {
7972 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
7973 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
7974 auto Mask = MIRBuilder.buildConstant(Res: ShTy, Val: BW - 1);
7975 if (isPowerOf2_32(Value: BW)) {
7976 // Z % BW -> Z & (BW - 1)
7977 ShAmt = MIRBuilder.buildAnd(Dst: ShTy, Src0: Z, Src1: Mask).getReg(Idx: 0);
7978 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
7979 auto NotZ = MIRBuilder.buildNot(Dst: ShTy, Src0: Z);
7980 InvShAmt = MIRBuilder.buildAnd(Dst: ShTy, Src0: NotZ, Src1: Mask).getReg(Idx: 0);
7981 } else {
7982 auto BitWidthC = MIRBuilder.buildConstant(Res: ShTy, Val: BW);
7983 ShAmt = MIRBuilder.buildURem(Dst: ShTy, Src0: Z, Src1: BitWidthC).getReg(Idx: 0);
7984 InvShAmt = MIRBuilder.buildSub(Dst: ShTy, Src0: Mask, Src1: ShAmt).getReg(Idx: 0);
7985 }
7986
7987 auto One = MIRBuilder.buildConstant(Res: ShTy, Val: 1);
7988 if (IsFSHL) {
7989 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: ShAmt).getReg(Idx: 0);
7990 auto ShY1 = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: One);
7991 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: ShY1, Src1: InvShAmt).getReg(Idx: 0);
7992 } else {
7993 auto ShX1 = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: One);
7994 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: ShX1, Src1: InvShAmt).getReg(Idx: 0);
7995 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: ShAmt).getReg(Idx: 0);
7996 }
7997 }
7998
7999 MIRBuilder.buildOr(Dst, Src0: ShX, Src1: ShY, Flags: MachineInstr::Disjoint);
8000 MI.eraseFromParent();
8001 return Legalized;
8002}
8003
8004LegalizerHelper::LegalizeResult
8005LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
8006 // These operations approximately do the following (while avoiding undefined
8007 // shifts by BW):
8008 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
8009 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
8010 Register Dst = MI.getOperand(i: 0).getReg();
8011 LLT Ty = MRI.getType(Reg: Dst);
8012 LLT ShTy = MRI.getType(Reg: MI.getOperand(i: 3).getReg());
8013
8014 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
8015 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
8016
8017 // TODO: Use smarter heuristic that accounts for vector legalization.
8018 if (LI.getAction(Query: {RevOpcode, {Ty, ShTy}}).Action == Lower)
8019 return lowerFunnelShiftAsShifts(MI);
8020
8021 // This only works for powers of 2, fallback to shifts if it fails.
8022 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
8023 if (Result == UnableToLegalize)
8024 return lowerFunnelShiftAsShifts(MI);
8025 return Result;
8026}
8027
8028LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
8029 auto [Dst, Src] = MI.getFirst2Regs();
8030 LLT DstTy = MRI.getType(Reg: Dst);
8031 LLT SrcTy = MRI.getType(Reg: Src);
8032
8033 uint32_t DstTySize = DstTy.getSizeInBits();
8034 uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
8035 uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
8036
8037 if (!isPowerOf2_32(Value: DstTySize) || !isPowerOf2_32(Value: DstTyScalarSize) ||
8038 !isPowerOf2_32(Value: SrcTyScalarSize))
8039 return UnableToLegalize;
8040
8041 // The step between extend is too large, split it by creating an intermediate
8042 // extend instruction
8043 if (SrcTyScalarSize * 2 < DstTyScalarSize) {
8044 LLT MidTy = SrcTy.changeElementSize(NewEltSize: SrcTyScalarSize * 2);
8045 // If the destination type is illegal, split it into multiple statements
8046 // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
8047 auto NewExt = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {MidTy}, SrcOps: {Src});
8048 // Unmerge the vector
8049 LLT EltTy = MidTy.changeElementCount(
8050 EC: MidTy.getElementCount().divideCoefficientBy(RHS: 2));
8051 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: NewExt);
8052
8053 // ZExt the vectors
8054 LLT ZExtResTy = DstTy.changeElementCount(
8055 EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2));
8056 auto ZExtRes1 = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {ZExtResTy},
8057 SrcOps: {UnmergeSrc.getReg(Idx: 0)});
8058 auto ZExtRes2 = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {ZExtResTy},
8059 SrcOps: {UnmergeSrc.getReg(Idx: 1)});
8060
8061 // Merge the ending vectors
8062 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: {ZExtRes1, ZExtRes2});
8063
8064 MI.eraseFromParent();
8065 return Legalized;
8066 }
8067 return UnableToLegalize;
8068}
8069
8070LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
8071 // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
8072 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
8073 // Similar to how operand splitting is done in SelectiondDAG, we can handle
8074 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
8075 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
8076 // %lo16(<4 x s16>) = G_TRUNC %inlo
8077 // %hi16(<4 x s16>) = G_TRUNC %inhi
8078 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
8079 // %res(<8 x s8>) = G_TRUNC %in16
8080
8081 assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
8082
8083 Register DstReg = MI.getOperand(i: 0).getReg();
8084 Register SrcReg = MI.getOperand(i: 1).getReg();
8085 LLT DstTy = MRI.getType(Reg: DstReg);
8086 LLT SrcTy = MRI.getType(Reg: SrcReg);
8087
8088 if (DstTy.isVector() && isPowerOf2_32(Value: DstTy.getNumElements()) &&
8089 isPowerOf2_32(Value: DstTy.getScalarSizeInBits()) &&
8090 isPowerOf2_32(Value: SrcTy.getNumElements()) &&
8091 isPowerOf2_32(Value: SrcTy.getScalarSizeInBits())) {
8092 // Split input type.
8093 LLT SplitSrcTy = SrcTy.changeElementCount(
8094 EC: SrcTy.getElementCount().divideCoefficientBy(RHS: 2));
8095
8096 // First, split the source into two smaller vectors.
8097 SmallVector<Register, 2> SplitSrcs;
8098 extractParts(Reg: SrcReg, Ty: SplitSrcTy, NumParts: 2, VRegs&: SplitSrcs, MIRBuilder, MRI);
8099
8100 // Truncate the splits into intermediate narrower elements.
8101 LLT InterTy;
8102 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
8103 InterTy = SplitSrcTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits() * 2);
8104 else
8105 InterTy = SplitSrcTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits());
8106 for (Register &Src : SplitSrcs)
8107 Src = MIRBuilder.buildTrunc(Res: InterTy, Op: Src).getReg(Idx: 0);
8108
8109 // Combine the new truncates into one vector
8110 auto Merge = MIRBuilder.buildMergeLikeInstr(
8111 Res: DstTy.changeElementSize(NewEltSize: InterTy.getScalarSizeInBits()), Ops: SplitSrcs);
8112
8113 // Truncate the new vector to the final result type
8114 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
8115 MIRBuilder.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Merge.getReg(Idx: 0));
8116 else
8117 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0).getReg(), Op: Merge.getReg(Idx: 0));
8118
8119 MI.eraseFromParent();
8120
8121 return Legalized;
8122 }
8123 return UnableToLegalize;
8124}
8125
8126LegalizerHelper::LegalizeResult
8127LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
8128 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
8129 auto Zero = MIRBuilder.buildConstant(Res: AmtTy, Val: 0);
8130 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
8131 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
8132 auto Neg = MIRBuilder.buildSub(Dst: AmtTy, Src0: Zero, Src1: Amt);
8133 MIRBuilder.buildInstr(Opc: RevRot, DstOps: {Dst}, SrcOps: {Src, Neg});
8134 MI.eraseFromParent();
8135 return Legalized;
8136}
8137
8138LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
8139 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
8140
8141 unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
8142 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
8143
8144 MIRBuilder.setInstrAndDebugLoc(MI);
8145
8146 // If a rotate in the other direction is supported, use it.
8147 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
8148 if (LI.isLegalOrCustom(Query: {RevRot, {DstTy, SrcTy}}) &&
8149 isPowerOf2_32(Value: EltSizeInBits))
8150 return lowerRotateWithReverseRotate(MI);
8151
8152 // If a funnel shift is supported, use it.
8153 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
8154 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
8155 bool IsFShLegal = false;
8156 if ((IsFShLegal = LI.isLegalOrCustom(Query: {FShOpc, {DstTy, AmtTy}})) ||
8157 LI.isLegalOrCustom(Query: {RevFsh, {DstTy, AmtTy}})) {
8158 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
8159 Register R3) {
8160 MIRBuilder.buildInstr(Opc, DstOps: {R1}, SrcOps: {R2, R2, R3});
8161 MI.eraseFromParent();
8162 return Legalized;
8163 };
8164 // If a funnel shift in the other direction is supported, use it.
8165 if (IsFShLegal) {
8166 return buildFunnelShift(FShOpc, Dst, Src, Amt);
8167 } else if (isPowerOf2_32(Value: EltSizeInBits)) {
8168 Amt = MIRBuilder.buildNeg(Dst: DstTy, Src0: Amt).getReg(Idx: 0);
8169 return buildFunnelShift(RevFsh, Dst, Src, Amt);
8170 }
8171 }
8172
8173 auto Zero = MIRBuilder.buildConstant(Res: AmtTy, Val: 0);
8174 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
8175 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
8176 auto BitWidthMinusOneC = MIRBuilder.buildConstant(Res: AmtTy, Val: EltSizeInBits - 1);
8177 Register ShVal;
8178 Register RevShiftVal;
8179 if (isPowerOf2_32(Value: EltSizeInBits)) {
8180 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
8181 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
8182 auto NegAmt = MIRBuilder.buildSub(Dst: AmtTy, Src0: Zero, Src1: Amt);
8183 auto ShAmt = MIRBuilder.buildAnd(Dst: AmtTy, Src0: Amt, Src1: BitWidthMinusOneC);
8184 ShVal = MIRBuilder.buildInstr(Opc: ShOpc, DstOps: {DstTy}, SrcOps: {Src, ShAmt}).getReg(Idx: 0);
8185 auto RevAmt = MIRBuilder.buildAnd(Dst: AmtTy, Src0: NegAmt, Src1: BitWidthMinusOneC);
8186 RevShiftVal =
8187 MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Src, RevAmt}).getReg(Idx: 0);
8188 } else {
8189 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
8190 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
8191 auto BitWidthC = MIRBuilder.buildConstant(Res: AmtTy, Val: EltSizeInBits);
8192 auto ShAmt = MIRBuilder.buildURem(Dst: AmtTy, Src0: Amt, Src1: BitWidthC);
8193 ShVal = MIRBuilder.buildInstr(Opc: ShOpc, DstOps: {DstTy}, SrcOps: {Src, ShAmt}).getReg(Idx: 0);
8194 auto RevAmt = MIRBuilder.buildSub(Dst: AmtTy, Src0: BitWidthMinusOneC, Src1: ShAmt);
8195 auto One = MIRBuilder.buildConstant(Res: AmtTy, Val: 1);
8196 auto Inner = MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Src, One});
8197 RevShiftVal =
8198 MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Inner, RevAmt}).getReg(Idx: 0);
8199 }
8200 MIRBuilder.buildOr(Dst, Src0: ShVal, Src1: RevShiftVal, Flags: MachineInstr::Disjoint);
8201 MI.eraseFromParent();
8202 return Legalized;
8203}
8204
8205// Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
8206// representation.
8207LegalizerHelper::LegalizeResult
8208LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
8209 auto [Dst, Src] = MI.getFirst2Regs();
8210 const LLT S64 = LLT::scalar(SizeInBits: 64);
8211 const LLT S32 = LLT::scalar(SizeInBits: 32);
8212 const LLT S1 = LLT::scalar(SizeInBits: 1);
8213
8214 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
8215
8216 // unsigned cul2f(ulong u) {
8217 // uint lz = clz(u);
8218 // uint e = (u != 0) ? 127U + 63U - lz : 0;
8219 // u = (u << lz) & 0x7fffffffffffffffUL;
8220 // ulong t = u & 0xffffffffffUL;
8221 // uint v = (e << 23) | (uint)(u >> 40);
8222 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
8223 // return as_float(v + r);
8224 // }
8225
8226 auto Zero32 = MIRBuilder.buildConstant(Res: S32, Val: 0);
8227 auto Zero64 = MIRBuilder.buildConstant(Res: S64, Val: 0);
8228
8229 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(Dst: S32, Src0: Src);
8230
8231 auto K = MIRBuilder.buildConstant(Res: S32, Val: 127U + 63U);
8232 auto Sub = MIRBuilder.buildSub(Dst: S32, Src0: K, Src1: LZ);
8233
8234 auto NotZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: Src, Op1: Zero64);
8235 auto E = MIRBuilder.buildSelect(Res: S32, Tst: NotZero, Op0: Sub, Op1: Zero32);
8236
8237 auto Mask0 = MIRBuilder.buildConstant(Res: S64, Val: (-1ULL) >> 1);
8238 auto ShlLZ = MIRBuilder.buildShl(Dst: S64, Src0: Src, Src1: LZ);
8239
8240 auto U = MIRBuilder.buildAnd(Dst: S64, Src0: ShlLZ, Src1: Mask0);
8241
8242 auto Mask1 = MIRBuilder.buildConstant(Res: S64, Val: 0xffffffffffULL);
8243 auto T = MIRBuilder.buildAnd(Dst: S64, Src0: U, Src1: Mask1);
8244
8245 auto UShl = MIRBuilder.buildLShr(Dst: S64, Src0: U, Src1: MIRBuilder.buildConstant(Res: S64, Val: 40));
8246 auto ShlE = MIRBuilder.buildShl(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: 23));
8247 auto V = MIRBuilder.buildOr(Dst: S32, Src0: ShlE, Src1: MIRBuilder.buildTrunc(Res: S32, Op: UShl));
8248
8249 auto C = MIRBuilder.buildConstant(Res: S64, Val: 0x8000000000ULL);
8250 auto RCmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_UGT, Res: S1, Op0: T, Op1: C);
8251 auto TCmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: T, Op1: C);
8252 auto One = MIRBuilder.buildConstant(Res: S32, Val: 1);
8253
8254 auto VTrunc1 = MIRBuilder.buildAnd(Dst: S32, Src0: V, Src1: One);
8255 auto Select0 = MIRBuilder.buildSelect(Res: S32, Tst: TCmp, Op0: VTrunc1, Op1: Zero32);
8256 auto R = MIRBuilder.buildSelect(Res: S32, Tst: RCmp, Op0: One, Op1: Select0);
8257 MIRBuilder.buildAdd(Dst, Src0: V, Src1: R);
8258
8259 MI.eraseFromParent();
8260 return Legalized;
8261}
8262
8263// Expand s32 = G_UITOFP s64 to an IEEE float representation using bit
8264// operations and G_SITOFP
8265LegalizerHelper::LegalizeResult
8266LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
8267 auto [Dst, Src] = MI.getFirst2Regs();
8268 const LLT S64 = LLT::scalar(SizeInBits: 64);
8269 const LLT S32 = LLT::scalar(SizeInBits: 32);
8270 const LLT S1 = LLT::scalar(SizeInBits: 1);
8271
8272 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
8273
8274 // For i64 < INT_MAX we simply reuse SITOFP.
8275 // Otherwise, divide i64 by 2, round result by ORing with the lowest bit
8276 // saved before division, convert to float by SITOFP, multiply the result
8277 // by 2.
8278 auto One = MIRBuilder.buildConstant(Res: S64, Val: 1);
8279 auto Zero = MIRBuilder.buildConstant(Res: S64, Val: 0);
8280 // Result if Src < INT_MAX
8281 auto SmallResult = MIRBuilder.buildSITOFP(Dst: S32, Src0: Src);
8282 // Result if Src >= INT_MAX
8283 auto Halved = MIRBuilder.buildLShr(Dst: S64, Src0: Src, Src1: One);
8284 auto LowerBit = MIRBuilder.buildAnd(Dst: S64, Src0: Src, Src1: One);
8285 auto RoundedHalved = MIRBuilder.buildOr(Dst: S64, Src0: Halved, Src1: LowerBit);
8286 auto HalvedFP = MIRBuilder.buildSITOFP(Dst: S32, Src0: RoundedHalved);
8287 auto LargeResult = MIRBuilder.buildFAdd(Dst: S32, Src0: HalvedFP, Src1: HalvedFP);
8288 // Check if the original value is larger than INT_MAX by comparing with
8289 // zero to pick one of the two conversions.
8290 auto IsLarge =
8291 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_SLT, Res: S1, Op0: Src, Op1: Zero);
8292 MIRBuilder.buildSelect(Res: Dst, Tst: IsLarge, Op0: LargeResult, Op1: SmallResult);
8293
8294 MI.eraseFromParent();
8295 return Legalized;
8296}
8297
8298// Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an
8299// IEEE double representation.
8300LegalizerHelper::LegalizeResult
8301LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
8302 auto [Dst, Src] = MI.getFirst2Regs();
8303 const LLT S64 = LLT::scalar(SizeInBits: 64);
8304 const LLT S32 = LLT::scalar(SizeInBits: 32);
8305
8306 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
8307
8308 // We create double value from 32 bit parts with 32 exponent difference.
8309 // Note that + and - are float operations that adjust the implicit leading
8310 // one, the bases 2^52 and 2^84 are for illustrative purposes.
8311 //
8312 // X = 2^52 * 1.0...LowBits
8313 // Y = 2^84 * 1.0...HighBits
8314 // Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
8315 // = - 2^52 * 1.0...HighBits
8316 // Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
8317 auto TwoP52 = MIRBuilder.buildConstant(Res: S64, UINT64_C(0x4330000000000000));
8318 auto TwoP84 = MIRBuilder.buildConstant(Res: S64, UINT64_C(0x4530000000000000));
8319 auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
8320 auto TwoP52P84FP = MIRBuilder.buildFConstant(Res: S64, Val: TwoP52P84);
8321 auto HalfWidth = MIRBuilder.buildConstant(Res: S64, Val: 32);
8322
8323 auto LowBits = MIRBuilder.buildTrunc(Res: S32, Op: Src);
8324 LowBits = MIRBuilder.buildZExt(Res: S64, Op: LowBits);
8325 auto LowBitsFP = MIRBuilder.buildOr(Dst: S64, Src0: TwoP52, Src1: LowBits);
8326 auto HighBits = MIRBuilder.buildLShr(Dst: S64, Src0: Src, Src1: HalfWidth);
8327 auto HighBitsFP = MIRBuilder.buildOr(Dst: S64, Src0: TwoP84, Src1: HighBits);
8328 auto Scratch = MIRBuilder.buildFSub(Dst: S64, Src0: HighBitsFP, Src1: TwoP52P84FP);
8329 MIRBuilder.buildFAdd(Dst, Src0: Scratch, Src1: LowBitsFP);
8330
8331 MI.eraseFromParent();
8332 return Legalized;
8333}
8334
8335/// i64->fp16 itofp can be lowered to i64->f64,f64->f32,f32->f16. We cannot
8336/// convert fpround f64->f16 without double-rounding, so we manually perform the
8337/// lowering here where we know it is valid.
8338static LegalizerHelper::LegalizeResult
8339loweri64tof16ITOFP(MachineInstr &MI, Register Dst, LLT DstTy, Register Src,
8340 LLT SrcTy, MachineIRBuilder &MIRBuilder) {
8341 auto DstFpTy =
8342 SrcTy.changeElementType(NewEltTy: LLT::floatIEEE(SizeInBits: SrcTy.getScalarSizeInBits()));
8343 auto M1 = MI.getOpcode() == TargetOpcode::G_UITOFP
8344 ? MIRBuilder.buildUITOFP(Dst: DstFpTy, Src0: Src)
8345 : MIRBuilder.buildSITOFP(Dst: DstFpTy, Src0: Src);
8346 LLT F32Ty = DstFpTy.changeElementSize(NewEltSize: 32);
8347 auto M2 = MIRBuilder.buildFPTrunc(Res: F32Ty, Op: M1);
8348 MIRBuilder.buildFPTrunc(Res: Dst, Op: M2);
8349 MI.eraseFromParent();
8350 return LegalizerHelper::Legalized;
8351}
8352
8353LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
8354 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8355
8356 if (SrcTy == LLT::scalar(SizeInBits: 1)) {
8357 auto True = MIRBuilder.buildFConstant(Res: DstTy, Val: 1.0);
8358 auto False = MIRBuilder.buildFConstant(Res: DstTy, Val: 0.0);
8359 MIRBuilder.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
8360 MI.eraseFromParent();
8361 return Legalized;
8362 }
8363
8364 if (DstTy.getScalarSizeInBits() == 16 && SrcTy.getScalarSizeInBits() == 64)
8365 return loweri64tof16ITOFP(MI, Dst, DstTy, Src, SrcTy, MIRBuilder);
8366
8367 if (SrcTy != LLT::scalar(SizeInBits: 64))
8368 return UnableToLegalize;
8369
8370 if (DstTy == LLT::scalar(SizeInBits: 32))
8371 // TODO: SelectionDAG has several alternative expansions to port which may
8372 // be more reasonable depending on the available instructions. We also need
8373 // a more advanced mechanism to choose an optimal version depending on
8374 // target features such as sitofp or CTLZ availability.
8375 return lowerU64ToF32WithSITOFP(MI);
8376
8377 if (DstTy == LLT::scalar(SizeInBits: 64))
8378 return lowerU64ToF64BitFloatOps(MI);
8379
8380 return UnableToLegalize;
8381}
8382
8383LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
8384 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8385
8386 const LLT I64 = LLT::integer(SizeInBits: 64);
8387 const LLT I32 = LLT::integer(SizeInBits: 32);
8388 const LLT I1 = LLT::integer(SizeInBits: 1);
8389
8390 if (SrcTy == I1) {
8391 auto True = MIRBuilder.buildFConstant(Res: DstTy, Val: -1.0);
8392 auto False = MIRBuilder.buildFConstant(Res: DstTy, Val: 0.0);
8393 MIRBuilder.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
8394 MI.eraseFromParent();
8395 return Legalized;
8396 }
8397
8398 if (DstTy.getScalarSizeInBits() == 16 && SrcTy.getScalarSizeInBits() == 64)
8399 return loweri64tof16ITOFP(MI, Dst, DstTy, Src, SrcTy, MIRBuilder);
8400
8401 if (SrcTy != I64)
8402 return UnableToLegalize;
8403
8404 if (DstTy.getScalarSizeInBits() == 32) {
8405 // signed cl2f(long l) {
8406 // long s = l >> 63;
8407 // float r = cul2f((l + s) ^ s);
8408 // return s ? -r : r;
8409 // }
8410 Register L = Src;
8411 auto SignBit = MIRBuilder.buildConstant(Res: I64, Val: 63);
8412 auto S = MIRBuilder.buildAShr(Dst: I64, Src0: L, Src1: SignBit);
8413
8414 auto LPlusS = MIRBuilder.buildAdd(Dst: I64, Src0: L, Src1: S);
8415 auto Xor = MIRBuilder.buildXor(Dst: I64, Src0: LPlusS, Src1: S);
8416 auto R = MIRBuilder.buildUITOFP(Dst: I32, Src0: Xor);
8417
8418 auto RNeg = MIRBuilder.buildFNeg(Dst: I32, Src0: R);
8419 auto SignNotZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: I1, Op0: S,
8420 Op1: MIRBuilder.buildConstant(Res: I64, Val: 0));
8421 MIRBuilder.buildSelect(Res: Dst, Tst: SignNotZero, Op0: RNeg, Op1: R);
8422 MI.eraseFromParent();
8423 return Legalized;
8424 }
8425
8426 return UnableToLegalize;
8427}
8428
8429LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
8430 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8431 const LLT S64 = LLT::scalar(SizeInBits: 64);
8432 const LLT S32 = LLT::scalar(SizeInBits: 32);
8433
8434 if (SrcTy != S64 && SrcTy != S32)
8435 return UnableToLegalize;
8436 if (DstTy != S32 && DstTy != S64)
8437 return UnableToLegalize;
8438
8439 // FPTOSI gives same result as FPTOUI for positive signed integers.
8440 // FPTOUI needs to deal with fp values that convert to unsigned integers
8441 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
8442
8443 APInt TwoPExpInt = APInt::getSignMask(BitWidth: DstTy.getSizeInBits());
8444 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
8445 : APFloat::IEEEdouble(),
8446 APInt::getZero(numBits: SrcTy.getSizeInBits()));
8447 TwoPExpFP.convertFromAPInt(Input: TwoPExpInt, IsSigned: false, RM: APFloat::rmNearestTiesToEven);
8448
8449 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Src);
8450
8451 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(Res: SrcTy, Val: TwoPExpFP);
8452 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
8453 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
8454 MachineInstrBuilder FSub = MIRBuilder.buildFSub(Dst: SrcTy, Src0: Src, Src1: Threshold);
8455 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: FSub);
8456 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(Res: DstTy, Val: TwoPExpInt);
8457 MachineInstrBuilder Res = MIRBuilder.buildXor(Dst: DstTy, Src0: ResLowBits, Src1: ResHighBit);
8458
8459 const LLT S1 = LLT::scalar(SizeInBits: 1);
8460
8461 MachineInstrBuilder FCMP =
8462 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ULT, Res: S1, Op0: Src, Op1: Threshold);
8463 MIRBuilder.buildSelect(Res: Dst, Tst: FCMP, Op0: FPTOSI, Op1: Res);
8464
8465 MI.eraseFromParent();
8466 return Legalized;
8467}
8468
8469LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
8470 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8471 const LLT S64 = LLT::scalar(SizeInBits: 64);
8472 const LLT S32 = LLT::scalar(SizeInBits: 32);
8473
8474 // FIXME: Only f32 to i64 conversions are supported.
8475 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
8476 return UnableToLegalize;
8477
8478 // Expand f32 -> i64 conversion
8479 // This algorithm comes from compiler-rt's implementation of fixsfdi:
8480 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
8481
8482 unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
8483
8484 auto ExponentMask = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x7F800000);
8485 auto ExponentLoBit = MIRBuilder.buildConstant(Res: SrcTy, Val: 23);
8486
8487 auto AndExpMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: ExponentMask);
8488 auto ExponentBits = MIRBuilder.buildLShr(Dst: SrcTy, Src0: AndExpMask, Src1: ExponentLoBit);
8489
8490 auto SignMask = MIRBuilder.buildConstant(Res: SrcTy,
8491 Val: APInt::getSignMask(BitWidth: SrcEltBits));
8492 auto AndSignMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: SignMask);
8493 auto SignLowBit = MIRBuilder.buildConstant(Res: SrcTy, Val: SrcEltBits - 1);
8494 auto Sign = MIRBuilder.buildAShr(Dst: SrcTy, Src0: AndSignMask, Src1: SignLowBit);
8495 Sign = MIRBuilder.buildSExt(Res: DstTy, Op: Sign);
8496
8497 auto MantissaMask = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x007FFFFF);
8498 auto AndMantissaMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: MantissaMask);
8499 auto K = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x00800000);
8500
8501 auto R = MIRBuilder.buildOr(Dst: SrcTy, Src0: AndMantissaMask, Src1: K);
8502 R = MIRBuilder.buildZExt(Res: DstTy, Op: R);
8503
8504 auto Bias = MIRBuilder.buildConstant(Res: SrcTy, Val: 127);
8505 auto Exponent = MIRBuilder.buildSub(Dst: SrcTy, Src0: ExponentBits, Src1: Bias);
8506 auto SubExponent = MIRBuilder.buildSub(Dst: SrcTy, Src0: Exponent, Src1: ExponentLoBit);
8507 auto ExponentSub = MIRBuilder.buildSub(Dst: SrcTy, Src0: ExponentLoBit, Src1: Exponent);
8508
8509 auto Shl = MIRBuilder.buildShl(Dst: DstTy, Src0: R, Src1: SubExponent);
8510 auto Srl = MIRBuilder.buildLShr(Dst: DstTy, Src0: R, Src1: ExponentSub);
8511
8512 const LLT S1 = LLT::scalar(SizeInBits: 1);
8513 auto CmpGt = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT,
8514 Res: S1, Op0: Exponent, Op1: ExponentLoBit);
8515
8516 R = MIRBuilder.buildSelect(Res: DstTy, Tst: CmpGt, Op0: Shl, Op1: Srl);
8517
8518 auto XorSign = MIRBuilder.buildXor(Dst: DstTy, Src0: R, Src1: Sign);
8519 auto Ret = MIRBuilder.buildSub(Dst: DstTy, Src0: XorSign, Src1: Sign);
8520
8521 auto ZeroSrcTy = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
8522
8523 auto ExponentLt0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT,
8524 Res: S1, Op0: Exponent, Op1: ZeroSrcTy);
8525
8526 auto ZeroDstTy = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
8527 MIRBuilder.buildSelect(Res: Dst, Tst: ExponentLt0, Op0: ZeroDstTy, Op1: Ret);
8528
8529 MI.eraseFromParent();
8530 return Legalized;
8531}
8532
8533LegalizerHelper::LegalizeResult
8534LegalizerHelper::lowerFPTOINT_SAT(MachineInstr &MI) {
8535 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8536
8537 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI_SAT;
8538 unsigned SatWidth = DstTy.getScalarSizeInBits();
8539
8540 // Determine minimum and maximum integer values and their corresponding
8541 // floating-point values.
8542 APInt MinInt, MaxInt;
8543 if (IsSigned) {
8544 MinInt = APInt::getSignedMinValue(numBits: SatWidth);
8545 MaxInt = APInt::getSignedMaxValue(numBits: SatWidth);
8546 } else {
8547 MinInt = APInt::getMinValue(numBits: SatWidth);
8548 MaxInt = APInt::getMaxValue(numBits: SatWidth);
8549 }
8550
8551 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: SrcTy.getScalarType());
8552 APFloat MinFloat(Semantics);
8553 APFloat MaxFloat(Semantics);
8554
8555 APFloat::opStatus MinStatus =
8556 MinFloat.convertFromAPInt(Input: MinInt, IsSigned, RM: APFloat::rmTowardZero);
8557 APFloat::opStatus MaxStatus =
8558 MaxFloat.convertFromAPInt(Input: MaxInt, IsSigned, RM: APFloat::rmTowardZero);
8559 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
8560 !(MaxStatus & APFloat::opStatus::opInexact);
8561
8562 // If the integer bounds are exactly representable as floats, emit a
8563 // min+max+fptoi sequence. Otherwise we have to use a sequence of comparisons
8564 // and selects.
8565 if (AreExactFloatBounds) {
8566 // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
8567 auto MaxC = MIRBuilder.buildFConstant(Res: SrcTy, Val: MinFloat);
8568 auto MaxP =
8569 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::integer(SizeInBits: 1), Op0: Src, Op1: MaxC);
8570 auto Max = MIRBuilder.buildSelect(Res: SrcTy, Tst: MaxP, Op0: Src, Op1: MaxC);
8571 // Clamp by MaxFloat from above. NaN cannot occur.
8572 auto MinC = MIRBuilder.buildFConstant(Res: SrcTy, Val: MaxFloat);
8573 auto MinP = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::integer(SizeInBits: 1), Op0: Max,
8574 Op1: MinC, Flags: MachineInstr::FmNoNans);
8575 auto Min =
8576 MIRBuilder.buildSelect(Res: SrcTy, Tst: MinP, Op0: Max, Op1: MinC, Flags: MachineInstr::FmNoNans);
8577 // Convert clamped value to integer. In the unsigned case we're done,
8578 // because we mapped NaN to MinFloat, which will cast to zero.
8579 if (!IsSigned) {
8580 MIRBuilder.buildFPTOUI(Dst, Src0: Min);
8581 MI.eraseFromParent();
8582 return Legalized;
8583 }
8584
8585 // Otherwise, select 0 if Src is NaN.
8586 auto FpToInt = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Min);
8587 auto IsZero =
8588 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_UNO, Res: LLT::integer(SizeInBits: 1), Op0: Src, Op1: Src);
8589 MIRBuilder.buildSelect(Res: Dst, Tst: IsZero, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: 0),
8590 Op1: FpToInt);
8591 MI.eraseFromParent();
8592 return Legalized;
8593 }
8594
8595 // Result of direct conversion. The assumption here is that the operation is
8596 // non-trapping and it's fine to apply it to an out-of-range value if we
8597 // select it away later.
8598 auto FpToInt = IsSigned ? MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Src)
8599 : MIRBuilder.buildFPTOUI(Dst: DstTy, Src0: Src);
8600
8601 // If Src ULT MinFloat, select MinInt. In particular, this also selects
8602 // MinInt if Src is NaN.
8603 auto ULT = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ULT, Res: LLT::integer(SizeInBits: 1), Op0: Src,
8604 Op1: MIRBuilder.buildFConstant(Res: SrcTy, Val: MinFloat));
8605 auto Max = MIRBuilder.buildSelect(
8606 Res: DstTy, Tst: ULT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MinInt), Op1: FpToInt);
8607 // If Src OGT MaxFloat, select MaxInt.
8608 auto OGT = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::integer(SizeInBits: 1), Op0: Src,
8609 Op1: MIRBuilder.buildFConstant(Res: SrcTy, Val: MaxFloat));
8610
8611 // In the unsigned case we are done, because we mapped NaN to MinInt, which
8612 // is already zero.
8613 if (!IsSigned) {
8614 MIRBuilder.buildSelect(Res: Dst, Tst: OGT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MaxInt),
8615 Op1: Max);
8616 MI.eraseFromParent();
8617 return Legalized;
8618 }
8619
8620 // Otherwise, select 0 if Src is NaN.
8621 auto Min = MIRBuilder.buildSelect(
8622 Res: DstTy, Tst: OGT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MaxInt), Op1: Max);
8623 auto IsZero =
8624 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_UNO, Res: LLT::integer(SizeInBits: 1), Op0: Src, Op1: Src);
8625 MIRBuilder.buildSelect(Res: Dst, Tst: IsZero, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: 0), Op1: Min);
8626 MI.eraseFromParent();
8627 return Legalized;
8628}
8629
8630// f64 -> f16 conversion using round-to-nearest-even rounding mode.
8631LegalizerHelper::LegalizeResult
8632LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
8633 const LLT S1 = LLT::scalar(SizeInBits: 1);
8634 const LLT S32 = LLT::scalar(SizeInBits: 32);
8635
8636 auto [Dst, Src] = MI.getFirst2Regs();
8637 assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
8638 MRI.getType(Src).getScalarType() == LLT::scalar(64));
8639
8640 if (MRI.getType(Reg: Src).isVector()) // TODO: Handle vectors directly.
8641 return UnableToLegalize;
8642
8643 if (MI.getFlag(Flag: MachineInstr::FmAfn)) {
8644 unsigned Flags = MI.getFlags();
8645 auto Src32 = MIRBuilder.buildFPTrunc(Res: S32, Op: Src, Flags);
8646 MIRBuilder.buildFPTrunc(Res: Dst, Op: Src32, Flags);
8647 MI.eraseFromParent();
8648 return Legalized;
8649 }
8650
8651 const unsigned ExpMask = 0x7ff;
8652 const unsigned ExpBiasf64 = 1023;
8653 const unsigned ExpBiasf16 = 15;
8654
8655 auto Unmerge = MIRBuilder.buildUnmerge(Res: S32, Op: Src);
8656 Register U = Unmerge.getReg(Idx: 0);
8657 Register UH = Unmerge.getReg(Idx: 1);
8658
8659 auto E = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 20));
8660 E = MIRBuilder.buildAnd(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: ExpMask));
8661
8662 // Subtract the fp64 exponent bias (1023) to get the real exponent and
8663 // add the f16 bias (15) to get the biased exponent for the f16 format.
8664 E = MIRBuilder.buildAdd(
8665 Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: -ExpBiasf64 + ExpBiasf16));
8666
8667 auto M = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 8));
8668 M = MIRBuilder.buildAnd(Dst: S32, Src0: M, Src1: MIRBuilder.buildConstant(Res: S32, Val: 0xffe));
8669
8670 auto MaskedSig = MIRBuilder.buildAnd(Dst: S32, Src0: UH,
8671 Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x1ff));
8672 MaskedSig = MIRBuilder.buildOr(Dst: S32, Src0: MaskedSig, Src1: U);
8673
8674 auto Zero = MIRBuilder.buildConstant(Res: S32, Val: 0);
8675 auto SigCmpNE0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: MaskedSig, Op1: Zero);
8676 auto Lo40Set = MIRBuilder.buildZExt(Res: S32, Op: SigCmpNE0);
8677 M = MIRBuilder.buildOr(Dst: S32, Src0: M, Src1: Lo40Set);
8678
8679 // (M != 0 ? 0x0200 : 0) | 0x7c00;
8680 auto Bits0x200 = MIRBuilder.buildConstant(Res: S32, Val: 0x0200);
8681 auto CmpM_NE0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: M, Op1: Zero);
8682 auto SelectCC = MIRBuilder.buildSelect(Res: S32, Tst: CmpM_NE0, Op0: Bits0x200, Op1: Zero);
8683
8684 auto Bits0x7c00 = MIRBuilder.buildConstant(Res: S32, Val: 0x7c00);
8685 auto I = MIRBuilder.buildOr(Dst: S32, Src0: SelectCC, Src1: Bits0x7c00);
8686
8687 // N = M | (E << 12);
8688 auto EShl12 = MIRBuilder.buildShl(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: 12));
8689 auto N = MIRBuilder.buildOr(Dst: S32, Src0: M, Src1: EShl12);
8690
8691 // B = clamp(1-E, 0, 13);
8692 auto One = MIRBuilder.buildConstant(Res: S32, Val: 1);
8693 auto OneSubExp = MIRBuilder.buildSub(Dst: S32, Src0: One, Src1: E);
8694 auto B = MIRBuilder.buildSMax(Dst: S32, Src0: OneSubExp, Src1: Zero);
8695 B = MIRBuilder.buildSMin(Dst: S32, Src0: B, Src1: MIRBuilder.buildConstant(Res: S32, Val: 13));
8696
8697 auto SigSetHigh = MIRBuilder.buildOr(Dst: S32, Src0: M,
8698 Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x1000));
8699
8700 auto D = MIRBuilder.buildLShr(Dst: S32, Src0: SigSetHigh, Src1: B);
8701 auto D0 = MIRBuilder.buildShl(Dst: S32, Src0: D, Src1: B);
8702
8703 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1,
8704 Op0: D0, Op1: SigSetHigh);
8705 auto D1 = MIRBuilder.buildZExt(Res: S32, Op: D0_NE_SigSetHigh);
8706 D = MIRBuilder.buildOr(Dst: S32, Src0: D, Src1: D1);
8707
8708 auto CmpELtOne = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: E, Op1: One);
8709 auto V = MIRBuilder.buildSelect(Res: S32, Tst: CmpELtOne, Op0: D, Op1: N);
8710
8711 auto VLow3 = MIRBuilder.buildAnd(Dst: S32, Src0: V, Src1: MIRBuilder.buildConstant(Res: S32, Val: 7));
8712 V = MIRBuilder.buildLShr(Dst: S32, Src0: V, Src1: MIRBuilder.buildConstant(Res: S32, Val: 2));
8713
8714 auto VLow3Eq3 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: VLow3,
8715 Op1: MIRBuilder.buildConstant(Res: S32, Val: 3));
8716 auto V0 = MIRBuilder.buildZExt(Res: S32, Op: VLow3Eq3);
8717
8718 auto VLow3Gt5 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: VLow3,
8719 Op1: MIRBuilder.buildConstant(Res: S32, Val: 5));
8720 auto V1 = MIRBuilder.buildZExt(Res: S32, Op: VLow3Gt5);
8721
8722 V1 = MIRBuilder.buildOr(Dst: S32, Src0: V0, Src1: V1);
8723 V = MIRBuilder.buildAdd(Dst: S32, Src0: V, Src1: V1);
8724
8725 auto CmpEGt30 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1,
8726 Op0: E, Op1: MIRBuilder.buildConstant(Res: S32, Val: 30));
8727 V = MIRBuilder.buildSelect(Res: S32, Tst: CmpEGt30,
8728 Op0: MIRBuilder.buildConstant(Res: S32, Val: 0x7c00), Op1: V);
8729
8730 auto CmpEGt1039 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1,
8731 Op0: E, Op1: MIRBuilder.buildConstant(Res: S32, Val: 1039));
8732 V = MIRBuilder.buildSelect(Res: S32, Tst: CmpEGt1039, Op0: I, Op1: V);
8733
8734 // Extract the sign bit.
8735 auto Sign = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 16));
8736 Sign = MIRBuilder.buildAnd(Dst: S32, Src0: Sign, Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x8000));
8737
8738 // Insert the sign bit
8739 V = MIRBuilder.buildOr(Dst: S32, Src0: Sign, Src1: V);
8740
8741 MIRBuilder.buildTrunc(Res: Dst, Op: V);
8742 MI.eraseFromParent();
8743 return Legalized;
8744}
8745
8746LegalizerHelper::LegalizeResult
8747LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
8748 auto [DstTy, SrcTy] = MI.getFirst2LLTs();
8749 const LLT S64 = LLT::scalar(SizeInBits: 64);
8750 const LLT S16 = LLT::scalar(SizeInBits: 16);
8751
8752 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
8753 return lowerFPTRUNC_F64_TO_F16(MI);
8754
8755 return UnableToLegalize;
8756}
8757
8758LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
8759 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8760 LLT Ty = MRI.getType(Reg: Dst);
8761
8762 auto CvtSrc1 = MIRBuilder.buildSITOFP(Dst: Ty, Src0: Src1);
8763 MIRBuilder.buildFPow(Dst, Src0, Src1: CvtSrc1, Flags: MI.getFlags());
8764 MI.eraseFromParent();
8765 return Legalized;
8766}
8767
8768LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMODF(MachineInstr &MI) {
8769 auto [DstFrac, DstInt, Src] = MI.getFirst3Regs();
8770 LLT Ty = MRI.getType(Reg: Src);
8771 auto Flags = MI.getFlags();
8772
8773 auto IntPart = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: Src, Flags);
8774 auto FracPart = MIRBuilder.buildFSub(Dst: Ty, Src0: Src, Src1: IntPart, Flags);
8775
8776 Register FracToUse;
8777 if (MI.getFlag(Flag: MachineInstr::FmNoInfs)) {
8778 FracToUse = FracPart.getReg(Idx: 0);
8779 } else {
8780 auto Abs = MIRBuilder.buildFAbs(Dst: Ty, Src0: Src, Flags);
8781 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: Ty.getScalarType());
8782 auto Inf = MIRBuilder.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: Semantics));
8783 auto IsInf = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OEQ,
8784 Res: Ty.changeElementSize(NewEltSize: 1), Op0: Abs, Op1: Inf);
8785 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
8786 auto Select = MIRBuilder.buildSelect(Res: Ty, Tst: IsInf, Op0: Zero, Op1: FracPart);
8787 FracToUse = Select.getReg(Idx: 0);
8788 }
8789
8790 MIRBuilder.buildFCopysign(Dst: DstFrac, Src0: FracToUse, Src1: Src, Flags);
8791 MIRBuilder.buildCopy(Res: DstInt, Op: IntPart.getReg(Idx: 0));
8792
8793 MI.eraseFromParent();
8794 return Legalized;
8795}
8796
8797static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
8798 switch (Opc) {
8799 case TargetOpcode::G_SMIN:
8800 return CmpInst::ICMP_SLT;
8801 case TargetOpcode::G_SMAX:
8802 return CmpInst::ICMP_SGT;
8803 case TargetOpcode::G_UMIN:
8804 return CmpInst::ICMP_ULT;
8805 case TargetOpcode::G_UMAX:
8806 return CmpInst::ICMP_UGT;
8807 default:
8808 llvm_unreachable("not in integer min/max");
8809 }
8810}
8811
8812LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
8813 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8814
8815 const CmpInst::Predicate Pred = minMaxToCompare(Opc: MI.getOpcode());
8816 LLT CmpType = MRI.getType(Reg: Dst).changeElementType(NewEltTy: LLT::integer(SizeInBits: 1));
8817
8818 auto Cmp = MIRBuilder.buildICmp(Pred, Res: CmpType, Op0: Src0, Op1: Src1);
8819 MIRBuilder.buildSelect(Res: Dst, Tst: Cmp, Op0: Src0, Op1: Src1);
8820
8821 MI.eraseFromParent();
8822 return Legalized;
8823}
8824
8825LegalizerHelper::LegalizeResult
8826LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
8827 GSUCmp *Cmp = cast<GSUCmp>(Val: &MI);
8828
8829 Register Dst = Cmp->getReg(Idx: 0);
8830 LLT DstTy = MRI.getType(Reg: Dst);
8831 LLT SrcTy = MRI.getType(Reg: Cmp->getReg(Idx: 1));
8832 LLT CmpTy = DstTy.changeElementSize(NewEltSize: 1);
8833
8834 CmpInst::Predicate LTPredicate = Cmp->isSigned()
8835 ? CmpInst::Predicate::ICMP_SLT
8836 : CmpInst::Predicate::ICMP_ULT;
8837 CmpInst::Predicate GTPredicate = Cmp->isSigned()
8838 ? CmpInst::Predicate::ICMP_SGT
8839 : CmpInst::Predicate::ICMP_UGT;
8840
8841 auto Zero = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
8842 auto IsGT = MIRBuilder.buildICmp(Pred: GTPredicate, Res: CmpTy, Op0: Cmp->getLHSReg(),
8843 Op1: Cmp->getRHSReg());
8844 auto IsLT = MIRBuilder.buildICmp(Pred: LTPredicate, Res: CmpTy, Op0: Cmp->getLHSReg(),
8845 Op1: Cmp->getRHSReg());
8846
8847 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
8848 auto BC = TLI.getBooleanContents(isVec: DstTy.isVector(), /*isFP=*/isFloat: false);
8849 if (TLI.preferSelectsOverBooleanArithmetic(
8850 VT: getApproximateEVTForLLT(Ty: SrcTy, Ctx)) ||
8851 BC == TargetLowering::UndefinedBooleanContent) {
8852 auto One = MIRBuilder.buildConstant(Res: DstTy, Val: 1);
8853 auto SelectZeroOrOne = MIRBuilder.buildSelect(Res: DstTy, Tst: IsGT, Op0: One, Op1: Zero);
8854
8855 auto MinusOne = MIRBuilder.buildConstant(Res: DstTy, Val: -1);
8856 MIRBuilder.buildSelect(Res: Dst, Tst: IsLT, Op0: MinusOne, Op1: SelectZeroOrOne);
8857 } else {
8858 if (BC == TargetLowering::ZeroOrNegativeOneBooleanContent)
8859 std::swap(a&: IsGT, b&: IsLT);
8860 // Extend boolean results to DstTy, which is at least i2, before subtracting
8861 // them.
8862 unsigned BoolExtOp =
8863 MIRBuilder.getBoolExtOp(IsVec: DstTy.isVector(), /*isFP=*/IsFP: false);
8864 IsGT = MIRBuilder.buildInstr(Opc: BoolExtOp, DstOps: {DstTy}, SrcOps: {IsGT});
8865 IsLT = MIRBuilder.buildInstr(Opc: BoolExtOp, DstOps: {DstTy}, SrcOps: {IsLT});
8866 MIRBuilder.buildSub(Dst, Src0: IsGT, Src1: IsLT);
8867 }
8868
8869 MI.eraseFromParent();
8870 return Legalized;
8871}
8872
8873LegalizerHelper::LegalizeResult
8874LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
8875 auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
8876 const int Src0Size = Src0Ty.getScalarSizeInBits();
8877 const int Src1Size = Src1Ty.getScalarSizeInBits();
8878
8879 LLT DstIntTy =
8880 DstTy.changeElementType(NewEltTy: LLT::integer(SizeInBits: DstTy.getScalarSizeInBits()));
8881 LLT Src0IntTy = Src0Ty.changeElementType(NewEltTy: LLT::integer(SizeInBits: Src0Size));
8882 LLT Src1IntTy = Src1Ty.changeElementType(NewEltTy: LLT::integer(SizeInBits: Src1Size));
8883
8884 Register Src0Int = Src0;
8885 Register Src1Int = Src1;
8886
8887 if (!(Src0Ty.getScalarType().isAnyScalar() ||
8888 Src0Ty.getScalarType().isInteger()))
8889 Src0Int = MIRBuilder.buildBitcast(Dst: Src0IntTy, Src: Src0).getReg(Idx: 0);
8890
8891 if (!(Src1Ty.getScalarType().isAnyScalar() ||
8892 Src1Ty.getScalarType().isInteger()))
8893 Src1Int = MIRBuilder.buildBitcast(Dst: Src1IntTy, Src: Src1).getReg(Idx: 0);
8894
8895 auto SignBitMask =
8896 MIRBuilder.buildConstant(Res: Src0IntTy, Val: APInt::getSignMask(BitWidth: Src0Size));
8897
8898 auto NotSignBitMask = MIRBuilder.buildConstant(
8899 Res: Src0IntTy, Val: APInt::getLowBitsSet(numBits: Src0Size, loBitsSet: Src0Size - 1));
8900
8901 Register And0 =
8902 MIRBuilder.buildAnd(Dst: Src0IntTy, Src0: Src0Int, Src1: NotSignBitMask).getReg(Idx: 0);
8903 Register And1;
8904 if (Src0Ty == Src1Ty) {
8905 And1 = MIRBuilder.buildAnd(Dst: Src1IntTy, Src0: Src1Int, Src1: SignBitMask).getReg(Idx: 0);
8906 } else if (Src0Size > Src1Size) {
8907 auto ShiftAmt = MIRBuilder.buildConstant(Res: Src0IntTy, Val: Src0Size - Src1Size);
8908 auto Zext = MIRBuilder.buildZExt(Res: Src0IntTy, Op: Src1Int);
8909 auto Shift = MIRBuilder.buildShl(Dst: Src0IntTy, Src0: Zext, Src1: ShiftAmt);
8910 And1 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0: Shift, Src1: SignBitMask).getReg(Idx: 0);
8911 } else {
8912 auto ShiftAmt = MIRBuilder.buildConstant(Res: Src1IntTy, Val: Src1Size - Src0Size);
8913 auto Shift = MIRBuilder.buildLShr(Dst: Src1IntTy, Src0: Src1Int, Src1: ShiftAmt);
8914 auto Trunc = MIRBuilder.buildTrunc(Res: Src0IntTy, Op: Shift);
8915 And1 = MIRBuilder.buildAnd(Dst: Src0IntTy, Src0: Trunc, Src1: SignBitMask).getReg(Idx: 0);
8916 }
8917
8918 // Be careful about setting nsz/nnan/ninf on every instruction, since the
8919 // constants are a nan and -0.0, but the final result should preserve
8920 // everything.
8921 unsigned Flags = MI.getFlags();
8922
8923 // We masked the sign bit and the not-sign bit, so these are disjoint.
8924 Flags |= MachineInstr::Disjoint;
8925
8926 if (DstTy == DstIntTy)
8927 MIRBuilder.buildOr(Dst, Src0: And0, Src1: And1, Flags).getReg(Idx: 0);
8928 else {
8929 Register NewDst = MIRBuilder.buildOr(Dst: DstIntTy, Src0: And0, Src1: And1, Flags).getReg(Idx: 0);
8930 MIRBuilder.buildBitcast(Dst, Src: NewDst);
8931 }
8932
8933 MI.eraseFromParent();
8934 return Legalized;
8935}
8936
8937LegalizerHelper::LegalizeResult
8938LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
8939 // FIXME: fminnum/fmaxnum and fminimumnum/fmaximumnum should not have
8940 // identical handling. fminimumnum/fmaximumnum also need a path that do not
8941 // depend on fminnum/fmaxnum.
8942
8943 unsigned NewOp;
8944 switch (MI.getOpcode()) {
8945 case TargetOpcode::G_FMINNUM:
8946 NewOp = TargetOpcode::G_FMINNUM_IEEE;
8947 break;
8948 case TargetOpcode::G_FMINIMUMNUM:
8949 NewOp = TargetOpcode::G_FMINNUM;
8950 break;
8951 case TargetOpcode::G_FMAXNUM:
8952 NewOp = TargetOpcode::G_FMAXNUM_IEEE;
8953 break;
8954 case TargetOpcode::G_FMAXIMUMNUM:
8955 NewOp = TargetOpcode::G_FMAXNUM;
8956 break;
8957 default:
8958 llvm_unreachable("unexpected min/max opcode");
8959 }
8960
8961 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8962 LLT Ty = MRI.getType(Reg: Dst);
8963
8964 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
8965 // Insert canonicalizes if it's possible we need to quiet to get correct
8966 // sNaN behavior.
8967
8968 // Note this must be done here, and not as an optimization combine in the
8969 // absence of a dedicate quiet-snan instruction as we're using an
8970 // omni-purpose G_FCANONICALIZE.
8971 if (!isKnownNeverSNaN(Val: Src0, MRI))
8972 Src0 = MIRBuilder.buildFCanonicalize(Dst: Ty, Src0, Flags: MI.getFlags()).getReg(Idx: 0);
8973
8974 if (!isKnownNeverSNaN(Val: Src1, MRI))
8975 Src1 = MIRBuilder.buildFCanonicalize(Dst: Ty, Src0: Src1, Flags: MI.getFlags()).getReg(Idx: 0);
8976 }
8977
8978 // If there are no nans, it's safe to simply replace this with the non-IEEE
8979 // version.
8980 MIRBuilder.buildInstr(Opc: NewOp, DstOps: {Dst}, SrcOps: {Src0, Src1}, Flags: MI.getFlags());
8981 MI.eraseFromParent();
8982 return Legalized;
8983}
8984
8985LegalizerHelper::LegalizeResult
8986LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) {
8987 unsigned Opc = MI.getOpcode();
8988 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8989 LLT Ty = MRI.getType(Reg: Dst);
8990 LLT CmpTy = Ty.changeElementSize(NewEltSize: 1);
8991
8992 bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM);
8993 unsigned OpcIeee =
8994 IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE;
8995 unsigned OpcNonIeee =
8996 IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM;
8997 bool MinMaxMustRespectOrderedZero = false;
8998 Register Res;
8999
9000 // IEEE variants don't need canonicalization
9001 if (LI.isLegalOrCustom(Query: {OpcIeee, Ty})) {
9002 Res = MIRBuilder.buildInstr(Opc: OpcIeee, DstOps: {Ty}, SrcOps: {Src0, Src1}).getReg(Idx: 0);
9003 MinMaxMustRespectOrderedZero = true;
9004 } else if (LI.isLegalOrCustom(Query: {OpcNonIeee, Ty})) {
9005 Res = MIRBuilder.buildInstr(Opc: OpcNonIeee, DstOps: {Ty}, SrcOps: {Src0, Src1}).getReg(Idx: 0);
9006 } else {
9007 auto Compare = MIRBuilder.buildFCmp(
9008 Pred: IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, Res: CmpTy, Op0: Src0, Op1: Src1);
9009 Res = MIRBuilder.buildSelect(Res: Ty, Tst: Compare, Op0: Src0, Op1: Src1).getReg(Idx: 0);
9010 }
9011
9012 // Propagate any NaN of both operands
9013 if (!MI.getFlag(Flag: MachineInstr::FmNoNans) &&
9014 (!isKnownNeverNaN(Val: Src0, MRI) || isKnownNeverNaN(Val: Src1, MRI))) {
9015 auto IsOrdered = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: CmpTy, Op0: Src0, Op1: Src1);
9016
9017 LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType();
9018 APFloat NaNValue = APFloat::getNaN(Sem: getFltSemanticForLLT(Ty: ElementTy));
9019 Register NaN = MIRBuilder.buildFConstant(Res: ElementTy, Val: NaNValue).getReg(Idx: 0);
9020 if (Ty.isVector())
9021 NaN = MIRBuilder.buildSplatBuildVector(Res: Ty, Src: NaN).getReg(Idx: 0);
9022
9023 Res = MIRBuilder.buildSelect(Res: Ty, Tst: IsOrdered, Op0: Res, Op1: NaN).getReg(Idx: 0);
9024 }
9025
9026 // fminimum/fmaximum requires -0.0 less than +0.0
9027 if (!MinMaxMustRespectOrderedZero && !MI.getFlag(Flag: MachineInstr::FmNsz)) {
9028 GISelValueTracking VT(MIRBuilder.getMF());
9029 KnownFPClass Src0Info = VT.computeKnownFPClass(R: Src0, InterestedClasses: fcZero);
9030 KnownFPClass Src1Info = VT.computeKnownFPClass(R: Src1, InterestedClasses: fcZero);
9031
9032 if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) {
9033 const unsigned Flags = MI.getFlags();
9034 Register Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0).getReg(Idx: 0);
9035 auto IsZero = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OEQ, Res: CmpTy, Op0: Res, Op1: Zero);
9036
9037 unsigned TestClass = IsMax ? fcPosZero : fcNegZero;
9038
9039 auto LHSTestZero = MIRBuilder.buildIsFPClass(Res: CmpTy, Src: Src0, Mask: TestClass);
9040 auto LHSSelect =
9041 MIRBuilder.buildSelect(Res: Ty, Tst: LHSTestZero, Op0: Src0, Op1: Res, Flags);
9042
9043 auto RHSTestZero = MIRBuilder.buildIsFPClass(Res: CmpTy, Src: Src1, Mask: TestClass);
9044 auto RHSSelect =
9045 MIRBuilder.buildSelect(Res: Ty, Tst: RHSTestZero, Op0: Src1, Op1: LHSSelect, Flags);
9046
9047 Res = MIRBuilder.buildSelect(Res: Ty, Tst: IsZero, Op0: RHSSelect, Op1: Res, Flags).getReg(Idx: 0);
9048 }
9049 }
9050
9051 MIRBuilder.buildCopy(Res: Dst, Op: Res);
9052 MI.eraseFromParent();
9053 return Legalized;
9054}
9055
9056LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
9057 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
9058 Register DstReg = MI.getOperand(i: 0).getReg();
9059 LLT Ty = MRI.getType(Reg: DstReg);
9060 unsigned Flags = MI.getFlags();
9061
9062 auto Mul = MIRBuilder.buildFMul(Dst: Ty, Src0: MI.getOperand(i: 1), Src1: MI.getOperand(i: 2),
9063 Flags);
9064 MIRBuilder.buildFAdd(Dst: DstReg, Src0: Mul, Src1: MI.getOperand(i: 3), Flags);
9065 MI.eraseFromParent();
9066 return Legalized;
9067}
9068
9069LegalizerHelper::LegalizeResult
9070LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
9071 auto [DstReg, X] = MI.getFirst2Regs();
9072 const unsigned Flags = MI.getFlags();
9073 const LLT Ty = MRI.getType(Reg: DstReg);
9074 const LLT CondTy = Ty.changeElementSize(NewEltSize: 1);
9075
9076 // round(x) =>
9077 // t = trunc(x);
9078 // d = fabs(x - t);
9079 // o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
9080 // return t + o;
9081
9082 auto T = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: X, Flags);
9083
9084 auto Diff = MIRBuilder.buildFSub(Dst: Ty, Src0: X, Src1: T, Flags);
9085 auto AbsDiff = MIRBuilder.buildFAbs(Dst: Ty, Src0: Diff, Flags);
9086
9087 auto Half = MIRBuilder.buildFConstant(Res: Ty, Val: 0.5);
9088 auto Cmp =
9089 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGE, Res: CondTy, Op0: AbsDiff, Op1: Half, Flags);
9090
9091 // Could emit G_UITOFP instead
9092 auto One = MIRBuilder.buildFConstant(Res: Ty, Val: 1.0);
9093 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
9094 auto BoolFP = MIRBuilder.buildSelect(Res: Ty, Tst: Cmp, Op0: One, Op1: Zero);
9095 auto SignedOffset = MIRBuilder.buildFCopysign(Dst: Ty, Src0: BoolFP, Src1: X);
9096
9097 MIRBuilder.buildFAdd(Dst: DstReg, Src0: T, Src1: SignedOffset, Flags);
9098
9099 MI.eraseFromParent();
9100 return Legalized;
9101}
9102
9103LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
9104 auto [DstReg, SrcReg] = MI.getFirst2Regs();
9105 unsigned Flags = MI.getFlags();
9106 LLT Ty = MRI.getType(Reg: DstReg);
9107 const LLT CondTy = Ty.changeElementSize(NewEltSize: 1);
9108
9109 // result = trunc(src);
9110 // if (src < 0.0 && src != result)
9111 // result += -1.0.
9112
9113 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: SrcReg, Flags);
9114 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
9115
9116 auto Lt0 = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: CondTy,
9117 Op0: SrcReg, Op1: Zero, Flags);
9118 auto NeTrunc = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: CondTy,
9119 Op0: SrcReg, Op1: Trunc, Flags);
9120 auto And = MIRBuilder.buildAnd(Dst: CondTy, Src0: Lt0, Src1: NeTrunc);
9121 auto AddVal = MIRBuilder.buildSITOFP(Dst: Ty, Src0: And);
9122
9123 MIRBuilder.buildFAdd(Dst: DstReg, Src0: Trunc, Src1: AddVal, Flags);
9124 MI.eraseFromParent();
9125 return Legalized;
9126}
9127
9128LegalizerHelper::LegalizeResult
9129LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
9130 const unsigned NumOps = MI.getNumOperands();
9131 auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
9132 unsigned PartSize = Src0Ty.getSizeInBits();
9133
9134 LLT WideTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
9135 Register ResultReg = MIRBuilder.buildZExt(Res: WideTy, Op: Src0Reg).getReg(Idx: 0);
9136
9137 for (unsigned I = 2; I != NumOps; ++I) {
9138 const unsigned Offset = (I - 1) * PartSize;
9139
9140 Register SrcReg = MI.getOperand(i: I).getReg();
9141 auto ZextInput = MIRBuilder.buildZExt(Res: WideTy, Op: SrcReg);
9142
9143 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
9144 MRI.createGenericVirtualRegister(Ty: WideTy);
9145
9146 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: Offset);
9147 auto Shl = MIRBuilder.buildShl(Dst: WideTy, Src0: ZextInput, Src1: ShiftAmt);
9148 MIRBuilder.buildOr(Dst: NextResult, Src0: ResultReg, Src1: Shl);
9149 ResultReg = NextResult;
9150 }
9151
9152 if (DstTy.isPointer()) {
9153 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
9154 AddrSpace: DstTy.getAddressSpace())) {
9155 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
9156 return UnableToLegalize;
9157 }
9158
9159 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
9160 }
9161
9162 MI.eraseFromParent();
9163 return Legalized;
9164}
9165
9166LegalizerHelper::LegalizeResult
9167LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
9168 const unsigned NumDst = MI.getNumOperands() - 1;
9169 Register SrcReg = MI.getOperand(i: NumDst).getReg();
9170 Register Dst0Reg = MI.getOperand(i: 0).getReg();
9171 LLT DstTy = MRI.getType(Reg: Dst0Reg);
9172 if (DstTy.isPointer())
9173 return UnableToLegalize; // TODO
9174
9175 SrcReg = coerceToScalar(Val: SrcReg);
9176 if (!SrcReg)
9177 return UnableToLegalize;
9178
9179 // Expand scalarizing unmerge as bitcast to integer and shift.
9180 LLT IntTy = MRI.getType(Reg: SrcReg);
9181
9182 MIRBuilder.buildTrunc(Res: Dst0Reg, Op: SrcReg);
9183
9184 const unsigned DstSize = DstTy.getSizeInBits();
9185 unsigned Offset = DstSize;
9186 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
9187 auto ShiftAmt = MIRBuilder.buildConstant(Res: IntTy, Val: Offset);
9188 auto Shift = MIRBuilder.buildLShr(Dst: IntTy, Src0: SrcReg, Src1: ShiftAmt);
9189 MIRBuilder.buildTrunc(Res: MI.getOperand(i: I), Op: Shift);
9190 }
9191
9192 MI.eraseFromParent();
9193 return Legalized;
9194}
9195
9196/// Lower a vector extract or insert by writing the vector to a stack temporary
9197/// and reloading the element or vector.
9198///
9199/// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
9200/// =>
9201/// %stack_temp = G_FRAME_INDEX
9202/// G_STORE %vec, %stack_temp
9203/// %idx = clamp(%idx, %vec.getNumElements())
9204/// %element_ptr = G_PTR_ADD %stack_temp, %idx
9205/// %dst = G_LOAD %element_ptr
9206LegalizerHelper::LegalizeResult
9207LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
9208 Register DstReg = MI.getOperand(i: 0).getReg();
9209 Register SrcVec = MI.getOperand(i: 1).getReg();
9210 Register InsertVal;
9211 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
9212 InsertVal = MI.getOperand(i: 2).getReg();
9213
9214 Register Idx = MI.getOperand(i: MI.getNumOperands() - 1).getReg();
9215
9216 LLT VecTy = MRI.getType(Reg: SrcVec);
9217 LLT EltTy = VecTy.getElementType();
9218 unsigned NumElts = VecTy.getNumElements();
9219
9220 int64_t IdxVal;
9221 if (mi_match(R: Idx, MRI, P: m_ICst(Cst&: IdxVal)) && IdxVal <= NumElts) {
9222 SmallVector<Register, 8> SrcRegs;
9223 extractParts(Reg: SrcVec, Ty: EltTy, NumParts: NumElts, VRegs&: SrcRegs, MIRBuilder, MRI);
9224
9225 if (InsertVal) {
9226 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
9227 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SrcRegs);
9228 } else {
9229 MIRBuilder.buildCopy(Res: DstReg, Op: SrcRegs[IdxVal]);
9230 }
9231
9232 MI.eraseFromParent();
9233 return Legalized;
9234 }
9235
9236 if (!EltTy.isByteSized()) { // Not implemented.
9237 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
9238 return UnableToLegalize;
9239 }
9240
9241 unsigned EltBytes = EltTy.getSizeInBytes();
9242 Align VecAlign = getStackTemporaryAlignment(Ty: VecTy);
9243 Align EltAlign;
9244
9245 MachinePointerInfo PtrInfo;
9246 auto StackTemp = createStackTemporary(
9247 Bytes: TypeSize::getFixed(ExactSize: VecTy.getSizeInBytes()), Alignment: VecAlign, PtrInfo);
9248 MIRBuilder.buildStore(Val: SrcVec, Addr: StackTemp, PtrInfo, Alignment: VecAlign);
9249
9250 // Get the pointer to the element, and be sure not to hit undefined behavior
9251 // if the index is out of bounds.
9252 Register EltPtr = getVectorElementPointer(VecPtr: StackTemp.getReg(Idx: 0), VecTy, Index: Idx);
9253
9254 if (mi_match(R: Idx, MRI, P: m_ICst(Cst&: IdxVal))) {
9255 int64_t Offset = IdxVal * EltBytes;
9256 PtrInfo = PtrInfo.getWithOffset(O: Offset);
9257 EltAlign = commonAlignment(A: VecAlign, Offset);
9258 } else {
9259 // We lose information with a variable offset.
9260 EltAlign = getStackTemporaryAlignment(Ty: EltTy);
9261 PtrInfo = MachinePointerInfo(MRI.getType(Reg: EltPtr).getAddressSpace());
9262 }
9263
9264 if (InsertVal) {
9265 // Write the inserted element
9266 MIRBuilder.buildStore(Val: InsertVal, Addr: EltPtr, PtrInfo, Alignment: EltAlign);
9267
9268 // Reload the whole vector.
9269 MIRBuilder.buildLoad(Res: DstReg, Addr: StackTemp, PtrInfo, Alignment: VecAlign);
9270 } else {
9271 MIRBuilder.buildLoad(Res: DstReg, Addr: EltPtr, PtrInfo, Alignment: EltAlign);
9272 }
9273
9274 MI.eraseFromParent();
9275 return Legalized;
9276}
9277
9278LegalizerHelper::LegalizeResult
9279LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
9280 auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
9281 MI.getFirst3RegLLTs();
9282 LLT IdxTy = LLT::scalar(SizeInBits: 32);
9283
9284 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
9285 Register Undef;
9286 SmallVector<Register, 32> BuildVec;
9287 LLT EltTy = DstTy.getScalarType();
9288
9289 DenseMap<unsigned, Register> CachedExtract;
9290
9291 for (int Idx : Mask) {
9292 if (Idx < 0) {
9293 if (!Undef.isValid())
9294 Undef = MIRBuilder.buildUndef(Res: EltTy).getReg(Idx: 0);
9295 BuildVec.push_back(Elt: Undef);
9296 continue;
9297 }
9298
9299 assert(!Src0Ty.isScalar() && "Unexpected scalar G_SHUFFLE_VECTOR");
9300
9301 int NumElts = Src0Ty.getNumElements();
9302 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
9303 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
9304 auto [It, Inserted] = CachedExtract.try_emplace(Key: Idx);
9305 if (Inserted) {
9306 auto IdxK = MIRBuilder.buildConstant(Res: IdxTy, Val: ExtractIdx);
9307 It->second =
9308 MIRBuilder.buildExtractVectorElement(Res: EltTy, Val: SrcVec, Idx: IdxK).getReg(Idx: 0);
9309 }
9310 BuildVec.push_back(Elt: It->second);
9311 }
9312
9313 assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR");
9314 MIRBuilder.buildBuildVector(Res: DstReg, Ops: BuildVec);
9315 MI.eraseFromParent();
9316 return Legalized;
9317}
9318
9319LegalizerHelper::LegalizeResult
9320LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
9321 auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] =
9322 MI.getFirst4RegLLTs();
9323
9324 if (VecTy.isScalableVector())
9325 report_fatal_error(reason: "Cannot expand masked_compress for scalable vectors.");
9326
9327 Align VecAlign = getStackTemporaryAlignment(Ty: VecTy);
9328 MachinePointerInfo PtrInfo;
9329 Register StackPtr =
9330 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: VecTy.getSizeInBytes()), Alignment: VecAlign,
9331 PtrInfo)
9332 .getReg(Idx: 0);
9333 MachinePointerInfo ValPtrInfo =
9334 MachinePointerInfo::getUnknownStack(MF&: *MI.getMF());
9335
9336 LLT IdxTy = LLT::scalar(SizeInBits: 32);
9337 LLT ValTy = VecTy.getElementType();
9338 Align ValAlign = getStackTemporaryAlignment(Ty: ValTy);
9339
9340 auto OutPos = MIRBuilder.buildConstant(Res: IdxTy, Val: 0);
9341
9342 bool HasPassthru =
9343 MRI.getVRegDef(Reg: Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF;
9344
9345 if (HasPassthru)
9346 MIRBuilder.buildStore(Val: Passthru, Addr: StackPtr, PtrInfo, Alignment: VecAlign);
9347
9348 Register LastWriteVal;
9349 std::optional<APInt> PassthruSplatVal =
9350 isConstantOrConstantSplatVector(MI&: *MRI.getVRegDef(Reg: Passthru), MRI);
9351
9352 if (PassthruSplatVal.has_value()) {
9353 LastWriteVal =
9354 MIRBuilder.buildConstant(Res: ValTy, Val: PassthruSplatVal.value()).getReg(Idx: 0);
9355 } else if (HasPassthru) {
9356 auto Popcount = MIRBuilder.buildZExt(Res: MaskTy.changeElementSize(NewEltSize: 32), Op: Mask);
9357 Popcount = MIRBuilder.buildInstr(Opc: TargetOpcode::G_VECREDUCE_ADD,
9358 DstOps: {LLT::scalar(SizeInBits: 32)}, SrcOps: {Popcount});
9359
9360 Register LastElmtPtr =
9361 getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: Popcount.getReg(Idx: 0));
9362 LastWriteVal =
9363 MIRBuilder.buildLoad(Res: ValTy, Addr: LastElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign)
9364 .getReg(Idx: 0);
9365 }
9366
9367 unsigned NumElmts = VecTy.getNumElements();
9368 for (unsigned I = 0; I < NumElmts; ++I) {
9369 auto Idx = MIRBuilder.buildConstant(Res: IdxTy, Val: I);
9370 auto Val = MIRBuilder.buildExtractVectorElement(Res: ValTy, Val: Vec, Idx);
9371 Register ElmtPtr =
9372 getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: OutPos.getReg(Idx: 0));
9373 MIRBuilder.buildStore(Val, Addr: ElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign);
9374
9375 LLT MaskITy = MaskTy.getElementType();
9376 auto MaskI = MIRBuilder.buildExtractVectorElement(Res: MaskITy, Val: Mask, Idx);
9377 if (MaskITy.getSizeInBits() > 1)
9378 MaskI = MIRBuilder.buildTrunc(Res: LLT::scalar(SizeInBits: 1), Op: MaskI);
9379
9380 MaskI = MIRBuilder.buildZExt(Res: IdxTy, Op: MaskI);
9381 OutPos = MIRBuilder.buildAdd(Dst: IdxTy, Src0: OutPos, Src1: MaskI);
9382
9383 if (HasPassthru && I == NumElmts - 1) {
9384 auto EndOfVector =
9385 MIRBuilder.buildConstant(Res: IdxTy, Val: VecTy.getNumElements() - 1);
9386 auto AllLanesSelected = MIRBuilder.buildICmp(
9387 Pred: CmpInst::ICMP_UGT, Res: LLT::scalar(SizeInBits: 1), Op0: OutPos, Op1: EndOfVector);
9388 OutPos = MIRBuilder.buildInstr(Opc: TargetOpcode::G_UMIN, DstOps: {IdxTy},
9389 SrcOps: {OutPos, EndOfVector});
9390 ElmtPtr = getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: OutPos.getReg(Idx: 0));
9391
9392 LastWriteVal =
9393 MIRBuilder.buildSelect(Res: ValTy, Tst: AllLanesSelected, Op0: Val, Op1: LastWriteVal)
9394 .getReg(Idx: 0);
9395 MIRBuilder.buildStore(Val: LastWriteVal, Addr: ElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign);
9396 }
9397 }
9398
9399 // TODO: Use StackPtr's FrameIndex alignment.
9400 MIRBuilder.buildLoad(Res: Dst, Addr: StackPtr, PtrInfo, Alignment: VecAlign);
9401
9402 MI.eraseFromParent();
9403 return Legalized;
9404}
9405
9406Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
9407 Register AllocSize,
9408 Align Alignment,
9409 LLT PtrTy) {
9410 LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
9411
9412 auto SPTmp = MIRBuilder.buildCopy(Res: PtrTy, Op: SPReg);
9413 SPTmp = MIRBuilder.buildCast(Dst: IntPtrTy, Src: SPTmp);
9414
9415 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
9416 // have to generate an extra instruction to negate the alloc and then use
9417 // G_PTR_ADD to add the negative offset.
9418 auto Alloc = MIRBuilder.buildSub(Dst: IntPtrTy, Src0: SPTmp, Src1: AllocSize);
9419 if (Alignment > Align(1)) {
9420 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
9421 AlignMask.negate();
9422 auto AlignCst = MIRBuilder.buildConstant(Res: IntPtrTy, Val: AlignMask);
9423 Alloc = MIRBuilder.buildAnd(Dst: IntPtrTy, Src0: Alloc, Src1: AlignCst);
9424 }
9425
9426 return MIRBuilder.buildCast(Dst: PtrTy, Src: Alloc).getReg(Idx: 0);
9427}
9428
9429LegalizerHelper::LegalizeResult
9430LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
9431 const auto &MF = *MI.getMF();
9432 const auto &TFI = *MF.getSubtarget().getFrameLowering();
9433 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
9434 return UnableToLegalize;
9435
9436 Register Dst = MI.getOperand(i: 0).getReg();
9437 Register AllocSize = MI.getOperand(i: 1).getReg();
9438 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
9439
9440 LLT PtrTy = MRI.getType(Reg: Dst);
9441 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
9442 Register SPTmp =
9443 getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
9444
9445 MIRBuilder.buildCopy(Res: SPReg, Op: SPTmp);
9446 MIRBuilder.buildCopy(Res: Dst, Op: SPTmp);
9447
9448 MI.eraseFromParent();
9449 return Legalized;
9450}
9451
9452LegalizerHelper::LegalizeResult
9453LegalizerHelper::lowerStackSave(MachineInstr &MI) {
9454 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
9455 if (!StackPtr)
9456 return UnableToLegalize;
9457
9458 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0), Op: StackPtr);
9459 MI.eraseFromParent();
9460 return Legalized;
9461}
9462
9463LegalizerHelper::LegalizeResult
9464LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
9465 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
9466 if (!StackPtr)
9467 return UnableToLegalize;
9468
9469 MIRBuilder.buildCopy(Res: StackPtr, Op: MI.getOperand(i: 0));
9470 MI.eraseFromParent();
9471 return Legalized;
9472}
9473
9474LegalizerHelper::LegalizeResult
9475LegalizerHelper::lowerExtract(MachineInstr &MI) {
9476 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
9477 unsigned Offset = MI.getOperand(i: 2).getImm();
9478
9479 // Extract sub-vector or one element
9480 if (SrcTy.isVector()) {
9481 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
9482 unsigned DstSize = DstTy.getSizeInBits();
9483
9484 if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
9485 (Offset + DstSize <= SrcTy.getSizeInBits())) {
9486 // Unmerge and allow access to each Src element for the artifact combiner.
9487 auto Unmerge = MIRBuilder.buildUnmerge(Res: SrcTy.getElementType(), Op: SrcReg);
9488
9489 // Take element(s) we need to extract and copy it (merge them).
9490 SmallVector<Register, 8> SubVectorElts;
9491 for (unsigned Idx = Offset / SrcEltSize;
9492 Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
9493 SubVectorElts.push_back(Elt: Unmerge.getReg(Idx));
9494 }
9495 if (SubVectorElts.size() == 1)
9496 MIRBuilder.buildCopy(Res: DstReg, Op: SubVectorElts[0]);
9497 else
9498 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SubVectorElts);
9499
9500 MI.eraseFromParent();
9501 return Legalized;
9502 }
9503 }
9504
9505 const DataLayout &DL = MIRBuilder.getDataLayout();
9506 if ((SrcTy.isPointer() &&
9507 DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace())) ||
9508 (DstTy.isPointer() &&
9509 DL.isNonIntegralAddressSpace(AddrSpace: DstTy.getAddressSpace()))) {
9510 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
9511 return UnableToLegalize;
9512 }
9513
9514 if ((DstTy.isScalar() || DstTy.isPointer()) &&
9515 (SrcTy.isScalar() || SrcTy.isPointer() ||
9516 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
9517 LLT SrcIntTy = SrcTy;
9518 if (!SrcTy.isScalar()) {
9519 SrcIntTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
9520 SrcReg = MIRBuilder.buildCast(Dst: SrcIntTy, Src: SrcReg).getReg(Idx: 0);
9521 }
9522
9523 Register ResultReg = DstReg;
9524 if (DstTy.isPointer())
9525 ResultReg =
9526 MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: DstTy.getSizeInBits()));
9527
9528 if (Offset == 0)
9529 MIRBuilder.buildTrunc(Res: ResultReg, Op: SrcReg);
9530 else {
9531 auto ShiftAmt = MIRBuilder.buildConstant(Res: SrcIntTy, Val: Offset);
9532 auto Shr = MIRBuilder.buildLShr(Dst: SrcIntTy, Src0: SrcReg, Src1: ShiftAmt);
9533 MIRBuilder.buildTrunc(Res: ResultReg, Op: Shr);
9534 }
9535
9536 if (DstTy.isPointer())
9537 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
9538
9539 MI.eraseFromParent();
9540 return Legalized;
9541 }
9542
9543 return UnableToLegalize;
9544}
9545
9546LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
9547 auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
9548 uint64_t Offset = MI.getOperand(i: 3).getImm();
9549
9550 LLT DstTy = MRI.getType(Reg: Src);
9551 LLT InsertTy = MRI.getType(Reg: InsertSrc);
9552
9553 const DataLayout &DL = MIRBuilder.getDataLayout();
9554 bool IsNonIntegralInsert =
9555 InsertTy.isPointerOrPointerVector() &&
9556 DL.isNonIntegralAddressSpace(AddrSpace: InsertTy.getAddressSpace());
9557 bool IsNonIntegralDst = DstTy.isPointerOrPointerVector() &&
9558 DL.isNonIntegralAddressSpace(AddrSpace: DstTy.getAddressSpace());
9559
9560 // Insert sub-vector or one element
9561 if (DstTy.isVector()) {
9562 LLT EltTy = DstTy.getElementType();
9563
9564 if ((IsNonIntegralInsert || IsNonIntegralDst) && InsertTy != EltTy) {
9565 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
9566 return UnableToLegalize;
9567 }
9568
9569 unsigned EltSize = EltTy.getSizeInBits();
9570 unsigned InsertSize = InsertTy.getSizeInBits();
9571
9572 if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
9573 (Offset + InsertSize <= DstTy.getSizeInBits())) {
9574 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: Src);
9575 SmallVector<Register, 8> DstElts;
9576 unsigned Idx = 0;
9577 // Elements from Src before insert start Offset
9578 for (; Idx < Offset / EltSize; ++Idx) {
9579 DstElts.push_back(Elt: UnmergeSrc.getReg(Idx));
9580 }
9581
9582 // Replace elements in Src with elements from InsertSrc
9583 if (InsertTy.getSizeInBits() > EltSize) {
9584 auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: InsertSrc);
9585 for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
9586 ++Idx, ++i) {
9587 DstElts.push_back(Elt: UnmergeInsertSrc.getReg(Idx: i));
9588 }
9589 } else {
9590 if (InsertTy.isPointer() && !EltTy.isPointer())
9591 InsertSrc = MIRBuilder.buildPtrToInt(Dst: EltTy, Src: InsertSrc).getReg(Idx: 0);
9592 else if (!InsertTy.isPointer() && EltTy.isPointer())
9593 InsertSrc = MIRBuilder.buildIntToPtr(Dst: EltTy, Src: InsertSrc).getReg(Idx: 0);
9594 DstElts.push_back(Elt: InsertSrc);
9595 ++Idx;
9596 }
9597
9598 // Remaining elements from Src after insert
9599 for (; Idx < DstTy.getNumElements(); ++Idx) {
9600 DstElts.push_back(Elt: UnmergeSrc.getReg(Idx));
9601 }
9602
9603 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: DstElts);
9604 MI.eraseFromParent();
9605 return Legalized;
9606 }
9607 }
9608
9609 if (InsertTy.isVector() ||
9610 (DstTy.isVector() && DstTy.getElementType() != InsertTy))
9611 return UnableToLegalize;
9612
9613 if (IsNonIntegralDst || IsNonIntegralInsert) {
9614 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
9615 return UnableToLegalize;
9616 }
9617
9618 LLT IntDstTy = DstTy;
9619
9620 if (!DstTy.isScalar()) {
9621 IntDstTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
9622 Src = MIRBuilder.buildCast(Dst: IntDstTy, Src).getReg(Idx: 0);
9623 }
9624
9625 if (!InsertTy.isScalar()) {
9626 const LLT IntInsertTy = LLT::scalar(SizeInBits: InsertTy.getSizeInBits());
9627 InsertSrc = MIRBuilder.buildPtrToInt(Dst: IntInsertTy, Src: InsertSrc).getReg(Idx: 0);
9628 }
9629
9630 Register ExtInsSrc = MIRBuilder.buildZExt(Res: IntDstTy, Op: InsertSrc).getReg(Idx: 0);
9631 if (Offset != 0) {
9632 auto ShiftAmt = MIRBuilder.buildConstant(Res: IntDstTy, Val: Offset);
9633 ExtInsSrc = MIRBuilder.buildShl(Dst: IntDstTy, Src0: ExtInsSrc, Src1: ShiftAmt).getReg(Idx: 0);
9634 }
9635
9636 APInt MaskVal = APInt::getBitsSetWithWrap(
9637 numBits: DstTy.getSizeInBits(), loBit: Offset + InsertTy.getSizeInBits(), hiBit: Offset);
9638
9639 auto Mask = MIRBuilder.buildConstant(Res: IntDstTy, Val: MaskVal);
9640 auto MaskedSrc = MIRBuilder.buildAnd(Dst: IntDstTy, Src0: Src, Src1: Mask);
9641 auto Or = MIRBuilder.buildOr(Dst: IntDstTy, Src0: MaskedSrc, Src1: ExtInsSrc);
9642
9643 MIRBuilder.buildCast(Dst, Src: Or);
9644 MI.eraseFromParent();
9645 return Legalized;
9646}
9647
9648LegalizerHelper::LegalizeResult
9649LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
9650 auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
9651 MI.getFirst4RegLLTs();
9652 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
9653
9654 LLT Ty = Dst0Ty;
9655 LLT BoolTy = Dst1Ty;
9656
9657 Register NewDst0 = MRI.cloneVirtualRegister(VReg: Dst0);
9658
9659 if (IsAdd)
9660 MIRBuilder.buildAdd(Dst: NewDst0, Src0: LHS, Src1: RHS);
9661 else
9662 MIRBuilder.buildSub(Dst: NewDst0, Src0: LHS, Src1: RHS);
9663
9664 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
9665
9666 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9667
9668 // For an addition, the result should be less than one of the operands (LHS)
9669 // if and only if the other operand (RHS) is negative, otherwise there will
9670 // be overflow.
9671 // For a subtraction, the result should be less than one of the operands
9672 // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
9673 // otherwise there will be overflow.
9674 auto ResultLowerThanLHS =
9675 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: BoolTy, Op0: NewDst0, Op1: LHS);
9676 auto ConditionRHS = MIRBuilder.buildICmp(
9677 Pred: IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, Res: BoolTy, Op0: RHS, Op1: Zero);
9678
9679 MIRBuilder.buildXor(Dst: Dst1, Src0: ConditionRHS, Src1: ResultLowerThanLHS);
9680
9681 MIRBuilder.buildCopy(Res: Dst0, Op: NewDst0);
9682 MI.eraseFromParent();
9683
9684 return Legalized;
9685}
9686
9687LegalizerHelper::LegalizeResult LegalizerHelper::lowerSADDE(MachineInstr &MI) {
9688 auto [Res, OvOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
9689 const LLT Ty = MRI.getType(Reg: Res);
9690
9691 // sum = LHS + RHS + zext(CarryIn)
9692 auto Tmp = MIRBuilder.buildAdd(Dst: Ty, Src0: LHS, Src1: RHS);
9693 auto CarryZ = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
9694 auto Sum = MIRBuilder.buildAdd(Dst: Ty, Src0: Tmp, Src1: CarryZ);
9695 MIRBuilder.buildCopy(Res, Op: Sum);
9696
9697 // OvOut = icmp slt ((sum ^ lhs) & (sum ^ rhs)), 0
9698 auto AX = MIRBuilder.buildXor(Dst: Ty, Src0: Sum, Src1: LHS);
9699 auto BX = MIRBuilder.buildXor(Dst: Ty, Src0: Sum, Src1: RHS);
9700 auto T = MIRBuilder.buildAnd(Dst: Ty, Src0: AX, Src1: BX);
9701
9702 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9703 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: OvOut, Op0: T, Op1: Zero);
9704
9705 MI.eraseFromParent();
9706 return Legalized;
9707}
9708
9709LegalizerHelper::LegalizeResult LegalizerHelper::lowerSSUBE(MachineInstr &MI) {
9710 auto [Res, OvOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
9711 const LLT Ty = MRI.getType(Reg: Res);
9712
9713 // Diff = LHS - (RHS + zext(CarryIn))
9714 auto CarryZ = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
9715 auto RHSPlusCI = MIRBuilder.buildAdd(Dst: Ty, Src0: RHS, Src1: CarryZ);
9716 auto Diff = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHSPlusCI);
9717 MIRBuilder.buildCopy(Res, Op: Diff);
9718
9719 // ov = msb((LHS ^ RHS) & (LHS ^ Diff))
9720 auto X1 = MIRBuilder.buildXor(Dst: Ty, Src0: LHS, Src1: RHS);
9721 auto X2 = MIRBuilder.buildXor(Dst: Ty, Src0: LHS, Src1: Diff);
9722 auto T = MIRBuilder.buildAnd(Dst: Ty, Src0: X1, Src1: X2);
9723 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9724 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: OvOut, Op0: T, Op1: Zero);
9725
9726 MI.eraseFromParent();
9727 return Legalized;
9728}
9729
9730LegalizerHelper::LegalizeResult
9731LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
9732 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9733 LLT Ty = MRI.getType(Reg: Res);
9734 bool IsSigned;
9735 bool IsAdd;
9736 unsigned BaseOp;
9737 switch (MI.getOpcode()) {
9738 default:
9739 llvm_unreachable("unexpected addsat/subsat opcode");
9740 case TargetOpcode::G_UADDSAT:
9741 IsSigned = false;
9742 IsAdd = true;
9743 BaseOp = TargetOpcode::G_ADD;
9744 break;
9745 case TargetOpcode::G_SADDSAT:
9746 IsSigned = true;
9747 IsAdd = true;
9748 BaseOp = TargetOpcode::G_ADD;
9749 break;
9750 case TargetOpcode::G_USUBSAT:
9751 IsSigned = false;
9752 IsAdd = false;
9753 BaseOp = TargetOpcode::G_SUB;
9754 break;
9755 case TargetOpcode::G_SSUBSAT:
9756 IsSigned = true;
9757 IsAdd = false;
9758 BaseOp = TargetOpcode::G_SUB;
9759 break;
9760 }
9761
9762 if (IsSigned) {
9763 // sadd.sat(a, b) ->
9764 // hi = 0x7fffffff - smax(a, 0)
9765 // lo = 0x80000000 - smin(a, 0)
9766 // a + smin(smax(lo, b), hi)
9767 // ssub.sat(a, b) ->
9768 // lo = smax(a, -1) - 0x7fffffff
9769 // hi = smin(a, -1) - 0x80000000
9770 // a - smin(smax(lo, b), hi)
9771 // TODO: AMDGPU can use a "median of 3" instruction here:
9772 // a +/- med3(lo, b, hi)
9773 uint64_t NumBits = Ty.getScalarSizeInBits();
9774 auto MaxVal =
9775 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMaxValue(numBits: NumBits));
9776 auto MinVal =
9777 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: NumBits));
9778 MachineInstrBuilder Hi, Lo;
9779 if (IsAdd) {
9780 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9781 Hi = MIRBuilder.buildSub(Dst: Ty, Src0: MaxVal, Src1: MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: Zero));
9782 Lo = MIRBuilder.buildSub(Dst: Ty, Src0: MinVal, Src1: MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: Zero));
9783 } else {
9784 auto NegOne = MIRBuilder.buildConstant(Res: Ty, Val: -1);
9785 Lo = MIRBuilder.buildSub(Dst: Ty, Src0: MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: NegOne),
9786 Src1: MaxVal);
9787 Hi = MIRBuilder.buildSub(Dst: Ty, Src0: MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: NegOne),
9788 Src1: MinVal);
9789 }
9790 auto RHSClamped =
9791 MIRBuilder.buildSMin(Dst: Ty, Src0: MIRBuilder.buildSMax(Dst: Ty, Src0: Lo, Src1: RHS), Src1: Hi);
9792 MIRBuilder.buildInstr(Opc: BaseOp, DstOps: {Res}, SrcOps: {LHS, RHSClamped});
9793 } else {
9794 // uadd.sat(a, b) -> a + umin(~a, b)
9795 // usub.sat(a, b) -> a - umin(a, b)
9796 Register Not = IsAdd ? MIRBuilder.buildNot(Dst: Ty, Src0: LHS).getReg(Idx: 0) : LHS;
9797 auto Min = MIRBuilder.buildUMin(Dst: Ty, Src0: Not, Src1: RHS);
9798 MIRBuilder.buildInstr(Opc: BaseOp, DstOps: {Res}, SrcOps: {LHS, Min});
9799 }
9800
9801 MI.eraseFromParent();
9802 return Legalized;
9803}
9804
9805LegalizerHelper::LegalizeResult
9806LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
9807 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9808 LLT Ty = MRI.getType(Reg: Res);
9809 LLT BoolTy = Ty.changeElementSize(NewEltSize: 1);
9810 bool IsSigned;
9811 bool IsAdd;
9812 unsigned OverflowOp;
9813 switch (MI.getOpcode()) {
9814 default:
9815 llvm_unreachable("unexpected addsat/subsat opcode");
9816 case TargetOpcode::G_UADDSAT:
9817 IsSigned = false;
9818 IsAdd = true;
9819 OverflowOp = TargetOpcode::G_UADDO;
9820 break;
9821 case TargetOpcode::G_SADDSAT:
9822 IsSigned = true;
9823 IsAdd = true;
9824 OverflowOp = TargetOpcode::G_SADDO;
9825 break;
9826 case TargetOpcode::G_USUBSAT:
9827 IsSigned = false;
9828 IsAdd = false;
9829 OverflowOp = TargetOpcode::G_USUBO;
9830 break;
9831 case TargetOpcode::G_SSUBSAT:
9832 IsSigned = true;
9833 IsAdd = false;
9834 OverflowOp = TargetOpcode::G_SSUBO;
9835 break;
9836 }
9837
9838 auto OverflowRes =
9839 MIRBuilder.buildInstr(Opc: OverflowOp, DstOps: {Ty, BoolTy}, SrcOps: {LHS, RHS});
9840 Register Tmp = OverflowRes.getReg(Idx: 0);
9841 Register Ov = OverflowRes.getReg(Idx: 1);
9842 MachineInstrBuilder Clamp;
9843 if (IsSigned) {
9844 // sadd.sat(a, b) ->
9845 // {tmp, ov} = saddo(a, b)
9846 // ov ? (tmp >>s 31) + 0x80000000 : r
9847 // ssub.sat(a, b) ->
9848 // {tmp, ov} = ssubo(a, b)
9849 // ov ? (tmp >>s 31) + 0x80000000 : r
9850 uint64_t NumBits = Ty.getScalarSizeInBits();
9851 auto ShiftAmount = MIRBuilder.buildConstant(Res: Ty, Val: NumBits - 1);
9852 auto Sign = MIRBuilder.buildAShr(Dst: Ty, Src0: Tmp, Src1: ShiftAmount);
9853 auto MinVal =
9854 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: NumBits));
9855 Clamp = MIRBuilder.buildAdd(Dst: Ty, Src0: Sign, Src1: MinVal);
9856 } else {
9857 // uadd.sat(a, b) ->
9858 // {tmp, ov} = uaddo(a, b)
9859 // ov ? 0xffffffff : tmp
9860 // usub.sat(a, b) ->
9861 // {tmp, ov} = usubo(a, b)
9862 // ov ? 0 : tmp
9863 Clamp = MIRBuilder.buildConstant(Res: Ty, Val: IsAdd ? -1 : 0);
9864 }
9865 MIRBuilder.buildSelect(Res, Tst: Ov, Op0: Clamp, Op1: Tmp);
9866
9867 MI.eraseFromParent();
9868 return Legalized;
9869}
9870
9871LegalizerHelper::LegalizeResult
9872LegalizerHelper::lowerShlSat(MachineInstr &MI) {
9873 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
9874 MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
9875 "Expected shlsat opcode!");
9876 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
9877 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9878 LLT Ty = MRI.getType(Reg: Res);
9879 LLT BoolTy = Ty.changeElementSize(NewEltSize: 1);
9880
9881 unsigned BW = Ty.getScalarSizeInBits();
9882 auto Result = MIRBuilder.buildShl(Dst: Ty, Src0: LHS, Src1: RHS);
9883 auto Orig = IsSigned ? MIRBuilder.buildAShr(Dst: Ty, Src0: Result, Src1: RHS)
9884 : MIRBuilder.buildLShr(Dst: Ty, Src0: Result, Src1: RHS);
9885
9886 MachineInstrBuilder SatVal;
9887 if (IsSigned) {
9888 auto SatMin = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: BW));
9889 auto SatMax = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMaxValue(numBits: BW));
9890 auto Cmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: BoolTy, Op0: LHS,
9891 Op1: MIRBuilder.buildConstant(Res: Ty, Val: 0));
9892 SatVal = MIRBuilder.buildSelect(Res: Ty, Tst: Cmp, Op0: SatMin, Op1: SatMax);
9893 } else {
9894 SatVal = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getMaxValue(numBits: BW));
9895 }
9896 auto Ov = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: BoolTy, Op0: LHS, Op1: Orig);
9897 MIRBuilder.buildSelect(Res, Tst: Ov, Op0: SatVal, Op1: Result);
9898
9899 MI.eraseFromParent();
9900 return Legalized;
9901}
9902
9903LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
9904 auto [Dst, Src] = MI.getFirst2Regs();
9905 const LLT Ty = MRI.getType(Reg: Src);
9906 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
9907 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
9908
9909 // Swap most and least significant byte, set remaining bytes in Res to zero.
9910 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: BaseShiftAmt);
9911 auto LSByteShiftedLeft = MIRBuilder.buildShl(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9912 auto MSByteShiftedRight = MIRBuilder.buildLShr(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9913 auto Res = MIRBuilder.buildOr(Dst: Ty, Src0: MSByteShiftedRight, Src1: LSByteShiftedLeft);
9914
9915 // Set i-th high/low byte in Res to i-th low/high byte from Src.
9916 for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
9917 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
9918 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
9919 auto Mask = MIRBuilder.buildConstant(Res: Ty, Val: APMask);
9920 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: BaseShiftAmt - 16 * i);
9921 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
9922 auto LoByte = MIRBuilder.buildAnd(Dst: Ty, Src0: Src, Src1: Mask);
9923 auto LoShiftedLeft = MIRBuilder.buildShl(Dst: Ty, Src0: LoByte, Src1: ShiftAmt);
9924 Res = MIRBuilder.buildOr(Dst: Ty, Src0: Res, Src1: LoShiftedLeft);
9925 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
9926 auto SrcShiftedRight = MIRBuilder.buildLShr(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9927 auto HiShiftedRight = MIRBuilder.buildAnd(Dst: Ty, Src0: SrcShiftedRight, Src1: Mask);
9928 Res = MIRBuilder.buildOr(Dst: Ty, Src0: Res, Src1: HiShiftedRight);
9929 }
9930 Res.getInstr()->getOperand(i: 0).setReg(Dst);
9931
9932 MI.eraseFromParent();
9933 return Legalized;
9934}
9935
9936//{ (Src & Mask) >> N } | { (Src << N) & Mask }
9937static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
9938 MachineInstrBuilder Src, const APInt &Mask) {
9939 const LLT Ty = Dst.getLLTTy(MRI: *B.getMRI());
9940 MachineInstrBuilder C_N = B.buildConstant(Res: Ty, Val: N);
9941 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Res: Ty, Val: Mask);
9942 auto LHS = B.buildLShr(Dst: Ty, Src0: B.buildAnd(Dst: Ty, Src0: Src, Src1: MaskLoNTo0), Src1: C_N);
9943 auto RHS = B.buildAnd(Dst: Ty, Src0: B.buildShl(Dst: Ty, Src0: Src, Src1: C_N), Src1: MaskLoNTo0);
9944 return B.buildOr(Dst, Src0: LHS, Src1: RHS);
9945}
9946
9947LegalizerHelper::LegalizeResult
9948LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
9949 auto [Dst, Src] = MI.getFirst2Regs();
9950 const LLT SrcTy = MRI.getType(Reg: Src);
9951 unsigned Size = SrcTy.getScalarSizeInBits();
9952 unsigned VSize = SrcTy.getSizeInBits();
9953
9954 if (Size >= 8) {
9955 if (SrcTy.isVector() && (VSize % 8 == 0) &&
9956 (LI.isLegal(Query: {TargetOpcode::G_BITREVERSE,
9957 {LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8),
9958 LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8)}}))) {
9959 // If bitreverse is legal for i8 vector of the same size, then cast
9960 // to i8 vector type.
9961 // e.g. v4s32 -> v16s8
9962 LLT VTy = LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8);
9963 auto BSWAP = MIRBuilder.buildBSwap(Dst: SrcTy, Src0: Src);
9964 auto Cast = MIRBuilder.buildBitcast(Dst: VTy, Src: BSWAP);
9965 auto RBIT = MIRBuilder.buildBitReverse(Dst: VTy, Src: Cast);
9966 MIRBuilder.buildBitcast(Dst, Src: RBIT);
9967 } else {
9968 MachineInstrBuilder BSWAP =
9969 MIRBuilder.buildInstr(Opc: TargetOpcode::G_BSWAP, DstOps: {SrcTy}, SrcOps: {Src});
9970
9971 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
9972 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
9973 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
9974 MachineInstrBuilder Swap4 = SwapN(N: 4, Dst: SrcTy, B&: MIRBuilder, Src: BSWAP,
9975 Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xF0)));
9976
9977 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
9978 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
9979 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
9980 MachineInstrBuilder Swap2 = SwapN(N: 2, Dst: SrcTy, B&: MIRBuilder, Src: Swap4,
9981 Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xCC)));
9982
9983 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
9984 // 6|7
9985 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
9986 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
9987 SwapN(N: 1, Dst, B&: MIRBuilder, Src: Swap2, Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xAA)));
9988 }
9989 } else {
9990 // Expand bitreverse for types smaller than 8 bits.
9991 MachineInstrBuilder Tmp;
9992 for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
9993 MachineInstrBuilder Tmp2;
9994 if (I < J) {
9995 auto ShAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: J - I);
9996 Tmp2 = MIRBuilder.buildShl(Dst: SrcTy, Src0: Src, Src1: ShAmt);
9997 } else {
9998 auto ShAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: I - J);
9999 Tmp2 = MIRBuilder.buildLShr(Dst: SrcTy, Src0: Src, Src1: ShAmt);
10000 }
10001
10002 auto Mask = MIRBuilder.buildConstant(Res: SrcTy, Val: 1ULL << J);
10003 Tmp2 = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Tmp2, Src1: Mask);
10004 if (I == 0)
10005 Tmp = Tmp2;
10006 else
10007 Tmp = MIRBuilder.buildOr(Dst: SrcTy, Src0: Tmp, Src1: Tmp2);
10008 }
10009 MIRBuilder.buildCopy(Res: Dst, Op: Tmp);
10010 }
10011
10012 MI.eraseFromParent();
10013 return Legalized;
10014}
10015
10016LegalizerHelper::LegalizeResult
10017LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
10018 MachineFunction &MF = MIRBuilder.getMF();
10019
10020 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
10021 int NameOpIdx = IsRead ? 1 : 0;
10022 int ValRegIndex = IsRead ? 0 : 1;
10023
10024 Register ValReg = MI.getOperand(i: ValRegIndex).getReg();
10025 const LLT Ty = MRI.getType(Reg: ValReg);
10026 const MDString *RegStr = cast<MDString>(
10027 Val: cast<MDNode>(Val: MI.getOperand(i: NameOpIdx).getMetadata())->getOperand(I: 0));
10028
10029 Register PhysReg = TLI.getRegisterByName(RegName: RegStr->getString().data(), Ty, MF);
10030 if (!PhysReg) {
10031 const Function &Fn = MF.getFunction();
10032 Fn.getContext().diagnose(DI: DiagnosticInfoGenericWithLoc(
10033 "invalid register \"" + Twine(RegStr->getString().data()) + "\" for " +
10034 (IsRead ? "llvm.read_register" : "llvm.write_register"),
10035 Fn, MI.getDebugLoc()));
10036 if (IsRead)
10037 MIRBuilder.buildUndef(Res: ValReg);
10038
10039 MI.eraseFromParent();
10040 return Legalized;
10041 }
10042
10043 if (IsRead)
10044 MIRBuilder.buildCopy(Res: ValReg, Op: PhysReg);
10045 else
10046 MIRBuilder.buildCopy(Res: PhysReg, Op: ValReg);
10047
10048 MI.eraseFromParent();
10049 return Legalized;
10050}
10051
10052LegalizerHelper::LegalizeResult
10053LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
10054 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
10055 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
10056 Register Result = MI.getOperand(i: 0).getReg();
10057 LLT OrigTy = MRI.getType(Reg: Result);
10058 auto SizeInBits = OrigTy.getScalarSizeInBits();
10059 LLT WideTy = OrigTy.changeElementSize(NewEltSize: SizeInBits * 2);
10060
10061 auto LHS = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 1)});
10062 auto RHS = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 2)});
10063 auto Mul = MIRBuilder.buildMul(Dst: WideTy, Src0: LHS, Src1: RHS);
10064 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
10065
10066 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: SizeInBits);
10067 auto Shifted = MIRBuilder.buildInstr(Opc: ShiftOp, DstOps: {WideTy}, SrcOps: {Mul, ShiftAmt});
10068 MIRBuilder.buildTrunc(Res: Result, Op: Shifted);
10069
10070 MI.eraseFromParent();
10071 return Legalized;
10072}
10073
10074LegalizerHelper::LegalizeResult
10075LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
10076 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
10077 FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(i: 2).getImm());
10078
10079 if (Mask == fcNone) {
10080 MIRBuilder.buildConstant(Res: DstReg, Val: 0);
10081 MI.eraseFromParent();
10082 return Legalized;
10083 }
10084 if (Mask == fcAllFlags) {
10085 MIRBuilder.buildConstant(Res: DstReg, Val: 1);
10086 MI.eraseFromParent();
10087 return Legalized;
10088 }
10089
10090 // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
10091 // version
10092
10093 unsigned BitSize = SrcTy.getScalarSizeInBits();
10094 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: SrcTy.getScalarType());
10095
10096 LLT IntTy = SrcTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: BitSize));
10097 auto AsInt = MIRBuilder.buildCopy(Res: IntTy, Op: SrcReg);
10098
10099 // Various masks.
10100 APInt SignBit = APInt::getSignMask(BitWidth: BitSize);
10101 APInt ValueMask = APInt::getSignedMaxValue(numBits: BitSize); // All bits but sign.
10102 APInt Inf = APFloat::getInf(Sem: Semantics).bitcastToAPInt(); // Exp and int bit.
10103 APInt ExpMask = Inf;
10104 APInt AllOneMantissa = APFloat::getLargest(Sem: Semantics).bitcastToAPInt() & ~Inf;
10105 APInt QNaNBitMask =
10106 APInt::getOneBitSet(numBits: BitSize, BitNo: AllOneMantissa.getActiveBits() - 1);
10107 APInt InversionMask = APInt::getAllOnes(numBits: DstTy.getScalarSizeInBits());
10108
10109 auto SignBitC = MIRBuilder.buildConstant(Res: IntTy, Val: SignBit);
10110 auto ValueMaskC = MIRBuilder.buildConstant(Res: IntTy, Val: ValueMask);
10111 auto InfC = MIRBuilder.buildConstant(Res: IntTy, Val: Inf);
10112 auto ExpMaskC = MIRBuilder.buildConstant(Res: IntTy, Val: ExpMask);
10113 auto ZeroC = MIRBuilder.buildConstant(Res: IntTy, Val: 0);
10114
10115 auto Abs = MIRBuilder.buildAnd(Dst: IntTy, Src0: AsInt, Src1: ValueMaskC);
10116 auto Sign =
10117 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_NE, Res: DstTy, Op0: AsInt, Op1: Abs);
10118
10119 auto Res = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
10120 // Clang doesn't support capture of structured bindings:
10121 LLT DstTyCopy = DstTy;
10122 const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
10123 Res = MIRBuilder.buildOr(Dst: DstTyCopy, Src0: Res, Src1: ToAppend);
10124 };
10125
10126 // Tests that involve more than one class should be processed first.
10127 if ((Mask & fcFinite) == fcFinite) {
10128 // finite(V) ==> abs(V) u< exp_mask
10129 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: Abs,
10130 Op1: ExpMaskC));
10131 Mask &= ~fcFinite;
10132 } else if ((Mask & fcFinite) == fcPosFinite) {
10133 // finite(V) && V > 0 ==> V u< exp_mask
10134 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: AsInt,
10135 Op1: ExpMaskC));
10136 Mask &= ~fcPosFinite;
10137 } else if ((Mask & fcFinite) == fcNegFinite) {
10138 // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
10139 auto Cmp = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: Abs,
10140 Op1: ExpMaskC);
10141 auto And = MIRBuilder.buildAnd(Dst: DstTy, Src0: Cmp, Src1: Sign);
10142 appendToRes(And);
10143 Mask &= ~fcNegFinite;
10144 }
10145
10146 if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
10147 // fcZero | fcSubnormal => test all exponent bits are 0
10148 // TODO: Handle sign bit specific cases
10149 // TODO: Handle inverted case
10150 if (PartialCheck == (fcZero | fcSubnormal)) {
10151 auto ExpBits = MIRBuilder.buildAnd(Dst: IntTy, Src0: AsInt, Src1: ExpMaskC);
10152 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10153 Op0: ExpBits, Op1: ZeroC));
10154 Mask &= ~PartialCheck;
10155 }
10156 }
10157
10158 // Check for individual classes.
10159 if (FPClassTest PartialCheck = Mask & fcZero) {
10160 if (PartialCheck == fcPosZero)
10161 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10162 Op0: AsInt, Op1: ZeroC));
10163 else if (PartialCheck == fcZero)
10164 appendToRes(
10165 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy, Op0: Abs, Op1: ZeroC));
10166 else // fcNegZero
10167 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10168 Op0: AsInt, Op1: SignBitC));
10169 }
10170
10171 if (FPClassTest PartialCheck = Mask & fcSubnormal) {
10172 // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
10173 // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
10174 auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
10175 auto OneC = MIRBuilder.buildConstant(Res: IntTy, Val: 1);
10176 auto VMinusOne = MIRBuilder.buildSub(Dst: IntTy, Src0: V, Src1: OneC);
10177 auto SubnormalRes =
10178 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: VMinusOne,
10179 Op1: MIRBuilder.buildConstant(Res: IntTy, Val: AllOneMantissa));
10180 if (PartialCheck == fcNegSubnormal)
10181 SubnormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: SubnormalRes, Src1: Sign);
10182 appendToRes(SubnormalRes);
10183 }
10184
10185 if (FPClassTest PartialCheck = Mask & fcInf) {
10186 if (PartialCheck == fcPosInf)
10187 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10188 Op0: AsInt, Op1: InfC));
10189 else if (PartialCheck == fcInf)
10190 appendToRes(
10191 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy, Op0: Abs, Op1: InfC));
10192 else { // fcNegInf
10193 APInt NegInf = APFloat::getInf(Sem: Semantics, Negative: true).bitcastToAPInt();
10194 auto NegInfC = MIRBuilder.buildConstant(Res: IntTy, Val: NegInf);
10195 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10196 Op0: AsInt, Op1: NegInfC));
10197 }
10198 }
10199
10200 if (FPClassTest PartialCheck = Mask & fcNan) {
10201 auto InfWithQnanBitC = MIRBuilder.buildConstant(Res: IntTy, Val: Inf | QNaNBitMask);
10202 if (PartialCheck == fcNan) {
10203 // isnan(V) ==> abs(V) u> int(inf)
10204 appendToRes(
10205 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGT, Res: DstTy, Op0: Abs, Op1: InfC));
10206 } else if (PartialCheck == fcQNan) {
10207 // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
10208 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGE, Res: DstTy, Op0: Abs,
10209 Op1: InfWithQnanBitC));
10210 } else { // fcSNan
10211 // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
10212 // abs(V) u< (unsigned(Inf) | quiet_bit)
10213 auto IsNan =
10214 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGT, Res: DstTy, Op0: Abs, Op1: InfC);
10215 auto IsNotQnan = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy,
10216 Op0: Abs, Op1: InfWithQnanBitC);
10217 appendToRes(MIRBuilder.buildAnd(Dst: DstTy, Src0: IsNan, Src1: IsNotQnan));
10218 }
10219 }
10220
10221 if (FPClassTest PartialCheck = Mask & fcNormal) {
10222 // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
10223 // (max_exp-1))
10224 APInt ExpLSB = ExpMask & ~(ExpMask.shl(shiftAmt: 1));
10225 auto ExpMinusOne = MIRBuilder.buildSub(
10226 Dst: IntTy, Src0: Abs, Src1: MIRBuilder.buildConstant(Res: IntTy, Val: ExpLSB));
10227 APInt MaxExpMinusOne = ExpMask - ExpLSB;
10228 auto NormalRes =
10229 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: ExpMinusOne,
10230 Op1: MIRBuilder.buildConstant(Res: IntTy, Val: MaxExpMinusOne));
10231 if (PartialCheck == fcNegNormal)
10232 NormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: NormalRes, Src1: Sign);
10233 else if (PartialCheck == fcPosNormal) {
10234 auto PosSign = MIRBuilder.buildXor(
10235 Dst: DstTy, Src0: Sign, Src1: MIRBuilder.buildConstant(Res: DstTy, Val: InversionMask));
10236 NormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: NormalRes, Src1: PosSign);
10237 }
10238 appendToRes(NormalRes);
10239 }
10240
10241 MIRBuilder.buildCopy(Res: DstReg, Op: Res);
10242 MI.eraseFromParent();
10243 return Legalized;
10244}
10245
10246LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
10247 // Implement G_SELECT in terms of XOR, AND, OR.
10248 auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
10249 MI.getFirst4RegLLTs();
10250
10251 LLT Op1TyInt =
10252 Op1Ty.changeElementType(NewEltTy: LLT::integer(SizeInBits: Op1Ty.getScalarSizeInBits()));
10253
10254 bool IsEltPtr = DstTy.isPointerOrPointerVector();
10255 if (IsEltPtr) {
10256 LLT ScalarPtrTy = LLT::scalar(SizeInBits: DstTy.getScalarSizeInBits());
10257 LLT NewTy = DstTy.changeElementType(NewEltTy: ScalarPtrTy);
10258 Op1Reg = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Op1Reg).getReg(Idx: 0);
10259 Op1Ty = MRI.getType(Reg: Op1Reg);
10260 Op2Reg = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Op2Reg).getReg(Idx: 0);
10261 Op2Ty = MRI.getType(Reg: Op2Reg);
10262 DstTy = NewTy;
10263 }
10264
10265 if (MaskTy.isScalar()) {
10266 // Turn the scalar condition into a vector condition mask if needed.
10267
10268 Register MaskElt = MaskReg;
10269
10270 // The condition was potentially zero extended before, but we want a sign
10271 // extended boolean.
10272 if (MaskTy != LLT::scalar(SizeInBits: 1))
10273 MaskElt = MIRBuilder.buildSExtInReg(Res: MaskTy, Op: MaskElt, ImmOp: 1).getReg(Idx: 0);
10274
10275 // Continue the sign extension (or truncate) to match the data type.
10276 MaskElt =
10277 MIRBuilder.buildSExtOrTrunc(Res: DstTy.getScalarType(), Op: MaskElt).getReg(Idx: 0);
10278
10279 if (DstTy.isVector()) {
10280 // Generate a vector splat idiom.
10281 auto ShufSplat = MIRBuilder.buildShuffleSplat(Res: DstTy, Src: MaskElt);
10282 MaskReg = ShufSplat.getReg(Idx: 0);
10283 } else {
10284 MaskReg = MaskElt;
10285 }
10286 MaskTy = DstTy;
10287 } else if (!DstTy.isVector()) {
10288 // Cannot handle the case that mask is a vector and dst is a scalar.
10289 return UnableToLegalize;
10290 }
10291
10292 if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
10293 return UnableToLegalize;
10294 }
10295
10296 if (!(Op1Ty.getScalarType().isAnyScalar() ||
10297 Op1Ty.getScalarType().isInteger()))
10298 Op1Reg = MIRBuilder.buildBitcast(Dst: Op1TyInt, Src: Op1Reg).getReg(Idx: 0);
10299
10300 if (!(Op2Ty.getScalarType().isAnyScalar() ||
10301 Op2Ty.getScalarType().isInteger())) {
10302 auto Op2TyInt =
10303 Op2Ty.changeElementType(NewEltTy: LLT::integer(SizeInBits: Op2Ty.getScalarSizeInBits()));
10304 Op2Reg = MIRBuilder.buildBitcast(Dst: Op2TyInt, Src: Op2Reg).getReg(Idx: 0);
10305 }
10306
10307 auto NotMask = MIRBuilder.buildNot(Dst: MaskTy, Src0: MaskReg);
10308 auto NewOp1 = MIRBuilder.buildAnd(Dst: MaskTy, Src0: Op1Reg, Src1: MaskReg);
10309 auto NewOp2 = MIRBuilder.buildAnd(Dst: MaskTy, Src0: Op2Reg, Src1: NotMask);
10310 if (IsEltPtr) {
10311 auto Or = MIRBuilder.buildOr(Dst: DstTy, Src0: NewOp1, Src1: NewOp2);
10312 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: Or);
10313 } else {
10314 if (DstTy == Op1TyInt)
10315 MIRBuilder.buildOr(Dst: DstReg, Src0: NewOp1, Src1: NewOp2);
10316 else {
10317 auto Or = MIRBuilder.buildOr(Dst: Op1TyInt, Src0: NewOp1, Src1: NewOp2);
10318 MIRBuilder.buildBitcast(Dst: DstReg, Src: Or.getReg(Idx: 0));
10319 }
10320 }
10321 MI.eraseFromParent();
10322 return Legalized;
10323}
10324
10325LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
10326 // Split DIVREM into individual instructions.
10327 unsigned Opcode = MI.getOpcode();
10328
10329 MIRBuilder.buildInstr(
10330 Opc: Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
10331 : TargetOpcode::G_UDIV,
10332 DstOps: {MI.getOperand(i: 0).getReg()}, SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
10333 MIRBuilder.buildInstr(
10334 Opc: Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
10335 : TargetOpcode::G_UREM,
10336 DstOps: {MI.getOperand(i: 1).getReg()}, SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
10337 MI.eraseFromParent();
10338 return Legalized;
10339}
10340
10341LegalizerHelper::LegalizeResult
10342LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
10343 // Expand %res = G_ABS %a into:
10344 // %v1 = G_ASHR %a, scalar_size-1
10345 // %v2 = G_ADD %a, %v1
10346 // %res = G_XOR %v2, %v1
10347 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
10348 Register OpReg = MI.getOperand(i: 1).getReg();
10349 auto ShiftAmt =
10350 MIRBuilder.buildConstant(Res: DstTy, Val: DstTy.getScalarSizeInBits() - 1);
10351 auto Shift = MIRBuilder.buildAShr(Dst: DstTy, Src0: OpReg, Src1: ShiftAmt);
10352 auto Add = MIRBuilder.buildAdd(Dst: DstTy, Src0: OpReg, Src1: Shift);
10353 MIRBuilder.buildXor(Dst: MI.getOperand(i: 0).getReg(), Src0: Add, Src1: Shift);
10354 MI.eraseFromParent();
10355 return Legalized;
10356}
10357
10358LegalizerHelper::LegalizeResult
10359LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
10360 // Expand %res = G_ABS %a into:
10361 // %v1 = G_CONSTANT 0
10362 // %v2 = G_SUB %v1, %a
10363 // %res = G_SMAX %a, %v2
10364 Register SrcReg = MI.getOperand(i: 1).getReg();
10365 LLT Ty = MRI.getType(Reg: SrcReg);
10366 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
10367 auto Sub = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: SrcReg);
10368 MIRBuilder.buildSMax(Dst: MI.getOperand(i: 0), Src0: SrcReg, Src1: Sub);
10369 MI.eraseFromParent();
10370 return Legalized;
10371}
10372
10373LegalizerHelper::LegalizeResult
10374LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
10375 Register SrcReg = MI.getOperand(i: 1).getReg();
10376 Register DestReg = MI.getOperand(i: 0).getReg();
10377 LLT Ty = MRI.getType(Reg: SrcReg), IType = LLT::scalar(SizeInBits: 1);
10378 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0).getReg(Idx: 0);
10379 auto Sub = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: SrcReg).getReg(Idx: 0);
10380 auto ICmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: IType, Op0: SrcReg, Op1: Zero);
10381 MIRBuilder.buildSelect(Res: DestReg, Tst: ICmp, Op0: SrcReg, Op1: Sub);
10382 MI.eraseFromParent();
10383 return Legalized;
10384}
10385
10386LegalizerHelper::LegalizeResult
10387LegalizerHelper::lowerAbsDiffToSelect(MachineInstr &MI) {
10388 assert((MI.getOpcode() == TargetOpcode::G_ABDS ||
10389 MI.getOpcode() == TargetOpcode::G_ABDU) &&
10390 "Expected G_ABDS or G_ABDU instruction");
10391
10392 auto [DstReg, LHS, RHS] = MI.getFirst3Regs();
10393 LLT Ty = MRI.getType(Reg: LHS);
10394
10395 // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
10396 // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
10397 Register LHSSub = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10398 Register RHSSub = MIRBuilder.buildSub(Dst: Ty, Src0: RHS, Src1: LHS).getReg(Idx: 0);
10399 CmpInst::Predicate Pred = (MI.getOpcode() == TargetOpcode::G_ABDS)
10400 ? CmpInst::ICMP_SGT
10401 : CmpInst::ICMP_UGT;
10402 auto ICmp = MIRBuilder.buildICmp(Pred, Res: LLT::scalar(SizeInBits: 1), Op0: LHS, Op1: RHS);
10403 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LHSSub, Op1: RHSSub);
10404
10405 MI.eraseFromParent();
10406 return Legalized;
10407}
10408
10409LegalizerHelper::LegalizeResult
10410LegalizerHelper::lowerAbsDiffToMinMax(MachineInstr &MI) {
10411 assert((MI.getOpcode() == TargetOpcode::G_ABDS ||
10412 MI.getOpcode() == TargetOpcode::G_ABDU) &&
10413 "Expected G_ABDS or G_ABDU instruction");
10414
10415 auto [DstReg, LHS, RHS] = MI.getFirst3Regs();
10416 LLT Ty = MRI.getType(Reg: LHS);
10417
10418 // abds(lhs, rhs) -→ sub(smax(lhs, rhs), smin(lhs, rhs))
10419 // abdu(lhs, rhs) -→ sub(umax(lhs, rhs), umin(lhs, rhs))
10420 Register MaxReg, MinReg;
10421 if (MI.getOpcode() == TargetOpcode::G_ABDS) {
10422 MaxReg = MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10423 MinReg = MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10424 } else {
10425 MaxReg = MIRBuilder.buildUMax(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10426 MinReg = MIRBuilder.buildUMin(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10427 }
10428 MIRBuilder.buildSub(Dst: DstReg, Src0: MaxReg, Src1: MinReg);
10429
10430 MI.eraseFromParent();
10431 return Legalized;
10432}
10433
10434LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
10435 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
10436 LLT TyInt =
10437 DstTy.changeElementType(NewEltTy: LLT::integer(SizeInBits: DstTy.getScalarSizeInBits()));
10438 Register CastedSrc = SrcReg;
10439
10440 if (!(SrcTy.getScalarType().isAnyScalar() ||
10441 SrcTy.getScalarType().isInteger())) {
10442 auto SrcTyInt =
10443 SrcTy.changeElementType(NewEltTy: LLT::integer(SizeInBits: SrcTy.getScalarSizeInBits()));
10444 CastedSrc = MIRBuilder.buildBitcast(Dst: SrcTyInt, Src: SrcReg).getReg(Idx: 0);
10445 }
10446
10447 if (MRI.getType(Reg: DstReg) != TyInt) {
10448 // Reset sign bit
10449 Register NewDst =
10450 MIRBuilder
10451 .buildAnd(Dst: TyInt, Src0: CastedSrc,
10452 Src1: MIRBuilder.buildConstant(
10453 Res: TyInt, Val: APInt::getSignedMaxValue(
10454 numBits: DstTy.getScalarSizeInBits())))
10455 .getReg(Idx: 0);
10456
10457 MIRBuilder.buildBitcast(Dst: DstReg, Src: NewDst);
10458 } else
10459 MIRBuilder
10460 .buildAnd(
10461 Dst: DstReg, Src0: CastedSrc,
10462 Src1: MIRBuilder.buildConstant(
10463 Res: TyInt, Val: APInt::getSignedMaxValue(numBits: DstTy.getScalarSizeInBits())))
10464 .getReg(Idx: 0);
10465
10466 MI.eraseFromParent();
10467 return Legalized;
10468}
10469
10470LegalizerHelper::LegalizeResult
10471LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
10472 Register SrcReg = MI.getOperand(i: 1).getReg();
10473 LLT SrcTy = MRI.getType(Reg: SrcReg);
10474 LLT DstTy = MRI.getType(Reg: SrcReg);
10475
10476 // The source could be a scalar if the IR type was <1 x sN>.
10477 if (SrcTy.isScalar()) {
10478 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
10479 return UnableToLegalize; // FIXME: handle extension.
10480 // This can be just a plain copy.
10481 Observer.changingInstr(MI);
10482 MI.setDesc(MIRBuilder.getTII().get(Opcode: TargetOpcode::COPY));
10483 Observer.changedInstr(MI);
10484 return Legalized;
10485 }
10486 return UnableToLegalize;
10487}
10488
10489LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
10490 MachineFunction &MF = *MI.getMF();
10491 const DataLayout &DL = MIRBuilder.getDataLayout();
10492 LLVMContext &Ctx = MF.getFunction().getContext();
10493 Register ListPtr = MI.getOperand(i: 1).getReg();
10494 LLT PtrTy = MRI.getType(Reg: ListPtr);
10495
10496 // LstPtr is a pointer to the head of the list. Get the address
10497 // of the head of the list.
10498 Align PtrAlignment = DL.getABITypeAlign(Ty: getTypeForLLT(Ty: PtrTy, C&: Ctx));
10499 MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
10500 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad, MemTy: PtrTy, base_alignment: PtrAlignment);
10501 auto VAList = MIRBuilder.buildLoad(Res: PtrTy, Addr: ListPtr, MMO&: *PtrLoadMMO).getReg(Idx: 0);
10502
10503 const Align A(MI.getOperand(i: 2).getImm());
10504 LLT PtrTyAsScalarTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
10505 if (A > TLI.getMinStackArgumentAlignment()) {
10506 Register AlignAmt =
10507 MIRBuilder.buildConstant(Res: PtrTyAsScalarTy, Val: A.value() - 1).getReg(Idx: 0);
10508 auto AddDst = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VAList, Op1: AlignAmt);
10509 auto AndDst = MIRBuilder.buildMaskLowPtrBits(Res: PtrTy, Op0: AddDst, NumBits: Log2(A));
10510 VAList = AndDst.getReg(Idx: 0);
10511 }
10512
10513 // Increment the pointer, VAList, to the next vaarg
10514 // The list should be bumped by the size of element in the current head of
10515 // list.
10516 Register Dst = MI.getOperand(i: 0).getReg();
10517 LLT LLTTy = MRI.getType(Reg: Dst);
10518 Type *Ty = getTypeForLLT(Ty: LLTTy, C&: Ctx);
10519 auto IncAmt =
10520 MIRBuilder.buildConstant(Res: PtrTyAsScalarTy, Val: DL.getTypeAllocSize(Ty));
10521 auto Succ = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VAList, Op1: IncAmt);
10522
10523 // Store the increment VAList to the legalized pointer
10524 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10525 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOStore, MemTy: PtrTy, base_alignment: PtrAlignment);
10526 MIRBuilder.buildStore(Val: Succ, Addr: ListPtr, MMO&: *StoreMMO);
10527 // Load the actual argument out of the pointer VAList
10528 Align EltAlignment = DL.getABITypeAlign(Ty);
10529 MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
10530 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad, MemTy: LLTTy, base_alignment: EltAlignment);
10531 MIRBuilder.buildLoad(Res: Dst, Addr: VAList, MMO&: *EltLoadMMO);
10532
10533 MI.eraseFromParent();
10534 return Legalized;
10535}
10536
10537static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
10538 // On Darwin, -Os means optimize for size without hurting performance, so
10539 // only really optimize for size when -Oz (MinSize) is used.
10540 if (MF.getTarget().getTargetTriple().isOSDarwin())
10541 return MF.getFunction().hasMinSize();
10542 return MF.getFunction().hasOptSize();
10543}
10544
10545// Returns a list of types to use for memory op lowering in MemOps. A partial
10546// port of findOptimalMemOpLowering in TargetLowering.
10547static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
10548 unsigned Limit, const MemOp &Op,
10549 unsigned DstAS, unsigned SrcAS,
10550 const AttributeList &FuncAttributes,
10551 const TargetLowering &TLI) {
10552 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
10553 return false;
10554
10555 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
10556
10557 if (Ty == LLT()) {
10558 // Use the largest scalar type whose alignment constraints are satisfied.
10559 // We only need to check DstAlign here as SrcAlign is always greater or
10560 // equal to DstAlign (or zero).
10561 Ty = LLT::scalar(SizeInBits: 64);
10562 if (Op.isFixedDstAlign())
10563 while (Op.getDstAlign() < Ty.getSizeInBytes() &&
10564 !TLI.allowsMisalignedMemoryAccesses(Ty, AddrSpace: DstAS, Alignment: Op.getDstAlign()))
10565 Ty = LLT::scalar(SizeInBits: Ty.getSizeInBytes());
10566 assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
10567 // FIXME: check for the largest legal type we can load/store to.
10568 }
10569
10570 unsigned NumMemOps = 0;
10571 uint64_t Size = Op.size();
10572 while (Size) {
10573 unsigned TySize = Ty.getSizeInBytes();
10574 while (TySize > Size) {
10575 // For now, only use non-vector load / store's for the left-over pieces.
10576 LLT NewTy = Ty;
10577 // FIXME: check for mem op safety and legality of the types. Not all of
10578 // SDAGisms map cleanly to GISel concepts.
10579 if (NewTy.isVector())
10580 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32);
10581 NewTy = LLT::scalar(SizeInBits: llvm::bit_floor(Value: NewTy.getSizeInBits() - 1));
10582 unsigned NewTySize = NewTy.getSizeInBytes();
10583 assert(NewTySize > 0 && "Could not find appropriate type");
10584
10585 // If the new LLT cannot cover all of the remaining bits, then consider
10586 // issuing a (or a pair of) unaligned and overlapping load / store.
10587 unsigned Fast;
10588 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
10589 MVT VT = getMVTForLLT(Ty);
10590 if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
10591 TLI.allowsMisalignedMemoryAccesses(
10592 VT, AddrSpace: DstAS, Alignment: Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
10593 Flags: MachineMemOperand::MONone, &Fast) &&
10594 Fast)
10595 TySize = Size;
10596 else {
10597 Ty = NewTy;
10598 TySize = NewTySize;
10599 }
10600 }
10601
10602 if (++NumMemOps > Limit)
10603 return false;
10604
10605 MemOps.push_back(x: Ty);
10606 Size -= TySize;
10607 }
10608
10609 return true;
10610}
10611
10612// Get a vectorized representation of the memset value operand, GISel edition.
10613static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
10614 MachineRegisterInfo &MRI = *MIB.getMRI();
10615 unsigned NumBits = Ty.getScalarSizeInBits();
10616 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Val, MRI);
10617 if (!Ty.isVector() && ValVRegAndVal) {
10618 APInt Scalar = ValVRegAndVal->Value.trunc(width: 8);
10619 APInt SplatVal = APInt::getSplat(NewLen: NumBits, V: Scalar);
10620 return MIB.buildConstant(Res: Ty, Val: SplatVal).getReg(Idx: 0);
10621 }
10622
10623 // Extend the byte value to the larger type, and then multiply by a magic
10624 // value 0x010101... in order to replicate it across every byte.
10625 // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
10626 if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
10627 return MIB.buildConstant(Res: Ty, Val: 0).getReg(Idx: 0);
10628 }
10629
10630 LLT ExtType = Ty.getScalarType();
10631 auto ZExt = MIB.buildZExtOrTrunc(Res: ExtType, Op: Val);
10632 if (NumBits > 8) {
10633 APInt Magic = APInt::getSplat(NewLen: NumBits, V: APInt(8, 0x01));
10634 auto MagicMI = MIB.buildConstant(Res: ExtType, Val: Magic);
10635 Val = MIB.buildMul(Dst: ExtType, Src0: ZExt, Src1: MagicMI).getReg(Idx: 0);
10636 }
10637
10638 // For vector types create a G_BUILD_VECTOR.
10639 if (Ty.isVector())
10640 Val = MIB.buildSplatBuildVector(Res: Ty, Src: Val).getReg(Idx: 0);
10641
10642 return Val;
10643}
10644
10645LegalizerHelper::LegalizeResult
10646LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
10647 uint64_t KnownLen, Align Alignment,
10648 bool IsVolatile) {
10649 auto &MF = *MI.getParent()->getParent();
10650 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10651 auto &DL = MF.getDataLayout();
10652 LLVMContext &C = MF.getFunction().getContext();
10653
10654 assert(KnownLen != 0 && "Have a zero length memset length!");
10655
10656 bool DstAlignCanChange = false;
10657 MachineFrameInfo &MFI = MF.getFrameInfo();
10658 bool OptSize = shouldLowerMemFuncForSize(MF);
10659
10660 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
10661 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
10662 DstAlignCanChange = true;
10663
10664 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
10665 std::vector<LLT> MemOps;
10666
10667 const auto &DstMMO = **MI.memoperands_begin();
10668 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10669
10670 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Val, MRI);
10671 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
10672
10673 if (!findGISelOptimalMemOpLowering(MemOps, Limit,
10674 Op: MemOp::Set(Size: KnownLen, DstAlignCanChange,
10675 DstAlign: Alignment,
10676 /*IsZeroMemset=*/IsZeroVal,
10677 /*IsVolatile=*/IsVolatile),
10678 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: ~0u,
10679 FuncAttributes: MF.getFunction().getAttributes(), TLI))
10680 return UnableToLegalize;
10681
10682 if (DstAlignCanChange) {
10683 // Get an estimate of the type from the LLT.
10684 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
10685 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
10686 if (NewAlign > Alignment) {
10687 Alignment = NewAlign;
10688 unsigned FI = FIDef->getOperand(i: 1).getIndex();
10689 // Give the stack frame object a larger alignment if needed.
10690 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
10691 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
10692 }
10693 }
10694
10695 MachineIRBuilder MIB(MI);
10696 // Find the largest store and generate the bit pattern for it.
10697 LLT LargestTy = MemOps[0];
10698 for (unsigned i = 1; i < MemOps.size(); i++)
10699 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
10700 LargestTy = MemOps[i];
10701
10702 // The memset stored value is always defined as an s8, so in order to make it
10703 // work with larger store types we need to repeat the bit pattern across the
10704 // wider type.
10705 Register MemSetValue = getMemsetValue(Val, Ty: LargestTy, MIB);
10706
10707 if (!MemSetValue)
10708 return UnableToLegalize;
10709
10710 // Generate the stores. For each store type in the list, we generate the
10711 // matching store of that type to the destination address.
10712 LLT PtrTy = MRI.getType(Reg: Dst);
10713 unsigned DstOff = 0;
10714 unsigned Size = KnownLen;
10715 for (unsigned I = 0; I < MemOps.size(); I++) {
10716 LLT Ty = MemOps[I];
10717 unsigned TySize = Ty.getSizeInBytes();
10718 if (TySize > Size) {
10719 // Issuing an unaligned load / store pair that overlaps with the previous
10720 // pair. Adjust the offset accordingly.
10721 assert(I == MemOps.size() - 1 && I != 0);
10722 DstOff -= TySize - Size;
10723 }
10724
10725 // If this store is smaller than the largest store see whether we can get
10726 // the smaller value for free with a truncate.
10727 Register Value = MemSetValue;
10728 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
10729 MVT VT = getMVTForLLT(Ty);
10730 MVT LargestVT = getMVTForLLT(Ty: LargestTy);
10731 if (!LargestTy.isVector() && !Ty.isVector() &&
10732 TLI.isTruncateFree(FromVT: LargestVT, ToVT: VT))
10733 Value = MIB.buildTrunc(Res: Ty, Op: MemSetValue).getReg(Idx: 0);
10734 else
10735 Value = getMemsetValue(Val, Ty, MIB);
10736 if (!Value)
10737 return UnableToLegalize;
10738 }
10739
10740 auto *StoreMMO = MF.getMachineMemOperand(MMO: &DstMMO, Offset: DstOff, Ty);
10741
10742 Register Ptr = Dst;
10743 if (DstOff != 0) {
10744 auto Offset =
10745 MIB.buildConstant(Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()), Val: DstOff);
10746 Ptr = MIB.buildObjectPtrOffset(Res: PtrTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
10747 }
10748
10749 MIB.buildStore(Val: Value, Addr: Ptr, MMO&: *StoreMMO);
10750 DstOff += Ty.getSizeInBytes();
10751 Size -= TySize;
10752 }
10753
10754 MI.eraseFromParent();
10755 return Legalized;
10756}
10757
10758LegalizerHelper::LegalizeResult
10759LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
10760 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
10761
10762 auto [Dst, Src, Len] = MI.getFirst3Regs();
10763
10764 const auto *MMOIt = MI.memoperands_begin();
10765 const MachineMemOperand *MemOp = *MMOIt;
10766 bool IsVolatile = MemOp->isVolatile();
10767
10768 // See if this is a constant length copy
10769 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Len, MRI);
10770 // FIXME: support dynamically sized G_MEMCPY_INLINE
10771 assert(LenVRegAndVal &&
10772 "inline memcpy with dynamic size is not yet supported");
10773 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
10774 if (KnownLen == 0) {
10775 MI.eraseFromParent();
10776 return Legalized;
10777 }
10778
10779 const auto &DstMMO = **MI.memoperands_begin();
10780 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
10781 Align DstAlign = DstMMO.getBaseAlign();
10782 Align SrcAlign = SrcMMO.getBaseAlign();
10783
10784 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
10785 IsVolatile);
10786}
10787
10788LegalizerHelper::LegalizeResult
10789LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
10790 uint64_t KnownLen, Align DstAlign,
10791 Align SrcAlign, bool IsVolatile) {
10792 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
10793 return lowerMemcpy(MI, Dst, Src, KnownLen,
10794 Limit: std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
10795 IsVolatile);
10796}
10797
10798LegalizerHelper::LegalizeResult
10799LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
10800 uint64_t KnownLen, uint64_t Limit, Align DstAlign,
10801 Align SrcAlign, bool IsVolatile) {
10802 auto &MF = *MI.getParent()->getParent();
10803 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10804 auto &DL = MF.getDataLayout();
10805 LLVMContext &C = MF.getFunction().getContext();
10806
10807 assert(KnownLen != 0 && "Have a zero length memcpy length!");
10808
10809 bool DstAlignCanChange = false;
10810 MachineFrameInfo &MFI = MF.getFrameInfo();
10811 Align Alignment = std::min(a: DstAlign, b: SrcAlign);
10812
10813 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
10814 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
10815 DstAlignCanChange = true;
10816
10817 // FIXME: infer better src pointer alignment like SelectionDAG does here.
10818 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
10819 // if the memcpy is in a tail call position.
10820
10821 std::vector<LLT> MemOps;
10822
10823 const auto &DstMMO = **MI.memoperands_begin();
10824 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
10825 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10826 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
10827
10828 if (!findGISelOptimalMemOpLowering(
10829 MemOps, Limit,
10830 Op: MemOp::Copy(Size: KnownLen, DstAlignCanChange, DstAlign: Alignment, SrcAlign,
10831 IsVolatile),
10832 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: SrcPtrInfo.getAddrSpace(),
10833 FuncAttributes: MF.getFunction().getAttributes(), TLI))
10834 return UnableToLegalize;
10835
10836 if (DstAlignCanChange) {
10837 // Get an estimate of the type from the LLT.
10838 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
10839 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
10840
10841 // Don't promote to an alignment that would require dynamic stack
10842 // realignment.
10843 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
10844 if (!TRI->hasStackRealignment(MF))
10845 if (MaybeAlign StackAlign = DL.getStackAlignment())
10846 NewAlign = std::min(a: NewAlign, b: *StackAlign);
10847
10848 if (NewAlign > Alignment) {
10849 Alignment = NewAlign;
10850 unsigned FI = FIDef->getOperand(i: 1).getIndex();
10851 // Give the stack frame object a larger alignment if needed.
10852 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
10853 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
10854 }
10855 }
10856
10857 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
10858
10859 MachineIRBuilder MIB(MI);
10860 // Now we need to emit a pair of load and stores for each of the types we've
10861 // collected. I.e. for each type, generate a load from the source pointer of
10862 // that type width, and then generate a corresponding store to the dest buffer
10863 // of that value loaded. This can result in a sequence of loads and stores
10864 // mixed types, depending on what the target specifies as good types to use.
10865 unsigned CurrOffset = 0;
10866 unsigned Size = KnownLen;
10867 for (auto CopyTy : MemOps) {
10868 // Issuing an unaligned load / store pair that overlaps with the previous
10869 // pair. Adjust the offset accordingly.
10870 if (CopyTy.getSizeInBytes() > Size)
10871 CurrOffset -= CopyTy.getSizeInBytes() - Size;
10872
10873 // Construct MMOs for the accesses.
10874 auto *LoadMMO =
10875 MF.getMachineMemOperand(MMO: &SrcMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10876 auto *StoreMMO =
10877 MF.getMachineMemOperand(MMO: &DstMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10878
10879 // Create the load.
10880 Register LoadPtr = Src;
10881 Register Offset;
10882 if (CurrOffset != 0) {
10883 LLT SrcTy = MRI.getType(Reg: Src);
10884 Offset = MIB.buildConstant(Res: LLT::scalar(SizeInBits: SrcTy.getSizeInBits()), Val: CurrOffset)
10885 .getReg(Idx: 0);
10886 LoadPtr = MIB.buildObjectPtrOffset(Res: SrcTy, Op0: Src, Op1: Offset).getReg(Idx: 0);
10887 }
10888 auto LdVal = MIB.buildLoad(Res: CopyTy, Addr: LoadPtr, MMO&: *LoadMMO);
10889
10890 // Create the store.
10891 Register StorePtr = Dst;
10892 if (CurrOffset != 0) {
10893 LLT DstTy = MRI.getType(Reg: Dst);
10894 StorePtr = MIB.buildObjectPtrOffset(Res: DstTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
10895 }
10896 MIB.buildStore(Val: LdVal, Addr: StorePtr, MMO&: *StoreMMO);
10897 CurrOffset += CopyTy.getSizeInBytes();
10898 Size -= CopyTy.getSizeInBytes();
10899 }
10900
10901 MI.eraseFromParent();
10902 return Legalized;
10903}
10904
10905LegalizerHelper::LegalizeResult
10906LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
10907 uint64_t KnownLen, Align DstAlign, Align SrcAlign,
10908 bool IsVolatile) {
10909 auto &MF = *MI.getParent()->getParent();
10910 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10911 auto &DL = MF.getDataLayout();
10912 LLVMContext &C = MF.getFunction().getContext();
10913
10914 assert(KnownLen != 0 && "Have a zero length memmove length!");
10915
10916 bool DstAlignCanChange = false;
10917 MachineFrameInfo &MFI = MF.getFrameInfo();
10918 bool OptSize = shouldLowerMemFuncForSize(MF);
10919 Align Alignment = std::min(a: DstAlign, b: SrcAlign);
10920
10921 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
10922 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
10923 DstAlignCanChange = true;
10924
10925 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
10926 std::vector<LLT> MemOps;
10927
10928 const auto &DstMMO = **MI.memoperands_begin();
10929 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
10930 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10931 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
10932
10933 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
10934 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
10935 // same thing here.
10936 if (!findGISelOptimalMemOpLowering(
10937 MemOps, Limit,
10938 Op: MemOp::Copy(Size: KnownLen, DstAlignCanChange, DstAlign: Alignment, SrcAlign,
10939 /*IsVolatile*/ true),
10940 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: SrcPtrInfo.getAddrSpace(),
10941 FuncAttributes: MF.getFunction().getAttributes(), TLI))
10942 return UnableToLegalize;
10943
10944 if (DstAlignCanChange) {
10945 // Get an estimate of the type from the LLT.
10946 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
10947 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
10948
10949 // Don't promote to an alignment that would require dynamic stack
10950 // realignment.
10951 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
10952 if (!TRI->hasStackRealignment(MF))
10953 if (MaybeAlign StackAlign = DL.getStackAlignment())
10954 NewAlign = std::min(a: NewAlign, b: *StackAlign);
10955
10956 if (NewAlign > Alignment) {
10957 Alignment = NewAlign;
10958 unsigned FI = FIDef->getOperand(i: 1).getIndex();
10959 // Give the stack frame object a larger alignment if needed.
10960 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
10961 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
10962 }
10963 }
10964
10965 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
10966
10967 MachineIRBuilder MIB(MI);
10968 // Memmove requires that we perform the loads first before issuing the stores.
10969 // Apart from that, this loop is pretty much doing the same thing as the
10970 // memcpy codegen function.
10971 unsigned CurrOffset = 0;
10972 SmallVector<Register, 16> LoadVals;
10973 for (auto CopyTy : MemOps) {
10974 // Construct MMO for the load.
10975 auto *LoadMMO =
10976 MF.getMachineMemOperand(MMO: &SrcMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10977
10978 // Create the load.
10979 Register LoadPtr = Src;
10980 if (CurrOffset != 0) {
10981 LLT SrcTy = MRI.getType(Reg: Src);
10982 auto Offset =
10983 MIB.buildConstant(Res: LLT::scalar(SizeInBits: SrcTy.getSizeInBits()), Val: CurrOffset);
10984 LoadPtr = MIB.buildObjectPtrOffset(Res: SrcTy, Op0: Src, Op1: Offset).getReg(Idx: 0);
10985 }
10986 LoadVals.push_back(Elt: MIB.buildLoad(Res: CopyTy, Addr: LoadPtr, MMO&: *LoadMMO).getReg(Idx: 0));
10987 CurrOffset += CopyTy.getSizeInBytes();
10988 }
10989
10990 CurrOffset = 0;
10991 for (unsigned I = 0; I < MemOps.size(); ++I) {
10992 LLT CopyTy = MemOps[I];
10993 // Now store the values loaded.
10994 auto *StoreMMO =
10995 MF.getMachineMemOperand(MMO: &DstMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10996
10997 Register StorePtr = Dst;
10998 if (CurrOffset != 0) {
10999 LLT DstTy = MRI.getType(Reg: Dst);
11000 auto Offset =
11001 MIB.buildConstant(Res: LLT::scalar(SizeInBits: DstTy.getSizeInBits()), Val: CurrOffset);
11002 StorePtr = MIB.buildObjectPtrOffset(Res: DstTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
11003 }
11004 MIB.buildStore(Val: LoadVals[I], Addr: StorePtr, MMO&: *StoreMMO);
11005 CurrOffset += CopyTy.getSizeInBytes();
11006 }
11007 MI.eraseFromParent();
11008 return Legalized;
11009}
11010
11011LegalizerHelper::LegalizeResult
11012LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
11013 const unsigned Opc = MI.getOpcode();
11014 // This combine is fairly complex so it's not written with a separate
11015 // matcher function.
11016 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
11017 Opc == TargetOpcode::G_MEMSET) &&
11018 "Expected memcpy like instruction");
11019
11020 auto MMOIt = MI.memoperands_begin();
11021 const MachineMemOperand *MemOp = *MMOIt;
11022
11023 Align DstAlign = MemOp->getBaseAlign();
11024 Align SrcAlign;
11025 auto [Dst, Src, Len] = MI.getFirst3Regs();
11026
11027 if (Opc != TargetOpcode::G_MEMSET) {
11028 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
11029 MemOp = *(++MMOIt);
11030 SrcAlign = MemOp->getBaseAlign();
11031 }
11032
11033 // See if this is a constant length copy
11034 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Len, MRI);
11035 if (!LenVRegAndVal)
11036 return UnableToLegalize;
11037 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
11038
11039 if (KnownLen == 0) {
11040 MI.eraseFromParent();
11041 return Legalized;
11042 }
11043
11044 if (MaxLen && KnownLen > MaxLen)
11045 return UnableToLegalize;
11046
11047 bool IsVolatile = MemOp->isVolatile();
11048 if (Opc == TargetOpcode::G_MEMCPY) {
11049 auto &MF = *MI.getParent()->getParent();
11050 const auto &TLI = *MF.getSubtarget().getTargetLowering();
11051 bool OptSize = shouldLowerMemFuncForSize(MF);
11052 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
11053 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
11054 IsVolatile);
11055 }
11056 if (Opc == TargetOpcode::G_MEMMOVE)
11057 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
11058 if (Opc == TargetOpcode::G_MEMSET)
11059 return lowerMemset(MI, Dst, Val: Src, KnownLen, Alignment: DstAlign, IsVolatile);
11060 return UnableToLegalize;
11061}
11062