1//===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This file implements the LegalizerHelper class to legalize
10/// individual instructions and the LegalizeMachineIR wrapper pass for the
11/// primary legalization.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16#include "llvm/CodeGen/GlobalISel/CallLowering.h"
17#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
19#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24#include "llvm/CodeGen/GlobalISel/Utils.h"
25#include "llvm/CodeGen/LowLevelTypeUtils.h"
26#include "llvm/CodeGen/MachineConstantPool.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/CodeGen/MachineRegisterInfo.h"
29#include "llvm/CodeGen/RuntimeLibcallUtil.h"
30#include "llvm/CodeGen/TargetFrameLowering.h"
31#include "llvm/CodeGen/TargetInstrInfo.h"
32#include "llvm/CodeGen/TargetLowering.h"
33#include "llvm/CodeGen/TargetOpcodes.h"
34#include "llvm/CodeGen/TargetSubtargetInfo.h"
35#include "llvm/IR/Instructions.h"
36#include "llvm/Support/Debug.h"
37#include "llvm/Support/MathExtras.h"
38#include "llvm/Support/raw_ostream.h"
39#include "llvm/Target/TargetMachine.h"
40#include <numeric>
41#include <optional>
42
43#define DEBUG_TYPE "legalizer"
44
45using namespace llvm;
46using namespace LegalizeActions;
47using namespace MIPatternMatch;
48
49/// Try to break down \p OrigTy into \p NarrowTy sized pieces.
50///
51/// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
52/// with any leftover piece as type \p LeftoverTy
53///
54/// Returns -1 in the first element of the pair if the breakdown is not
55/// satisfiable.
56static std::pair<int, int>
57getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
58 assert(!LeftoverTy.isValid() && "this is an out argument");
59
60 unsigned Size = OrigTy.getSizeInBits();
61 unsigned NarrowSize = NarrowTy.getSizeInBits();
62 unsigned NumParts = Size / NarrowSize;
63 unsigned LeftoverSize = Size - NumParts * NarrowSize;
64 assert(Size > NarrowSize);
65
66 if (LeftoverSize == 0)
67 return {NumParts, 0};
68
69 if (NarrowTy.isVector()) {
70 unsigned EltSize = OrigTy.getScalarSizeInBits();
71 if (LeftoverSize % EltSize != 0)
72 return {-1, -1};
73 LeftoverTy = OrigTy.changeElementCount(
74 EC: ElementCount::getFixed(MinVal: LeftoverSize / EltSize));
75 } else {
76 LeftoverTy = LLT::scalar(SizeInBits: LeftoverSize);
77 }
78
79 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
80 return std::make_pair(x&: NumParts, y&: NumLeftover);
81}
82
83static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
84
85 if (!Ty.isScalar())
86 return nullptr;
87
88 switch (Ty.getSizeInBits()) {
89 case 16:
90 return Type::getHalfTy(C&: Ctx);
91 case 32:
92 return Type::getFloatTy(C&: Ctx);
93 case 64:
94 return Type::getDoubleTy(C&: Ctx);
95 case 80:
96 return Type::getX86_FP80Ty(C&: Ctx);
97 case 128:
98 return Type::getFP128Ty(C&: Ctx);
99 default:
100 return nullptr;
101 }
102}
103
104LegalizerHelper::LegalizerHelper(MachineFunction &MF,
105 GISelChangeObserver &Observer,
106 MachineIRBuilder &Builder,
107 const LibcallLoweringInfo *Libcalls)
108 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
109 LI(*MF.getSubtarget().getLegalizerInfo()),
110 TLI(*MF.getSubtarget().getTargetLowering()), Libcalls(Libcalls) {}
111
112LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
113 GISelChangeObserver &Observer,
114 MachineIRBuilder &B,
115 const LibcallLoweringInfo *Libcalls,
116 GISelValueTracking *VT)
117 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
118 TLI(*MF.getSubtarget().getTargetLowering()), Libcalls(Libcalls), VT(VT) {}
119
120LegalizerHelper::LegalizeResult
121LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
122 LostDebugLocObserver &LocObserver) {
123 LLVM_DEBUG(dbgs() << "\nLegalizing: " << MI);
124
125 MIRBuilder.setInstrAndDebugLoc(MI);
126
127 if (isa<GIntrinsic>(Val: MI))
128 return LI.legalizeIntrinsic(Helper&: *this, MI) ? Legalized : UnableToLegalize;
129 auto Step = LI.getAction(MI, MRI);
130 switch (Step.Action) {
131 case Legal:
132 LLVM_DEBUG(dbgs() << ".. Already legal\n");
133 return AlreadyLegal;
134 case Libcall:
135 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
136 return libcall(MI, LocObserver);
137 case NarrowScalar:
138 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
139 return narrowScalar(MI, TypeIdx: Step.TypeIdx, NarrowTy: Step.NewType);
140 case WidenScalar:
141 LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
142 return widenScalar(MI, TypeIdx: Step.TypeIdx, WideTy: Step.NewType);
143 case Bitcast:
144 LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
145 return bitcast(MI, TypeIdx: Step.TypeIdx, Ty: Step.NewType);
146 case Lower:
147 LLVM_DEBUG(dbgs() << ".. Lower\n");
148 return lower(MI, TypeIdx: Step.TypeIdx, Ty: Step.NewType);
149 case FewerElements:
150 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
151 return fewerElementsVector(MI, TypeIdx: Step.TypeIdx, NarrowTy: Step.NewType);
152 case MoreElements:
153 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
154 return moreElementsVector(MI, TypeIdx: Step.TypeIdx, MoreTy: Step.NewType);
155 case Custom:
156 LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
157 return LI.legalizeCustom(Helper&: *this, MI, LocObserver) ? Legalized
158 : UnableToLegalize;
159 default:
160 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
161 return UnableToLegalize;
162 }
163}
164
165void LegalizerHelper::insertParts(Register DstReg,
166 LLT ResultTy, LLT PartTy,
167 ArrayRef<Register> PartRegs,
168 LLT LeftoverTy,
169 ArrayRef<Register> LeftoverRegs) {
170 if (!LeftoverTy.isValid()) {
171 assert(LeftoverRegs.empty());
172
173 if (!ResultTy.isVector()) {
174 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: PartRegs);
175 return;
176 }
177
178 if (PartTy.isVector())
179 MIRBuilder.buildConcatVectors(Res: DstReg, Ops: PartRegs);
180 else
181 MIRBuilder.buildBuildVector(Res: DstReg, Ops: PartRegs);
182 return;
183 }
184
185 // Merge sub-vectors with different number of elements and insert into DstReg.
186 if (ResultTy.isVector()) {
187 assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
188 SmallVector<Register, 8> AllRegs(PartRegs);
189 AllRegs.append(in_start: LeftoverRegs.begin(), in_end: LeftoverRegs.end());
190 return mergeMixedSubvectors(DstReg, PartRegs: AllRegs);
191 }
192
193 SmallVector<Register> GCDRegs;
194 LLT GCDTy = getGCDType(OrigTy: getGCDType(OrigTy: ResultTy, TargetTy: LeftoverTy), TargetTy: PartTy);
195 for (auto PartReg : concat<const Register>(Ranges&: PartRegs, Ranges&: LeftoverRegs))
196 extractGCDType(Parts&: GCDRegs, GCDTy, SrcReg: PartReg);
197 LLT ResultLCMTy = buildLCMMergePieces(DstTy: ResultTy, NarrowTy: LeftoverTy, GCDTy, VRegs&: GCDRegs);
198 buildWidenedRemergeToDst(DstReg, LCMTy: ResultLCMTy, RemergeRegs: GCDRegs);
199}
200
201void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
202 Register Reg) {
203 LLT Ty = MRI.getType(Reg);
204 SmallVector<Register, 8> RegElts;
205 extractParts(Reg, Ty: Ty.getScalarType(), NumParts: Ty.getNumElements(), VRegs&: RegElts,
206 MIRBuilder, MRI);
207 Elts.append(RHS: RegElts);
208}
209
210/// Merge \p PartRegs with different types into \p DstReg.
211void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
212 ArrayRef<Register> PartRegs) {
213 SmallVector<Register, 8> AllElts;
214 for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
215 appendVectorElts(Elts&: AllElts, Reg: PartRegs[i]);
216
217 Register Leftover = PartRegs[PartRegs.size() - 1];
218 if (!MRI.getType(Reg: Leftover).isVector())
219 AllElts.push_back(Elt: Leftover);
220 else
221 appendVectorElts(Elts&: AllElts, Reg: Leftover);
222
223 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: AllElts);
224}
225
226/// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
227static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
228 const MachineInstr &MI) {
229 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
230
231 const int StartIdx = Regs.size();
232 const int NumResults = MI.getNumOperands() - 1;
233 Regs.resize(N: Regs.size() + NumResults);
234 for (int I = 0; I != NumResults; ++I)
235 Regs[StartIdx + I] = MI.getOperand(i: I).getReg();
236}
237
238void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
239 LLT GCDTy, Register SrcReg) {
240 LLT SrcTy = MRI.getType(Reg: SrcReg);
241 if (SrcTy == GCDTy) {
242 // If the source already evenly divides the result type, we don't need to do
243 // anything.
244 Parts.push_back(Elt: SrcReg);
245 } else {
246 // Need to split into common type sized pieces.
247 auto Unmerge = MIRBuilder.buildUnmerge(Res: GCDTy, Op: SrcReg);
248 getUnmergeResults(Regs&: Parts, MI: *Unmerge);
249 }
250}
251
252LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
253 LLT NarrowTy, Register SrcReg) {
254 LLT SrcTy = MRI.getType(Reg: SrcReg);
255 LLT GCDTy = getGCDType(OrigTy: getGCDType(OrigTy: SrcTy, TargetTy: NarrowTy), TargetTy: DstTy);
256 extractGCDType(Parts, GCDTy, SrcReg);
257 return GCDTy;
258}
259
260LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
261 SmallVectorImpl<Register> &VRegs,
262 unsigned PadStrategy) {
263 LLT LCMTy = getLCMType(OrigTy: DstTy, TargetTy: NarrowTy);
264
265 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
266 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
267 int NumOrigSrc = VRegs.size();
268
269 Register PadReg;
270
271 // Get a value we can use to pad the source value if the sources won't evenly
272 // cover the result type.
273 if (NumOrigSrc < NumParts * NumSubParts) {
274 if (PadStrategy == TargetOpcode::G_ZEXT)
275 PadReg = MIRBuilder.buildConstant(Res: GCDTy, Val: 0).getReg(Idx: 0);
276 else if (PadStrategy == TargetOpcode::G_ANYEXT)
277 PadReg = MIRBuilder.buildUndef(Res: GCDTy).getReg(Idx: 0);
278 else {
279 assert(PadStrategy == TargetOpcode::G_SEXT);
280
281 // Shift the sign bit of the low register through the high register.
282 auto ShiftAmt =
283 MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: GCDTy.getSizeInBits() - 1);
284 PadReg = MIRBuilder.buildAShr(Dst: GCDTy, Src0: VRegs.back(), Src1: ShiftAmt).getReg(Idx: 0);
285 }
286 }
287
288 // Registers for the final merge to be produced.
289 SmallVector<Register, 4> Remerge(NumParts);
290
291 // Registers needed for intermediate merges, which will be merged into a
292 // source for Remerge.
293 SmallVector<Register, 4> SubMerge(NumSubParts);
294
295 // Once we've fully read off the end of the original source bits, we can reuse
296 // the same high bits for remaining padding elements.
297 Register AllPadReg;
298
299 // Build merges to the LCM type to cover the original result type.
300 for (int I = 0; I != NumParts; ++I) {
301 bool AllMergePartsArePadding = true;
302
303 // Build the requested merges to the requested type.
304 for (int J = 0; J != NumSubParts; ++J) {
305 int Idx = I * NumSubParts + J;
306 if (Idx >= NumOrigSrc) {
307 SubMerge[J] = PadReg;
308 continue;
309 }
310
311 SubMerge[J] = VRegs[Idx];
312
313 // There are meaningful bits here we can't reuse later.
314 AllMergePartsArePadding = false;
315 }
316
317 // If we've filled up a complete piece with padding bits, we can directly
318 // emit the natural sized constant if applicable, rather than a merge of
319 // smaller constants.
320 if (AllMergePartsArePadding && !AllPadReg) {
321 if (PadStrategy == TargetOpcode::G_ANYEXT)
322 AllPadReg = MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0);
323 else if (PadStrategy == TargetOpcode::G_ZEXT)
324 AllPadReg = MIRBuilder.buildConstant(Res: NarrowTy, Val: 0).getReg(Idx: 0);
325
326 // If this is a sign extension, we can't materialize a trivial constant
327 // with the right type and have to produce a merge.
328 }
329
330 if (AllPadReg) {
331 // Avoid creating additional instructions if we're just adding additional
332 // copies of padding bits.
333 Remerge[I] = AllPadReg;
334 continue;
335 }
336
337 if (NumSubParts == 1)
338 Remerge[I] = SubMerge[0];
339 else
340 Remerge[I] = MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: SubMerge).getReg(Idx: 0);
341
342 // In the sign extend padding case, re-use the first all-signbit merge.
343 if (AllMergePartsArePadding && !AllPadReg)
344 AllPadReg = Remerge[I];
345 }
346
347 VRegs = std::move(Remerge);
348 return LCMTy;
349}
350
351void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
352 ArrayRef<Register> RemergeRegs) {
353 LLT DstTy = MRI.getType(Reg: DstReg);
354
355 // Create the merge to the widened source, and extract the relevant bits into
356 // the result.
357
358 if (DstTy == LCMTy) {
359 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: RemergeRegs);
360 return;
361 }
362
363 auto Remerge = MIRBuilder.buildMergeLikeInstr(Res: LCMTy, Ops: RemergeRegs);
364 if (DstTy.isScalar() && LCMTy.isScalar()) {
365 MIRBuilder.buildTrunc(Res: DstReg, Op: Remerge);
366 return;
367 }
368
369 if (LCMTy.isVector()) {
370 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
371 SmallVector<Register, 8> UnmergeDefs(NumDefs);
372 UnmergeDefs[0] = DstReg;
373 for (unsigned I = 1; I != NumDefs; ++I)
374 UnmergeDefs[I] = MRI.createGenericVirtualRegister(Ty: DstTy);
375
376 MIRBuilder.buildUnmerge(Res: UnmergeDefs,
377 Op: MIRBuilder.buildMergeLikeInstr(Res: LCMTy, Ops: RemergeRegs));
378 return;
379 }
380
381 llvm_unreachable("unhandled case");
382}
383
384static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
385#define RTLIBCASE_INT(LibcallPrefix) \
386 do { \
387 switch (Size) { \
388 case 32: \
389 return RTLIB::LibcallPrefix##32; \
390 case 64: \
391 return RTLIB::LibcallPrefix##64; \
392 case 128: \
393 return RTLIB::LibcallPrefix##128; \
394 default: \
395 llvm_unreachable("unexpected size"); \
396 } \
397 } while (0)
398
399#define RTLIBCASE(LibcallPrefix) \
400 do { \
401 switch (Size) { \
402 case 32: \
403 return RTLIB::LibcallPrefix##32; \
404 case 64: \
405 return RTLIB::LibcallPrefix##64; \
406 case 80: \
407 return RTLIB::LibcallPrefix##80; \
408 case 128: \
409 return RTLIB::LibcallPrefix##128; \
410 default: \
411 llvm_unreachable("unexpected size"); \
412 } \
413 } while (0)
414
415 switch (Opcode) {
416 case TargetOpcode::G_LROUND:
417 RTLIBCASE(LROUND_F);
418 case TargetOpcode::G_LLROUND:
419 RTLIBCASE(LLROUND_F);
420 case TargetOpcode::G_MUL:
421 RTLIBCASE_INT(MUL_I);
422 case TargetOpcode::G_SDIV:
423 RTLIBCASE_INT(SDIV_I);
424 case TargetOpcode::G_UDIV:
425 RTLIBCASE_INT(UDIV_I);
426 case TargetOpcode::G_SREM:
427 RTLIBCASE_INT(SREM_I);
428 case TargetOpcode::G_UREM:
429 RTLIBCASE_INT(UREM_I);
430 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
431 RTLIBCASE_INT(CTLZ_I);
432 case TargetOpcode::G_FADD:
433 RTLIBCASE(ADD_F);
434 case TargetOpcode::G_FSUB:
435 RTLIBCASE(SUB_F);
436 case TargetOpcode::G_FMUL:
437 RTLIBCASE(MUL_F);
438 case TargetOpcode::G_FDIV:
439 RTLIBCASE(DIV_F);
440 case TargetOpcode::G_FEXP:
441 RTLIBCASE(EXP_F);
442 case TargetOpcode::G_FEXP2:
443 RTLIBCASE(EXP2_F);
444 case TargetOpcode::G_FEXP10:
445 RTLIBCASE(EXP10_F);
446 case TargetOpcode::G_FREM:
447 RTLIBCASE(REM_F);
448 case TargetOpcode::G_FPOW:
449 RTLIBCASE(POW_F);
450 case TargetOpcode::G_FPOWI:
451 RTLIBCASE(POWI_F);
452 case TargetOpcode::G_FMA:
453 RTLIBCASE(FMA_F);
454 case TargetOpcode::G_FSIN:
455 RTLIBCASE(SIN_F);
456 case TargetOpcode::G_FCOS:
457 RTLIBCASE(COS_F);
458 case TargetOpcode::G_FTAN:
459 RTLIBCASE(TAN_F);
460 case TargetOpcode::G_FASIN:
461 RTLIBCASE(ASIN_F);
462 case TargetOpcode::G_FACOS:
463 RTLIBCASE(ACOS_F);
464 case TargetOpcode::G_FATAN:
465 RTLIBCASE(ATAN_F);
466 case TargetOpcode::G_FATAN2:
467 RTLIBCASE(ATAN2_F);
468 case TargetOpcode::G_FSINH:
469 RTLIBCASE(SINH_F);
470 case TargetOpcode::G_FCOSH:
471 RTLIBCASE(COSH_F);
472 case TargetOpcode::G_FTANH:
473 RTLIBCASE(TANH_F);
474 case TargetOpcode::G_FSINCOS:
475 RTLIBCASE(SINCOS_F);
476 case TargetOpcode::G_FMODF:
477 RTLIBCASE(MODF_F);
478 case TargetOpcode::G_FLOG10:
479 RTLIBCASE(LOG10_F);
480 case TargetOpcode::G_FLOG:
481 RTLIBCASE(LOG_F);
482 case TargetOpcode::G_FLOG2:
483 RTLIBCASE(LOG2_F);
484 case TargetOpcode::G_FLDEXP:
485 RTLIBCASE(LDEXP_F);
486 case TargetOpcode::G_FCEIL:
487 RTLIBCASE(CEIL_F);
488 case TargetOpcode::G_FFLOOR:
489 RTLIBCASE(FLOOR_F);
490 case TargetOpcode::G_FMINNUM:
491 RTLIBCASE(FMIN_F);
492 case TargetOpcode::G_FMAXNUM:
493 RTLIBCASE(FMAX_F);
494 case TargetOpcode::G_FMINIMUMNUM:
495 RTLIBCASE(FMINIMUM_NUM_F);
496 case TargetOpcode::G_FMAXIMUMNUM:
497 RTLIBCASE(FMAXIMUM_NUM_F);
498 case TargetOpcode::G_FSQRT:
499 RTLIBCASE(SQRT_F);
500 case TargetOpcode::G_FRINT:
501 RTLIBCASE(RINT_F);
502 case TargetOpcode::G_FNEARBYINT:
503 RTLIBCASE(NEARBYINT_F);
504 case TargetOpcode::G_INTRINSIC_TRUNC:
505 RTLIBCASE(TRUNC_F);
506 case TargetOpcode::G_INTRINSIC_ROUND:
507 RTLIBCASE(ROUND_F);
508 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
509 RTLIBCASE(ROUNDEVEN_F);
510 case TargetOpcode::G_INTRINSIC_LRINT:
511 RTLIBCASE(LRINT_F);
512 case TargetOpcode::G_INTRINSIC_LLRINT:
513 RTLIBCASE(LLRINT_F);
514 }
515 llvm_unreachable("Unknown libcall function");
516#undef RTLIBCASE_INT
517#undef RTLIBCASE
518}
519
520/// True if an instruction is in tail position in its caller. Intended for
521/// legalizing libcalls as tail calls when possible.
522static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
523 MachineInstr &MI,
524 const TargetInstrInfo &TII,
525 MachineRegisterInfo &MRI) {
526 MachineBasicBlock &MBB = *MI.getParent();
527 const Function &F = MBB.getParent()->getFunction();
528
529 // Conservatively require the attributes of the call to match those of
530 // the return. Ignore NoAlias and NonNull because they don't affect the
531 // call sequence.
532 AttributeList CallerAttrs = F.getAttributes();
533 if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
534 .removeAttribute(Val: Attribute::NoAlias)
535 .removeAttribute(Val: Attribute::NonNull)
536 .hasAttributes())
537 return false;
538
539 // It's not safe to eliminate the sign / zero extension of the return value.
540 if (CallerAttrs.hasRetAttr(Kind: Attribute::ZExt) ||
541 CallerAttrs.hasRetAttr(Kind: Attribute::SExt))
542 return false;
543
544 // Only tail call if the following instruction is a standard return or if we
545 // have a `thisreturn` callee, and a sequence like:
546 //
547 // G_MEMCPY %0, %1, %2
548 // $x0 = COPY %0
549 // RET_ReallyLR implicit $x0
550 auto Next = next_nodbg(It: MI.getIterator(), End: MBB.instr_end());
551 if (Next != MBB.instr_end() && Next->isCopy()) {
552 if (MI.getOpcode() == TargetOpcode::G_BZERO)
553 return false;
554
555 // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
556 // mempy/etc routines return the same parameter. For other it will be the
557 // returned value.
558 Register VReg = MI.getOperand(i: 0).getReg();
559 if (!VReg.isVirtual() || VReg != Next->getOperand(i: 1).getReg())
560 return false;
561
562 Register PReg = Next->getOperand(i: 0).getReg();
563 if (!PReg.isPhysical())
564 return false;
565
566 auto Ret = next_nodbg(It: Next, End: MBB.instr_end());
567 if (Ret == MBB.instr_end() || !Ret->isReturn())
568 return false;
569
570 if (Ret->getNumImplicitOperands() != 1)
571 return false;
572
573 if (!Ret->getOperand(i: 0).isReg() || PReg != Ret->getOperand(i: 0).getReg())
574 return false;
575
576 // Skip over the COPY that we just validated.
577 Next = Ret;
578 }
579
580 if (Next == MBB.instr_end() || TII.isTailCall(Inst: *Next) || !Next->isReturn())
581 return false;
582
583 return true;
584}
585
586LegalizerHelper::LegalizeResult LegalizerHelper::createLibcall(
587 const char *Name, const CallLowering::ArgInfo &Result,
588 ArrayRef<CallLowering::ArgInfo> Args, const CallingConv::ID CC,
589 LostDebugLocObserver &LocObserver, MachineInstr *MI) const {
590 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
591
592 CallLowering::CallLoweringInfo Info;
593 Info.CallConv = CC;
594 Info.Callee = MachineOperand::CreateES(SymName: Name);
595 Info.OrigRet = Result;
596 if (MI)
597 Info.IsTailCall =
598 (Result.Ty->isVoidTy() ||
599 Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
600 isLibCallInTailPosition(Result, MI&: *MI, TII: MIRBuilder.getTII(),
601 MRI&: *MIRBuilder.getMRI());
602
603 llvm::append_range(C&: Info.OrigArgs, R&: Args);
604 if (!CLI.lowerCall(MIRBuilder, Info))
605 return LegalizerHelper::UnableToLegalize;
606
607 if (MI && Info.LoweredTailCall) {
608 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
609
610 // Check debug locations before removing the return.
611 LocObserver.checkpoint(CheckDebugLocs: true);
612
613 // We must have a return following the call (or debug insts) to get past
614 // isLibCallInTailPosition.
615 do {
616 MachineInstr *Next = MI->getNextNode();
617 assert(Next &&
618 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
619 "Expected instr following MI to be return or debug inst?");
620 // We lowered a tail call, so the call is now the return from the block.
621 // Delete the old return.
622 Next->eraseFromParent();
623 } while (MI->getNextNode());
624
625 // We expect to lose the debug location from the return.
626 LocObserver.checkpoint(CheckDebugLocs: false);
627 }
628 return LegalizerHelper::Legalized;
629}
630
631LegalizerHelper::LegalizeResult LegalizerHelper::createLibcall(
632 RTLIB::Libcall Libcall, const CallLowering::ArgInfo &Result,
633 ArrayRef<CallLowering::ArgInfo> Args, LostDebugLocObserver &LocObserver,
634 MachineInstr *MI) const {
635 if (!Libcalls)
636 return LegalizerHelper::UnableToLegalize;
637
638 RTLIB::LibcallImpl LibcallImpl = Libcalls->getLibcallImpl(Call: Libcall);
639 if (LibcallImpl == RTLIB::Unsupported)
640 return LegalizerHelper::UnableToLegalize;
641
642 StringRef Name = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: LibcallImpl);
643 const CallingConv::ID CC = Libcalls->getLibcallImplCallingConv(Call: LibcallImpl);
644 return createLibcall(Name: Name.data(), Result, Args, CC, LocObserver, MI);
645}
646
647// Useful for libcalls where all operands have the same type.
648LegalizerHelper::LegalizeResult
649LegalizerHelper::simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
650 unsigned Size, Type *OpType,
651 LostDebugLocObserver &LocObserver) const {
652 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
653
654 // FIXME: What does the original arg index mean here?
655 SmallVector<CallLowering::ArgInfo, 3> Args;
656 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands()))
657 Args.push_back(Elt: {MO.getReg(), OpType, 0});
658 return createLibcall(Libcall, Result: {MI.getOperand(i: 0).getReg(), OpType, 0}, Args,
659 LocObserver, MI: &MI);
660}
661
662LegalizerHelper::LegalizeResult LegalizerHelper::emitSincosLibcall(
663 MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, Type *OpType,
664 LostDebugLocObserver &LocObserver) {
665 MachineFunction &MF = *MI.getMF();
666 MachineRegisterInfo &MRI = MF.getRegInfo();
667
668 Register DstSin = MI.getOperand(i: 0).getReg();
669 Register DstCos = MI.getOperand(i: 1).getReg();
670 Register Src = MI.getOperand(i: 2).getReg();
671 LLT DstTy = MRI.getType(Reg: DstSin);
672
673 int MemSize = DstTy.getSizeInBytes();
674 Align Alignment = getStackTemporaryAlignment(Type: DstTy);
675 const DataLayout &DL = MIRBuilder.getDataLayout();
676 unsigned AddrSpace = DL.getAllocaAddrSpace();
677 MachinePointerInfo PtrInfo;
678
679 Register StackPtrSin =
680 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
681 .getReg(Idx: 0);
682 Register StackPtrCos =
683 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
684 .getReg(Idx: 0);
685
686 auto &Ctx = MF.getFunction().getContext();
687 auto LibcallResult = createLibcall(
688 Libcall: getRTLibDesc(Opcode: MI.getOpcode(), Size), Result: {{0}, Type::getVoidTy(C&: Ctx), 0},
689 Args: {{Src, OpType, 0},
690 {StackPtrSin, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 1},
691 {StackPtrCos, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 2}},
692 LocObserver, MI: &MI);
693
694 if (LibcallResult != LegalizeResult::Legalized)
695 return LegalizerHelper::UnableToLegalize;
696
697 MachineMemOperand *LoadMMOSin = MF.getMachineMemOperand(
698 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
699 MachineMemOperand *LoadMMOCos = MF.getMachineMemOperand(
700 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
701
702 MIRBuilder.buildLoad(Res: DstSin, Addr: StackPtrSin, MMO&: *LoadMMOSin);
703 MIRBuilder.buildLoad(Res: DstCos, Addr: StackPtrCos, MMO&: *LoadMMOCos);
704 MI.eraseFromParent();
705
706 return LegalizerHelper::Legalized;
707}
708
709LegalizerHelper::LegalizeResult
710LegalizerHelper::emitModfLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
711 unsigned Size, Type *OpType,
712 LostDebugLocObserver &LocObserver) {
713 MachineFunction &MF = MIRBuilder.getMF();
714 MachineRegisterInfo &MRI = MF.getRegInfo();
715
716 Register DstFrac = MI.getOperand(i: 0).getReg();
717 Register DstInt = MI.getOperand(i: 1).getReg();
718 Register Src = MI.getOperand(i: 2).getReg();
719 LLT DstTy = MRI.getType(Reg: DstFrac);
720
721 int MemSize = DstTy.getSizeInBytes();
722 Align Alignment = getStackTemporaryAlignment(Type: DstTy);
723 const DataLayout &DL = MIRBuilder.getDataLayout();
724 unsigned AddrSpace = DL.getAllocaAddrSpace();
725 MachinePointerInfo PtrInfo;
726
727 Register StackPtrInt =
728 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
729 .getReg(Idx: 0);
730
731 auto &Ctx = MF.getFunction().getContext();
732 auto LibcallResult = createLibcall(
733 Libcall: getRTLibDesc(Opcode: MI.getOpcode(), Size), Result: {DstFrac, OpType, 0},
734 Args: {{Src, OpType, 0}, {StackPtrInt, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 1}},
735 LocObserver, MI: &MI);
736
737 if (LibcallResult != LegalizeResult::Legalized)
738 return LegalizerHelper::UnableToLegalize;
739
740 MachineMemOperand *LoadMMOInt = MF.getMachineMemOperand(
741 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
742
743 MIRBuilder.buildLoad(Res: DstInt, Addr: StackPtrInt, MMO&: *LoadMMOInt);
744 MI.eraseFromParent();
745
746 return LegalizerHelper::Legalized;
747}
748
749static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
750 Type *FromType) {
751 auto ToMVT = MVT::getVT(Ty: ToType);
752 auto FromMVT = MVT::getVT(Ty: FromType);
753
754 switch (Opcode) {
755 case TargetOpcode::G_FPEXT:
756 return RTLIB::getFPEXT(OpVT: FromMVT, RetVT: ToMVT);
757 case TargetOpcode::G_FPTRUNC:
758 return RTLIB::getFPROUND(OpVT: FromMVT, RetVT: ToMVT);
759 case TargetOpcode::G_FPTOSI:
760 return RTLIB::getFPTOSINT(OpVT: FromMVT, RetVT: ToMVT);
761 case TargetOpcode::G_FPTOUI:
762 return RTLIB::getFPTOUINT(OpVT: FromMVT, RetVT: ToMVT);
763 case TargetOpcode::G_SITOFP:
764 return RTLIB::getSINTTOFP(OpVT: FromMVT, RetVT: ToMVT);
765 case TargetOpcode::G_UITOFP:
766 return RTLIB::getUINTTOFP(OpVT: FromMVT, RetVT: ToMVT);
767 }
768 llvm_unreachable("Unsupported libcall function");
769}
770
771LegalizerHelper::LegalizeResult LegalizerHelper::conversionLibcall(
772 MachineInstr &MI, Type *ToType, Type *FromType,
773 LostDebugLocObserver &LocObserver, bool IsSigned) const {
774 CallLowering::ArgInfo Arg = {MI.getOperand(i: 1).getReg(), FromType, 0};
775 if (FromType->isIntegerTy()) {
776 if (TLI.shouldSignExtendTypeInLibCall(Ty: FromType, IsSigned))
777 Arg.Flags[0].setSExt();
778 else
779 Arg.Flags[0].setZExt();
780 }
781
782 RTLIB::Libcall Libcall = getConvRTLibDesc(Opcode: MI.getOpcode(), ToType, FromType);
783 return createLibcall(Libcall, Result: {MI.getOperand(i: 0).getReg(), ToType, 0}, Args: Arg,
784 LocObserver, MI: &MI);
785}
786
787LegalizerHelper::LegalizeResult
788LegalizerHelper::createMemLibcall(MachineRegisterInfo &MRI, MachineInstr &MI,
789 LostDebugLocObserver &LocObserver) const {
790 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
791
792 SmallVector<CallLowering::ArgInfo, 3> Args;
793 // Add all the args, except for the last which is an imm denoting 'tail'.
794 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
795 Register Reg = MI.getOperand(i).getReg();
796
797 // Need derive an IR type for call lowering.
798 LLT OpLLT = MRI.getType(Reg);
799 Type *OpTy = nullptr;
800 if (OpLLT.isPointer())
801 OpTy = PointerType::get(C&: Ctx, AddressSpace: OpLLT.getAddressSpace());
802 else
803 OpTy = IntegerType::get(C&: Ctx, NumBits: OpLLT.getSizeInBits());
804 Args.push_back(Elt: {Reg, OpTy, 0});
805 }
806
807 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
808 RTLIB::Libcall RTLibcall;
809 unsigned Opc = MI.getOpcode();
810 switch (Opc) {
811 case TargetOpcode::G_BZERO:
812 RTLibcall = RTLIB::BZERO;
813 break;
814 case TargetOpcode::G_MEMCPY:
815 RTLibcall = RTLIB::MEMCPY;
816 Args[0].Flags[0].setReturned();
817 break;
818 case TargetOpcode::G_MEMMOVE:
819 RTLibcall = RTLIB::MEMMOVE;
820 Args[0].Flags[0].setReturned();
821 break;
822 case TargetOpcode::G_MEMSET:
823 RTLibcall = RTLIB::MEMSET;
824 Args[0].Flags[0].setReturned();
825 break;
826 default:
827 llvm_unreachable("unsupported opcode");
828 }
829
830 if (!Libcalls) // FIXME: Should be mandatory
831 return LegalizerHelper::UnableToLegalize;
832
833 RTLIB::LibcallImpl RTLibcallImpl = Libcalls->getLibcallImpl(Call: RTLibcall);
834
835 // Unsupported libcall on the target.
836 if (RTLibcallImpl == RTLIB::Unsupported) {
837 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
838 << MIRBuilder.getTII().getName(Opc) << "\n");
839 return LegalizerHelper::UnableToLegalize;
840 }
841
842 CallLowering::CallLoweringInfo Info;
843 Info.CallConv = Libcalls->getLibcallImplCallingConv(Call: RTLibcallImpl);
844
845 StringRef LibcallName =
846 RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: RTLibcallImpl);
847 Info.Callee = MachineOperand::CreateES(SymName: LibcallName.data());
848 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0);
849 Info.IsTailCall =
850 MI.getOperand(i: MI.getNumOperands() - 1).getImm() &&
851 isLibCallInTailPosition(Result: Info.OrigRet, MI, TII: MIRBuilder.getTII(), MRI);
852
853 llvm::append_range(C&: Info.OrigArgs, R&: Args);
854 if (!CLI.lowerCall(MIRBuilder, Info))
855 return LegalizerHelper::UnableToLegalize;
856
857 if (Info.LoweredTailCall) {
858 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
859
860 // Check debug locations before removing the return.
861 LocObserver.checkpoint(CheckDebugLocs: true);
862
863 // We must have a return following the call (or debug insts) to get past
864 // isLibCallInTailPosition.
865 do {
866 MachineInstr *Next = MI.getNextNode();
867 assert(Next &&
868 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
869 "Expected instr following MI to be return or debug inst?");
870 // We lowered a tail call, so the call is now the return from the block.
871 // Delete the old return.
872 Next->eraseFromParent();
873 } while (MI.getNextNode());
874
875 // We expect to lose the debug location from the return.
876 LocObserver.checkpoint(CheckDebugLocs: false);
877 }
878
879 return LegalizerHelper::Legalized;
880}
881
882static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
883 unsigned Opc = MI.getOpcode();
884 auto &AtomicMI = cast<GMemOperation>(Val&: MI);
885 auto &MMO = AtomicMI.getMMO();
886 auto Ordering = MMO.getMergedOrdering();
887 LLT MemType = MMO.getMemoryType();
888 uint64_t MemSize = MemType.getSizeInBytes();
889 if (MemType.isVector())
890 return RTLIB::UNKNOWN_LIBCALL;
891
892#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
893#define LCALL5(A) \
894 LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
895 switch (Opc) {
896 case TargetOpcode::G_ATOMIC_CMPXCHG:
897 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
898 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
899 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
900 }
901 case TargetOpcode::G_ATOMICRMW_XCHG: {
902 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
903 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
904 }
905 case TargetOpcode::G_ATOMICRMW_ADD:
906 case TargetOpcode::G_ATOMICRMW_SUB: {
907 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
908 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
909 }
910 case TargetOpcode::G_ATOMICRMW_AND: {
911 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
912 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
913 }
914 case TargetOpcode::G_ATOMICRMW_OR: {
915 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
916 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
917 }
918 case TargetOpcode::G_ATOMICRMW_XOR: {
919 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
920 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
921 }
922 default:
923 return RTLIB::UNKNOWN_LIBCALL;
924 }
925#undef LCALLS
926#undef LCALL5
927}
928
929LegalizerHelper::LegalizeResult
930LegalizerHelper::createAtomicLibcall(MachineInstr &MI) const {
931 auto &Ctx = MIRBuilder.getContext();
932
933 Type *RetTy;
934 SmallVector<Register> RetRegs;
935 SmallVector<CallLowering::ArgInfo, 3> Args;
936 unsigned Opc = MI.getOpcode();
937 switch (Opc) {
938 case TargetOpcode::G_ATOMIC_CMPXCHG:
939 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
940 Register Success;
941 LLT SuccessLLT;
942 auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
943 MI.getFirst4RegLLTs();
944 RetRegs.push_back(Elt: Ret);
945 RetTy = IntegerType::get(C&: Ctx, NumBits: RetLLT.getSizeInBits());
946 if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
947 std::tie(args&: Ret, args&: RetLLT, args&: Success, args&: SuccessLLT, args&: Mem, args&: MemLLT, args&: Cmp, args&: CmpLLT, args&: New,
948 args&: NewLLT) = MI.getFirst5RegLLTs();
949 RetRegs.push_back(Elt: Success);
950 RetTy = StructType::get(
951 Context&: Ctx, Elements: {RetTy, IntegerType::get(C&: Ctx, NumBits: SuccessLLT.getSizeInBits())});
952 }
953 Args.push_back(Elt: {Cmp, IntegerType::get(C&: Ctx, NumBits: CmpLLT.getSizeInBits()), 0});
954 Args.push_back(Elt: {New, IntegerType::get(C&: Ctx, NumBits: NewLLT.getSizeInBits()), 0});
955 Args.push_back(Elt: {Mem, PointerType::get(C&: Ctx, AddressSpace: MemLLT.getAddressSpace()), 0});
956 break;
957 }
958 case TargetOpcode::G_ATOMICRMW_XCHG:
959 case TargetOpcode::G_ATOMICRMW_ADD:
960 case TargetOpcode::G_ATOMICRMW_SUB:
961 case TargetOpcode::G_ATOMICRMW_AND:
962 case TargetOpcode::G_ATOMICRMW_OR:
963 case TargetOpcode::G_ATOMICRMW_XOR: {
964 auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
965 RetRegs.push_back(Elt: Ret);
966 RetTy = IntegerType::get(C&: Ctx, NumBits: RetLLT.getSizeInBits());
967 if (Opc == TargetOpcode::G_ATOMICRMW_AND)
968 Val =
969 MIRBuilder.buildXor(Dst: ValLLT, Src0: MIRBuilder.buildConstant(Res: ValLLT, Val: -1), Src1: Val)
970 .getReg(Idx: 0);
971 else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
972 Val =
973 MIRBuilder.buildSub(Dst: ValLLT, Src0: MIRBuilder.buildConstant(Res: ValLLT, Val: 0), Src1: Val)
974 .getReg(Idx: 0);
975 Args.push_back(Elt: {Val, IntegerType::get(C&: Ctx, NumBits: ValLLT.getSizeInBits()), 0});
976 Args.push_back(Elt: {Mem, PointerType::get(C&: Ctx, AddressSpace: MemLLT.getAddressSpace()), 0});
977 break;
978 }
979 default:
980 llvm_unreachable("unsupported opcode");
981 }
982
983 if (!Libcalls) // FIXME: Should be mandatory
984 return LegalizerHelper::UnableToLegalize;
985
986 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
987 RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
988 RTLIB::LibcallImpl RTLibcallImpl = Libcalls->getLibcallImpl(Call: RTLibcall);
989
990 // Unsupported libcall on the target.
991 if (RTLibcallImpl == RTLIB::Unsupported) {
992 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
993 << MIRBuilder.getTII().getName(Opc) << "\n");
994 return LegalizerHelper::UnableToLegalize;
995 }
996
997 CallLowering::CallLoweringInfo Info;
998 Info.CallConv = Libcalls->getLibcallImplCallingConv(Call: RTLibcallImpl);
999
1000 StringRef LibcallName =
1001 RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: RTLibcallImpl);
1002 Info.Callee = MachineOperand::CreateES(SymName: LibcallName.data());
1003 Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
1004
1005 llvm::append_range(C&: Info.OrigArgs, R&: Args);
1006 if (!CLI.lowerCall(MIRBuilder, Info))
1007 return LegalizerHelper::UnableToLegalize;
1008
1009 return LegalizerHelper::Legalized;
1010}
1011
1012static RTLIB::Libcall
1013getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
1014 RTLIB::Libcall RTLibcall;
1015 switch (MI.getOpcode()) {
1016 case TargetOpcode::G_GET_FPENV:
1017 RTLibcall = RTLIB::FEGETENV;
1018 break;
1019 case TargetOpcode::G_SET_FPENV:
1020 case TargetOpcode::G_RESET_FPENV:
1021 RTLibcall = RTLIB::FESETENV;
1022 break;
1023 case TargetOpcode::G_GET_FPMODE:
1024 RTLibcall = RTLIB::FEGETMODE;
1025 break;
1026 case TargetOpcode::G_SET_FPMODE:
1027 case TargetOpcode::G_RESET_FPMODE:
1028 RTLibcall = RTLIB::FESETMODE;
1029 break;
1030 default:
1031 llvm_unreachable("Unexpected opcode");
1032 }
1033 return RTLibcall;
1034}
1035
1036// Some library functions that read FP state (fegetmode, fegetenv) write the
1037// state into a region in memory. IR intrinsics that do the same operations
1038// (get_fpmode, get_fpenv) return the state as integer value. To implement these
1039// intrinsics via the library functions, we need to use temporary variable,
1040// for example:
1041//
1042// %0:_(s32) = G_GET_FPMODE
1043//
1044// is transformed to:
1045//
1046// %1:_(p0) = G_FRAME_INDEX %stack.0
1047// BL &fegetmode
1048// %0:_(s32) = G_LOAD % 1
1049//
1050LegalizerHelper::LegalizeResult
1051LegalizerHelper::createGetStateLibcall(MachineInstr &MI,
1052 LostDebugLocObserver &LocObserver) {
1053 const DataLayout &DL = MIRBuilder.getDataLayout();
1054 auto &MF = MIRBuilder.getMF();
1055 auto &MRI = *MIRBuilder.getMRI();
1056 auto &Ctx = MF.getFunction().getContext();
1057
1058 // Create temporary, where library function will put the read state.
1059 Register Dst = MI.getOperand(i: 0).getReg();
1060 LLT StateTy = MRI.getType(Reg: Dst);
1061 TypeSize StateSize = StateTy.getSizeInBytes();
1062 Align TempAlign = getStackTemporaryAlignment(Type: StateTy);
1063 MachinePointerInfo TempPtrInfo;
1064 auto Temp = createStackTemporary(Bytes: StateSize, Alignment: TempAlign, PtrInfo&: TempPtrInfo);
1065
1066 // Create a call to library function, with the temporary as an argument.
1067 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1068 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: TempAddrSpace);
1069 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1070 auto Res = createLibcall(
1071 Libcall: RTLibcall, Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1072 Args: CallLowering::ArgInfo({Temp.getReg(Idx: 0), StatePtrTy, 0}), LocObserver,
1073 MI: nullptr);
1074 if (Res != LegalizerHelper::Legalized)
1075 return Res;
1076
1077 // Create a load from the temporary.
1078 MachineMemOperand *MMO = MF.getMachineMemOperand(
1079 PtrInfo: TempPtrInfo, f: MachineMemOperand::MOLoad, MemTy: StateTy, base_alignment: TempAlign);
1080 MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_LOAD, Res: Dst, Addr: Temp, MMO&: *MMO);
1081
1082 return LegalizerHelper::Legalized;
1083}
1084
1085// Similar to `createGetStateLibcall` the function calls a library function
1086// using transient space in stack. In this case the library function reads
1087// content of memory region.
1088LegalizerHelper::LegalizeResult
1089LegalizerHelper::createSetStateLibcall(MachineInstr &MI,
1090 LostDebugLocObserver &LocObserver) {
1091 const DataLayout &DL = MIRBuilder.getDataLayout();
1092 auto &MF = MIRBuilder.getMF();
1093 auto &MRI = *MIRBuilder.getMRI();
1094 auto &Ctx = MF.getFunction().getContext();
1095
1096 // Create temporary, where library function will get the new state.
1097 Register Src = MI.getOperand(i: 0).getReg();
1098 LLT StateTy = MRI.getType(Reg: Src);
1099 TypeSize StateSize = StateTy.getSizeInBytes();
1100 Align TempAlign = getStackTemporaryAlignment(Type: StateTy);
1101 MachinePointerInfo TempPtrInfo;
1102 auto Temp = createStackTemporary(Bytes: StateSize, Alignment: TempAlign, PtrInfo&: TempPtrInfo);
1103
1104 // Put the new state into the temporary.
1105 MachineMemOperand *MMO = MF.getMachineMemOperand(
1106 PtrInfo: TempPtrInfo, f: MachineMemOperand::MOStore, MemTy: StateTy, base_alignment: TempAlign);
1107 MIRBuilder.buildStore(Val: Src, Addr: Temp, MMO&: *MMO);
1108
1109 // Create a call to library function, with the temporary as an argument.
1110 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1111 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: TempAddrSpace);
1112 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1113 return createLibcall(Libcall: RTLibcall,
1114 Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1115 Args: CallLowering::ArgInfo({Temp.getReg(Idx: 0), StatePtrTy, 0}),
1116 LocObserver, MI: nullptr);
1117}
1118
1119/// Returns the corresponding libcall for the given Pred and
1120/// the ICMP predicate that should be generated to compare with #0
1121/// after the libcall.
1122static std::pair<RTLIB::Libcall, CmpInst::Predicate>
1123getFCMPLibcallDesc(const CmpInst::Predicate Pred, unsigned Size) {
1124#define RTLIBCASE_CMP(LibcallPrefix, ICmpPred) \
1125 do { \
1126 switch (Size) { \
1127 case 32: \
1128 return {RTLIB::LibcallPrefix##32, ICmpPred}; \
1129 case 64: \
1130 return {RTLIB::LibcallPrefix##64, ICmpPred}; \
1131 case 128: \
1132 return {RTLIB::LibcallPrefix##128, ICmpPred}; \
1133 default: \
1134 llvm_unreachable("unexpected size"); \
1135 } \
1136 } while (0)
1137
1138 switch (Pred) {
1139 case CmpInst::FCMP_OEQ:
1140 RTLIBCASE_CMP(OEQ_F, CmpInst::ICMP_EQ);
1141 case CmpInst::FCMP_UNE:
1142 RTLIBCASE_CMP(UNE_F, CmpInst::ICMP_NE);
1143 case CmpInst::FCMP_OGE:
1144 RTLIBCASE_CMP(OGE_F, CmpInst::ICMP_SGE);
1145 case CmpInst::FCMP_OLT:
1146 RTLIBCASE_CMP(OLT_F, CmpInst::ICMP_SLT);
1147 case CmpInst::FCMP_OLE:
1148 RTLIBCASE_CMP(OLE_F, CmpInst::ICMP_SLE);
1149 case CmpInst::FCMP_OGT:
1150 RTLIBCASE_CMP(OGT_F, CmpInst::ICMP_SGT);
1151 case CmpInst::FCMP_UNO:
1152 RTLIBCASE_CMP(UO_F, CmpInst::ICMP_NE);
1153 default:
1154 return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
1155 }
1156}
1157
1158LegalizerHelper::LegalizeResult
1159LegalizerHelper::createFCMPLibcall(MachineInstr &MI,
1160 LostDebugLocObserver &LocObserver) {
1161 auto &MF = MIRBuilder.getMF();
1162 auto &Ctx = MF.getFunction().getContext();
1163 const GFCmp *Cmp = cast<GFCmp>(Val: &MI);
1164
1165 LLT OpLLT = MRI.getType(Reg: Cmp->getLHSReg());
1166 unsigned Size = OpLLT.getSizeInBits();
1167 if ((Size != 32 && Size != 64 && Size != 128) ||
1168 OpLLT != MRI.getType(Reg: Cmp->getRHSReg()))
1169 return UnableToLegalize;
1170
1171 Type *OpType = getFloatTypeForLLT(Ctx, Ty: OpLLT);
1172
1173 // DstReg type is s32
1174 const Register DstReg = Cmp->getReg(Idx: 0);
1175 LLT DstTy = MRI.getType(Reg: DstReg);
1176 const auto Cond = Cmp->getCond();
1177
1178 // Reference:
1179 // https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
1180 // Generates a libcall followed by ICMP.
1181 const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
1182 const CmpInst::Predicate ICmpPred,
1183 const DstOp &Res) -> Register {
1184 // FCMP libcall always returns an i32, and needs an ICMP with #0.
1185 constexpr LLT TempLLT = LLT::scalar(SizeInBits: 32);
1186 Register Temp = MRI.createGenericVirtualRegister(Ty: TempLLT);
1187 // Generate libcall, holding result in Temp
1188 const auto Status = createLibcall(
1189 Libcall, Result: {Temp, Type::getInt32Ty(C&: Ctx), 0},
1190 Args: {{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
1191 LocObserver, MI: &MI);
1192 if (!Status)
1193 return {};
1194
1195 // Compare temp with #0 to get the final result.
1196 return MIRBuilder
1197 .buildICmp(Pred: ICmpPred, Res, Op0: Temp, Op1: MIRBuilder.buildConstant(Res: TempLLT, Val: 0))
1198 .getReg(Idx: 0);
1199 };
1200
1201 // Simple case if we have a direct mapping from predicate to libcall
1202 if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Pred: Cond, Size);
1203 Libcall != RTLIB::UNKNOWN_LIBCALL &&
1204 ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
1205 if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
1206 return Legalized;
1207 }
1208 return UnableToLegalize;
1209 }
1210
1211 // No direct mapping found, should be generated as combination of libcalls.
1212
1213 switch (Cond) {
1214 case CmpInst::FCMP_UEQ: {
1215 // FCMP_UEQ: unordered or equal
1216 // Convert into (FCMP_OEQ || FCMP_UNO).
1217
1218 const auto [OeqLibcall, OeqPred] =
1219 getFCMPLibcallDesc(Pred: CmpInst::FCMP_OEQ, Size);
1220 const auto Oeq = BuildLibcall(OeqLibcall, OeqPred, DstTy);
1221
1222 const auto [UnoLibcall, UnoPred] =
1223 getFCMPLibcallDesc(Pred: CmpInst::FCMP_UNO, Size);
1224 const auto Uno = BuildLibcall(UnoLibcall, UnoPred, DstTy);
1225 if (Oeq && Uno)
1226 MIRBuilder.buildOr(Dst: DstReg, Src0: Oeq, Src1: Uno);
1227 else
1228 return UnableToLegalize;
1229
1230 break;
1231 }
1232 case CmpInst::FCMP_ONE: {
1233 // FCMP_ONE: ordered and operands are unequal
1234 // Convert into (!FCMP_OEQ && !FCMP_UNO).
1235
1236 // We inverse the predicate instead of generating a NOT
1237 // to save one instruction.
1238 // On AArch64 isel can even select two cmp into a single ccmp.
1239 const auto [OeqLibcall, OeqPred] =
1240 getFCMPLibcallDesc(Pred: CmpInst::FCMP_OEQ, Size);
1241 const auto NotOeq =
1242 BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(pred: OeqPred), DstTy);
1243
1244 const auto [UnoLibcall, UnoPred] =
1245 getFCMPLibcallDesc(Pred: CmpInst::FCMP_UNO, Size);
1246 const auto NotUno =
1247 BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(pred: UnoPred), DstTy);
1248
1249 if (NotOeq && NotUno)
1250 MIRBuilder.buildAnd(Dst: DstReg, Src0: NotOeq, Src1: NotUno);
1251 else
1252 return UnableToLegalize;
1253
1254 break;
1255 }
1256 case CmpInst::FCMP_ULT:
1257 case CmpInst::FCMP_UGE:
1258 case CmpInst::FCMP_UGT:
1259 case CmpInst::FCMP_ULE:
1260 case CmpInst::FCMP_ORD: {
1261 // Convert into: !(inverse(Pred))
1262 // E.g. FCMP_ULT becomes !FCMP_OGE
1263 // This is equivalent to the following, but saves some instructions.
1264 // MIRBuilder.buildNot(
1265 // PredTy,
1266 // MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
1267 // Op1, Op2));
1268 const auto [InversedLibcall, InversedPred] =
1269 getFCMPLibcallDesc(Pred: CmpInst::getInversePredicate(pred: Cond), Size);
1270 if (!BuildLibcall(InversedLibcall,
1271 CmpInst::getInversePredicate(pred: InversedPred), DstReg))
1272 return UnableToLegalize;
1273 break;
1274 }
1275 default:
1276 return UnableToLegalize;
1277 }
1278
1279 return Legalized;
1280}
1281
1282// The function is used to legalize operations that set default environment
1283// state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
1284// On most targets supported in glibc FE_DFL_MODE is defined as
1285// `((const femode_t *) -1)`. Such assumption is used here. If for some target
1286// it is not true, the target must provide custom lowering.
1287LegalizerHelper::LegalizeResult
1288LegalizerHelper::createResetStateLibcall(MachineInstr &MI,
1289 LostDebugLocObserver &LocObserver) {
1290 const DataLayout &DL = MIRBuilder.getDataLayout();
1291 auto &MF = MIRBuilder.getMF();
1292 auto &Ctx = MF.getFunction().getContext();
1293
1294 // Create an argument for the library function.
1295 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
1296 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: AddrSpace);
1297 unsigned PtrSize = DL.getPointerSizeInBits(AS: AddrSpace);
1298 LLT MemTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: PtrSize);
1299 auto DefValue = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: PtrSize), Val: -1LL);
1300 DstOp Dest(MRI.createGenericVirtualRegister(Ty: MemTy));
1301 MIRBuilder.buildIntToPtr(Dst: Dest, Src: DefValue);
1302
1303 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1304 return createLibcall(
1305 Libcall: RTLibcall, Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1306 Args: CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}), LocObserver, MI: &MI);
1307}
1308
1309LegalizerHelper::LegalizeResult
1310LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1311 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1312
1313 switch (MI.getOpcode()) {
1314 default:
1315 return UnableToLegalize;
1316 case TargetOpcode::G_MUL:
1317 case TargetOpcode::G_SDIV:
1318 case TargetOpcode::G_UDIV:
1319 case TargetOpcode::G_SREM:
1320 case TargetOpcode::G_UREM:
1321 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1322 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1323 unsigned Size = LLTy.getSizeInBits();
1324 Type *HLTy = IntegerType::get(C&: Ctx, NumBits: Size);
1325 auto Status = simpleLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1326 if (Status != Legalized)
1327 return Status;
1328 break;
1329 }
1330 case TargetOpcode::G_FADD:
1331 case TargetOpcode::G_FSUB:
1332 case TargetOpcode::G_FMUL:
1333 case TargetOpcode::G_FDIV:
1334 case TargetOpcode::G_FMA:
1335 case TargetOpcode::G_FPOW:
1336 case TargetOpcode::G_FREM:
1337 case TargetOpcode::G_FCOS:
1338 case TargetOpcode::G_FSIN:
1339 case TargetOpcode::G_FTAN:
1340 case TargetOpcode::G_FACOS:
1341 case TargetOpcode::G_FASIN:
1342 case TargetOpcode::G_FATAN:
1343 case TargetOpcode::G_FATAN2:
1344 case TargetOpcode::G_FCOSH:
1345 case TargetOpcode::G_FSINH:
1346 case TargetOpcode::G_FTANH:
1347 case TargetOpcode::G_FLOG10:
1348 case TargetOpcode::G_FLOG:
1349 case TargetOpcode::G_FLOG2:
1350 case TargetOpcode::G_FEXP:
1351 case TargetOpcode::G_FEXP2:
1352 case TargetOpcode::G_FEXP10:
1353 case TargetOpcode::G_FCEIL:
1354 case TargetOpcode::G_FFLOOR:
1355 case TargetOpcode::G_FMINNUM:
1356 case TargetOpcode::G_FMAXNUM:
1357 case TargetOpcode::G_FMINIMUMNUM:
1358 case TargetOpcode::G_FMAXIMUMNUM:
1359 case TargetOpcode::G_FSQRT:
1360 case TargetOpcode::G_FRINT:
1361 case TargetOpcode::G_FNEARBYINT:
1362 case TargetOpcode::G_INTRINSIC_TRUNC:
1363 case TargetOpcode::G_INTRINSIC_ROUND:
1364 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1365 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1366 unsigned Size = LLTy.getSizeInBits();
1367 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1368 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1369 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1370 return UnableToLegalize;
1371 }
1372 auto Status = simpleLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1373 if (Status != Legalized)
1374 return Status;
1375 break;
1376 }
1377 case TargetOpcode::G_FSINCOS: {
1378 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1379 unsigned Size = LLTy.getSizeInBits();
1380 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1381 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1382 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1383 return UnableToLegalize;
1384 }
1385 return emitSincosLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1386 }
1387 case TargetOpcode::G_FMODF: {
1388 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1389 unsigned Size = LLTy.getSizeInBits();
1390 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1391 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1392 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1393 return UnableToLegalize;
1394 }
1395 return emitModfLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1396 }
1397 case TargetOpcode::G_LROUND:
1398 case TargetOpcode::G_LLROUND:
1399 case TargetOpcode::G_INTRINSIC_LRINT:
1400 case TargetOpcode::G_INTRINSIC_LLRINT: {
1401 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
1402 unsigned Size = LLTy.getSizeInBits();
1403 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1404 Type *ITy = IntegerType::get(
1405 C&: Ctx, NumBits: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits());
1406 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1407 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1408 return UnableToLegalize;
1409 }
1410 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
1411 LegalizeResult Status =
1412 createLibcall(Libcall, Result: {MI.getOperand(i: 0).getReg(), ITy, 0},
1413 Args: {{MI.getOperand(i: 1).getReg(), HLTy, 0}}, LocObserver, MI: &MI);
1414 if (Status != Legalized)
1415 return Status;
1416 MI.eraseFromParent();
1417 return Legalized;
1418 }
1419 case TargetOpcode::G_FPOWI:
1420 case TargetOpcode::G_FLDEXP: {
1421 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1422 unsigned Size = LLTy.getSizeInBits();
1423 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1424 Type *ITy = IntegerType::get(
1425 C&: Ctx, NumBits: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits());
1426 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1427 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1428 return UnableToLegalize;
1429 }
1430 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
1431 SmallVector<CallLowering::ArgInfo, 2> Args = {
1432 {MI.getOperand(i: 1).getReg(), HLTy, 0},
1433 {MI.getOperand(i: 2).getReg(), ITy, 1}};
1434 Args[1].Flags[0].setSExt();
1435 LegalizeResult Status = createLibcall(
1436 Libcall, Result: {MI.getOperand(i: 0).getReg(), HLTy, 0}, Args, LocObserver, MI: &MI);
1437 if (Status != Legalized)
1438 return Status;
1439 break;
1440 }
1441 case TargetOpcode::G_FPEXT:
1442 case TargetOpcode::G_FPTRUNC: {
1443 Type *FromTy = getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 1).getReg()));
1444 Type *ToTy = getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
1445 if (!FromTy || !ToTy)
1446 return UnableToLegalize;
1447 LegalizeResult Status = conversionLibcall(MI, ToType: ToTy, FromType: FromTy, LocObserver);
1448 if (Status != Legalized)
1449 return Status;
1450 break;
1451 }
1452 case TargetOpcode::G_FCMP: {
1453 LegalizeResult Status = createFCMPLibcall(MI, LocObserver);
1454 if (Status != Legalized)
1455 return Status;
1456 MI.eraseFromParent();
1457 return Status;
1458 }
1459 case TargetOpcode::G_FPTOSI:
1460 case TargetOpcode::G_FPTOUI: {
1461 // FIXME: Support other types
1462 Type *FromTy =
1463 getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 1).getReg()));
1464 unsigned ToSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1465 if ((ToSize != 32 && ToSize != 64 && ToSize != 128) || !FromTy)
1466 return UnableToLegalize;
1467 LegalizeResult Status = conversionLibcall(MI, ToType: Type::getIntNTy(C&: Ctx, N: ToSize),
1468 FromType: FromTy, LocObserver);
1469 if (Status != Legalized)
1470 return Status;
1471 break;
1472 }
1473 case TargetOpcode::G_SITOFP:
1474 case TargetOpcode::G_UITOFP: {
1475 unsigned FromSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1476 Type *ToTy =
1477 getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
1478 if ((FromSize != 32 && FromSize != 64 && FromSize != 128) || !ToTy)
1479 return UnableToLegalize;
1480 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SITOFP;
1481 LegalizeResult Status = conversionLibcall(
1482 MI, ToType: ToTy, FromType: Type::getIntNTy(C&: Ctx, N: FromSize), LocObserver, IsSigned);
1483 if (Status != Legalized)
1484 return Status;
1485 break;
1486 }
1487 case TargetOpcode::G_ATOMICRMW_XCHG:
1488 case TargetOpcode::G_ATOMICRMW_ADD:
1489 case TargetOpcode::G_ATOMICRMW_SUB:
1490 case TargetOpcode::G_ATOMICRMW_AND:
1491 case TargetOpcode::G_ATOMICRMW_OR:
1492 case TargetOpcode::G_ATOMICRMW_XOR:
1493 case TargetOpcode::G_ATOMIC_CMPXCHG:
1494 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1495 auto Status = createAtomicLibcall(MI);
1496 if (Status != Legalized)
1497 return Status;
1498 break;
1499 }
1500 case TargetOpcode::G_BZERO:
1501 case TargetOpcode::G_MEMCPY:
1502 case TargetOpcode::G_MEMMOVE:
1503 case TargetOpcode::G_MEMSET: {
1504 LegalizeResult Result =
1505 createMemLibcall(MRI&: *MIRBuilder.getMRI(), MI, LocObserver);
1506 if (Result != Legalized)
1507 return Result;
1508 MI.eraseFromParent();
1509 return Result;
1510 }
1511 case TargetOpcode::G_GET_FPENV:
1512 case TargetOpcode::G_GET_FPMODE: {
1513 LegalizeResult Result = createGetStateLibcall(MI, LocObserver);
1514 if (Result != Legalized)
1515 return Result;
1516 break;
1517 }
1518 case TargetOpcode::G_SET_FPENV:
1519 case TargetOpcode::G_SET_FPMODE: {
1520 LegalizeResult Result = createSetStateLibcall(MI, LocObserver);
1521 if (Result != Legalized)
1522 return Result;
1523 break;
1524 }
1525 case TargetOpcode::G_RESET_FPENV:
1526 case TargetOpcode::G_RESET_FPMODE: {
1527 LegalizeResult Result = createResetStateLibcall(MI, LocObserver);
1528 if (Result != Legalized)
1529 return Result;
1530 break;
1531 }
1532 }
1533
1534 MI.eraseFromParent();
1535 return Legalized;
1536}
1537
1538LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1539 unsigned TypeIdx,
1540 LLT NarrowTy) {
1541 uint64_t SizeOp0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1542 uint64_t NarrowSize = NarrowTy.getSizeInBits();
1543
1544 switch (MI.getOpcode()) {
1545 default:
1546 return UnableToLegalize;
1547 case TargetOpcode::G_IMPLICIT_DEF: {
1548 Register DstReg = MI.getOperand(i: 0).getReg();
1549 LLT DstTy = MRI.getType(Reg: DstReg);
1550
1551 // If SizeOp0 is not an exact multiple of NarrowSize, emit
1552 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1553 // FIXME: Although this would also be legal for the general case, it causes
1554 // a lot of regressions in the emitted code (superfluous COPYs, artifact
1555 // combines not being hit). This seems to be a problem related to the
1556 // artifact combiner.
1557 if (SizeOp0 % NarrowSize != 0) {
1558 LLT ImplicitTy = DstTy.changeElementType(NewEltTy: NarrowTy);
1559 Register ImplicitReg = MIRBuilder.buildUndef(Res: ImplicitTy).getReg(Idx: 0);
1560 MIRBuilder.buildAnyExt(Res: DstReg, Op: ImplicitReg);
1561
1562 MI.eraseFromParent();
1563 return Legalized;
1564 }
1565
1566 int NumParts = SizeOp0 / NarrowSize;
1567
1568 SmallVector<Register, 2> DstRegs;
1569 for (int i = 0; i < NumParts; ++i)
1570 DstRegs.push_back(Elt: MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0));
1571
1572 if (DstTy.isVector())
1573 MIRBuilder.buildBuildVector(Res: DstReg, Ops: DstRegs);
1574 else
1575 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
1576 MI.eraseFromParent();
1577 return Legalized;
1578 }
1579 case TargetOpcode::G_CONSTANT: {
1580 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1581 const APInt &Val = MI.getOperand(i: 1).getCImm()->getValue();
1582 unsigned TotalSize = Ty.getSizeInBits();
1583 unsigned NarrowSize = NarrowTy.getSizeInBits();
1584 int NumParts = TotalSize / NarrowSize;
1585
1586 SmallVector<Register, 4> PartRegs;
1587 for (int I = 0; I != NumParts; ++I) {
1588 unsigned Offset = I * NarrowSize;
1589 auto K = MIRBuilder.buildConstant(Res: NarrowTy,
1590 Val: Val.lshr(shiftAmt: Offset).trunc(width: NarrowSize));
1591 PartRegs.push_back(Elt: K.getReg(Idx: 0));
1592 }
1593
1594 LLT LeftoverTy;
1595 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1596 SmallVector<Register, 1> LeftoverRegs;
1597 if (LeftoverBits != 0) {
1598 LeftoverTy = LLT::scalar(SizeInBits: LeftoverBits);
1599 auto K = MIRBuilder.buildConstant(
1600 Res: LeftoverTy,
1601 Val: Val.lshr(shiftAmt: NumParts * NarrowSize).trunc(width: LeftoverBits));
1602 LeftoverRegs.push_back(Elt: K.getReg(Idx: 0));
1603 }
1604
1605 insertParts(DstReg: MI.getOperand(i: 0).getReg(),
1606 ResultTy: Ty, PartTy: NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1607
1608 MI.eraseFromParent();
1609 return Legalized;
1610 }
1611 case TargetOpcode::G_SEXT:
1612 case TargetOpcode::G_ZEXT:
1613 case TargetOpcode::G_ANYEXT:
1614 return narrowScalarExt(MI, TypeIdx, Ty: NarrowTy);
1615 case TargetOpcode::G_TRUNC: {
1616 if (TypeIdx != 1)
1617 return UnableToLegalize;
1618
1619 uint64_t SizeOp1 = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1620 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1621 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1622 return UnableToLegalize;
1623 }
1624
1625 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1));
1626 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0), Op: Unmerge.getReg(Idx: 0));
1627 MI.eraseFromParent();
1628 return Legalized;
1629 }
1630 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
1631 case TargetOpcode::G_FREEZE: {
1632 if (TypeIdx != 0)
1633 return UnableToLegalize;
1634
1635 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1636 // Should widen scalar first
1637 if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1638 return UnableToLegalize;
1639
1640 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1).getReg());
1641 SmallVector<Register, 8> Parts;
1642 for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1643 Parts.push_back(
1644 Elt: MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy}, SrcOps: {Unmerge.getReg(Idx: i)})
1645 .getReg(Idx: 0));
1646 }
1647
1648 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: Parts);
1649 MI.eraseFromParent();
1650 return Legalized;
1651 }
1652 case TargetOpcode::G_ADD:
1653 case TargetOpcode::G_SUB:
1654 case TargetOpcode::G_SADDO:
1655 case TargetOpcode::G_SSUBO:
1656 case TargetOpcode::G_SADDE:
1657 case TargetOpcode::G_SSUBE:
1658 case TargetOpcode::G_UADDO:
1659 case TargetOpcode::G_USUBO:
1660 case TargetOpcode::G_UADDE:
1661 case TargetOpcode::G_USUBE:
1662 return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1663 case TargetOpcode::G_MUL:
1664 case TargetOpcode::G_UMULH:
1665 return narrowScalarMul(MI, Ty: NarrowTy);
1666 case TargetOpcode::G_EXTRACT:
1667 return narrowScalarExtract(MI, TypeIdx, Ty: NarrowTy);
1668 case TargetOpcode::G_INSERT:
1669 return narrowScalarInsert(MI, TypeIdx, Ty: NarrowTy);
1670 case TargetOpcode::G_LOAD: {
1671 auto &LoadMI = cast<GLoad>(Val&: MI);
1672 Register DstReg = LoadMI.getDstReg();
1673 LLT DstTy = MRI.getType(Reg: DstReg);
1674 if (DstTy.isVector())
1675 return UnableToLegalize;
1676
1677 if (8 * LoadMI.getMemSize().getValue() != DstTy.getSizeInBits()) {
1678 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1679 MIRBuilder.buildLoad(Res: TmpReg, Addr: LoadMI.getPointerReg(), MMO&: LoadMI.getMMO());
1680 MIRBuilder.buildAnyExt(Res: DstReg, Op: TmpReg);
1681 LoadMI.eraseFromParent();
1682 return Legalized;
1683 }
1684
1685 return reduceLoadStoreWidth(MI&: LoadMI, TypeIdx, NarrowTy);
1686 }
1687 case TargetOpcode::G_ZEXTLOAD:
1688 case TargetOpcode::G_SEXTLOAD: {
1689 auto &LoadMI = cast<GExtLoad>(Val&: MI);
1690 Register DstReg = LoadMI.getDstReg();
1691 Register PtrReg = LoadMI.getPointerReg();
1692
1693 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1694 auto &MMO = LoadMI.getMMO();
1695 unsigned MemSize = MMO.getSizeInBits().getValue();
1696
1697 if (MemSize == NarrowSize) {
1698 MIRBuilder.buildLoad(Res: TmpReg, Addr: PtrReg, MMO);
1699 } else if (MemSize < NarrowSize) {
1700 MIRBuilder.buildLoadInstr(Opcode: LoadMI.getOpcode(), Res: TmpReg, Addr: PtrReg, MMO);
1701 } else if (MemSize > NarrowSize) {
1702 // FIXME: Need to split the load.
1703 return UnableToLegalize;
1704 }
1705
1706 if (isa<GZExtLoad>(Val: LoadMI))
1707 MIRBuilder.buildZExt(Res: DstReg, Op: TmpReg);
1708 else
1709 MIRBuilder.buildSExt(Res: DstReg, Op: TmpReg);
1710
1711 LoadMI.eraseFromParent();
1712 return Legalized;
1713 }
1714 case TargetOpcode::G_STORE: {
1715 auto &StoreMI = cast<GStore>(Val&: MI);
1716
1717 Register SrcReg = StoreMI.getValueReg();
1718 LLT SrcTy = MRI.getType(Reg: SrcReg);
1719 if (SrcTy.isVector())
1720 return UnableToLegalize;
1721
1722 int NumParts = SizeOp0 / NarrowSize;
1723 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1724 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1725 if (SrcTy.isVector() && LeftoverBits != 0)
1726 return UnableToLegalize;
1727
1728 if (8 * StoreMI.getMemSize().getValue() != SrcTy.getSizeInBits()) {
1729 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1730 MIRBuilder.buildTrunc(Res: TmpReg, Op: SrcReg);
1731 MIRBuilder.buildStore(Val: TmpReg, Addr: StoreMI.getPointerReg(), MMO&: StoreMI.getMMO());
1732 StoreMI.eraseFromParent();
1733 return Legalized;
1734 }
1735
1736 return reduceLoadStoreWidth(MI&: StoreMI, TypeIdx: 0, NarrowTy);
1737 }
1738 case TargetOpcode::G_SELECT:
1739 return narrowScalarSelect(MI, TypeIdx, Ty: NarrowTy);
1740 case TargetOpcode::G_AND:
1741 case TargetOpcode::G_OR:
1742 case TargetOpcode::G_XOR: {
1743 // Legalize bitwise operation:
1744 // A = BinOp<Ty> B, C
1745 // into:
1746 // B1, ..., BN = G_UNMERGE_VALUES B
1747 // C1, ..., CN = G_UNMERGE_VALUES C
1748 // A1 = BinOp<Ty/N> B1, C2
1749 // ...
1750 // AN = BinOp<Ty/N> BN, CN
1751 // A = G_MERGE_VALUES A1, ..., AN
1752 return narrowScalarBasic(MI, TypeIdx, Ty: NarrowTy);
1753 }
1754 case TargetOpcode::G_SHL:
1755 case TargetOpcode::G_LSHR:
1756 case TargetOpcode::G_ASHR:
1757 return narrowScalarShift(MI, TypeIdx, Ty: NarrowTy);
1758 case TargetOpcode::G_CTLZ:
1759 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1760 case TargetOpcode::G_CTTZ:
1761 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1762 case TargetOpcode::G_CTLS:
1763 case TargetOpcode::G_CTPOP:
1764 if (TypeIdx == 1)
1765 switch (MI.getOpcode()) {
1766 case TargetOpcode::G_CTLZ:
1767 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1768 return narrowScalarCTLZ(MI, TypeIdx, Ty: NarrowTy);
1769 case TargetOpcode::G_CTTZ:
1770 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1771 return narrowScalarCTTZ(MI, TypeIdx, Ty: NarrowTy);
1772 case TargetOpcode::G_CTPOP:
1773 return narrowScalarCTPOP(MI, TypeIdx, Ty: NarrowTy);
1774 case TargetOpcode::G_CTLS:
1775 return narrowScalarCTLS(MI, TypeIdx, Ty: NarrowTy);
1776 default:
1777 return UnableToLegalize;
1778 }
1779
1780 Observer.changingInstr(MI);
1781 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1782 Observer.changedInstr(MI);
1783 return Legalized;
1784 case TargetOpcode::G_INTTOPTR:
1785 if (TypeIdx != 1)
1786 return UnableToLegalize;
1787
1788 Observer.changingInstr(MI);
1789 narrowScalarSrc(MI, NarrowTy, OpIdx: 1);
1790 Observer.changedInstr(MI);
1791 return Legalized;
1792 case TargetOpcode::G_PTRTOINT:
1793 if (TypeIdx != 0)
1794 return UnableToLegalize;
1795
1796 Observer.changingInstr(MI);
1797 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1798 Observer.changedInstr(MI);
1799 return Legalized;
1800 case TargetOpcode::G_PHI: {
1801 // FIXME: add support for when SizeOp0 isn't an exact multiple of
1802 // NarrowSize.
1803 if (SizeOp0 % NarrowSize != 0)
1804 return UnableToLegalize;
1805
1806 unsigned NumParts = SizeOp0 / NarrowSize;
1807 SmallVector<Register, 2> DstRegs(NumParts);
1808 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1809 Observer.changingInstr(MI);
1810 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1811 MachineBasicBlock &OpMBB = *MI.getOperand(i: i + 1).getMBB();
1812 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
1813 extractParts(Reg: MI.getOperand(i).getReg(), Ty: NarrowTy, NumParts,
1814 VRegs&: SrcRegs[i / 2], MIRBuilder, MRI);
1815 }
1816 MachineBasicBlock &MBB = *MI.getParent();
1817 MIRBuilder.setInsertPt(MBB, II: MI);
1818 for (unsigned i = 0; i < NumParts; ++i) {
1819 DstRegs[i] = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1820 MachineInstrBuilder MIB =
1821 MIRBuilder.buildInstr(Opcode: TargetOpcode::G_PHI).addDef(RegNo: DstRegs[i]);
1822 for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1823 MIB.addUse(RegNo: SrcRegs[j / 2][i]).add(MO: MI.getOperand(i: j + 1));
1824 }
1825 MIRBuilder.setInsertPt(MBB, II: MBB.getFirstNonPHI());
1826 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: DstRegs);
1827 Observer.changedInstr(MI);
1828 MI.eraseFromParent();
1829 return Legalized;
1830 }
1831 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1832 case TargetOpcode::G_INSERT_VECTOR_ELT: {
1833 if (TypeIdx != 2)
1834 return UnableToLegalize;
1835
1836 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1837 Observer.changingInstr(MI);
1838 narrowScalarSrc(MI, NarrowTy, OpIdx);
1839 Observer.changedInstr(MI);
1840 return Legalized;
1841 }
1842 case TargetOpcode::G_ICMP: {
1843 Register LHS = MI.getOperand(i: 2).getReg();
1844 LLT SrcTy = MRI.getType(Reg: LHS);
1845 CmpInst::Predicate Pred =
1846 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
1847
1848 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1849 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1850 if (!extractParts(Reg: LHS, RegTy: SrcTy, MainTy: NarrowTy, LeftoverTy, VRegs&: LHSPartRegs,
1851 LeftoverVRegs&: LHSLeftoverRegs, MIRBuilder, MRI))
1852 return UnableToLegalize;
1853
1854 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1855 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1856 if (!extractParts(Reg: MI.getOperand(i: 3).getReg(), RegTy: SrcTy, MainTy: NarrowTy, LeftoverTy&: Unused,
1857 VRegs&: RHSPartRegs, LeftoverVRegs&: RHSLeftoverRegs, MIRBuilder, MRI))
1858 return UnableToLegalize;
1859
1860 // We now have the LHS and RHS of the compare split into narrow-type
1861 // registers, plus potentially some leftover type.
1862 Register Dst = MI.getOperand(i: 0).getReg();
1863 LLT ResTy = MRI.getType(Reg: Dst);
1864 if (ICmpInst::isEquality(P: Pred)) {
1865 // For each part on the LHS and RHS, keep track of the result of XOR-ing
1866 // them together. For each equal part, the result should be all 0s. For
1867 // each non-equal part, we'll get at least one 1.
1868 auto Zero = MIRBuilder.buildConstant(Res: NarrowTy, Val: 0);
1869 SmallVector<Register, 4> Xors;
1870 for (auto LHSAndRHS : zip(t&: LHSPartRegs, u&: RHSPartRegs)) {
1871 auto LHS = std::get<0>(t&: LHSAndRHS);
1872 auto RHS = std::get<1>(t&: LHSAndRHS);
1873 auto Xor = MIRBuilder.buildXor(Dst: NarrowTy, Src0: LHS, Src1: RHS).getReg(Idx: 0);
1874 Xors.push_back(Elt: Xor);
1875 }
1876
1877 // Build a G_XOR for each leftover register. Each G_XOR must be widened
1878 // to the desired narrow type so that we can OR them together later.
1879 SmallVector<Register, 4> WidenedXors;
1880 for (auto LHSAndRHS : zip(t&: LHSLeftoverRegs, u&: RHSLeftoverRegs)) {
1881 auto LHS = std::get<0>(t&: LHSAndRHS);
1882 auto RHS = std::get<1>(t&: LHSAndRHS);
1883 auto Xor = MIRBuilder.buildXor(Dst: LeftoverTy, Src0: LHS, Src1: RHS).getReg(Idx: 0);
1884 LLT GCDTy = extractGCDType(Parts&: WidenedXors, DstTy: NarrowTy, NarrowTy: LeftoverTy, SrcReg: Xor);
1885 buildLCMMergePieces(DstTy: LeftoverTy, NarrowTy, GCDTy, VRegs&: WidenedXors,
1886 /* PadStrategy = */ TargetOpcode::G_ZEXT);
1887 llvm::append_range(C&: Xors, R&: WidenedXors);
1888 }
1889
1890 // Now, for each part we broke up, we know if they are equal/not equal
1891 // based off the G_XOR. We can OR these all together and compare against
1892 // 0 to get the result.
1893 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1894 auto Or = MIRBuilder.buildOr(Dst: NarrowTy, Src0: Xors[0], Src1: Xors[1]);
1895 for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1896 Or = MIRBuilder.buildOr(Dst: NarrowTy, Src0: Or, Src1: Xors[I]);
1897 MIRBuilder.buildICmp(Pred, Res: Dst, Op0: Or, Op1: Zero);
1898 } else {
1899 Register CmpIn;
1900 for (unsigned I = 0, E = LHSPartRegs.size(); I != E; ++I) {
1901 Register CmpOut;
1902 CmpInst::Predicate PartPred;
1903
1904 if (I == E - 1 && LHSLeftoverRegs.empty()) {
1905 PartPred = Pred;
1906 CmpOut = Dst;
1907 } else {
1908 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1909 CmpOut = MRI.createGenericVirtualRegister(Ty: ResTy);
1910 }
1911
1912 if (!CmpIn) {
1913 MIRBuilder.buildICmp(Pred: PartPred, Res: CmpOut, Op0: LHSPartRegs[I],
1914 Op1: RHSPartRegs[I]);
1915 } else {
1916 auto Cmp = MIRBuilder.buildICmp(Pred: PartPred, Res: ResTy, Op0: LHSPartRegs[I],
1917 Op1: RHSPartRegs[I]);
1918 auto CmpEq = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: ResTy,
1919 Op0: LHSPartRegs[I], Op1: RHSPartRegs[I]);
1920 MIRBuilder.buildSelect(Res: CmpOut, Tst: CmpEq, Op0: CmpIn, Op1: Cmp);
1921 }
1922
1923 CmpIn = CmpOut;
1924 }
1925
1926 for (unsigned I = 0, E = LHSLeftoverRegs.size(); I != E; ++I) {
1927 Register CmpOut;
1928 CmpInst::Predicate PartPred;
1929
1930 if (I == E - 1) {
1931 PartPred = Pred;
1932 CmpOut = Dst;
1933 } else {
1934 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1935 CmpOut = MRI.createGenericVirtualRegister(Ty: ResTy);
1936 }
1937
1938 if (!CmpIn) {
1939 MIRBuilder.buildICmp(Pred: PartPred, Res: CmpOut, Op0: LHSLeftoverRegs[I],
1940 Op1: RHSLeftoverRegs[I]);
1941 } else {
1942 auto Cmp = MIRBuilder.buildICmp(Pred: PartPred, Res: ResTy, Op0: LHSLeftoverRegs[I],
1943 Op1: RHSLeftoverRegs[I]);
1944 auto CmpEq =
1945 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: ResTy,
1946 Op0: LHSLeftoverRegs[I], Op1: RHSLeftoverRegs[I]);
1947 MIRBuilder.buildSelect(Res: CmpOut, Tst: CmpEq, Op0: CmpIn, Op1: Cmp);
1948 }
1949
1950 CmpIn = CmpOut;
1951 }
1952 }
1953 MI.eraseFromParent();
1954 return Legalized;
1955 }
1956 case TargetOpcode::G_FCMP:
1957 if (TypeIdx != 0)
1958 return UnableToLegalize;
1959
1960 Observer.changingInstr(MI);
1961 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1962 Observer.changedInstr(MI);
1963 return Legalized;
1964
1965 case TargetOpcode::G_SEXT_INREG: {
1966 if (TypeIdx != 0)
1967 return UnableToLegalize;
1968
1969 int64_t SizeInBits = MI.getOperand(i: 2).getImm();
1970
1971 // So long as the new type has more bits than the bits we're extending we
1972 // don't need to break it apart.
1973 if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1974 Observer.changingInstr(MI);
1975 // We don't lose any non-extension bits by truncating the src and
1976 // sign-extending the dst.
1977 MachineOperand &MO1 = MI.getOperand(i: 1);
1978 auto TruncMIB = MIRBuilder.buildTrunc(Res: NarrowTy, Op: MO1);
1979 MO1.setReg(TruncMIB.getReg(Idx: 0));
1980
1981 MachineOperand &MO2 = MI.getOperand(i: 0);
1982 Register DstExt = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1983 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
1984 MIRBuilder.buildSExt(Res: MO2, Op: DstExt);
1985 MO2.setReg(DstExt);
1986 Observer.changedInstr(MI);
1987 return Legalized;
1988 }
1989
1990 // Break it apart. Components below the extension point are unmodified. The
1991 // component containing the extension point becomes a narrower SEXT_INREG.
1992 // Components above it are ashr'd from the component containing the
1993 // extension point.
1994 if (SizeOp0 % NarrowSize != 0)
1995 return UnableToLegalize;
1996 int NumParts = SizeOp0 / NarrowSize;
1997
1998 // List the registers where the destination will be scattered.
1999 SmallVector<Register, 2> DstRegs;
2000 // List the registers where the source will be split.
2001 SmallVector<Register, 2> SrcRegs;
2002
2003 // Create all the temporary registers.
2004 for (int i = 0; i < NumParts; ++i) {
2005 Register SrcReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
2006
2007 SrcRegs.push_back(Elt: SrcReg);
2008 }
2009
2010 // Explode the big arguments into smaller chunks.
2011 MIRBuilder.buildUnmerge(Res: SrcRegs, Op: MI.getOperand(i: 1));
2012
2013 Register AshrCstReg =
2014 MIRBuilder.buildConstant(Res: NarrowTy, Val: NarrowTy.getScalarSizeInBits() - 1)
2015 .getReg(Idx: 0);
2016 Register FullExtensionReg;
2017 Register PartialExtensionReg;
2018
2019 // Do the operation on each small part.
2020 for (int i = 0; i < NumParts; ++i) {
2021 if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
2022 DstRegs.push_back(Elt: SrcRegs[i]);
2023 PartialExtensionReg = DstRegs.back();
2024 } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
2025 assert(PartialExtensionReg &&
2026 "Expected to visit partial extension before full");
2027 if (FullExtensionReg) {
2028 DstRegs.push_back(Elt: FullExtensionReg);
2029 continue;
2030 }
2031 DstRegs.push_back(
2032 Elt: MIRBuilder.buildAShr(Dst: NarrowTy, Src0: PartialExtensionReg, Src1: AshrCstReg)
2033 .getReg(Idx: 0));
2034 FullExtensionReg = DstRegs.back();
2035 } else {
2036 DstRegs.push_back(
2037 Elt: MIRBuilder
2038 .buildInstr(
2039 Opc: TargetOpcode::G_SEXT_INREG, DstOps: {NarrowTy},
2040 SrcOps: {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
2041 .getReg(Idx: 0));
2042 PartialExtensionReg = DstRegs.back();
2043 }
2044 }
2045
2046 // Gather the destination registers into the final destination.
2047 Register DstReg = MI.getOperand(i: 0).getReg();
2048 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
2049 MI.eraseFromParent();
2050 return Legalized;
2051 }
2052 case TargetOpcode::G_BSWAP:
2053 case TargetOpcode::G_BITREVERSE: {
2054 if (SizeOp0 % NarrowSize != 0)
2055 return UnableToLegalize;
2056
2057 Observer.changingInstr(MI);
2058 SmallVector<Register, 2> SrcRegs, DstRegs;
2059 unsigned NumParts = SizeOp0 / NarrowSize;
2060 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowTy, NumParts, VRegs&: SrcRegs,
2061 MIRBuilder, MRI);
2062
2063 for (unsigned i = 0; i < NumParts; ++i) {
2064 auto DstPart = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy},
2065 SrcOps: {SrcRegs[NumParts - 1 - i]});
2066 DstRegs.push_back(Elt: DstPart.getReg(Idx: 0));
2067 }
2068
2069 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: DstRegs);
2070
2071 Observer.changedInstr(MI);
2072 MI.eraseFromParent();
2073 return Legalized;
2074 }
2075 case TargetOpcode::G_PTR_ADD:
2076 case TargetOpcode::G_PTRMASK: {
2077 if (TypeIdx != 1)
2078 return UnableToLegalize;
2079 Observer.changingInstr(MI);
2080 narrowScalarSrc(MI, NarrowTy, OpIdx: 2);
2081 Observer.changedInstr(MI);
2082 return Legalized;
2083 }
2084 case TargetOpcode::G_FPTOUI:
2085 case TargetOpcode::G_FPTOSI:
2086 case TargetOpcode::G_FPTOUI_SAT:
2087 case TargetOpcode::G_FPTOSI_SAT:
2088 return narrowScalarFPTOI(MI, TypeIdx, Ty: NarrowTy);
2089 case TargetOpcode::G_FPEXT:
2090 if (TypeIdx != 0)
2091 return UnableToLegalize;
2092 Observer.changingInstr(MI);
2093 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_FPEXT);
2094 Observer.changedInstr(MI);
2095 return Legalized;
2096 case TargetOpcode::G_FLDEXP:
2097 case TargetOpcode::G_STRICT_FLDEXP:
2098 return narrowScalarFLDEXP(MI, TypeIdx, Ty: NarrowTy);
2099 case TargetOpcode::G_VSCALE: {
2100 Register Dst = MI.getOperand(i: 0).getReg();
2101 LLT Ty = MRI.getType(Reg: Dst);
2102
2103 // Assume VSCALE(1) fits into a legal integer
2104 const APInt One(NarrowTy.getSizeInBits(), 1);
2105 auto VScaleBase = MIRBuilder.buildVScale(Res: NarrowTy, MinElts: One);
2106 auto ZExt = MIRBuilder.buildZExt(Res: Ty, Op: VScaleBase);
2107 auto C = MIRBuilder.buildConstant(Res: Ty, Val: *MI.getOperand(i: 1).getCImm());
2108 MIRBuilder.buildMul(Dst, Src0: ZExt, Src1: C);
2109
2110 MI.eraseFromParent();
2111 return Legalized;
2112 }
2113 }
2114}
2115
2116Register LegalizerHelper::coerceToScalar(Register Val) {
2117 LLT Ty = MRI.getType(Reg: Val);
2118 if (Ty.isScalar())
2119 return Val;
2120
2121 const DataLayout &DL = MIRBuilder.getDataLayout();
2122 LLT NewTy = LLT::scalar(SizeInBits: Ty.getSizeInBits());
2123 if (Ty.isPointer()) {
2124 if (DL.isNonIntegralAddressSpace(AddrSpace: Ty.getAddressSpace()))
2125 return Register();
2126 return MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Val).getReg(Idx: 0);
2127 }
2128
2129 Register NewVal = Val;
2130
2131 assert(Ty.isVector());
2132 if (Ty.isPointerVector())
2133 NewVal = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: NewVal).getReg(Idx: 0);
2134 return MIRBuilder.buildBitcast(Dst: NewTy, Src: NewVal).getReg(Idx: 0);
2135}
2136
2137void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
2138 unsigned OpIdx, unsigned ExtOpcode) {
2139 MachineOperand &MO = MI.getOperand(i: OpIdx);
2140 auto ExtB = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MO});
2141 MO.setReg(ExtB.getReg(Idx: 0));
2142}
2143
2144void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
2145 unsigned OpIdx) {
2146 MachineOperand &MO = MI.getOperand(i: OpIdx);
2147 auto ExtB = MIRBuilder.buildTrunc(Res: NarrowTy, Op: MO);
2148 MO.setReg(ExtB.getReg(Idx: 0));
2149}
2150
2151void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
2152 unsigned OpIdx, unsigned TruncOpcode) {
2153 MachineOperand &MO = MI.getOperand(i: OpIdx);
2154 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2155 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2156 MIRBuilder.buildInstr(Opc: TruncOpcode, DstOps: {MO}, SrcOps: {DstExt});
2157 MO.setReg(DstExt);
2158}
2159
2160void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
2161 unsigned OpIdx, unsigned ExtOpcode) {
2162 MachineOperand &MO = MI.getOperand(i: OpIdx);
2163 Register DstTrunc = MRI.createGenericVirtualRegister(Ty: NarrowTy);
2164 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2165 MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {MO}, SrcOps: {DstTrunc});
2166 MO.setReg(DstTrunc);
2167}
2168
2169void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
2170 unsigned OpIdx) {
2171 MachineOperand &MO = MI.getOperand(i: OpIdx);
2172 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2173 Register Dst = MO.getReg();
2174 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2175 MO.setReg(DstExt);
2176 MIRBuilder.buildDeleteTrailingVectorElements(Res: Dst, Op0: DstExt);
2177}
2178
2179void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
2180 unsigned OpIdx) {
2181 MachineOperand &MO = MI.getOperand(i: OpIdx);
2182 MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(Res: MoreTy, Op0: MO).getReg(Idx: 0));
2183}
2184
2185void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2186 MachineOperand &Op = MI.getOperand(i: OpIdx);
2187 Op.setReg(MIRBuilder.buildBitcast(Dst: CastTy, Src: Op).getReg(Idx: 0));
2188}
2189
2190void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2191 MachineOperand &MO = MI.getOperand(i: OpIdx);
2192 Register CastDst = MRI.createGenericVirtualRegister(Ty: CastTy);
2193 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2194 MIRBuilder.buildBitcast(Dst: MO, Src: CastDst);
2195 MO.setReg(CastDst);
2196}
2197
2198LegalizerHelper::LegalizeResult
2199LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
2200 LLT WideTy) {
2201 if (TypeIdx != 1)
2202 return UnableToLegalize;
2203
2204 auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
2205 if (DstTy.isVector())
2206 return UnableToLegalize;
2207
2208 LLT SrcTy = MRI.getType(Reg: Src1Reg);
2209 const int DstSize = DstTy.getSizeInBits();
2210 const int SrcSize = SrcTy.getSizeInBits();
2211 const int WideSize = WideTy.getSizeInBits();
2212 const int NumMerge = (DstSize + WideSize - 1) / WideSize;
2213
2214 unsigned NumOps = MI.getNumOperands();
2215 unsigned NumSrc = MI.getNumOperands() - 1;
2216 unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
2217
2218 if (WideSize >= DstSize) {
2219 // Directly pack the bits in the target type.
2220 Register ResultReg = MIRBuilder.buildZExt(Res: WideTy, Op: Src1Reg).getReg(Idx: 0);
2221
2222 for (unsigned I = 2; I != NumOps; ++I) {
2223 const unsigned Offset = (I - 1) * PartSize;
2224
2225 Register SrcReg = MI.getOperand(i: I).getReg();
2226 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
2227
2228 auto ZextInput = MIRBuilder.buildZExt(Res: WideTy, Op: SrcReg);
2229
2230 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
2231 MRI.createGenericVirtualRegister(Ty: WideTy);
2232
2233 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: Offset);
2234 auto Shl = MIRBuilder.buildShl(Dst: WideTy, Src0: ZextInput, Src1: ShiftAmt);
2235 MIRBuilder.buildOr(Dst: NextResult, Src0: ResultReg, Src1: Shl);
2236 ResultReg = NextResult;
2237 }
2238
2239 if (WideSize > DstSize)
2240 MIRBuilder.buildTrunc(Res: DstReg, Op: ResultReg);
2241 else if (DstTy.isPointer())
2242 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
2243
2244 MI.eraseFromParent();
2245 return Legalized;
2246 }
2247
2248 // Unmerge the original values to the GCD type, and recombine to the next
2249 // multiple greater than the original type.
2250 //
2251 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
2252 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
2253 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
2254 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
2255 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
2256 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
2257 // %12:_(s12) = G_MERGE_VALUES %10, %11
2258 //
2259 // Padding with undef if necessary:
2260 //
2261 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
2262 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
2263 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
2264 // %7:_(s2) = G_IMPLICIT_DEF
2265 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
2266 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
2267 // %10:_(s12) = G_MERGE_VALUES %8, %9
2268
2269 const int GCD = std::gcd(m: SrcSize, n: WideSize);
2270 LLT GCDTy = LLT::scalar(SizeInBits: GCD);
2271
2272 SmallVector<Register, 8> NewMergeRegs;
2273 SmallVector<Register, 8> Unmerges;
2274 LLT WideDstTy = LLT::scalar(SizeInBits: NumMerge * WideSize);
2275
2276 // Decompose the original operands if they don't evenly divide.
2277 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands())) {
2278 Register SrcReg = MO.getReg();
2279 if (GCD == SrcSize) {
2280 Unmerges.push_back(Elt: SrcReg);
2281 } else {
2282 auto Unmerge = MIRBuilder.buildUnmerge(Res: GCDTy, Op: SrcReg);
2283 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
2284 Unmerges.push_back(Elt: Unmerge.getReg(Idx: J));
2285 }
2286 }
2287
2288 // Pad with undef to the next size that is a multiple of the requested size.
2289 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
2290 Register UndefReg = MIRBuilder.buildUndef(Res: GCDTy).getReg(Idx: 0);
2291 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
2292 Unmerges.push_back(Elt: UndefReg);
2293 }
2294
2295 const int PartsPerGCD = WideSize / GCD;
2296
2297 // Build merges of each piece.
2298 ArrayRef<Register> Slicer(Unmerges);
2299 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(N: PartsPerGCD)) {
2300 auto Merge =
2301 MIRBuilder.buildMergeLikeInstr(Res: WideTy, Ops: Slicer.take_front(N: PartsPerGCD));
2302 NewMergeRegs.push_back(Elt: Merge.getReg(Idx: 0));
2303 }
2304
2305 // A truncate may be necessary if the requested type doesn't evenly divide the
2306 // original result type.
2307 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
2308 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NewMergeRegs);
2309 } else {
2310 auto FinalMerge = MIRBuilder.buildMergeLikeInstr(Res: WideDstTy, Ops: NewMergeRegs);
2311 MIRBuilder.buildTrunc(Res: DstReg, Op: FinalMerge.getReg(Idx: 0));
2312 }
2313
2314 MI.eraseFromParent();
2315 return Legalized;
2316}
2317
2318LegalizerHelper::LegalizeResult
2319LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
2320 LLT WideTy) {
2321 if (TypeIdx != 0)
2322 return UnableToLegalize;
2323
2324 int NumDst = MI.getNumOperands() - 1;
2325 Register SrcReg = MI.getOperand(i: NumDst).getReg();
2326 LLT SrcTy = MRI.getType(Reg: SrcReg);
2327 if (SrcTy.isVector())
2328 return UnableToLegalize;
2329
2330 Register Dst0Reg = MI.getOperand(i: 0).getReg();
2331 LLT DstTy = MRI.getType(Reg: Dst0Reg);
2332 if (!DstTy.isScalar())
2333 return UnableToLegalize;
2334
2335 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
2336 if (SrcTy.isPointer()) {
2337 const DataLayout &DL = MIRBuilder.getDataLayout();
2338 if (DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace())) {
2339 LLVM_DEBUG(
2340 dbgs() << "Not casting non-integral address space integer\n");
2341 return UnableToLegalize;
2342 }
2343
2344 SrcTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
2345 SrcReg = MIRBuilder.buildPtrToInt(Dst: SrcTy, Src: SrcReg).getReg(Idx: 0);
2346 }
2347
2348 // Widen SrcTy to WideTy. This does not affect the result, but since the
2349 // user requested this size, it is probably better handled than SrcTy and
2350 // should reduce the total number of legalization artifacts.
2351 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2352 SrcTy = WideTy;
2353 SrcReg = MIRBuilder.buildAnyExt(Res: WideTy, Op: SrcReg).getReg(Idx: 0);
2354 }
2355
2356 // Theres no unmerge type to target. Directly extract the bits from the
2357 // source type
2358 unsigned DstSize = DstTy.getSizeInBits();
2359
2360 MIRBuilder.buildTrunc(Res: Dst0Reg, Op: SrcReg);
2361 for (int I = 1; I != NumDst; ++I) {
2362 auto ShiftAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: DstSize * I);
2363 auto Shr = MIRBuilder.buildLShr(Dst: SrcTy, Src0: SrcReg, Src1: ShiftAmt);
2364 MIRBuilder.buildTrunc(Res: MI.getOperand(i: I), Op: Shr);
2365 }
2366
2367 MI.eraseFromParent();
2368 return Legalized;
2369 }
2370
2371 // Extend the source to a wider type.
2372 LLT LCMTy = getLCMType(OrigTy: SrcTy, TargetTy: WideTy);
2373
2374 Register WideSrc = SrcReg;
2375 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
2376 // TODO: If this is an integral address space, cast to integer and anyext.
2377 if (SrcTy.isPointer()) {
2378 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
2379 return UnableToLegalize;
2380 }
2381
2382 WideSrc = MIRBuilder.buildAnyExt(Res: LCMTy, Op: WideSrc).getReg(Idx: 0);
2383 }
2384
2385 auto Unmerge = MIRBuilder.buildUnmerge(Res: WideTy, Op: WideSrc);
2386
2387 // Create a sequence of unmerges and merges to the original results. Since we
2388 // may have widened the source, we will need to pad the results with dead defs
2389 // to cover the source register.
2390 // e.g. widen s48 to s64:
2391 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
2392 //
2393 // =>
2394 // %4:_(s192) = G_ANYEXT %0:_(s96)
2395 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
2396 // ; unpack to GCD type, with extra dead defs
2397 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
2398 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
2399 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
2400 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination
2401 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
2402 const LLT GCDTy = getGCDType(OrigTy: WideTy, TargetTy: DstTy);
2403 const int NumUnmerge = Unmerge->getNumOperands() - 1;
2404 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
2405
2406 // Directly unmerge to the destination without going through a GCD type
2407 // if possible
2408 if (PartsPerRemerge == 1) {
2409 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
2410
2411 for (int I = 0; I != NumUnmerge; ++I) {
2412 auto MIB = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_UNMERGE_VALUES);
2413
2414 for (int J = 0; J != PartsPerUnmerge; ++J) {
2415 int Idx = I * PartsPerUnmerge + J;
2416 if (Idx < NumDst)
2417 MIB.addDef(RegNo: MI.getOperand(i: Idx).getReg());
2418 else {
2419 // Create dead def for excess components.
2420 MIB.addDef(RegNo: MRI.createGenericVirtualRegister(Ty: DstTy));
2421 }
2422 }
2423
2424 MIB.addUse(RegNo: Unmerge.getReg(Idx: I));
2425 }
2426 } else {
2427 SmallVector<Register, 16> Parts;
2428 for (int J = 0; J != NumUnmerge; ++J)
2429 extractGCDType(Parts, GCDTy, SrcReg: Unmerge.getReg(Idx: J));
2430
2431 SmallVector<Register, 8> RemergeParts;
2432 for (int I = 0; I != NumDst; ++I) {
2433 for (int J = 0; J < PartsPerRemerge; ++J) {
2434 const int Idx = I * PartsPerRemerge + J;
2435 RemergeParts.emplace_back(Args&: Parts[Idx]);
2436 }
2437
2438 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: I).getReg(), Ops: RemergeParts);
2439 RemergeParts.clear();
2440 }
2441 }
2442
2443 MI.eraseFromParent();
2444 return Legalized;
2445}
2446
2447LegalizerHelper::LegalizeResult
2448LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2449 LLT WideTy) {
2450 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2451 unsigned Offset = MI.getOperand(i: 2).getImm();
2452
2453 if (TypeIdx == 0) {
2454 if (SrcTy.isVector() || DstTy.isVector())
2455 return UnableToLegalize;
2456
2457 SrcOp Src(SrcReg);
2458 if (SrcTy.isPointer()) {
2459 // Extracts from pointers can be handled only if they are really just
2460 // simple integers.
2461 const DataLayout &DL = MIRBuilder.getDataLayout();
2462 if (DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace()))
2463 return UnableToLegalize;
2464
2465 LLT SrcAsIntTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
2466 Src = MIRBuilder.buildPtrToInt(Dst: SrcAsIntTy, Src);
2467 SrcTy = SrcAsIntTy;
2468 }
2469
2470 if (DstTy.isPointer())
2471 return UnableToLegalize;
2472
2473 if (Offset == 0) {
2474 // Avoid a shift in the degenerate case.
2475 MIRBuilder.buildTrunc(Res: DstReg,
2476 Op: MIRBuilder.buildAnyExtOrTrunc(Res: WideTy, Op: Src));
2477 MI.eraseFromParent();
2478 return Legalized;
2479 }
2480
2481 // Do a shift in the source type.
2482 LLT ShiftTy = SrcTy;
2483 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2484 Src = MIRBuilder.buildAnyExt(Res: WideTy, Op: Src);
2485 ShiftTy = WideTy;
2486 }
2487
2488 auto LShr = MIRBuilder.buildLShr(
2489 Dst: ShiftTy, Src0: Src, Src1: MIRBuilder.buildConstant(Res: ShiftTy, Val: Offset));
2490 MIRBuilder.buildTrunc(Res: DstReg, Op: LShr);
2491 MI.eraseFromParent();
2492 return Legalized;
2493 }
2494
2495 if (SrcTy.isScalar()) {
2496 Observer.changingInstr(MI);
2497 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2498 Observer.changedInstr(MI);
2499 return Legalized;
2500 }
2501
2502 if (!SrcTy.isVector())
2503 return UnableToLegalize;
2504
2505 if (DstTy != SrcTy.getElementType())
2506 return UnableToLegalize;
2507
2508 if (Offset % SrcTy.getScalarSizeInBits() != 0)
2509 return UnableToLegalize;
2510
2511 Observer.changingInstr(MI);
2512 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2513
2514 MI.getOperand(i: 2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2515 Offset);
2516 widenScalarDst(MI, WideTy: WideTy.getScalarType(), OpIdx: 0);
2517 Observer.changedInstr(MI);
2518 return Legalized;
2519}
2520
2521LegalizerHelper::LegalizeResult
2522LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2523 LLT WideTy) {
2524 if (TypeIdx != 0 || WideTy.isVector())
2525 return UnableToLegalize;
2526 Observer.changingInstr(MI);
2527 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2528 widenScalarDst(MI, WideTy);
2529 Observer.changedInstr(MI);
2530 return Legalized;
2531}
2532
2533LegalizerHelper::LegalizeResult
2534LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2535 LLT WideTy) {
2536 unsigned Opcode;
2537 unsigned ExtOpcode;
2538 std::optional<Register> CarryIn;
2539 switch (MI.getOpcode()) {
2540 default:
2541 llvm_unreachable("Unexpected opcode!");
2542 case TargetOpcode::G_SADDO:
2543 Opcode = TargetOpcode::G_ADD;
2544 ExtOpcode = TargetOpcode::G_SEXT;
2545 break;
2546 case TargetOpcode::G_SSUBO:
2547 Opcode = TargetOpcode::G_SUB;
2548 ExtOpcode = TargetOpcode::G_SEXT;
2549 break;
2550 case TargetOpcode::G_UADDO:
2551 Opcode = TargetOpcode::G_ADD;
2552 ExtOpcode = TargetOpcode::G_ZEXT;
2553 break;
2554 case TargetOpcode::G_USUBO:
2555 Opcode = TargetOpcode::G_SUB;
2556 ExtOpcode = TargetOpcode::G_ZEXT;
2557 break;
2558 case TargetOpcode::G_SADDE:
2559 Opcode = TargetOpcode::G_UADDE;
2560 ExtOpcode = TargetOpcode::G_SEXT;
2561 CarryIn = MI.getOperand(i: 4).getReg();
2562 break;
2563 case TargetOpcode::G_SSUBE:
2564 Opcode = TargetOpcode::G_USUBE;
2565 ExtOpcode = TargetOpcode::G_SEXT;
2566 CarryIn = MI.getOperand(i: 4).getReg();
2567 break;
2568 case TargetOpcode::G_UADDE:
2569 Opcode = TargetOpcode::G_UADDE;
2570 ExtOpcode = TargetOpcode::G_ZEXT;
2571 CarryIn = MI.getOperand(i: 4).getReg();
2572 break;
2573 case TargetOpcode::G_USUBE:
2574 Opcode = TargetOpcode::G_USUBE;
2575 ExtOpcode = TargetOpcode::G_ZEXT;
2576 CarryIn = MI.getOperand(i: 4).getReg();
2577 break;
2578 }
2579
2580 if (TypeIdx == 1) {
2581 unsigned BoolExtOp = MIRBuilder.getBoolExtOp(IsVec: WideTy.isVector(), IsFP: false);
2582
2583 Observer.changingInstr(MI);
2584 if (CarryIn)
2585 widenScalarSrc(MI, WideTy, OpIdx: 4, ExtOpcode: BoolExtOp);
2586 widenScalarDst(MI, WideTy, OpIdx: 1);
2587
2588 Observer.changedInstr(MI);
2589 return Legalized;
2590 }
2591
2592 auto LHSExt = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 2)});
2593 auto RHSExt = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 3)});
2594 // Do the arithmetic in the larger type.
2595 Register NewOp;
2596 if (CarryIn) {
2597 LLT CarryOutTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
2598 NewOp = MIRBuilder
2599 .buildInstr(Opc: Opcode, DstOps: {WideTy, CarryOutTy},
2600 SrcOps: {LHSExt, RHSExt, *CarryIn})
2601 .getReg(Idx: 0);
2602 } else {
2603 NewOp = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {WideTy}, SrcOps: {LHSExt, RHSExt}).getReg(Idx: 0);
2604 }
2605 LLT OrigTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
2606 auto TruncOp = MIRBuilder.buildTrunc(Res: OrigTy, Op: NewOp);
2607 auto ExtOp = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {TruncOp});
2608 // There is no overflow if the ExtOp is the same as NewOp.
2609 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: MI.getOperand(i: 1), Op0: NewOp, Op1: ExtOp);
2610 // Now trunc the NewOp to the original result.
2611 MIRBuilder.buildTrunc(Res: MI.getOperand(i: 0), Op: NewOp);
2612 MI.eraseFromParent();
2613 return Legalized;
2614}
2615
2616LegalizerHelper::LegalizeResult
2617LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2618 LLT WideTy) {
2619 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2620 MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2621 MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2622 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2623 MI.getOpcode() == TargetOpcode::G_USHLSAT;
2624 // We can convert this to:
2625 // 1. Any extend iN to iM
2626 // 2. SHL by M-N
2627 // 3. [US][ADD|SUB|SHL]SAT
2628 // 4. L/ASHR by M-N
2629 //
2630 // It may be more efficient to lower this to a min and a max operation in
2631 // the higher precision arithmetic if the promoted operation isn't legal,
2632 // but this decision is up to the target's lowering request.
2633 Register DstReg = MI.getOperand(i: 0).getReg();
2634
2635 unsigned NewBits = WideTy.getScalarSizeInBits();
2636 unsigned SHLAmount = NewBits - MRI.getType(Reg: DstReg).getScalarSizeInBits();
2637
2638 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2639 // must not left shift the RHS to preserve the shift amount.
2640 auto LHS = MIRBuilder.buildAnyExt(Res: WideTy, Op: MI.getOperand(i: 1));
2641 auto RHS = IsShift ? MIRBuilder.buildZExt(Res: WideTy, Op: MI.getOperand(i: 2))
2642 : MIRBuilder.buildAnyExt(Res: WideTy, Op: MI.getOperand(i: 2));
2643 auto ShiftK = MIRBuilder.buildConstant(Res: WideTy, Val: SHLAmount);
2644 auto ShiftL = MIRBuilder.buildShl(Dst: WideTy, Src0: LHS, Src1: ShiftK);
2645 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(Dst: WideTy, Src0: RHS, Src1: ShiftK);
2646
2647 auto WideInst = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {WideTy},
2648 SrcOps: {ShiftL, ShiftR}, Flags: MI.getFlags());
2649
2650 // Use a shift that will preserve the number of sign bits when the trunc is
2651 // folded away.
2652 auto Result = IsSigned ? MIRBuilder.buildAShr(Dst: WideTy, Src0: WideInst, Src1: ShiftK)
2653 : MIRBuilder.buildLShr(Dst: WideTy, Src0: WideInst, Src1: ShiftK);
2654
2655 MIRBuilder.buildTrunc(Res: DstReg, Op: Result);
2656 MI.eraseFromParent();
2657 return Legalized;
2658}
2659
2660LegalizerHelper::LegalizeResult
2661LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2662 LLT WideTy) {
2663 if (TypeIdx == 1) {
2664 Observer.changingInstr(MI);
2665 widenScalarDst(MI, WideTy, OpIdx: 1);
2666 Observer.changedInstr(MI);
2667 return Legalized;
2668 }
2669
2670 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2671 auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2672 LLT SrcTy = MRI.getType(Reg: LHS);
2673 LLT OverflowTy = MRI.getType(Reg: OriginalOverflow);
2674 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2675
2676 // To determine if the result overflowed in the larger type, we extend the
2677 // input to the larger type, do the multiply (checking if it overflows),
2678 // then also check the high bits of the result to see if overflow happened
2679 // there.
2680 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2681 auto LeftOperand = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {LHS});
2682 auto RightOperand = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {RHS});
2683
2684 // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2685 // so we don't need to check the overflow result of larger type Mulo.
2686 bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2687
2688 unsigned MulOpc =
2689 WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2690
2691 MachineInstrBuilder Mulo;
2692 if (WideMulCanOverflow)
2693 Mulo = MIRBuilder.buildInstr(Opc: MulOpc, DstOps: {WideTy, OverflowTy},
2694 SrcOps: {LeftOperand, RightOperand});
2695 else
2696 Mulo = MIRBuilder.buildInstr(Opc: MulOpc, DstOps: {WideTy}, SrcOps: {LeftOperand, RightOperand});
2697
2698 auto Mul = Mulo->getOperand(i: 0);
2699 MIRBuilder.buildTrunc(Res: Result, Op: Mul);
2700
2701 MachineInstrBuilder ExtResult;
2702 // Overflow occurred if it occurred in the larger type, or if the high part
2703 // of the result does not zero/sign-extend the low part. Check this second
2704 // possibility first.
2705 if (IsSigned) {
2706 // For signed, overflow occurred when the high part does not sign-extend
2707 // the low part.
2708 ExtResult = MIRBuilder.buildSExtInReg(Res: WideTy, Op: Mul, ImmOp: SrcBitWidth);
2709 } else {
2710 // Unsigned overflow occurred when the high part does not zero-extend the
2711 // low part.
2712 ExtResult = MIRBuilder.buildZExtInReg(Res: WideTy, Op: Mul, ImmOp: SrcBitWidth);
2713 }
2714
2715 if (WideMulCanOverflow) {
2716 auto Overflow =
2717 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: OverflowTy, Op0: Mul, Op1: ExtResult);
2718 // Finally check if the multiplication in the larger type itself overflowed.
2719 MIRBuilder.buildOr(Dst: OriginalOverflow, Src0: Mulo->getOperand(i: 1), Src1: Overflow);
2720 } else {
2721 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: OriginalOverflow, Op0: Mul, Op1: ExtResult);
2722 }
2723 MI.eraseFromParent();
2724 return Legalized;
2725}
2726
2727LegalizerHelper::LegalizeResult
2728LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2729 unsigned Opcode = MI.getOpcode();
2730 switch (Opcode) {
2731 default:
2732 return UnableToLegalize;
2733 case TargetOpcode::G_ATOMICRMW_XCHG:
2734 case TargetOpcode::G_ATOMICRMW_ADD:
2735 case TargetOpcode::G_ATOMICRMW_SUB:
2736 case TargetOpcode::G_ATOMICRMW_AND:
2737 case TargetOpcode::G_ATOMICRMW_OR:
2738 case TargetOpcode::G_ATOMICRMW_XOR:
2739 case TargetOpcode::G_ATOMICRMW_MIN:
2740 case TargetOpcode::G_ATOMICRMW_MAX:
2741 case TargetOpcode::G_ATOMICRMW_UMIN:
2742 case TargetOpcode::G_ATOMICRMW_UMAX:
2743 assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2744 Observer.changingInstr(MI);
2745 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2746 widenScalarDst(MI, WideTy, OpIdx: 0);
2747 Observer.changedInstr(MI);
2748 return Legalized;
2749 case TargetOpcode::G_ATOMIC_CMPXCHG:
2750 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2751 Observer.changingInstr(MI);
2752 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2753 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2754 widenScalarDst(MI, WideTy, OpIdx: 0);
2755 Observer.changedInstr(MI);
2756 return Legalized;
2757 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2758 if (TypeIdx == 0) {
2759 Observer.changingInstr(MI);
2760 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2761 widenScalarSrc(MI, WideTy, OpIdx: 4, ExtOpcode: TargetOpcode::G_ANYEXT);
2762 widenScalarDst(MI, WideTy, OpIdx: 0);
2763 Observer.changedInstr(MI);
2764 return Legalized;
2765 }
2766 assert(TypeIdx == 1 &&
2767 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2768 Observer.changingInstr(MI);
2769 widenScalarDst(MI, WideTy, OpIdx: 1);
2770 Observer.changedInstr(MI);
2771 return Legalized;
2772 case TargetOpcode::G_EXTRACT:
2773 return widenScalarExtract(MI, TypeIdx, WideTy);
2774 case TargetOpcode::G_INSERT:
2775 return widenScalarInsert(MI, TypeIdx, WideTy);
2776 case TargetOpcode::G_MERGE_VALUES:
2777 return widenScalarMergeValues(MI, TypeIdx, WideTy);
2778 case TargetOpcode::G_UNMERGE_VALUES:
2779 return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2780 case TargetOpcode::G_SADDO:
2781 case TargetOpcode::G_SSUBO:
2782 case TargetOpcode::G_UADDO:
2783 case TargetOpcode::G_USUBO:
2784 case TargetOpcode::G_SADDE:
2785 case TargetOpcode::G_SSUBE:
2786 case TargetOpcode::G_UADDE:
2787 case TargetOpcode::G_USUBE:
2788 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2789 case TargetOpcode::G_UMULO:
2790 case TargetOpcode::G_SMULO:
2791 return widenScalarMulo(MI, TypeIdx, WideTy);
2792 case TargetOpcode::G_SADDSAT:
2793 case TargetOpcode::G_SSUBSAT:
2794 case TargetOpcode::G_SSHLSAT:
2795 case TargetOpcode::G_UADDSAT:
2796 case TargetOpcode::G_USUBSAT:
2797 case TargetOpcode::G_USHLSAT:
2798 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2799 case TargetOpcode::G_CTTZ:
2800 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2801 case TargetOpcode::G_CTLZ:
2802 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2803 case TargetOpcode::G_CTLS:
2804 case TargetOpcode::G_CTPOP: {
2805 if (TypeIdx == 0) {
2806 Observer.changingInstr(MI);
2807 widenScalarDst(MI, WideTy, OpIdx: 0);
2808 Observer.changedInstr(MI);
2809 return Legalized;
2810 }
2811
2812 Register SrcReg = MI.getOperand(i: 1).getReg();
2813
2814 // First extend the input.
2815 unsigned ExtOpc;
2816 switch (Opcode) {
2817 case TargetOpcode::G_CTTZ:
2818 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2819 ExtOpc = TargetOpcode::G_ANYEXT;
2820 break;
2821 case TargetOpcode::G_CTLS:
2822 ExtOpc = TargetOpcode::G_SEXT;
2823 break;
2824 default:
2825 ExtOpc = TargetOpcode::G_ZEXT;
2826 }
2827
2828 auto MIBSrc = MIRBuilder.buildInstr(Opc: ExtOpc, DstOps: {WideTy}, SrcOps: {SrcReg});
2829 LLT CurTy = MRI.getType(Reg: SrcReg);
2830 unsigned NewOpc = Opcode;
2831 if (NewOpc == TargetOpcode::G_CTTZ) {
2832 // The count is the same in the larger type except if the original
2833 // value was zero. This can be handled by setting the bit just off
2834 // the top of the original type.
2835 auto TopBit =
2836 APInt::getOneBitSet(numBits: WideTy.getSizeInBits(), BitNo: CurTy.getSizeInBits());
2837 MIBSrc = MIRBuilder.buildOr(
2838 Dst: WideTy, Src0: MIBSrc, Src1: MIRBuilder.buildConstant(Res: WideTy, Val: TopBit));
2839 // Now we know the operand is non-zero, use the more relaxed opcode.
2840 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2841 }
2842
2843 unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2844
2845 if (Opcode == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2846 // An optimization where the result is the CTLZ after the left shift by
2847 // (Difference in widety and current ty), that is,
2848 // MIBSrc = MIBSrc << (sizeinbits(WideTy) - sizeinbits(CurTy))
2849 // Result = ctlz MIBSrc
2850 MIBSrc = MIRBuilder.buildShl(Dst: WideTy, Src0: MIBSrc,
2851 Src1: MIRBuilder.buildConstant(Res: WideTy, Val: SizeDiff));
2852 }
2853
2854 // Perform the operation at the larger size.
2855 auto MIBNewOp = MIRBuilder.buildInstr(Opc: NewOpc, DstOps: {WideTy}, SrcOps: {MIBSrc});
2856 // This is already the correct result for CTPOP and CTTZs
2857 if (Opcode == TargetOpcode::G_CTLZ || Opcode == TargetOpcode::G_CTLS) {
2858 // The correct result is NewOp - (Difference in widety and current ty).
2859 MIBNewOp = MIRBuilder.buildSub(
2860 Dst: WideTy, Src0: MIBNewOp, Src1: MIRBuilder.buildConstant(Res: WideTy, Val: SizeDiff));
2861 }
2862
2863 MIRBuilder.buildZExtOrTrunc(Res: MI.getOperand(i: 0), Op: MIBNewOp);
2864 MI.eraseFromParent();
2865 return Legalized;
2866 }
2867 case TargetOpcode::G_BSWAP: {
2868 Observer.changingInstr(MI);
2869 Register DstReg = MI.getOperand(i: 0).getReg();
2870
2871 Register ShrReg = MRI.createGenericVirtualRegister(Ty: WideTy);
2872 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2873 Register ShiftAmtReg = MRI.createGenericVirtualRegister(Ty: WideTy);
2874 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2875
2876 MI.getOperand(i: 0).setReg(DstExt);
2877
2878 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2879
2880 LLT Ty = MRI.getType(Reg: DstReg);
2881 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2882 MIRBuilder.buildConstant(Res: ShiftAmtReg, Val: DiffBits);
2883 MIRBuilder.buildLShr(Dst: ShrReg, Src0: DstExt, Src1: ShiftAmtReg);
2884
2885 MIRBuilder.buildTrunc(Res: DstReg, Op: ShrReg);
2886 Observer.changedInstr(MI);
2887 return Legalized;
2888 }
2889 case TargetOpcode::G_BITREVERSE: {
2890 Observer.changingInstr(MI);
2891
2892 Register DstReg = MI.getOperand(i: 0).getReg();
2893 LLT Ty = MRI.getType(Reg: DstReg);
2894 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2895
2896 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2897 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2898 MI.getOperand(i: 0).setReg(DstExt);
2899 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2900
2901 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: DiffBits);
2902 auto Shift = MIRBuilder.buildLShr(Dst: WideTy, Src0: DstExt, Src1: ShiftAmt);
2903 MIRBuilder.buildTrunc(Res: DstReg, Op: Shift);
2904 Observer.changedInstr(MI);
2905 return Legalized;
2906 }
2907 case TargetOpcode::G_FREEZE:
2908 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
2909 Observer.changingInstr(MI);
2910 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2911 widenScalarDst(MI, WideTy);
2912 Observer.changedInstr(MI);
2913 return Legalized;
2914
2915 case TargetOpcode::G_ABS:
2916 Observer.changingInstr(MI);
2917 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2918 widenScalarDst(MI, WideTy);
2919 Observer.changedInstr(MI);
2920 return Legalized;
2921
2922 case TargetOpcode::G_ADD:
2923 case TargetOpcode::G_AND:
2924 case TargetOpcode::G_MUL:
2925 case TargetOpcode::G_OR:
2926 case TargetOpcode::G_XOR:
2927 case TargetOpcode::G_SUB:
2928 case TargetOpcode::G_SHUFFLE_VECTOR:
2929 // Perform operation at larger width (any extension is fines here, high bits
2930 // don't affect the result) and then truncate the result back to the
2931 // original type.
2932 Observer.changingInstr(MI);
2933 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2934 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2935 widenScalarDst(MI, WideTy);
2936 Observer.changedInstr(MI);
2937 return Legalized;
2938
2939 case TargetOpcode::G_SBFX:
2940 case TargetOpcode::G_UBFX:
2941 Observer.changingInstr(MI);
2942
2943 if (TypeIdx == 0) {
2944 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2945 widenScalarDst(MI, WideTy);
2946 } else {
2947 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2948 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
2949 }
2950
2951 Observer.changedInstr(MI);
2952 return Legalized;
2953
2954 case TargetOpcode::G_SHL:
2955 Observer.changingInstr(MI);
2956
2957 if (TypeIdx == 0) {
2958 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2959 widenScalarDst(MI, WideTy);
2960 } else {
2961 assert(TypeIdx == 1);
2962 // The "number of bits to shift" operand must preserve its value as an
2963 // unsigned integer:
2964 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2965 }
2966
2967 Observer.changedInstr(MI);
2968 return Legalized;
2969
2970 case TargetOpcode::G_ROTR:
2971 case TargetOpcode::G_ROTL:
2972 if (TypeIdx != 1)
2973 return UnableToLegalize;
2974
2975 Observer.changingInstr(MI);
2976 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2977 Observer.changedInstr(MI);
2978 return Legalized;
2979
2980 case TargetOpcode::G_SDIV:
2981 case TargetOpcode::G_SREM:
2982 case TargetOpcode::G_SMIN:
2983 case TargetOpcode::G_SMAX:
2984 case TargetOpcode::G_ABDS:
2985 Observer.changingInstr(MI);
2986 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2987 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2988 widenScalarDst(MI, WideTy);
2989 Observer.changedInstr(MI);
2990 return Legalized;
2991
2992 case TargetOpcode::G_SDIVREM:
2993 Observer.changingInstr(MI);
2994 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2995 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_SEXT);
2996 widenScalarDst(MI, WideTy);
2997 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: --MIRBuilder.getInsertPt());
2998 widenScalarDst(MI, WideTy, OpIdx: 1);
2999 Observer.changedInstr(MI);
3000 return Legalized;
3001
3002 case TargetOpcode::G_ASHR:
3003 case TargetOpcode::G_LSHR:
3004 Observer.changingInstr(MI);
3005
3006 if (TypeIdx == 0) {
3007 unsigned CvtOp = Opcode == TargetOpcode::G_ASHR ? TargetOpcode::G_SEXT
3008 : TargetOpcode::G_ZEXT;
3009
3010 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: CvtOp);
3011 widenScalarDst(MI, WideTy);
3012 } else {
3013 assert(TypeIdx == 1);
3014 // The "number of bits to shift" operand must preserve its value as an
3015 // unsigned integer:
3016 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3017 }
3018
3019 Observer.changedInstr(MI);
3020 return Legalized;
3021 case TargetOpcode::G_UDIV:
3022 case TargetOpcode::G_UREM:
3023 case TargetOpcode::G_ABDU:
3024 Observer.changingInstr(MI);
3025 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3026 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3027 widenScalarDst(MI, WideTy);
3028 Observer.changedInstr(MI);
3029 return Legalized;
3030 case TargetOpcode::G_UDIVREM:
3031 Observer.changingInstr(MI);
3032 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3033 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
3034 widenScalarDst(MI, WideTy);
3035 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: --MIRBuilder.getInsertPt());
3036 widenScalarDst(MI, WideTy, OpIdx: 1);
3037 Observer.changedInstr(MI);
3038 return Legalized;
3039 case TargetOpcode::G_UMIN:
3040 case TargetOpcode::G_UMAX: {
3041 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3042
3043 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
3044 unsigned ExtOpc =
3045 TLI.isSExtCheaperThanZExt(FromTy: getApproximateEVTForLLT(Ty, Ctx),
3046 ToTy: getApproximateEVTForLLT(Ty: WideTy, Ctx))
3047 ? TargetOpcode::G_SEXT
3048 : TargetOpcode::G_ZEXT;
3049
3050 Observer.changingInstr(MI);
3051 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: ExtOpc);
3052 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: ExtOpc);
3053 widenScalarDst(MI, WideTy);
3054 Observer.changedInstr(MI);
3055 return Legalized;
3056 }
3057
3058 case TargetOpcode::G_SELECT:
3059 Observer.changingInstr(MI);
3060 if (TypeIdx == 0) {
3061 // Perform operation at larger width (any extension is fine here, high
3062 // bits don't affect the result) and then truncate the result back to the
3063 // original type.
3064 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3065 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
3066 widenScalarDst(MI, WideTy);
3067 } else {
3068 bool IsVec = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector();
3069 // Explicit extension is required here since high bits affect the result.
3070 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: MIRBuilder.getBoolExtOp(IsVec, IsFP: false));
3071 }
3072 Observer.changedInstr(MI);
3073 return Legalized;
3074
3075 case TargetOpcode::G_FPEXT:
3076 if (TypeIdx != 1)
3077 return UnableToLegalize;
3078
3079 Observer.changingInstr(MI);
3080 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3081 Observer.changedInstr(MI);
3082 return Legalized;
3083 case TargetOpcode::G_FPTOSI:
3084 case TargetOpcode::G_FPTOUI:
3085 case TargetOpcode::G_INTRINSIC_LRINT:
3086 case TargetOpcode::G_INTRINSIC_LLRINT:
3087 case TargetOpcode::G_IS_FPCLASS:
3088 Observer.changingInstr(MI);
3089
3090 if (TypeIdx == 0)
3091 widenScalarDst(MI, WideTy);
3092 else
3093 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3094
3095 Observer.changedInstr(MI);
3096 return Legalized;
3097 case TargetOpcode::G_SITOFP:
3098 Observer.changingInstr(MI);
3099
3100 if (TypeIdx == 0)
3101 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3102 else
3103 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
3104
3105 Observer.changedInstr(MI);
3106 return Legalized;
3107 case TargetOpcode::G_UITOFP:
3108 Observer.changingInstr(MI);
3109
3110 if (TypeIdx == 0)
3111 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3112 else
3113 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3114
3115 Observer.changedInstr(MI);
3116 return Legalized;
3117 case TargetOpcode::G_FPTOSI_SAT:
3118 case TargetOpcode::G_FPTOUI_SAT:
3119 Observer.changingInstr(MI);
3120
3121 if (TypeIdx == 0) {
3122 Register OldDst = MI.getOperand(i: 0).getReg();
3123 LLT Ty = MRI.getType(Reg: OldDst);
3124 Register ExtReg = MRI.createGenericVirtualRegister(Ty: WideTy);
3125 Register NewDst;
3126 MI.getOperand(i: 0).setReg(ExtReg);
3127 uint64_t ShortBits = Ty.getScalarSizeInBits();
3128 uint64_t WideBits = WideTy.getScalarSizeInBits();
3129 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
3130 if (Opcode == TargetOpcode::G_FPTOSI_SAT) {
3131 // z = i16 fptosi_sat(a)
3132 // ->
3133 // x = i32 fptosi_sat(a)
3134 // y = smin(x, 32767)
3135 // z = smax(y, -32768)
3136 auto MaxVal = MIRBuilder.buildConstant(
3137 Res: WideTy, Val: APInt::getSignedMaxValue(numBits: ShortBits).sext(width: WideBits));
3138 auto MinVal = MIRBuilder.buildConstant(
3139 Res: WideTy, Val: APInt::getSignedMinValue(numBits: ShortBits).sext(width: WideBits));
3140 Register MidReg =
3141 MIRBuilder.buildSMin(Dst: WideTy, Src0: ExtReg, Src1: MaxVal).getReg(Idx: 0);
3142 NewDst = MIRBuilder.buildSMax(Dst: WideTy, Src0: MidReg, Src1: MinVal).getReg(Idx: 0);
3143 } else {
3144 // z = i16 fptoui_sat(a)
3145 // ->
3146 // x = i32 fptoui_sat(a)
3147 // y = smin(x, 65535)
3148 auto MaxVal = MIRBuilder.buildConstant(
3149 Res: WideTy, Val: APInt::getAllOnes(numBits: ShortBits).zext(width: WideBits));
3150 NewDst = MIRBuilder.buildUMin(Dst: WideTy, Src0: ExtReg, Src1: MaxVal).getReg(Idx: 0);
3151 }
3152 MIRBuilder.buildTrunc(Res: OldDst, Op: NewDst);
3153 } else
3154 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3155
3156 Observer.changedInstr(MI);
3157 return Legalized;
3158 case TargetOpcode::G_LOAD:
3159 case TargetOpcode::G_SEXTLOAD:
3160 case TargetOpcode::G_ZEXTLOAD:
3161 Observer.changingInstr(MI);
3162 widenScalarDst(MI, WideTy);
3163 Observer.changedInstr(MI);
3164 return Legalized;
3165
3166 case TargetOpcode::G_STORE: {
3167 if (TypeIdx != 0)
3168 return UnableToLegalize;
3169
3170 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3171 assert(!Ty.isPointerOrPointerVector() && "Can't widen type");
3172 if (!Ty.isScalar()) {
3173 // We need to widen the vector element type.
3174 Observer.changingInstr(MI);
3175 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ANYEXT);
3176 // We also need to adjust the MMO to turn this into a truncating store.
3177 MachineMemOperand &MMO = **MI.memoperands_begin();
3178 MachineFunction &MF = MIRBuilder.getMF();
3179 auto *NewMMO = MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty);
3180 MI.setMemRefs(MF, MemRefs: {NewMMO});
3181 Observer.changedInstr(MI);
3182 return Legalized;
3183 }
3184
3185 Observer.changingInstr(MI);
3186
3187 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
3188 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
3189 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: ExtType);
3190
3191 Observer.changedInstr(MI);
3192 return Legalized;
3193 }
3194 case TargetOpcode::G_CONSTANT: {
3195 MachineOperand &SrcMO = MI.getOperand(i: 1);
3196 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3197 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
3198 SmallTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
3199 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
3200 ExtOpc == TargetOpcode::G_ANYEXT) &&
3201 "Illegal Extend");
3202 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3203 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
3204 ? SrcVal.sext(width: WideTy.getSizeInBits())
3205 : SrcVal.zext(width: WideTy.getSizeInBits());
3206 Observer.changingInstr(MI);
3207 SrcMO.setCImm(ConstantInt::get(Context&: Ctx, V: Val));
3208
3209 widenScalarDst(MI, WideTy);
3210 Observer.changedInstr(MI);
3211 return Legalized;
3212 }
3213 case TargetOpcode::G_FCONSTANT: {
3214 // To avoid changing the bits of the constant due to extension to a larger
3215 // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
3216 MachineOperand &SrcMO = MI.getOperand(i: 1);
3217 APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
3218 MIRBuilder.setInstrAndDebugLoc(MI);
3219 auto IntCst = MIRBuilder.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val);
3220 widenScalarDst(MI&: *IntCst, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3221 MI.eraseFromParent();
3222 return Legalized;
3223 }
3224 case TargetOpcode::G_IMPLICIT_DEF: {
3225 Observer.changingInstr(MI);
3226 widenScalarDst(MI, WideTy);
3227 Observer.changedInstr(MI);
3228 return Legalized;
3229 }
3230 case TargetOpcode::G_BRCOND:
3231 Observer.changingInstr(MI);
3232 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: MIRBuilder.getBoolExtOp(IsVec: false, IsFP: false));
3233 Observer.changedInstr(MI);
3234 return Legalized;
3235
3236 case TargetOpcode::G_FCMP:
3237 Observer.changingInstr(MI);
3238 if (TypeIdx == 0)
3239 widenScalarDst(MI, WideTy);
3240 else {
3241 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3242 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_FPEXT);
3243 }
3244 Observer.changedInstr(MI);
3245 return Legalized;
3246
3247 case TargetOpcode::G_ICMP:
3248 Observer.changingInstr(MI);
3249 if (TypeIdx == 0)
3250 widenScalarDst(MI, WideTy);
3251 else {
3252 LLT SrcTy = MRI.getType(Reg: MI.getOperand(i: 2).getReg());
3253 CmpInst::Predicate Pred =
3254 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
3255
3256 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
3257 unsigned ExtOpcode =
3258 (CmpInst::isSigned(predicate: Pred) ||
3259 TLI.isSExtCheaperThanZExt(FromTy: getApproximateEVTForLLT(Ty: SrcTy, Ctx),
3260 ToTy: getApproximateEVTForLLT(Ty: WideTy, Ctx)))
3261 ? TargetOpcode::G_SEXT
3262 : TargetOpcode::G_ZEXT;
3263 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode);
3264 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode);
3265 }
3266 Observer.changedInstr(MI);
3267 return Legalized;
3268
3269 case TargetOpcode::G_PTR_ADD:
3270 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
3271 Observer.changingInstr(MI);
3272 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
3273 Observer.changedInstr(MI);
3274 return Legalized;
3275
3276 case TargetOpcode::G_PHI: {
3277 assert(TypeIdx == 0 && "Expecting only Idx 0");
3278
3279 Observer.changingInstr(MI);
3280 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
3281 MachineBasicBlock &OpMBB = *MI.getOperand(i: I + 1).getMBB();
3282 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
3283 widenScalarSrc(MI, WideTy, OpIdx: I, ExtOpcode: TargetOpcode::G_ANYEXT);
3284 }
3285
3286 MachineBasicBlock &MBB = *MI.getParent();
3287 MIRBuilder.setInsertPt(MBB, II: --MBB.getFirstNonPHI());
3288 widenScalarDst(MI, WideTy);
3289 Observer.changedInstr(MI);
3290 return Legalized;
3291 }
3292 case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
3293 if (TypeIdx == 0) {
3294 Register VecReg = MI.getOperand(i: 1).getReg();
3295 LLT VecTy = MRI.getType(Reg: VecReg);
3296 Observer.changingInstr(MI);
3297
3298 widenScalarSrc(
3299 MI,
3300 WideTy: VecTy.changeVectorElementType(NewEltTy: LLT::scalar(SizeInBits: WideTy.getSizeInBits())), OpIdx: 1,
3301 ExtOpcode: TargetOpcode::G_ANYEXT);
3302
3303 widenScalarDst(MI, WideTy, OpIdx: 0);
3304 Observer.changedInstr(MI);
3305 return Legalized;
3306 }
3307
3308 if (TypeIdx != 2)
3309 return UnableToLegalize;
3310 Observer.changingInstr(MI);
3311 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3312 Observer.changedInstr(MI);
3313 return Legalized;
3314 }
3315 case TargetOpcode::G_INSERT_VECTOR_ELT: {
3316 if (TypeIdx == 0) {
3317 Observer.changingInstr(MI);
3318 const LLT WideEltTy = WideTy.getElementType();
3319
3320 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3321 widenScalarSrc(MI, WideTy: WideEltTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3322 widenScalarDst(MI, WideTy, OpIdx: 0);
3323 Observer.changedInstr(MI);
3324 return Legalized;
3325 }
3326
3327 if (TypeIdx == 1) {
3328 Observer.changingInstr(MI);
3329
3330 Register VecReg = MI.getOperand(i: 1).getReg();
3331 LLT VecTy = MRI.getType(Reg: VecReg);
3332 LLT WideVecTy = VecTy.changeVectorElementType(NewEltTy: WideTy);
3333
3334 widenScalarSrc(MI, WideTy: WideVecTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3335 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3336 widenScalarDst(MI, WideTy: WideVecTy, OpIdx: 0);
3337 Observer.changedInstr(MI);
3338 return Legalized;
3339 }
3340
3341 if (TypeIdx == 2) {
3342 Observer.changingInstr(MI);
3343 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
3344 Observer.changedInstr(MI);
3345 return Legalized;
3346 }
3347
3348 return UnableToLegalize;
3349 }
3350 case TargetOpcode::G_FADD:
3351 case TargetOpcode::G_FMUL:
3352 case TargetOpcode::G_FSUB:
3353 case TargetOpcode::G_FMA:
3354 case TargetOpcode::G_FMAD:
3355 case TargetOpcode::G_FNEG:
3356 case TargetOpcode::G_FABS:
3357 case TargetOpcode::G_FCANONICALIZE:
3358 case TargetOpcode::G_FMINNUM:
3359 case TargetOpcode::G_FMAXNUM:
3360 case TargetOpcode::G_FMINNUM_IEEE:
3361 case TargetOpcode::G_FMAXNUM_IEEE:
3362 case TargetOpcode::G_FMINIMUM:
3363 case TargetOpcode::G_FMAXIMUM:
3364 case TargetOpcode::G_FMINIMUMNUM:
3365 case TargetOpcode::G_FMAXIMUMNUM:
3366 case TargetOpcode::G_FDIV:
3367 case TargetOpcode::G_FREM:
3368 case TargetOpcode::G_FCEIL:
3369 case TargetOpcode::G_FFLOOR:
3370 case TargetOpcode::G_FCOS:
3371 case TargetOpcode::G_FSIN:
3372 case TargetOpcode::G_FTAN:
3373 case TargetOpcode::G_FACOS:
3374 case TargetOpcode::G_FASIN:
3375 case TargetOpcode::G_FATAN:
3376 case TargetOpcode::G_FATAN2:
3377 case TargetOpcode::G_FCOSH:
3378 case TargetOpcode::G_FSINH:
3379 case TargetOpcode::G_FTANH:
3380 case TargetOpcode::G_FLOG10:
3381 case TargetOpcode::G_FLOG:
3382 case TargetOpcode::G_FLOG2:
3383 case TargetOpcode::G_FRINT:
3384 case TargetOpcode::G_FNEARBYINT:
3385 case TargetOpcode::G_FSQRT:
3386 case TargetOpcode::G_FEXP:
3387 case TargetOpcode::G_FEXP2:
3388 case TargetOpcode::G_FEXP10:
3389 case TargetOpcode::G_FPOW:
3390 case TargetOpcode::G_INTRINSIC_TRUNC:
3391 case TargetOpcode::G_INTRINSIC_ROUND:
3392 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
3393 assert(TypeIdx == 0);
3394 Observer.changingInstr(MI);
3395
3396 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3397 widenScalarSrc(MI, WideTy, OpIdx: I, ExtOpcode: TargetOpcode::G_FPEXT);
3398
3399 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3400 Observer.changedInstr(MI);
3401 return Legalized;
3402 case TargetOpcode::G_FMODF: {
3403 Observer.changingInstr(MI);
3404 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3405
3406 widenScalarDst(MI, WideTy, OpIdx: 1, TruncOpcode: TargetOpcode::G_FPTRUNC);
3407 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: --MIRBuilder.getInsertPt());
3408 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3409 Observer.changedInstr(MI);
3410 return Legalized;
3411 }
3412 case TargetOpcode::G_FPOWI:
3413 case TargetOpcode::G_FLDEXP:
3414 case TargetOpcode::G_STRICT_FLDEXP: {
3415 if (TypeIdx == 0) {
3416 if (Opcode == TargetOpcode::G_STRICT_FLDEXP)
3417 return UnableToLegalize;
3418
3419 Observer.changingInstr(MI);
3420 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3421 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3422 Observer.changedInstr(MI);
3423 return Legalized;
3424 }
3425
3426 if (TypeIdx == 1) {
3427 // For some reason SelectionDAG tries to promote to a libcall without
3428 // actually changing the integer type for promotion.
3429 Observer.changingInstr(MI);
3430 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
3431 Observer.changedInstr(MI);
3432 return Legalized;
3433 }
3434
3435 return UnableToLegalize;
3436 }
3437 case TargetOpcode::G_FFREXP: {
3438 Observer.changingInstr(MI);
3439
3440 if (TypeIdx == 0) {
3441 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3442 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3443 } else {
3444 widenScalarDst(MI, WideTy, OpIdx: 1);
3445 }
3446
3447 Observer.changedInstr(MI);
3448 return Legalized;
3449 }
3450 case TargetOpcode::G_LROUND:
3451 case TargetOpcode::G_LLROUND:
3452 Observer.changingInstr(MI);
3453
3454 if (TypeIdx == 0)
3455 widenScalarDst(MI, WideTy);
3456 else
3457 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3458
3459 Observer.changedInstr(MI);
3460 return Legalized;
3461
3462 case TargetOpcode::G_INTTOPTR:
3463 if (TypeIdx != 1)
3464 return UnableToLegalize;
3465
3466 Observer.changingInstr(MI);
3467 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3468 Observer.changedInstr(MI);
3469 return Legalized;
3470 case TargetOpcode::G_PTRTOINT:
3471 if (TypeIdx != 0)
3472 return UnableToLegalize;
3473
3474 Observer.changingInstr(MI);
3475 widenScalarDst(MI, WideTy, OpIdx: 0);
3476 Observer.changedInstr(MI);
3477 return Legalized;
3478 case TargetOpcode::G_BUILD_VECTOR: {
3479 Observer.changingInstr(MI);
3480
3481 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
3482 for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
3483 widenScalarSrc(MI, WideTy: WideEltTy, OpIdx: I, ExtOpcode: TargetOpcode::G_ANYEXT);
3484
3485 // Avoid changing the result vector type if the source element type was
3486 // requested.
3487 if (TypeIdx == 1) {
3488 MI.setDesc(MIRBuilder.getTII().get(Opcode: TargetOpcode::G_BUILD_VECTOR_TRUNC));
3489 } else {
3490 widenScalarDst(MI, WideTy, OpIdx: 0);
3491 }
3492
3493 Observer.changedInstr(MI);
3494 return Legalized;
3495 }
3496 case TargetOpcode::G_SEXT_INREG:
3497 if (TypeIdx != 0)
3498 return UnableToLegalize;
3499
3500 Observer.changingInstr(MI);
3501 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3502 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3503 Observer.changedInstr(MI);
3504 return Legalized;
3505 case TargetOpcode::G_PTRMASK: {
3506 if (TypeIdx != 1)
3507 return UnableToLegalize;
3508 Observer.changingInstr(MI);
3509 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3510 Observer.changedInstr(MI);
3511 return Legalized;
3512 }
3513 case TargetOpcode::G_VECREDUCE_ADD: {
3514 if (TypeIdx != 1)
3515 return UnableToLegalize;
3516 Observer.changingInstr(MI);
3517 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3518 widenScalarDst(MI, WideTy: WideTy.getScalarType(), OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3519 Observer.changedInstr(MI);
3520 return Legalized;
3521 }
3522 case TargetOpcode::G_VECREDUCE_FADD:
3523 case TargetOpcode::G_VECREDUCE_FMUL:
3524 case TargetOpcode::G_VECREDUCE_FMIN:
3525 case TargetOpcode::G_VECREDUCE_FMAX:
3526 case TargetOpcode::G_VECREDUCE_FMINIMUM:
3527 case TargetOpcode::G_VECREDUCE_FMAXIMUM: {
3528 if (TypeIdx != 0)
3529 return UnableToLegalize;
3530 Observer.changingInstr(MI);
3531 Register VecReg = MI.getOperand(i: 1).getReg();
3532 LLT VecTy = MRI.getType(Reg: VecReg);
3533 LLT WideVecTy = VecTy.changeElementType(NewEltTy: WideTy);
3534 widenScalarSrc(MI, WideTy: WideVecTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3535 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3536 Observer.changedInstr(MI);
3537 return Legalized;
3538 }
3539 case TargetOpcode::G_VSCALE: {
3540 MachineOperand &SrcMO = MI.getOperand(i: 1);
3541 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3542 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3543 // The CImm is always a signed value
3544 const APInt Val = SrcVal.sext(width: WideTy.getSizeInBits());
3545 Observer.changingInstr(MI);
3546 SrcMO.setCImm(ConstantInt::get(Context&: Ctx, V: Val));
3547 widenScalarDst(MI, WideTy);
3548 Observer.changedInstr(MI);
3549 return Legalized;
3550 }
3551 case TargetOpcode::G_SPLAT_VECTOR: {
3552 if (TypeIdx != 1)
3553 return UnableToLegalize;
3554
3555 Observer.changingInstr(MI);
3556 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3557 Observer.changedInstr(MI);
3558 return Legalized;
3559 }
3560 case TargetOpcode::G_INSERT_SUBVECTOR: {
3561 if (TypeIdx != 0)
3562 return UnableToLegalize;
3563
3564 GInsertSubvector &IS = cast<GInsertSubvector>(Val&: MI);
3565 Register BigVec = IS.getBigVec();
3566 Register SubVec = IS.getSubVec();
3567
3568 LLT SubVecTy = MRI.getType(Reg: SubVec);
3569 LLT SubVecWideTy = SubVecTy.changeElementType(NewEltTy: WideTy.getElementType());
3570
3571 // Widen the G_INSERT_SUBVECTOR
3572 auto BigZExt = MIRBuilder.buildZExt(Res: WideTy, Op: BigVec);
3573 auto SubZExt = MIRBuilder.buildZExt(Res: SubVecWideTy, Op: SubVec);
3574 auto WideInsert = MIRBuilder.buildInsertSubvector(Res: WideTy, Src0: BigZExt, Src1: SubZExt,
3575 Index: IS.getIndexImm());
3576
3577 // Truncate back down
3578 auto SplatZero = MIRBuilder.buildSplatVector(
3579 Res: WideTy, Val: MIRBuilder.buildConstant(Res: WideTy.getElementType(), Val: 0));
3580 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_NE, Res: IS.getReg(Idx: 0), Op0: WideInsert,
3581 Op1: SplatZero);
3582
3583 MI.eraseFromParent();
3584
3585 return Legalized;
3586 }
3587 }
3588}
3589
3590static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
3591 MachineIRBuilder &B, Register Src, LLT Ty) {
3592 auto Unmerge = B.buildUnmerge(Res: Ty, Op: Src);
3593 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3594 Pieces.push_back(Elt: Unmerge.getReg(Idx: I));
3595}
3596
3597static void emitLoadFromConstantPool(Register DstReg, const Constant *ConstVal,
3598 MachineIRBuilder &MIRBuilder) {
3599 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3600 MachineFunction &MF = MIRBuilder.getMF();
3601 const DataLayout &DL = MIRBuilder.getDataLayout();
3602 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
3603 LLT AddrPtrTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: DL.getPointerSizeInBits(AS: AddrSpace));
3604 LLT DstLLT = MRI.getType(Reg: DstReg);
3605
3606 Align Alignment(DL.getABITypeAlign(Ty: ConstVal->getType()));
3607
3608 auto Addr = MIRBuilder.buildConstantPool(
3609 Res: AddrPtrTy,
3610 Idx: MF.getConstantPool()->getConstantPoolIndex(C: ConstVal, Alignment));
3611
3612 MachineMemOperand *MMO =
3613 MF.getMachineMemOperand(PtrInfo: MachinePointerInfo::getConstantPool(MF),
3614 f: MachineMemOperand::MOLoad, MemTy: DstLLT, base_alignment: Alignment);
3615
3616 MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_LOAD, Res: DstReg, Addr, MMO&: *MMO);
3617}
3618
3619LegalizerHelper::LegalizeResult
3620LegalizerHelper::lowerConstant(MachineInstr &MI) {
3621 const MachineOperand &ConstOperand = MI.getOperand(i: 1);
3622 const Constant *ConstantVal = ConstOperand.getCImm();
3623
3624 emitLoadFromConstantPool(DstReg: MI.getOperand(i: 0).getReg(), ConstVal: ConstantVal, MIRBuilder);
3625 MI.eraseFromParent();
3626
3627 return Legalized;
3628}
3629
3630LegalizerHelper::LegalizeResult
3631LegalizerHelper::lowerFConstant(MachineInstr &MI) {
3632 const MachineOperand &ConstOperand = MI.getOperand(i: 1);
3633 const Constant *ConstantVal = ConstOperand.getFPImm();
3634
3635 emitLoadFromConstantPool(DstReg: MI.getOperand(i: 0).getReg(), ConstVal: ConstantVal, MIRBuilder);
3636 MI.eraseFromParent();
3637
3638 return Legalized;
3639}
3640
3641LegalizerHelper::LegalizeResult
3642LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3643 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3644 if (SrcTy.isVector()) {
3645 LLT SrcEltTy = SrcTy.getElementType();
3646 SmallVector<Register, 8> SrcRegs;
3647
3648 if (DstTy.isVector()) {
3649 int NumDstElt = DstTy.getNumElements();
3650 int NumSrcElt = SrcTy.getNumElements();
3651
3652 LLT DstEltTy = DstTy.getElementType();
3653 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3654 LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3655
3656 // If there's an element size mismatch, insert intermediate casts to match
3657 // the result element type.
3658 if (NumSrcElt < NumDstElt) { // Source element type is larger.
3659 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3660 //
3661 // =>
3662 //
3663 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3664 // %3:_(<2 x s8>) = G_BITCAST %2
3665 // %4:_(<2 x s8>) = G_BITCAST %3
3666 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3667 DstCastTy = DstTy.changeVectorElementCount(
3668 EC: ElementCount::getFixed(MinVal: NumDstElt / NumSrcElt));
3669 SrcPartTy = SrcEltTy;
3670 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3671 //
3672 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3673 //
3674 // =>
3675 //
3676 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3677 // %3:_(s16) = G_BITCAST %2
3678 // %4:_(s16) = G_BITCAST %3
3679 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3680 SrcPartTy = SrcTy.changeVectorElementCount(
3681 EC: ElementCount::getFixed(MinVal: NumSrcElt / NumDstElt));
3682 DstCastTy = DstEltTy;
3683 }
3684
3685 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: SrcPartTy);
3686 for (Register &SrcReg : SrcRegs)
3687 SrcReg = MIRBuilder.buildBitcast(Dst: DstCastTy, Src: SrcReg).getReg(Idx: 0);
3688 } else
3689 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: SrcEltTy);
3690
3691 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3692 MI.eraseFromParent();
3693 return Legalized;
3694 }
3695
3696 if (DstTy.isVector()) {
3697 SmallVector<Register, 8> SrcRegs;
3698 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: DstTy.getElementType());
3699 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3700 MI.eraseFromParent();
3701 return Legalized;
3702 }
3703
3704 return UnableToLegalize;
3705}
3706
3707/// Figure out the bit offset into a register when coercing a vector index for
3708/// the wide element type. This is only for the case when promoting vector to
3709/// one with larger elements.
3710//
3711///
3712/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3713/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3714static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3715 Register Idx,
3716 unsigned NewEltSize,
3717 unsigned OldEltSize) {
3718 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3719 LLT IdxTy = B.getMRI()->getType(Reg: Idx);
3720
3721 // Now figure out the amount we need to shift to get the target bits.
3722 auto OffsetMask = B.buildConstant(
3723 Res: IdxTy, Val: ~(APInt::getAllOnes(numBits: IdxTy.getSizeInBits()) << Log2EltRatio));
3724 auto OffsetIdx = B.buildAnd(Dst: IdxTy, Src0: Idx, Src1: OffsetMask);
3725 return B.buildShl(Dst: IdxTy, Src0: OffsetIdx,
3726 Src1: B.buildConstant(Res: IdxTy, Val: Log2_32(Value: OldEltSize))).getReg(Idx: 0);
3727}
3728
3729/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3730/// is casting to a vector with a smaller element size, perform multiple element
3731/// extracts and merge the results. If this is coercing to a vector with larger
3732/// elements, index the bitcasted vector and extract the target element with bit
3733/// operations. This is intended to force the indexing in the native register
3734/// size for architectures that can dynamically index the register file.
3735LegalizerHelper::LegalizeResult
3736LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3737 LLT CastTy) {
3738 if (TypeIdx != 1)
3739 return UnableToLegalize;
3740
3741 auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3742
3743 LLT SrcEltTy = SrcVecTy.getElementType();
3744 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3745 unsigned OldNumElts = SrcVecTy.getNumElements();
3746
3747 LLT NewEltTy = CastTy.getScalarType();
3748 Register CastVec = MIRBuilder.buildBitcast(Dst: CastTy, Src: SrcVec).getReg(Idx: 0);
3749
3750 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3751 const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3752 if (NewNumElts > OldNumElts) {
3753 // Decreasing the vector element size
3754 //
3755 // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3756 // =>
3757 // v4i32:castx = bitcast x:v2i64
3758 //
3759 // i64 = bitcast
3760 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3761 // (i32 (extract_vector_elt castx, (2 * y + 1)))
3762 //
3763 if (NewNumElts % OldNumElts != 0)
3764 return UnableToLegalize;
3765
3766 // Type of the intermediate result vector.
3767 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3768 LLT MidTy =
3769 CastTy.changeElementCount(EC: ElementCount::getFixed(MinVal: NewEltsPerOldElt));
3770
3771 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(Res: IdxTy, Val: NewEltsPerOldElt);
3772
3773 SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3774 auto NewBaseIdx = MIRBuilder.buildMul(Dst: IdxTy, Src0: Idx, Src1: NewEltsPerOldEltK);
3775
3776 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3777 auto IdxOffset = MIRBuilder.buildConstant(Res: IdxTy, Val: I);
3778 auto TmpIdx = MIRBuilder.buildAdd(Dst: IdxTy, Src0: NewBaseIdx, Src1: IdxOffset);
3779 auto Elt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec, Idx: TmpIdx);
3780 NewOps[I] = Elt.getReg(Idx: 0);
3781 }
3782
3783 auto NewVec = MIRBuilder.buildBuildVector(Res: MidTy, Ops: NewOps);
3784 MIRBuilder.buildBitcast(Dst, Src: NewVec);
3785 MI.eraseFromParent();
3786 return Legalized;
3787 }
3788
3789 if (NewNumElts < OldNumElts) {
3790 if (NewEltSize % OldEltSize != 0)
3791 return UnableToLegalize;
3792
3793 // This only depends on powers of 2 because we use bit tricks to figure out
3794 // the bit offset we need to shift to get the target element. A general
3795 // expansion could emit division/multiply.
3796 if (!isPowerOf2_32(Value: NewEltSize / OldEltSize))
3797 return UnableToLegalize;
3798
3799 // Increasing the vector element size.
3800 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3801 //
3802 // =>
3803 //
3804 // %cast = G_BITCAST %vec
3805 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3806 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3807 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3808 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3809 // %elt_bits = G_LSHR %wide_elt, %offset_bits
3810 // %elt = G_TRUNC %elt_bits
3811
3812 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3813 auto Log2Ratio = MIRBuilder.buildConstant(Res: IdxTy, Val: Log2EltRatio);
3814
3815 // Divide to get the index in the wider element type.
3816 auto ScaledIdx = MIRBuilder.buildLShr(Dst: IdxTy, Src0: Idx, Src1: Log2Ratio);
3817
3818 Register WideElt = CastVec;
3819 if (CastTy.isVector()) {
3820 WideElt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec,
3821 Idx: ScaledIdx).getReg(Idx: 0);
3822 }
3823
3824 // Compute the bit offset into the register of the target element.
3825 Register OffsetBits = getBitcastWiderVectorElementOffset(
3826 B&: MIRBuilder, Idx, NewEltSize, OldEltSize);
3827
3828 // Shift the wide element to get the target element.
3829 auto ExtractedBits = MIRBuilder.buildLShr(Dst: NewEltTy, Src0: WideElt, Src1: OffsetBits);
3830 MIRBuilder.buildTrunc(Res: Dst, Op: ExtractedBits);
3831 MI.eraseFromParent();
3832 return Legalized;
3833 }
3834
3835 return UnableToLegalize;
3836}
3837
3838/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3839/// TargetReg, while preserving other bits in \p TargetReg.
3840///
3841/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3842static Register buildBitFieldInsert(MachineIRBuilder &B,
3843 Register TargetReg, Register InsertReg,
3844 Register OffsetBits) {
3845 LLT TargetTy = B.getMRI()->getType(Reg: TargetReg);
3846 LLT InsertTy = B.getMRI()->getType(Reg: InsertReg);
3847 auto ZextVal = B.buildZExt(Res: TargetTy, Op: InsertReg);
3848 auto ShiftedInsertVal = B.buildShl(Dst: TargetTy, Src0: ZextVal, Src1: OffsetBits);
3849
3850 // Produce a bitmask of the value to insert
3851 auto EltMask = B.buildConstant(
3852 Res: TargetTy, Val: APInt::getLowBitsSet(numBits: TargetTy.getSizeInBits(),
3853 loBitsSet: InsertTy.getSizeInBits()));
3854 // Shift it into position
3855 auto ShiftedMask = B.buildShl(Dst: TargetTy, Src0: EltMask, Src1: OffsetBits);
3856 auto InvShiftedMask = B.buildNot(Dst: TargetTy, Src0: ShiftedMask);
3857
3858 // Clear out the bits in the wide element
3859 auto MaskedOldElt = B.buildAnd(Dst: TargetTy, Src0: TargetReg, Src1: InvShiftedMask);
3860
3861 // The value to insert has all zeros already, so stick it into the masked
3862 // wide element.
3863 return B.buildOr(Dst: TargetTy, Src0: MaskedOldElt, Src1: ShiftedInsertVal).getReg(Idx: 0);
3864}
3865
3866/// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3867/// is increasing the element size, perform the indexing in the target element
3868/// type, and use bit operations to insert at the element position. This is
3869/// intended for architectures that can dynamically index the register file and
3870/// want to force indexing in the native register size.
3871LegalizerHelper::LegalizeResult
3872LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3873 LLT CastTy) {
3874 if (TypeIdx != 0)
3875 return UnableToLegalize;
3876
3877 auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3878 MI.getFirst4RegLLTs();
3879 LLT VecTy = DstTy;
3880
3881 LLT VecEltTy = VecTy.getElementType();
3882 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3883 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3884 const unsigned OldEltSize = VecEltTy.getSizeInBits();
3885
3886 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3887 unsigned OldNumElts = VecTy.getNumElements();
3888
3889 Register CastVec = MIRBuilder.buildBitcast(Dst: CastTy, Src: SrcVec).getReg(Idx: 0);
3890 if (NewNumElts < OldNumElts) {
3891 if (NewEltSize % OldEltSize != 0)
3892 return UnableToLegalize;
3893
3894 // This only depends on powers of 2 because we use bit tricks to figure out
3895 // the bit offset we need to shift to get the target element. A general
3896 // expansion could emit division/multiply.
3897 if (!isPowerOf2_32(Value: NewEltSize / OldEltSize))
3898 return UnableToLegalize;
3899
3900 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3901 auto Log2Ratio = MIRBuilder.buildConstant(Res: IdxTy, Val: Log2EltRatio);
3902
3903 // Divide to get the index in the wider element type.
3904 auto ScaledIdx = MIRBuilder.buildLShr(Dst: IdxTy, Src0: Idx, Src1: Log2Ratio);
3905
3906 Register ExtractedElt = CastVec;
3907 if (CastTy.isVector()) {
3908 ExtractedElt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec,
3909 Idx: ScaledIdx).getReg(Idx: 0);
3910 }
3911
3912 // Compute the bit offset into the register of the target element.
3913 Register OffsetBits = getBitcastWiderVectorElementOffset(
3914 B&: MIRBuilder, Idx, NewEltSize, OldEltSize);
3915
3916 Register InsertedElt = buildBitFieldInsert(B&: MIRBuilder, TargetReg: ExtractedElt,
3917 InsertReg: Val, OffsetBits);
3918 if (CastTy.isVector()) {
3919 InsertedElt = MIRBuilder.buildInsertVectorElement(
3920 Res: CastTy, Val: CastVec, Elt: InsertedElt, Idx: ScaledIdx).getReg(Idx: 0);
3921 }
3922
3923 MIRBuilder.buildBitcast(Dst, Src: InsertedElt);
3924 MI.eraseFromParent();
3925 return Legalized;
3926 }
3927
3928 return UnableToLegalize;
3929}
3930
3931// This attempts to handle G_CONCAT_VECTORS with illegal operands, particularly
3932// those that have smaller than legal operands.
3933//
3934// <16 x s8> = G_CONCAT_VECTORS <4 x s8>, <4 x s8>, <4 x s8>, <4 x s8>
3935//
3936// ===>
3937//
3938// s32 = G_BITCAST <4 x s8>
3939// s32 = G_BITCAST <4 x s8>
3940// s32 = G_BITCAST <4 x s8>
3941// s32 = G_BITCAST <4 x s8>
3942// <4 x s32> = G_BUILD_VECTOR s32, s32, s32, s32
3943// <16 x s8> = G_BITCAST <4 x s32>
3944LegalizerHelper::LegalizeResult
3945LegalizerHelper::bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx,
3946 LLT CastTy) {
3947 // Convert it to CONCAT instruction
3948 auto ConcatMI = dyn_cast<GConcatVectors>(Val: &MI);
3949 if (!ConcatMI) {
3950 return UnableToLegalize;
3951 }
3952
3953 // Check if bitcast is Legal
3954 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
3955 LLT SrcScalTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
3956
3957 // Check if the build vector is Legal
3958 if (!LI.isLegal(Query: {TargetOpcode::G_BUILD_VECTOR, {CastTy, SrcScalTy}})) {
3959 return UnableToLegalize;
3960 }
3961
3962 // Bitcast the sources
3963 SmallVector<Register> BitcastRegs;
3964 for (unsigned i = 0; i < ConcatMI->getNumSources(); i++) {
3965 BitcastRegs.push_back(
3966 Elt: MIRBuilder.buildBitcast(Dst: SrcScalTy, Src: ConcatMI->getSourceReg(I: i))
3967 .getReg(Idx: 0));
3968 }
3969
3970 // Build the scalar values into a vector
3971 Register BuildReg =
3972 MIRBuilder.buildBuildVector(Res: CastTy, Ops: BitcastRegs).getReg(Idx: 0);
3973 MIRBuilder.buildBitcast(Dst: DstReg, Src: BuildReg);
3974
3975 MI.eraseFromParent();
3976 return Legalized;
3977}
3978
3979// This bitcasts a shuffle vector to a different type currently of the same
3980// element size. Mostly used to legalize ptr vectors, where ptrtoint/inttoptr
3981// will be used instead.
3982//
3983// <16 x p0> = G_CONCAT_VECTORS <4 x p0>, <4 x p0>, mask
3984// ===>
3985// <4 x s64> = G_PTRTOINT <4 x p0>
3986// <4 x s64> = G_PTRTOINT <4 x p0>
3987// <16 x s64> = G_CONCAT_VECTORS <4 x s64>, <4 x s64>, mask
3988// <16 x p0> = G_INTTOPTR <16 x s64>
3989LegalizerHelper::LegalizeResult
3990LegalizerHelper::bitcastShuffleVector(MachineInstr &MI, unsigned TypeIdx,
3991 LLT CastTy) {
3992 auto ShuffleMI = cast<GShuffleVector>(Val: &MI);
3993 LLT DstTy = MRI.getType(Reg: ShuffleMI->getReg(Idx: 0));
3994 LLT SrcTy = MRI.getType(Reg: ShuffleMI->getReg(Idx: 1));
3995
3996 // We currently only handle vectors of the same size.
3997 if (TypeIdx != 0 ||
3998 CastTy.getScalarSizeInBits() != DstTy.getScalarSizeInBits() ||
3999 CastTy.getElementCount() != DstTy.getElementCount())
4000 return UnableToLegalize;
4001
4002 LLT NewSrcTy = SrcTy.changeElementType(NewEltTy: CastTy.getScalarType());
4003
4004 auto Inp1 = MIRBuilder.buildCast(Dst: NewSrcTy, Src: ShuffleMI->getReg(Idx: 1));
4005 auto Inp2 = MIRBuilder.buildCast(Dst: NewSrcTy, Src: ShuffleMI->getReg(Idx: 2));
4006 auto Shuf =
4007 MIRBuilder.buildShuffleVector(Res: CastTy, Src1: Inp1, Src2: Inp2, Mask: ShuffleMI->getMask());
4008 MIRBuilder.buildCast(Dst: ShuffleMI->getReg(Idx: 0), Src: Shuf);
4009
4010 MI.eraseFromParent();
4011 return Legalized;
4012}
4013
4014/// This attempts to bitcast G_EXTRACT_SUBVECTOR to CastTy.
4015///
4016/// <vscale x 8 x i1> = G_EXTRACT_SUBVECTOR <vscale x 16 x i1>, N
4017///
4018/// ===>
4019///
4020/// <vscale x 2 x i1> = G_BITCAST <vscale x 16 x i1>
4021/// <vscale x 1 x i8> = G_EXTRACT_SUBVECTOR <vscale x 2 x i1>, N / 8
4022/// <vscale x 8 x i1> = G_BITCAST <vscale x 1 x i8>
4023LegalizerHelper::LegalizeResult
4024LegalizerHelper::bitcastExtractSubvector(MachineInstr &MI, unsigned TypeIdx,
4025 LLT CastTy) {
4026 auto ES = cast<GExtractSubvector>(Val: &MI);
4027
4028 if (!CastTy.isVector())
4029 return UnableToLegalize;
4030
4031 if (TypeIdx != 0)
4032 return UnableToLegalize;
4033
4034 Register Dst = ES->getReg(Idx: 0);
4035 Register Src = ES->getSrcVec();
4036 uint64_t Idx = ES->getIndexImm();
4037
4038 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4039
4040 LLT DstTy = MRI.getType(Reg: Dst);
4041 LLT SrcTy = MRI.getType(Reg: Src);
4042 ElementCount DstTyEC = DstTy.getElementCount();
4043 ElementCount SrcTyEC = SrcTy.getElementCount();
4044 auto DstTyMinElts = DstTyEC.getKnownMinValue();
4045 auto SrcTyMinElts = SrcTyEC.getKnownMinValue();
4046
4047 if (DstTy == CastTy)
4048 return Legalized;
4049
4050 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
4051 return UnableToLegalize;
4052
4053 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
4054 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
4055 if (CastEltSize < DstEltSize)
4056 return UnableToLegalize;
4057
4058 auto AdjustAmt = CastEltSize / DstEltSize;
4059 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
4060 SrcTyMinElts % AdjustAmt != 0)
4061 return UnableToLegalize;
4062
4063 Idx /= AdjustAmt;
4064 SrcTy = LLT::vector(EC: SrcTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4065 auto CastVec = MIRBuilder.buildBitcast(Dst: SrcTy, Src);
4066 auto PromotedES = MIRBuilder.buildExtractSubvector(Res: CastTy, Src: CastVec, Index: Idx);
4067 MIRBuilder.buildBitcast(Dst, Src: PromotedES);
4068
4069 ES->eraseFromParent();
4070 return Legalized;
4071}
4072
4073/// This attempts to bitcast G_INSERT_SUBVECTOR to CastTy.
4074///
4075/// <vscale x 16 x i1> = G_INSERT_SUBVECTOR <vscale x 16 x i1>,
4076/// <vscale x 8 x i1>,
4077/// N
4078///
4079/// ===>
4080///
4081/// <vscale x 2 x i8> = G_BITCAST <vscale x 16 x i1>
4082/// <vscale x 1 x i8> = G_BITCAST <vscale x 8 x i1>
4083/// <vscale x 2 x i8> = G_INSERT_SUBVECTOR <vscale x 2 x i8>,
4084/// <vscale x 1 x i8>, N / 8
4085/// <vscale x 16 x i1> = G_BITCAST <vscale x 2 x i8>
4086LegalizerHelper::LegalizeResult
4087LegalizerHelper::bitcastInsertSubvector(MachineInstr &MI, unsigned TypeIdx,
4088 LLT CastTy) {
4089 auto ES = cast<GInsertSubvector>(Val: &MI);
4090
4091 if (!CastTy.isVector())
4092 return UnableToLegalize;
4093
4094 if (TypeIdx != 0)
4095 return UnableToLegalize;
4096
4097 Register Dst = ES->getReg(Idx: 0);
4098 Register BigVec = ES->getBigVec();
4099 Register SubVec = ES->getSubVec();
4100 uint64_t Idx = ES->getIndexImm();
4101
4102 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4103
4104 LLT DstTy = MRI.getType(Reg: Dst);
4105 LLT BigVecTy = MRI.getType(Reg: BigVec);
4106 LLT SubVecTy = MRI.getType(Reg: SubVec);
4107
4108 if (DstTy == CastTy)
4109 return Legalized;
4110
4111 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
4112 return UnableToLegalize;
4113
4114 ElementCount DstTyEC = DstTy.getElementCount();
4115 ElementCount BigVecTyEC = BigVecTy.getElementCount();
4116 ElementCount SubVecTyEC = SubVecTy.getElementCount();
4117 auto DstTyMinElts = DstTyEC.getKnownMinValue();
4118 auto BigVecTyMinElts = BigVecTyEC.getKnownMinValue();
4119 auto SubVecTyMinElts = SubVecTyEC.getKnownMinValue();
4120
4121 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
4122 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
4123 if (CastEltSize < DstEltSize)
4124 return UnableToLegalize;
4125
4126 auto AdjustAmt = CastEltSize / DstEltSize;
4127 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
4128 BigVecTyMinElts % AdjustAmt != 0 || SubVecTyMinElts % AdjustAmt != 0)
4129 return UnableToLegalize;
4130
4131 Idx /= AdjustAmt;
4132 BigVecTy = LLT::vector(EC: BigVecTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4133 SubVecTy = LLT::vector(EC: SubVecTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4134 auto CastBigVec = MIRBuilder.buildBitcast(Dst: BigVecTy, Src: BigVec);
4135 auto CastSubVec = MIRBuilder.buildBitcast(Dst: SubVecTy, Src: SubVec);
4136 auto PromotedIS =
4137 MIRBuilder.buildInsertSubvector(Res: CastTy, Src0: CastBigVec, Src1: CastSubVec, Index: Idx);
4138 MIRBuilder.buildBitcast(Dst, Src: PromotedIS);
4139
4140 ES->eraseFromParent();
4141 return Legalized;
4142}
4143
4144LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
4145 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
4146 Register DstReg = LoadMI.getDstReg();
4147 Register PtrReg = LoadMI.getPointerReg();
4148 LLT DstTy = MRI.getType(Reg: DstReg);
4149 MachineMemOperand &MMO = LoadMI.getMMO();
4150 LLT MemTy = MMO.getMemoryType();
4151 MachineFunction &MF = MIRBuilder.getMF();
4152
4153 unsigned MemSizeInBits = MemTy.getSizeInBits();
4154 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
4155
4156 if (MemSizeInBits != MemStoreSizeInBits) {
4157 if (MemTy.isVector())
4158 return UnableToLegalize;
4159
4160 // Promote to a byte-sized load if not loading an integral number of
4161 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
4162 LLT WideMemTy = LLT::scalar(SizeInBits: MemStoreSizeInBits);
4163 MachineMemOperand *NewMMO =
4164 MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty: WideMemTy);
4165
4166 Register LoadReg = DstReg;
4167 LLT LoadTy = DstTy;
4168
4169 // If this wasn't already an extending load, we need to widen the result
4170 // register to avoid creating a load with a narrower result than the source.
4171 if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
4172 LoadTy = WideMemTy;
4173 LoadReg = MRI.createGenericVirtualRegister(Ty: WideMemTy);
4174 }
4175
4176 if (isa<GSExtLoad>(Val: LoadMI)) {
4177 auto NewLoad = MIRBuilder.buildLoad(Res: LoadTy, Addr: PtrReg, MMO&: *NewMMO);
4178 MIRBuilder.buildSExtInReg(Res: LoadReg, Op: NewLoad, ImmOp: MemSizeInBits);
4179 } else if (isa<GZExtLoad>(Val: LoadMI) || WideMemTy == LoadTy) {
4180 auto NewLoad = MIRBuilder.buildLoad(Res: LoadTy, Addr: PtrReg, MMO&: *NewMMO);
4181 // The extra bits are guaranteed to be zero, since we stored them that
4182 // way. A zext load from Wide thus automatically gives zext from MemVT.
4183 MIRBuilder.buildAssertZExt(Res: LoadReg, Op: NewLoad, Size: MemSizeInBits);
4184 } else {
4185 MIRBuilder.buildLoad(Res: LoadReg, Addr: PtrReg, MMO&: *NewMMO);
4186 }
4187
4188 if (DstTy != LoadTy)
4189 MIRBuilder.buildTrunc(Res: DstReg, Op: LoadReg);
4190
4191 LoadMI.eraseFromParent();
4192 return Legalized;
4193 }
4194
4195 // Big endian lowering not implemented.
4196 if (MIRBuilder.getDataLayout().isBigEndian())
4197 return UnableToLegalize;
4198
4199 // This load needs splitting into power of 2 sized loads.
4200 //
4201 // Our strategy here is to generate anyextending loads for the smaller
4202 // types up to next power-2 result type, and then combine the two larger
4203 // result values together, before truncating back down to the non-pow-2
4204 // type.
4205 // E.g. v1 = i24 load =>
4206 // v2 = i32 zextload (2 byte)
4207 // v3 = i32 load (1 byte)
4208 // v4 = i32 shl v3, 16
4209 // v5 = i32 or v4, v2
4210 // v1 = i24 trunc v5
4211 // By doing this we generate the correct truncate which should get
4212 // combined away as an artifact with a matching extend.
4213
4214 uint64_t LargeSplitSize, SmallSplitSize;
4215
4216 if (!isPowerOf2_32(Value: MemSizeInBits)) {
4217 // This load needs splitting into power of 2 sized loads.
4218 LargeSplitSize = llvm::bit_floor(Value: MemSizeInBits);
4219 SmallSplitSize = MemSizeInBits - LargeSplitSize;
4220 } else {
4221 // This is already a power of 2, but we still need to split this in half.
4222 //
4223 // Assume we're being asked to decompose an unaligned load.
4224 // TODO: If this requires multiple splits, handle them all at once.
4225 auto &Ctx = MF.getFunction().getContext();
4226 if (TLI.allowsMemoryAccess(Context&: Ctx, DL: MIRBuilder.getDataLayout(), Ty: MemTy, MMO))
4227 return UnableToLegalize;
4228
4229 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4230 }
4231
4232 if (MemTy.isVector()) {
4233 // TODO: Handle vector extloads
4234 if (MemTy != DstTy)
4235 return UnableToLegalize;
4236
4237 Align Alignment = LoadMI.getAlign();
4238 // Given an alignment larger than the size of the memory, we can increase
4239 // the size of the load without needing to scalarize it.
4240 if (Alignment.value() * 8 > MemSizeInBits &&
4241 isPowerOf2_64(Value: DstTy.getScalarSizeInBits())) {
4242 LLT MoreTy = DstTy.changeVectorElementCount(
4243 EC: ElementCount::getFixed(MinVal: NextPowerOf2(A: DstTy.getNumElements())));
4244 MachineMemOperand *NewMMO = MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Ty: MoreTy);
4245 auto NewLoad = MIRBuilder.buildLoad(Res: MoreTy, Addr: PtrReg, MMO&: *NewMMO);
4246 MIRBuilder.buildDeleteTrailingVectorElements(Res: LoadMI.getReg(Idx: 0),
4247 Op0: NewLoad.getReg(Idx: 0));
4248 LoadMI.eraseFromParent();
4249 return Legalized;
4250 }
4251
4252 // TODO: We can do better than scalarizing the vector and at least split it
4253 // in half.
4254 return reduceLoadStoreWidth(MI&: LoadMI, TypeIdx: 0, NarrowTy: DstTy.getElementType());
4255 }
4256
4257 MachineMemOperand *LargeMMO =
4258 MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Size: LargeSplitSize / 8);
4259 MachineMemOperand *SmallMMO =
4260 MF.getMachineMemOperand(MMO: &MMO, Offset: LargeSplitSize / 8, Size: SmallSplitSize / 8);
4261
4262 LLT PtrTy = MRI.getType(Reg: PtrReg);
4263 unsigned AnyExtSize = PowerOf2Ceil(A: DstTy.getSizeInBits());
4264 LLT AnyExtTy = LLT::scalar(SizeInBits: AnyExtSize);
4265 auto LargeLoad = MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_ZEXTLOAD, Res: AnyExtTy,
4266 Addr: PtrReg, MMO&: *LargeMMO);
4267
4268 auto OffsetCst = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()),
4269 Val: LargeSplitSize / 8);
4270 Register PtrAddReg = MRI.createGenericVirtualRegister(Ty: PtrTy);
4271 auto SmallPtr = MIRBuilder.buildObjectPtrOffset(Res: PtrAddReg, Op0: PtrReg, Op1: OffsetCst);
4272 auto SmallLoad = MIRBuilder.buildLoadInstr(Opcode: LoadMI.getOpcode(), Res: AnyExtTy,
4273 Addr: SmallPtr, MMO&: *SmallMMO);
4274
4275 auto ShiftAmt = MIRBuilder.buildConstant(Res: AnyExtTy, Val: LargeSplitSize);
4276 auto Shift = MIRBuilder.buildShl(Dst: AnyExtTy, Src0: SmallLoad, Src1: ShiftAmt);
4277
4278 if (AnyExtTy == DstTy)
4279 MIRBuilder.buildOr(Dst: DstReg, Src0: Shift, Src1: LargeLoad);
4280 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
4281 auto Or = MIRBuilder.buildOr(Dst: AnyExtTy, Src0: Shift, Src1: LargeLoad);
4282 MIRBuilder.buildTrunc(Res: DstReg, Op: {Or});
4283 } else {
4284 assert(DstTy.isPointer() && "expected pointer");
4285 auto Or = MIRBuilder.buildOr(Dst: AnyExtTy, Src0: Shift, Src1: LargeLoad);
4286
4287 // FIXME: We currently consider this to be illegal for non-integral address
4288 // spaces, but we need still need a way to reinterpret the bits.
4289 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: Or);
4290 }
4291
4292 LoadMI.eraseFromParent();
4293 return Legalized;
4294}
4295
4296LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
4297 // Lower a non-power of 2 store into multiple pow-2 stores.
4298 // E.g. split an i24 store into an i16 store + i8 store.
4299 // We do this by first extending the stored value to the next largest power
4300 // of 2 type, and then using truncating stores to store the components.
4301 // By doing this, likewise with G_LOAD, generate an extend that can be
4302 // artifact-combined away instead of leaving behind extracts.
4303 Register SrcReg = StoreMI.getValueReg();
4304 Register PtrReg = StoreMI.getPointerReg();
4305 LLT SrcTy = MRI.getType(Reg: SrcReg);
4306 MachineFunction &MF = MIRBuilder.getMF();
4307 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4308 LLT MemTy = MMO.getMemoryType();
4309
4310 unsigned StoreWidth = MemTy.getSizeInBits();
4311 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
4312
4313 if (StoreWidth != StoreSizeInBits && !SrcTy.isVector()) {
4314 // Promote to a byte-sized store with upper bits zero if not
4315 // storing an integral number of bytes. For example, promote
4316 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
4317 LLT WideTy = LLT::scalar(SizeInBits: StoreSizeInBits);
4318
4319 if (StoreSizeInBits > SrcTy.getSizeInBits()) {
4320 // Avoid creating a store with a narrower source than result.
4321 SrcReg = MIRBuilder.buildAnyExt(Res: WideTy, Op: SrcReg).getReg(Idx: 0);
4322 SrcTy = WideTy;
4323 }
4324
4325 auto ZextInReg = MIRBuilder.buildZExtInReg(Res: SrcTy, Op: SrcReg, ImmOp: StoreWidth);
4326
4327 MachineMemOperand *NewMMO =
4328 MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty: WideTy);
4329 MIRBuilder.buildStore(Val: ZextInReg, Addr: PtrReg, MMO&: *NewMMO);
4330 StoreMI.eraseFromParent();
4331 return Legalized;
4332 }
4333
4334 if (MemTy.isVector()) {
4335 if (MemTy != SrcTy)
4336 return scalarizeVectorBooleanStore(MI&: StoreMI);
4337
4338 // TODO: We can do better than scalarizing the vector and at least split it
4339 // in half.
4340 return reduceLoadStoreWidth(MI&: StoreMI, TypeIdx: 0, NarrowTy: SrcTy.getElementType());
4341 }
4342
4343 unsigned MemSizeInBits = MemTy.getSizeInBits();
4344 uint64_t LargeSplitSize, SmallSplitSize;
4345
4346 if (!isPowerOf2_32(Value: MemSizeInBits)) {
4347 LargeSplitSize = llvm::bit_floor<uint64_t>(Value: MemTy.getSizeInBits());
4348 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
4349 } else {
4350 auto &Ctx = MF.getFunction().getContext();
4351 if (TLI.allowsMemoryAccess(Context&: Ctx, DL: MIRBuilder.getDataLayout(), Ty: MemTy, MMO))
4352 return UnableToLegalize; // Don't know what we're being asked to do.
4353
4354 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4355 }
4356
4357 // Extend to the next pow-2. If this store was itself the result of lowering,
4358 // e.g. an s56 store being broken into s32 + s24, we might have a stored type
4359 // that's wider than the stored size.
4360 unsigned AnyExtSize = PowerOf2Ceil(A: MemTy.getSizeInBits());
4361 const LLT NewSrcTy = LLT::scalar(SizeInBits: AnyExtSize);
4362
4363 if (SrcTy.isPointer()) {
4364 const LLT IntPtrTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
4365 SrcReg = MIRBuilder.buildPtrToInt(Dst: IntPtrTy, Src: SrcReg).getReg(Idx: 0);
4366 }
4367
4368 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(Res: NewSrcTy, Op: SrcReg);
4369
4370 // Obtain the smaller value by shifting away the larger value.
4371 auto ShiftAmt = MIRBuilder.buildConstant(Res: NewSrcTy, Val: LargeSplitSize);
4372 auto SmallVal = MIRBuilder.buildLShr(Dst: NewSrcTy, Src0: ExtVal, Src1: ShiftAmt);
4373
4374 // Generate the PtrAdd and truncating stores.
4375 LLT PtrTy = MRI.getType(Reg: PtrReg);
4376 auto OffsetCst = MIRBuilder.buildConstant(
4377 Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()), Val: LargeSplitSize / 8);
4378 auto SmallPtr = MIRBuilder.buildObjectPtrOffset(Res: PtrTy, Op0: PtrReg, Op1: OffsetCst);
4379
4380 MachineMemOperand *LargeMMO =
4381 MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Size: LargeSplitSize / 8);
4382 MachineMemOperand *SmallMMO =
4383 MF.getMachineMemOperand(MMO: &MMO, Offset: LargeSplitSize / 8, Size: SmallSplitSize / 8);
4384 MIRBuilder.buildStore(Val: ExtVal, Addr: PtrReg, MMO&: *LargeMMO);
4385 MIRBuilder.buildStore(Val: SmallVal, Addr: SmallPtr, MMO&: *SmallMMO);
4386 StoreMI.eraseFromParent();
4387 return Legalized;
4388}
4389
4390LegalizerHelper::LegalizeResult
4391LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
4392 Register SrcReg = StoreMI.getValueReg();
4393 Register PtrReg = StoreMI.getPointerReg();
4394 LLT SrcTy = MRI.getType(Reg: SrcReg);
4395 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4396 LLT MemTy = MMO.getMemoryType();
4397 LLT MemScalarTy = MemTy.getElementType();
4398 MachineFunction &MF = MIRBuilder.getMF();
4399
4400 assert(SrcTy.isVector() && "Expect a vector store type");
4401
4402 if (!MemScalarTy.isByteSized()) {
4403 // We need to build an integer scalar of the vector bit pattern.
4404 // It's not legal for us to add padding when storing a vector.
4405 unsigned NumBits = MemTy.getSizeInBits();
4406 LLT IntTy = LLT::scalar(SizeInBits: NumBits);
4407 auto CurrVal = MIRBuilder.buildConstant(Res: IntTy, Val: 0);
4408 LLT IdxTy = TLI.getVectorIdxLLT(DL: MF.getDataLayout());
4409
4410 for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
4411 auto Elt = MIRBuilder.buildExtractVectorElement(
4412 Res: SrcTy.getElementType(), Val: SrcReg, Idx: MIRBuilder.buildConstant(Res: IdxTy, Val: I));
4413 auto Trunc = MIRBuilder.buildTrunc(Res: MemScalarTy, Op: Elt);
4414 auto ZExt = MIRBuilder.buildZExt(Res: IntTy, Op: Trunc);
4415 unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
4416 ? (MemTy.getNumElements() - 1) - I
4417 : I;
4418 auto ShiftAmt = MIRBuilder.buildConstant(
4419 Res: IntTy, Val: ShiftIntoIdx * MemScalarTy.getSizeInBits());
4420 auto Shifted = MIRBuilder.buildShl(Dst: IntTy, Src0: ZExt, Src1: ShiftAmt);
4421 CurrVal = MIRBuilder.buildOr(Dst: IntTy, Src0: CurrVal, Src1: Shifted);
4422 }
4423 auto PtrInfo = MMO.getPointerInfo();
4424 auto *NewMMO = MF.getMachineMemOperand(MMO: &MMO, PtrInfo, Ty: IntTy);
4425 MIRBuilder.buildStore(Val: CurrVal, Addr: PtrReg, MMO&: *NewMMO);
4426 StoreMI.eraseFromParent();
4427 return Legalized;
4428 }
4429
4430 // TODO: implement simple scalarization.
4431 return UnableToLegalize;
4432}
4433
4434LegalizerHelper::LegalizeResult
4435LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
4436 switch (MI.getOpcode()) {
4437 case TargetOpcode::G_LOAD: {
4438 if (TypeIdx != 0)
4439 return UnableToLegalize;
4440 MachineMemOperand &MMO = **MI.memoperands_begin();
4441
4442 // Not sure how to interpret a bitcast of an extending load.
4443 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4444 return UnableToLegalize;
4445
4446 Observer.changingInstr(MI);
4447 bitcastDst(MI, CastTy, OpIdx: 0);
4448 MMO.setType(CastTy);
4449 // The range metadata is no longer valid when reinterpreted as a different
4450 // type.
4451 MMO.clearRanges();
4452 Observer.changedInstr(MI);
4453 return Legalized;
4454 }
4455 case TargetOpcode::G_STORE: {
4456 if (TypeIdx != 0)
4457 return UnableToLegalize;
4458
4459 MachineMemOperand &MMO = **MI.memoperands_begin();
4460
4461 // Not sure how to interpret a bitcast of a truncating store.
4462 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4463 return UnableToLegalize;
4464
4465 Observer.changingInstr(MI);
4466 bitcastSrc(MI, CastTy, OpIdx: 0);
4467 MMO.setType(CastTy);
4468 Observer.changedInstr(MI);
4469 return Legalized;
4470 }
4471 case TargetOpcode::G_SELECT: {
4472 if (TypeIdx != 0)
4473 return UnableToLegalize;
4474
4475 if (MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector()) {
4476 LLVM_DEBUG(
4477 dbgs() << "bitcast action not implemented for vector select\n");
4478 return UnableToLegalize;
4479 }
4480
4481 Observer.changingInstr(MI);
4482 bitcastSrc(MI, CastTy, OpIdx: 2);
4483 bitcastSrc(MI, CastTy, OpIdx: 3);
4484 bitcastDst(MI, CastTy, OpIdx: 0);
4485 Observer.changedInstr(MI);
4486 return Legalized;
4487 }
4488 case TargetOpcode::G_AND:
4489 case TargetOpcode::G_OR:
4490 case TargetOpcode::G_XOR: {
4491 Observer.changingInstr(MI);
4492 bitcastSrc(MI, CastTy, OpIdx: 1);
4493 bitcastSrc(MI, CastTy, OpIdx: 2);
4494 bitcastDst(MI, CastTy, OpIdx: 0);
4495 Observer.changedInstr(MI);
4496 return Legalized;
4497 }
4498 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4499 return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
4500 case TargetOpcode::G_INSERT_VECTOR_ELT:
4501 return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
4502 case TargetOpcode::G_CONCAT_VECTORS:
4503 return bitcastConcatVector(MI, TypeIdx, CastTy);
4504 case TargetOpcode::G_SHUFFLE_VECTOR:
4505 return bitcastShuffleVector(MI, TypeIdx, CastTy);
4506 case TargetOpcode::G_EXTRACT_SUBVECTOR:
4507 return bitcastExtractSubvector(MI, TypeIdx, CastTy);
4508 case TargetOpcode::G_INSERT_SUBVECTOR:
4509 return bitcastInsertSubvector(MI, TypeIdx, CastTy);
4510 default:
4511 return UnableToLegalize;
4512 }
4513}
4514
4515// Legalize an instruction by changing the opcode in place.
4516void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
4517 Observer.changingInstr(MI);
4518 MI.setDesc(MIRBuilder.getTII().get(Opcode: NewOpcode));
4519 Observer.changedInstr(MI);
4520}
4521
4522LegalizerHelper::LegalizeResult
4523LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
4524 using namespace TargetOpcode;
4525
4526 switch(MI.getOpcode()) {
4527 default:
4528 return UnableToLegalize;
4529 case TargetOpcode::G_FCONSTANT:
4530 return lowerFConstant(MI);
4531 case TargetOpcode::G_BITCAST:
4532 return lowerBitcast(MI);
4533 case TargetOpcode::G_SREM:
4534 case TargetOpcode::G_UREM: {
4535 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4536 auto Quot =
4537 MIRBuilder.buildInstr(Opc: MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, DstOps: {Ty},
4538 SrcOps: {MI.getOperand(i: 1), MI.getOperand(i: 2)});
4539
4540 auto Prod = MIRBuilder.buildMul(Dst: Ty, Src0: Quot, Src1: MI.getOperand(i: 2));
4541 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MI.getOperand(i: 1), Src1: Prod);
4542 MI.eraseFromParent();
4543 return Legalized;
4544 }
4545 case TargetOpcode::G_SADDO:
4546 case TargetOpcode::G_SSUBO:
4547 return lowerSADDO_SSUBO(MI);
4548 case TargetOpcode::G_SADDE:
4549 return lowerSADDE(MI);
4550 case TargetOpcode::G_SSUBE:
4551 return lowerSSUBE(MI);
4552 case TargetOpcode::G_UMULH:
4553 case TargetOpcode::G_SMULH:
4554 return lowerSMULH_UMULH(MI);
4555 case TargetOpcode::G_SMULO:
4556 case TargetOpcode::G_UMULO: {
4557 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
4558 // result.
4559 auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
4560 LLT Ty = MRI.getType(Reg: Res);
4561
4562 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
4563 ? TargetOpcode::G_SMULH
4564 : TargetOpcode::G_UMULH;
4565
4566 Observer.changingInstr(MI);
4567 const auto &TII = MIRBuilder.getTII();
4568 MI.setDesc(TII.get(Opcode: TargetOpcode::G_MUL));
4569 MI.removeOperand(OpNo: 1);
4570 Observer.changedInstr(MI);
4571
4572 auto HiPart = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {Ty}, SrcOps: {LHS, RHS});
4573 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4574
4575 // Move insert point forward so we can use the Res register if needed.
4576 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
4577
4578 // For *signed* multiply, overflow is detected by checking:
4579 // (hi != (lo >> bitwidth-1))
4580 if (Opcode == TargetOpcode::G_SMULH) {
4581 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: Ty.getSizeInBits() - 1);
4582 auto Shifted = MIRBuilder.buildAShr(Dst: Ty, Src0: Res, Src1: ShiftAmt);
4583 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: Overflow, Op0: HiPart, Op1: Shifted);
4584 } else {
4585 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: Overflow, Op0: HiPart, Op1: Zero);
4586 }
4587 return Legalized;
4588 }
4589 case TargetOpcode::G_FNEG: {
4590 auto [Res, SubByReg] = MI.getFirst2Regs();
4591 LLT Ty = MRI.getType(Reg: Res);
4592
4593 auto SignMask = MIRBuilder.buildConstant(
4594 Res: Ty, Val: APInt::getSignMask(BitWidth: Ty.getScalarSizeInBits()));
4595 MIRBuilder.buildXor(Dst: Res, Src0: SubByReg, Src1: SignMask);
4596 MI.eraseFromParent();
4597 return Legalized;
4598 }
4599 case TargetOpcode::G_FSUB:
4600 case TargetOpcode::G_STRICT_FSUB: {
4601 auto [Res, LHS, RHS] = MI.getFirst3Regs();
4602 LLT Ty = MRI.getType(Reg: Res);
4603
4604 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
4605 auto Neg = MIRBuilder.buildFNeg(Dst: Ty, Src0: RHS);
4606
4607 if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
4608 MIRBuilder.buildStrictFAdd(Dst: Res, Src0: LHS, Src1: Neg, Flags: MI.getFlags());
4609 else
4610 MIRBuilder.buildFAdd(Dst: Res, Src0: LHS, Src1: Neg, Flags: MI.getFlags());
4611
4612 MI.eraseFromParent();
4613 return Legalized;
4614 }
4615 case TargetOpcode::G_FMAD:
4616 return lowerFMad(MI);
4617 case TargetOpcode::G_FFLOOR:
4618 return lowerFFloor(MI);
4619 case TargetOpcode::G_LROUND:
4620 case TargetOpcode::G_LLROUND: {
4621 Register DstReg = MI.getOperand(i: 0).getReg();
4622 Register SrcReg = MI.getOperand(i: 1).getReg();
4623 LLT SrcTy = MRI.getType(Reg: SrcReg);
4624 auto Round = MIRBuilder.buildInstr(Opc: TargetOpcode::G_INTRINSIC_ROUND, DstOps: {SrcTy},
4625 SrcOps: {SrcReg});
4626 MIRBuilder.buildFPTOSI(Dst: DstReg, Src0: Round);
4627 MI.eraseFromParent();
4628 return Legalized;
4629 }
4630 case TargetOpcode::G_INTRINSIC_ROUND:
4631 return lowerIntrinsicRound(MI);
4632 case TargetOpcode::G_FRINT: {
4633 // Since round even is the assumed rounding mode for unconstrained FP
4634 // operations, rint and roundeven are the same operation.
4635 changeOpcode(MI, NewOpcode: TargetOpcode::G_INTRINSIC_ROUNDEVEN);
4636 return Legalized;
4637 }
4638 case TargetOpcode::G_INTRINSIC_LRINT:
4639 case TargetOpcode::G_INTRINSIC_LLRINT: {
4640 Register DstReg = MI.getOperand(i: 0).getReg();
4641 Register SrcReg = MI.getOperand(i: 1).getReg();
4642 LLT SrcTy = MRI.getType(Reg: SrcReg);
4643 auto Round =
4644 MIRBuilder.buildInstr(Opc: TargetOpcode::G_FRINT, DstOps: {SrcTy}, SrcOps: {SrcReg});
4645 MIRBuilder.buildFPTOSI(Dst: DstReg, Src0: Round);
4646 MI.eraseFromParent();
4647 return Legalized;
4648 }
4649 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
4650 auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
4651 Register NewOldValRes = MRI.cloneVirtualRegister(VReg: OldValRes);
4652 MIRBuilder.buildAtomicCmpXchg(OldValRes: NewOldValRes, Addr, CmpVal, NewVal,
4653 MMO&: **MI.memoperands_begin());
4654 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: SuccessRes, Op0: NewOldValRes, Op1: CmpVal);
4655 MIRBuilder.buildCopy(Res: OldValRes, Op: NewOldValRes);
4656 MI.eraseFromParent();
4657 return Legalized;
4658 }
4659 case TargetOpcode::G_LOAD:
4660 case TargetOpcode::G_SEXTLOAD:
4661 case TargetOpcode::G_ZEXTLOAD:
4662 return lowerLoad(LoadMI&: cast<GAnyLoad>(Val&: MI));
4663 case TargetOpcode::G_STORE:
4664 return lowerStore(StoreMI&: cast<GStore>(Val&: MI));
4665 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
4666 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
4667 case TargetOpcode::G_CTLZ:
4668 case TargetOpcode::G_CTTZ:
4669 case TargetOpcode::G_CTPOP:
4670 case TargetOpcode::G_CTLS:
4671 return lowerBitCount(MI);
4672 case G_UADDO: {
4673 auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
4674
4675 Register NewRes = MRI.cloneVirtualRegister(VReg: Res);
4676
4677 MIRBuilder.buildAdd(Dst: NewRes, Src0: LHS, Src1: RHS);
4678 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: CarryOut, Op0: NewRes, Op1: RHS);
4679
4680 MIRBuilder.buildCopy(Res, Op: NewRes);
4681
4682 MI.eraseFromParent();
4683 return Legalized;
4684 }
4685 case G_UADDE: {
4686 auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
4687 const LLT CondTy = MRI.getType(Reg: CarryOut);
4688 const LLT Ty = MRI.getType(Reg: Res);
4689
4690 Register NewRes = MRI.cloneVirtualRegister(VReg: Res);
4691
4692 // Initial add of the two operands.
4693 auto TmpRes = MIRBuilder.buildAdd(Dst: Ty, Src0: LHS, Src1: RHS);
4694
4695 // Initial check for carry.
4696 auto Carry = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: CondTy, Op0: TmpRes, Op1: LHS);
4697
4698 // Add the sum and the carry.
4699 auto ZExtCarryIn = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
4700 MIRBuilder.buildAdd(Dst: NewRes, Src0: TmpRes, Src1: ZExtCarryIn);
4701
4702 // Second check for carry. We can only carry if the initial sum is all 1s
4703 // and the carry is set, resulting in a new sum of 0.
4704 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4705 auto ResEqZero =
4706 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CondTy, Op0: NewRes, Op1: Zero);
4707 auto Carry2 = MIRBuilder.buildAnd(Dst: CondTy, Src0: ResEqZero, Src1: CarryIn);
4708 MIRBuilder.buildOr(Dst: CarryOut, Src0: Carry, Src1: Carry2);
4709
4710 MIRBuilder.buildCopy(Res, Op: NewRes);
4711
4712 MI.eraseFromParent();
4713 return Legalized;
4714 }
4715 case G_USUBO: {
4716 auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
4717
4718 MIRBuilder.buildSub(Dst: Res, Src0: LHS, Src1: RHS);
4719 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: BorrowOut, Op0: LHS, Op1: RHS);
4720
4721 MI.eraseFromParent();
4722 return Legalized;
4723 }
4724 case G_USUBE: {
4725 auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
4726 const LLT CondTy = MRI.getType(Reg: BorrowOut);
4727 const LLT Ty = MRI.getType(Reg: Res);
4728
4729 // Initial subtract of the two operands.
4730 auto TmpRes = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHS);
4731
4732 // Initial check for borrow.
4733 auto Borrow = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_UGT, Res: CondTy, Op0: TmpRes, Op1: LHS);
4734
4735 // Subtract the borrow from the first subtract.
4736 auto ZExtBorrowIn = MIRBuilder.buildZExt(Res: Ty, Op: BorrowIn);
4737 MIRBuilder.buildSub(Dst: Res, Src0: TmpRes, Src1: ZExtBorrowIn);
4738
4739 // Second check for borrow. We can only borrow if the initial difference is
4740 // 0 and the borrow is set, resulting in a new difference of all 1s.
4741 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4742 auto TmpResEqZero =
4743 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CondTy, Op0: TmpRes, Op1: Zero);
4744 auto Borrow2 = MIRBuilder.buildAnd(Dst: CondTy, Src0: TmpResEqZero, Src1: BorrowIn);
4745 MIRBuilder.buildOr(Dst: BorrowOut, Src0: Borrow, Src1: Borrow2);
4746
4747 MI.eraseFromParent();
4748 return Legalized;
4749 }
4750 case G_UITOFP:
4751 return lowerUITOFP(MI);
4752 case G_SITOFP:
4753 return lowerSITOFP(MI);
4754 case G_FPTOUI:
4755 return lowerFPTOUI(MI);
4756 case G_FPTOSI:
4757 return lowerFPTOSI(MI);
4758 case G_FPTOUI_SAT:
4759 case G_FPTOSI_SAT:
4760 return lowerFPTOINT_SAT(MI);
4761 case G_FPTRUNC:
4762 return lowerFPTRUNC(MI);
4763 case G_FPOWI:
4764 return lowerFPOWI(MI);
4765 case G_SMIN:
4766 case G_SMAX:
4767 case G_UMIN:
4768 case G_UMAX:
4769 return lowerMinMax(MI);
4770 case G_SCMP:
4771 case G_UCMP:
4772 return lowerThreewayCompare(MI);
4773 case G_FCOPYSIGN:
4774 return lowerFCopySign(MI);
4775 case G_FMINNUM:
4776 case G_FMAXNUM:
4777 case G_FMINIMUMNUM:
4778 case G_FMAXIMUMNUM:
4779 return lowerFMinNumMaxNum(MI);
4780 case G_FMINIMUM:
4781 case G_FMAXIMUM:
4782 return lowerFMinimumMaximum(MI);
4783 case G_MERGE_VALUES:
4784 return lowerMergeValues(MI);
4785 case G_UNMERGE_VALUES:
4786 return lowerUnmergeValues(MI);
4787 case TargetOpcode::G_SEXT_INREG: {
4788 assert(MI.getOperand(2).isImm() && "Expected immediate");
4789 int64_t SizeInBits = MI.getOperand(i: 2).getImm();
4790
4791 auto [DstReg, SrcReg] = MI.getFirst2Regs();
4792 LLT DstTy = MRI.getType(Reg: DstReg);
4793 Register TmpRes = MRI.createGenericVirtualRegister(Ty: DstTy);
4794
4795 auto MIBSz = MIRBuilder.buildConstant(Res: DstTy, Val: DstTy.getScalarSizeInBits() - SizeInBits);
4796 MIRBuilder.buildShl(Dst: TmpRes, Src0: SrcReg, Src1: MIBSz->getOperand(i: 0));
4797 MIRBuilder.buildAShr(Dst: DstReg, Src0: TmpRes, Src1: MIBSz->getOperand(i: 0));
4798 MI.eraseFromParent();
4799 return Legalized;
4800 }
4801 case G_EXTRACT_VECTOR_ELT:
4802 case G_INSERT_VECTOR_ELT:
4803 return lowerExtractInsertVectorElt(MI);
4804 case G_SHUFFLE_VECTOR:
4805 return lowerShuffleVector(MI);
4806 case G_VECTOR_COMPRESS:
4807 return lowerVECTOR_COMPRESS(MI);
4808 case G_DYN_STACKALLOC:
4809 return lowerDynStackAlloc(MI);
4810 case G_STACKSAVE:
4811 return lowerStackSave(MI);
4812 case G_STACKRESTORE:
4813 return lowerStackRestore(MI);
4814 case G_EXTRACT:
4815 return lowerExtract(MI);
4816 case G_INSERT:
4817 return lowerInsert(MI);
4818 case G_BSWAP:
4819 return lowerBswap(MI);
4820 case G_BITREVERSE:
4821 return lowerBitreverse(MI);
4822 case G_READ_REGISTER:
4823 case G_WRITE_REGISTER:
4824 return lowerReadWriteRegister(MI);
4825 case G_UADDSAT:
4826 case G_USUBSAT: {
4827 // Try to make a reasonable guess about which lowering strategy to use. The
4828 // target can override this with custom lowering and calling the
4829 // implementation functions.
4830 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4831 if (LI.isLegalOrCustom(Query: {G_UMIN, Ty}))
4832 return lowerAddSubSatToMinMax(MI);
4833 return lowerAddSubSatToAddoSubo(MI);
4834 }
4835 case G_SADDSAT:
4836 case G_SSUBSAT: {
4837 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4838
4839 // FIXME: It would probably make more sense to see if G_SADDO is preferred,
4840 // since it's a shorter expansion. However, we would need to figure out the
4841 // preferred boolean type for the carry out for the query.
4842 if (LI.isLegalOrCustom(Query: {G_SMIN, Ty}) && LI.isLegalOrCustom(Query: {G_SMAX, Ty}))
4843 return lowerAddSubSatToMinMax(MI);
4844 return lowerAddSubSatToAddoSubo(MI);
4845 }
4846 case G_SSHLSAT:
4847 case G_USHLSAT:
4848 return lowerShlSat(MI);
4849 case G_ABS:
4850 return lowerAbsToAddXor(MI);
4851 case G_ABDS:
4852 case G_ABDU: {
4853 bool IsSigned = MI.getOpcode() == G_ABDS;
4854 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4855 if ((IsSigned && LI.isLegal(Query: {G_SMIN, Ty}) && LI.isLegal(Query: {G_SMAX, Ty})) ||
4856 (!IsSigned && LI.isLegal(Query: {G_UMIN, Ty}) && LI.isLegal(Query: {G_UMAX, Ty}))) {
4857 return lowerAbsDiffToMinMax(MI);
4858 }
4859 return lowerAbsDiffToSelect(MI);
4860 }
4861 case G_FABS:
4862 return lowerFAbs(MI);
4863 case G_SELECT:
4864 return lowerSelect(MI);
4865 case G_IS_FPCLASS:
4866 return lowerISFPCLASS(MI);
4867 case G_SDIVREM:
4868 case G_UDIVREM:
4869 return lowerDIVREM(MI);
4870 case G_FSHL:
4871 case G_FSHR:
4872 return lowerFunnelShift(MI);
4873 case G_ROTL:
4874 case G_ROTR:
4875 return lowerRotate(MI);
4876 case G_MEMSET:
4877 case G_MEMCPY:
4878 case G_MEMMOVE:
4879 return lowerMemCpyFamily(MI);
4880 case G_MEMCPY_INLINE:
4881 return lowerMemcpyInline(MI);
4882 case G_ZEXT:
4883 case G_SEXT:
4884 case G_ANYEXT:
4885 return lowerEXT(MI);
4886 case G_TRUNC:
4887 return lowerTRUNC(MI);
4888 GISEL_VECREDUCE_CASES_NONSEQ
4889 return lowerVectorReduction(MI);
4890 case G_VAARG:
4891 return lowerVAArg(MI);
4892 case G_ATOMICRMW_SUB: {
4893 auto [Ret, Mem, Val] = MI.getFirst3Regs();
4894 const LLT ValTy = MRI.getType(Reg: Val);
4895 MachineMemOperand *MMO = *MI.memoperands_begin();
4896
4897 auto VNeg = MIRBuilder.buildNeg(Dst: ValTy, Src0: Val);
4898 MIRBuilder.buildAtomicRMW(Opcode: G_ATOMICRMW_ADD, OldValRes: Ret, Addr: Mem, Val: VNeg, MMO&: *MMO);
4899 MI.eraseFromParent();
4900 return Legalized;
4901 }
4902 }
4903}
4904
4905Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
4906 Align MinAlign) const {
4907 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
4908 // datalayout for the preferred alignment. Also there should be a target hook
4909 // for this to allow targets to reduce the alignment and ignore the
4910 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
4911 // the type.
4912 return std::max(a: Align(PowerOf2Ceil(A: Ty.getSizeInBytes())), b: MinAlign);
4913}
4914
4915MachineInstrBuilder
4916LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
4917 MachinePointerInfo &PtrInfo) {
4918 MachineFunction &MF = MIRBuilder.getMF();
4919 const DataLayout &DL = MIRBuilder.getDataLayout();
4920 int FrameIdx = MF.getFrameInfo().CreateStackObject(Size: Bytes, Alignment, isSpillSlot: false);
4921
4922 unsigned AddrSpace = DL.getAllocaAddrSpace();
4923 LLT FramePtrTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: DL.getPointerSizeInBits(AS: AddrSpace));
4924
4925 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI: FrameIdx);
4926 return MIRBuilder.buildFrameIndex(Res: FramePtrTy, Idx: FrameIdx);
4927}
4928
4929MachineInstrBuilder LegalizerHelper::createStackStoreLoad(const DstOp &Res,
4930 const SrcOp &Val) {
4931 LLT SrcTy = Val.getLLTTy(MRI);
4932 Align StackTypeAlign =
4933 std::max(a: getStackTemporaryAlignment(Ty: SrcTy),
4934 b: getStackTemporaryAlignment(Ty: Res.getLLTTy(MRI)));
4935 MachinePointerInfo PtrInfo;
4936 auto StackTemp =
4937 createStackTemporary(Bytes: SrcTy.getSizeInBytes(), Alignment: StackTypeAlign, PtrInfo);
4938
4939 MIRBuilder.buildStore(Val, Addr: StackTemp, PtrInfo, Alignment: StackTypeAlign);
4940 return MIRBuilder.buildLoad(Res, Addr: StackTemp, PtrInfo, Alignment: StackTypeAlign);
4941}
4942
4943static Register clampVectorIndex(MachineIRBuilder &B, Register IdxReg,
4944 LLT VecTy) {
4945 LLT IdxTy = B.getMRI()->getType(Reg: IdxReg);
4946 unsigned NElts = VecTy.getNumElements();
4947
4948 int64_t IdxVal;
4949 if (mi_match(R: IdxReg, MRI: *B.getMRI(), P: m_ICst(Cst&: IdxVal))) {
4950 if (IdxVal < VecTy.getNumElements())
4951 return IdxReg;
4952 // If a constant index would be out of bounds, clamp it as well.
4953 }
4954
4955 if (isPowerOf2_32(Value: NElts)) {
4956 APInt Imm = APInt::getLowBitsSet(numBits: IdxTy.getSizeInBits(), loBitsSet: Log2_32(Value: NElts));
4957 return B.buildAnd(Dst: IdxTy, Src0: IdxReg, Src1: B.buildConstant(Res: IdxTy, Val: Imm)).getReg(Idx: 0);
4958 }
4959
4960 return B.buildUMin(Dst: IdxTy, Src0: IdxReg, Src1: B.buildConstant(Res: IdxTy, Val: NElts - 1))
4961 .getReg(Idx: 0);
4962}
4963
4964Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
4965 Register Index) {
4966 LLT EltTy = VecTy.getElementType();
4967
4968 // Calculate the element offset and add it to the pointer.
4969 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
4970 assert(EltSize * 8 == EltTy.getSizeInBits() &&
4971 "Converting bits to bytes lost precision");
4972
4973 Index = clampVectorIndex(B&: MIRBuilder, IdxReg: Index, VecTy);
4974
4975 // Convert index to the correct size for the address space.
4976 const DataLayout &DL = MIRBuilder.getDataLayout();
4977 unsigned AS = MRI.getType(Reg: VecPtr).getAddressSpace();
4978 unsigned IndexSizeInBits = DL.getIndexSize(AS) * 8;
4979 LLT IdxTy = MRI.getType(Reg: Index).changeElementSize(NewEltSize: IndexSizeInBits);
4980 if (IdxTy != MRI.getType(Reg: Index))
4981 Index = MIRBuilder.buildSExtOrTrunc(Res: IdxTy, Op: Index).getReg(Idx: 0);
4982
4983 auto Mul = MIRBuilder.buildMul(Dst: IdxTy, Src0: Index,
4984 Src1: MIRBuilder.buildConstant(Res: IdxTy, Val: EltSize));
4985
4986 LLT PtrTy = MRI.getType(Reg: VecPtr);
4987 return MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VecPtr, Op1: Mul).getReg(Idx: 0);
4988}
4989
4990#ifndef NDEBUG
4991/// Check that all vector operands have same number of elements. Other operands
4992/// should be listed in NonVecOp.
4993static bool hasSameNumEltsOnAllVectorOperands(
4994 GenericMachineInstr &MI, MachineRegisterInfo &MRI,
4995 std::initializer_list<unsigned> NonVecOpIndices) {
4996 if (MI.getNumMemOperands() != 0)
4997 return false;
4998
4999 LLT VecTy = MRI.getType(MI.getReg(0));
5000 if (!VecTy.isVector())
5001 return false;
5002 unsigned NumElts = VecTy.getNumElements();
5003
5004 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
5005 MachineOperand &Op = MI.getOperand(OpIdx);
5006 if (!Op.isReg()) {
5007 if (!is_contained(NonVecOpIndices, OpIdx))
5008 return false;
5009 continue;
5010 }
5011
5012 LLT Ty = MRI.getType(Op.getReg());
5013 if (!Ty.isVector()) {
5014 if (!is_contained(NonVecOpIndices, OpIdx))
5015 return false;
5016 continue;
5017 }
5018
5019 if (Ty.getNumElements() != NumElts)
5020 return false;
5021 }
5022
5023 return true;
5024}
5025#endif
5026
5027/// Fill \p DstOps with DstOps that have same number of elements combined as
5028/// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
5029/// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
5030/// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
5031static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
5032 unsigned NumElts) {
5033 LLT LeftoverTy;
5034 assert(Ty.isVector() && "Expected vector type");
5035 LLT NarrowTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: NumElts));
5036 int NumParts, NumLeftover;
5037 std::tie(args&: NumParts, args&: NumLeftover) =
5038 getNarrowTypeBreakDown(OrigTy: Ty, NarrowTy, LeftoverTy);
5039
5040 assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
5041 for (int i = 0; i < NumParts; ++i) {
5042 DstOps.push_back(Elt: NarrowTy);
5043 }
5044
5045 if (LeftoverTy.isValid()) {
5046 assert(NumLeftover == 1 && "expected exactly one leftover");
5047 DstOps.push_back(Elt: LeftoverTy);
5048 }
5049}
5050
5051/// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
5052/// made from \p Op depending on operand type.
5053static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
5054 MachineOperand &Op) {
5055 for (unsigned i = 0; i < N; ++i) {
5056 if (Op.isReg())
5057 Ops.push_back(Elt: Op.getReg());
5058 else if (Op.isImm())
5059 Ops.push_back(Elt: Op.getImm());
5060 else if (Op.isPredicate())
5061 Ops.push_back(Elt: static_cast<CmpInst::Predicate>(Op.getPredicate()));
5062 else
5063 llvm_unreachable("Unsupported type");
5064 }
5065}
5066
5067// Handle splitting vector operations which need to have the same number of
5068// elements in each type index, but each type index may have a different element
5069// type.
5070//
5071// e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
5072// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
5073// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
5074//
5075// Also handles some irregular breakdown cases, e.g.
5076// e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
5077// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
5078// s64 = G_SHL s64, s32
5079LegalizerHelper::LegalizeResult
5080LegalizerHelper::fewerElementsVectorMultiEltType(
5081 GenericMachineInstr &MI, unsigned NumElts,
5082 std::initializer_list<unsigned> NonVecOpIndices) {
5083 assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
5084 "Non-compatible opcode or not specified non-vector operands");
5085 unsigned OrigNumElts = MRI.getType(Reg: MI.getReg(Idx: 0)).getNumElements();
5086
5087 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
5088 unsigned NumDefs = MI.getNumDefs();
5089
5090 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
5091 // Build instructions with DstOps to use instruction found by CSE directly.
5092 // CSE copies found instruction into given vreg when building with vreg dest.
5093 SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
5094 // Output registers will be taken from created instructions.
5095 SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
5096 for (unsigned i = 0; i < NumDefs; ++i) {
5097 makeDstOps(DstOps&: OutputOpsPieces[i], Ty: MRI.getType(Reg: MI.getReg(Idx: i)), NumElts);
5098 }
5099
5100 // Split vector input operands into sub-vectors with NumElts elts + Leftover.
5101 // Operands listed in NonVecOpIndices will be used as is without splitting;
5102 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
5103 // scalar condition (op 1), immediate in sext_inreg (op 2).
5104 SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
5105 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
5106 ++UseIdx, ++UseNo) {
5107 if (is_contained(Set: NonVecOpIndices, Element: UseIdx)) {
5108 broadcastSrcOp(Ops&: InputOpsPieces[UseNo], N: OutputOpsPieces[0].size(),
5109 Op&: MI.getOperand(i: UseIdx));
5110 } else {
5111 SmallVector<Register, 8> SplitPieces;
5112 extractVectorParts(Reg: MI.getReg(Idx: UseIdx), NumElts, VRegs&: SplitPieces, MIRBuilder,
5113 MRI);
5114 llvm::append_range(C&: InputOpsPieces[UseNo], R&: SplitPieces);
5115 }
5116 }
5117
5118 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
5119
5120 // Take i-th piece of each input operand split and build sub-vector/scalar
5121 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
5122 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
5123 SmallVector<DstOp, 2> Defs;
5124 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5125 Defs.push_back(Elt: OutputOpsPieces[DstNo][i]);
5126
5127 SmallVector<SrcOp, 3> Uses;
5128 for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
5129 Uses.push_back(Elt: InputOpsPieces[InputNo][i]);
5130
5131 auto I = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: Defs, SrcOps: Uses, Flags: MI.getFlags());
5132 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5133 OutputRegs[DstNo].push_back(Elt: I.getReg(Idx: DstNo));
5134 }
5135
5136 // Merge small outputs into MI's output for each def operand.
5137 if (NumLeftovers) {
5138 for (unsigned i = 0; i < NumDefs; ++i)
5139 mergeMixedSubvectors(DstReg: MI.getReg(Idx: i), PartRegs: OutputRegs[i]);
5140 } else {
5141 for (unsigned i = 0; i < NumDefs; ++i)
5142 MIRBuilder.buildMergeLikeInstr(Res: MI.getReg(Idx: i), Ops: OutputRegs[i]);
5143 }
5144
5145 MI.eraseFromParent();
5146 return Legalized;
5147}
5148
5149LegalizerHelper::LegalizeResult
5150LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
5151 unsigned NumElts) {
5152 unsigned OrigNumElts = MRI.getType(Reg: MI.getReg(Idx: 0)).getNumElements();
5153
5154 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
5155 unsigned NumDefs = MI.getNumDefs();
5156
5157 SmallVector<DstOp, 8> OutputOpsPieces;
5158 SmallVector<Register, 8> OutputRegs;
5159 makeDstOps(DstOps&: OutputOpsPieces, Ty: MRI.getType(Reg: MI.getReg(Idx: 0)), NumElts);
5160
5161 // Instructions that perform register split will be inserted in basic block
5162 // where register is defined (basic block is in the next operand).
5163 SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
5164 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
5165 UseIdx += 2, ++UseNo) {
5166 MachineBasicBlock &OpMBB = *MI.getOperand(i: UseIdx + 1).getMBB();
5167 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
5168 extractVectorParts(Reg: MI.getReg(Idx: UseIdx), NumElts, VRegs&: InputOpsPieces[UseNo],
5169 MIRBuilder, MRI);
5170 }
5171
5172 // Build PHIs with fewer elements.
5173 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
5174 MIRBuilder.setInsertPt(MBB&: *MI.getParent(), II: MI);
5175 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
5176 auto Phi = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_PHI);
5177 Phi.addDef(
5178 RegNo: MRI.createGenericVirtualRegister(Ty: OutputOpsPieces[i].getLLTTy(MRI)));
5179 OutputRegs.push_back(Elt: Phi.getReg(Idx: 0));
5180
5181 for (unsigned j = 0; j < NumInputs / 2; ++j) {
5182 Phi.addUse(RegNo: InputOpsPieces[j][i]);
5183 Phi.add(MO: MI.getOperand(i: 1 + j * 2 + 1));
5184 }
5185 }
5186
5187 // Set the insert point after the existing PHIs
5188 MachineBasicBlock &MBB = *MI.getParent();
5189 MIRBuilder.setInsertPt(MBB, II: MBB.getFirstNonPHI());
5190
5191 // Merge small outputs into MI's def.
5192 if (NumLeftovers) {
5193 mergeMixedSubvectors(DstReg: MI.getReg(Idx: 0), PartRegs: OutputRegs);
5194 } else {
5195 MIRBuilder.buildMergeLikeInstr(Res: MI.getReg(Idx: 0), Ops: OutputRegs);
5196 }
5197
5198 MI.eraseFromParent();
5199 return Legalized;
5200}
5201
5202LegalizerHelper::LegalizeResult
5203LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
5204 unsigned TypeIdx,
5205 LLT NarrowTy) {
5206 const int NumDst = MI.getNumOperands() - 1;
5207 const Register SrcReg = MI.getOperand(i: NumDst).getReg();
5208 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5209 LLT SrcTy = MRI.getType(Reg: SrcReg);
5210
5211 if (TypeIdx != 1 || NarrowTy == DstTy)
5212 return UnableToLegalize;
5213
5214 // Requires compatible types. Otherwise SrcReg should have been defined by
5215 // merge-like instruction that would get artifact combined. Most likely
5216 // instruction that defines SrcReg has to perform more/fewer elements
5217 // legalization compatible with NarrowTy.
5218 assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5219 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5220
5221 if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5222 (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
5223 return UnableToLegalize;
5224
5225 // This is most likely DstTy (smaller then register size) packed in SrcTy
5226 // (larger then register size) and since unmerge was not combined it will be
5227 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
5228 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
5229
5230 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
5231 //
5232 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
5233 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
5234 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
5235 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: SrcReg);
5236 const int NumUnmerge = Unmerge->getNumOperands() - 1;
5237 const int PartsPerUnmerge = NumDst / NumUnmerge;
5238
5239 for (int I = 0; I != NumUnmerge; ++I) {
5240 auto MIB = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_UNMERGE_VALUES);
5241
5242 for (int J = 0; J != PartsPerUnmerge; ++J)
5243 MIB.addDef(RegNo: MI.getOperand(i: I * PartsPerUnmerge + J).getReg());
5244 MIB.addUse(RegNo: Unmerge.getReg(Idx: I));
5245 }
5246
5247 MI.eraseFromParent();
5248 return Legalized;
5249}
5250
5251LegalizerHelper::LegalizeResult
5252LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
5253 LLT NarrowTy) {
5254 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5255 // Requires compatible types. Otherwise user of DstReg did not perform unmerge
5256 // that should have been artifact combined. Most likely instruction that uses
5257 // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
5258 assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5259 assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5260 if (NarrowTy == SrcTy)
5261 return UnableToLegalize;
5262
5263 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
5264 // is for old mir tests. Since the changes to more/fewer elements it should no
5265 // longer be possible to generate MIR like this when starting from llvm-ir
5266 // because LCMTy approach was replaced with merge/unmerge to vector elements.
5267 if (TypeIdx == 1) {
5268 assert(SrcTy.isVector() && "Expected vector types");
5269 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5270 if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5271 (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
5272 return UnableToLegalize;
5273 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
5274 //
5275 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
5276 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
5277 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
5278 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
5279 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
5280 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
5281
5282 SmallVector<Register, 8> Elts;
5283 LLT EltTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getScalarType();
5284 for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
5285 auto Unmerge = MIRBuilder.buildUnmerge(Res: EltTy, Op: MI.getOperand(i).getReg());
5286 for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
5287 Elts.push_back(Elt: Unmerge.getReg(Idx: j));
5288 }
5289
5290 SmallVector<Register, 8> NarrowTyElts;
5291 unsigned NumNarrowTyElts = NarrowTy.getNumElements();
5292 unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
5293 for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
5294 ++i, Offset += NumNarrowTyElts) {
5295 ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
5296 NarrowTyElts.push_back(
5297 Elt: MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: Pieces).getReg(Idx: 0));
5298 }
5299
5300 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NarrowTyElts);
5301 MI.eraseFromParent();
5302 return Legalized;
5303 }
5304
5305 assert(TypeIdx == 0 && "Bad type index");
5306 if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
5307 (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
5308 return UnableToLegalize;
5309
5310 // This is most likely SrcTy (smaller then register size) packed in DstTy
5311 // (larger then register size) and since merge was not combined it will be
5312 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
5313 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
5314
5315 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
5316 //
5317 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
5318 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
5319 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
5320 SmallVector<Register, 8> NarrowTyElts;
5321 unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
5322 unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
5323 unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
5324 for (unsigned i = 0; i < NumParts; ++i) {
5325 SmallVector<Register, 8> Sources;
5326 for (unsigned j = 0; j < NumElts; ++j)
5327 Sources.push_back(Elt: MI.getOperand(i: 1 + i * NumElts + j).getReg());
5328 NarrowTyElts.push_back(
5329 Elt: MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: Sources).getReg(Idx: 0));
5330 }
5331
5332 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NarrowTyElts);
5333 MI.eraseFromParent();
5334 return Legalized;
5335}
5336
5337LegalizerHelper::LegalizeResult
5338LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
5339 unsigned TypeIdx,
5340 LLT NarrowVecTy) {
5341 auto [DstReg, SrcVec] = MI.getFirst2Regs();
5342 Register InsertVal;
5343 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
5344
5345 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
5346 if (IsInsert)
5347 InsertVal = MI.getOperand(i: 2).getReg();
5348
5349 Register Idx = MI.getOperand(i: MI.getNumOperands() - 1).getReg();
5350 LLT VecTy = MRI.getType(Reg: SrcVec);
5351
5352 // If the index is a constant, we can really break this down as you would
5353 // expect, and index into the target size pieces.
5354 auto MaybeCst = getIConstantVRegValWithLookThrough(VReg: Idx, MRI);
5355 if (MaybeCst) {
5356 uint64_t IdxVal = MaybeCst->Value.getZExtValue();
5357 // Avoid out of bounds indexing the pieces.
5358 if (IdxVal >= VecTy.getNumElements()) {
5359 MIRBuilder.buildUndef(Res: DstReg);
5360 MI.eraseFromParent();
5361 return Legalized;
5362 }
5363
5364 if (!NarrowVecTy.isVector()) {
5365 SmallVector<Register, 8> SplitPieces;
5366 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowVecTy,
5367 NumParts: VecTy.getNumElements(), VRegs&: SplitPieces, MIRBuilder, MRI);
5368 if (IsInsert) {
5369 SplitPieces[IdxVal] = InsertVal;
5370 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: SplitPieces);
5371 } else {
5372 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0).getReg(), Op: SplitPieces[IdxVal]);
5373 }
5374 } else {
5375 SmallVector<Register, 8> VecParts;
5376 LLT GCDTy = extractGCDType(Parts&: VecParts, DstTy: VecTy, NarrowTy: NarrowVecTy, SrcReg: SrcVec);
5377
5378 // Build a sequence of NarrowTy pieces in VecParts for this operand.
5379 LLT LCMTy = buildLCMMergePieces(DstTy: VecTy, NarrowTy: NarrowVecTy, GCDTy, VRegs&: VecParts,
5380 PadStrategy: TargetOpcode::G_ANYEXT);
5381
5382 unsigned NewNumElts = NarrowVecTy.getNumElements();
5383
5384 LLT IdxTy = MRI.getType(Reg: Idx);
5385 int64_t PartIdx = IdxVal / NewNumElts;
5386 auto NewIdx =
5387 MIRBuilder.buildConstant(Res: IdxTy, Val: IdxVal - NewNumElts * PartIdx);
5388
5389 if (IsInsert) {
5390 LLT PartTy = MRI.getType(Reg: VecParts[PartIdx]);
5391
5392 // Use the adjusted index to insert into one of the subvectors.
5393 auto InsertPart = MIRBuilder.buildInsertVectorElement(
5394 Res: PartTy, Val: VecParts[PartIdx], Elt: InsertVal, Idx: NewIdx);
5395 VecParts[PartIdx] = InsertPart.getReg(Idx: 0);
5396
5397 // Recombine the inserted subvector with the others to reform the result
5398 // vector.
5399 buildWidenedRemergeToDst(DstReg, LCMTy, RemergeRegs: VecParts);
5400 } else {
5401 MIRBuilder.buildExtractVectorElement(Res: DstReg, Val: VecParts[PartIdx], Idx: NewIdx);
5402 }
5403 }
5404
5405 MI.eraseFromParent();
5406 return Legalized;
5407 }
5408
5409 // With a variable index, we can't perform the operation in a smaller type, so
5410 // we're forced to expand this.
5411 //
5412 // TODO: We could emit a chain of compare/select to figure out which piece to
5413 // index.
5414 return lowerExtractInsertVectorElt(MI);
5415}
5416
5417LegalizerHelper::LegalizeResult
5418LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
5419 LLT NarrowTy) {
5420 // FIXME: Don't know how to handle secondary types yet.
5421 if (TypeIdx != 0)
5422 return UnableToLegalize;
5423
5424 if (!NarrowTy.isByteSized()) {
5425 LLVM_DEBUG(dbgs() << "Can't narrow load/store to non-byte-sized type\n");
5426 return UnableToLegalize;
5427 }
5428
5429 // This implementation doesn't work for atomics. Give up instead of doing
5430 // something invalid.
5431 if (LdStMI.isAtomic())
5432 return UnableToLegalize;
5433
5434 bool IsLoad = isa<GLoad>(Val: LdStMI);
5435 Register ValReg = LdStMI.getReg(Idx: 0);
5436 Register AddrReg = LdStMI.getPointerReg();
5437 LLT ValTy = MRI.getType(Reg: ValReg);
5438
5439 // FIXME: Do we need a distinct NarrowMemory legalize action?
5440 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize().getValue()) {
5441 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
5442 return UnableToLegalize;
5443 }
5444
5445 int NumParts = -1;
5446 int NumLeftover = -1;
5447 LLT LeftoverTy;
5448 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
5449 if (IsLoad) {
5450 std::tie(args&: NumParts, args&: NumLeftover) = getNarrowTypeBreakDown(OrigTy: ValTy, NarrowTy, LeftoverTy);
5451 } else {
5452 if (extractParts(Reg: ValReg, RegTy: ValTy, MainTy: NarrowTy, LeftoverTy, VRegs&: NarrowRegs,
5453 LeftoverVRegs&: NarrowLeftoverRegs, MIRBuilder, MRI)) {
5454 NumParts = NarrowRegs.size();
5455 NumLeftover = NarrowLeftoverRegs.size();
5456 }
5457 }
5458
5459 if (NumParts == -1)
5460 return UnableToLegalize;
5461
5462 LLT PtrTy = MRI.getType(Reg: AddrReg);
5463 const LLT OffsetTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
5464
5465 unsigned TotalSize = ValTy.getSizeInBits();
5466
5467 // Split the load/store into PartTy sized pieces starting at Offset. If this
5468 // is a load, return the new registers in ValRegs. For a store, each elements
5469 // of ValRegs should be PartTy. Returns the next offset that needs to be
5470 // handled.
5471 bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
5472 auto MMO = LdStMI.getMMO();
5473 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
5474 unsigned NumParts, unsigned Offset) -> unsigned {
5475 MachineFunction &MF = MIRBuilder.getMF();
5476 unsigned PartSize = PartTy.getSizeInBits();
5477 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
5478 ++Idx) {
5479 unsigned ByteOffset = Offset / 8;
5480 Register NewAddrReg;
5481
5482 MIRBuilder.materializeObjectPtrOffset(Res&: NewAddrReg, Op0: AddrReg, ValueTy: OffsetTy,
5483 Value: ByteOffset);
5484
5485 MachineMemOperand *NewMMO =
5486 MF.getMachineMemOperand(MMO: &MMO, Offset: ByteOffset, Ty: PartTy);
5487
5488 if (IsLoad) {
5489 Register Dst = MRI.createGenericVirtualRegister(Ty: PartTy);
5490 ValRegs.push_back(Elt: Dst);
5491 MIRBuilder.buildLoad(Res: Dst, Addr: NewAddrReg, MMO&: *NewMMO);
5492 } else {
5493 MIRBuilder.buildStore(Val: ValRegs[Idx], Addr: NewAddrReg, MMO&: *NewMMO);
5494 }
5495 Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
5496 }
5497
5498 return Offset;
5499 };
5500
5501 unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
5502 unsigned HandledOffset =
5503 splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
5504
5505 // Handle the rest of the register if this isn't an even type breakdown.
5506 if (LeftoverTy.isValid())
5507 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
5508
5509 if (IsLoad) {
5510 insertParts(DstReg: ValReg, ResultTy: ValTy, PartTy: NarrowTy, PartRegs: NarrowRegs,
5511 LeftoverTy, LeftoverRegs: NarrowLeftoverRegs);
5512 }
5513
5514 LdStMI.eraseFromParent();
5515 return Legalized;
5516}
5517
5518LegalizerHelper::LegalizeResult
5519LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
5520 LLT NarrowTy) {
5521 using namespace TargetOpcode;
5522 GenericMachineInstr &GMI = cast<GenericMachineInstr>(Val&: MI);
5523 unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5524
5525 switch (MI.getOpcode()) {
5526 case G_IMPLICIT_DEF:
5527 case G_TRUNC:
5528 case G_AND:
5529 case G_OR:
5530 case G_XOR:
5531 case G_ADD:
5532 case G_SUB:
5533 case G_MUL:
5534 case G_PTR_ADD:
5535 case G_SMULH:
5536 case G_UMULH:
5537 case G_FADD:
5538 case G_FMUL:
5539 case G_FSUB:
5540 case G_FNEG:
5541 case G_FABS:
5542 case G_FCANONICALIZE:
5543 case G_FDIV:
5544 case G_FREM:
5545 case G_FMA:
5546 case G_FMAD:
5547 case G_FPOW:
5548 case G_FEXP:
5549 case G_FEXP2:
5550 case G_FEXP10:
5551 case G_FLOG:
5552 case G_FLOG2:
5553 case G_FLOG10:
5554 case G_FLDEXP:
5555 case G_FNEARBYINT:
5556 case G_FCEIL:
5557 case G_FFLOOR:
5558 case G_FRINT:
5559 case G_INTRINSIC_LRINT:
5560 case G_INTRINSIC_LLRINT:
5561 case G_INTRINSIC_ROUND:
5562 case G_INTRINSIC_ROUNDEVEN:
5563 case G_LROUND:
5564 case G_LLROUND:
5565 case G_INTRINSIC_TRUNC:
5566 case G_FMODF:
5567 case G_FCOS:
5568 case G_FSIN:
5569 case G_FTAN:
5570 case G_FACOS:
5571 case G_FASIN:
5572 case G_FATAN:
5573 case G_FATAN2:
5574 case G_FCOSH:
5575 case G_FSINH:
5576 case G_FTANH:
5577 case G_FSQRT:
5578 case G_BSWAP:
5579 case G_BITREVERSE:
5580 case G_SDIV:
5581 case G_UDIV:
5582 case G_SREM:
5583 case G_UREM:
5584 case G_SDIVREM:
5585 case G_UDIVREM:
5586 case G_SMIN:
5587 case G_SMAX:
5588 case G_UMIN:
5589 case G_UMAX:
5590 case G_ABS:
5591 case G_FMINNUM:
5592 case G_FMAXNUM:
5593 case G_FMINNUM_IEEE:
5594 case G_FMAXNUM_IEEE:
5595 case G_FMINIMUM:
5596 case G_FMAXIMUM:
5597 case G_FMINIMUMNUM:
5598 case G_FMAXIMUMNUM:
5599 case G_FSHL:
5600 case G_FSHR:
5601 case G_ROTL:
5602 case G_ROTR:
5603 case G_FREEZE:
5604 case G_SADDSAT:
5605 case G_SSUBSAT:
5606 case G_UADDSAT:
5607 case G_USUBSAT:
5608 case G_UMULO:
5609 case G_SMULO:
5610 case G_SHL:
5611 case G_LSHR:
5612 case G_ASHR:
5613 case G_SSHLSAT:
5614 case G_USHLSAT:
5615 case G_CTLZ:
5616 case G_CTLZ_ZERO_UNDEF:
5617 case G_CTTZ:
5618 case G_CTTZ_ZERO_UNDEF:
5619 case G_CTPOP:
5620 case G_FCOPYSIGN:
5621 case G_ZEXT:
5622 case G_SEXT:
5623 case G_ANYEXT:
5624 case G_FPEXT:
5625 case G_FPTRUNC:
5626 case G_SITOFP:
5627 case G_UITOFP:
5628 case G_FPTOSI:
5629 case G_FPTOUI:
5630 case G_FPTOSI_SAT:
5631 case G_FPTOUI_SAT:
5632 case G_INTTOPTR:
5633 case G_PTRTOINT:
5634 case G_ADDRSPACE_CAST:
5635 case G_UADDO:
5636 case G_USUBO:
5637 case G_UADDE:
5638 case G_USUBE:
5639 case G_SADDO:
5640 case G_SSUBO:
5641 case G_SADDE:
5642 case G_SSUBE:
5643 case G_STRICT_FADD:
5644 case G_STRICT_FSUB:
5645 case G_STRICT_FMUL:
5646 case G_STRICT_FMA:
5647 case G_STRICT_FLDEXP:
5648 case G_FFREXP:
5649 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts);
5650 case G_ICMP:
5651 case G_FCMP:
5652 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {1 /*cpm predicate*/});
5653 case G_IS_FPCLASS:
5654 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2, 3 /*mask,fpsem*/});
5655 case G_SELECT:
5656 if (MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector())
5657 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts);
5658 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {1 /*scalar cond*/});
5659 case G_PHI:
5660 return fewerElementsVectorPhi(MI&: GMI, NumElts);
5661 case G_UNMERGE_VALUES:
5662 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
5663 case G_BUILD_VECTOR:
5664 assert(TypeIdx == 0 && "not a vector type index");
5665 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5666 case G_CONCAT_VECTORS:
5667 if (TypeIdx != 1) // TODO: This probably does work as expected already.
5668 return UnableToLegalize;
5669 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5670 case G_EXTRACT_VECTOR_ELT:
5671 case G_INSERT_VECTOR_ELT:
5672 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowVecTy: NarrowTy);
5673 case G_LOAD:
5674 case G_STORE:
5675 return reduceLoadStoreWidth(LdStMI&: cast<GLoadStore>(Val&: MI), TypeIdx, NarrowTy);
5676 case G_SEXT_INREG:
5677 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2 /*imm*/});
5678 GISEL_VECREDUCE_CASES_NONSEQ
5679 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
5680 case TargetOpcode::G_VECREDUCE_SEQ_FADD:
5681 case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
5682 return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
5683 case G_SHUFFLE_VECTOR:
5684 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
5685 case G_FPOWI:
5686 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2 /*pow*/});
5687 case G_BITCAST:
5688 return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
5689 case G_INTRINSIC_FPTRUNC_ROUND:
5690 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2});
5691 default:
5692 return UnableToLegalize;
5693 }
5694}
5695
5696LegalizerHelper::LegalizeResult
5697LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
5698 LLT NarrowTy) {
5699 assert(MI.getOpcode() == TargetOpcode::G_BITCAST &&
5700 "Not a bitcast operation");
5701
5702 if (TypeIdx != 0)
5703 return UnableToLegalize;
5704
5705 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5706
5707 unsigned NewElemCount =
5708 NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
5709 SmallVector<Register> SrcVRegs, BitcastVRegs;
5710 if (NewElemCount == 1) {
5711 LLT SrcNarrowTy = SrcTy.getElementType();
5712
5713 auto Unmerge = MIRBuilder.buildUnmerge(Res: SrcNarrowTy, Op: SrcReg);
5714 getUnmergeResults(Regs&: SrcVRegs, MI: *Unmerge);
5715 } else {
5716 LLT SrcNarrowTy =
5717 SrcTy.changeVectorElementCount(EC: ElementCount::getFixed(MinVal: NewElemCount));
5718
5719 // Split the Src and Dst Reg into smaller registers
5720 if (extractGCDType(Parts&: SrcVRegs, DstTy, NarrowTy: SrcNarrowTy, SrcReg) != SrcNarrowTy)
5721 return UnableToLegalize;
5722 }
5723
5724 // Build new smaller bitcast instructions
5725 // Not supporting Leftover types for now but will have to
5726 for (Register Reg : SrcVRegs)
5727 BitcastVRegs.push_back(Elt: MIRBuilder.buildBitcast(Dst: NarrowTy, Src: Reg).getReg(Idx: 0));
5728
5729 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: BitcastVRegs);
5730 MI.eraseFromParent();
5731 return Legalized;
5732}
5733
5734LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
5735 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5736 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
5737 if (TypeIdx != 0)
5738 return UnableToLegalize;
5739
5740 auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
5741 MI.getFirst3RegLLTs();
5742 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
5743 // The shuffle should be canonicalized by now.
5744 if (DstTy != Src1Ty)
5745 return UnableToLegalize;
5746 if (DstTy != Src2Ty)
5747 return UnableToLegalize;
5748
5749 if (!isPowerOf2_32(Value: DstTy.getNumElements()))
5750 return UnableToLegalize;
5751
5752 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
5753 // Further legalization attempts will be needed to do split further.
5754 NarrowTy =
5755 DstTy.changeElementCount(EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2));
5756 unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5757
5758 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
5759 extractParts(Reg: Src1Reg, Ty: NarrowTy, NumParts: 2, VRegs&: SplitSrc1Regs, MIRBuilder, MRI);
5760 extractParts(Reg: Src2Reg, Ty: NarrowTy, NumParts: 2, VRegs&: SplitSrc2Regs, MIRBuilder, MRI);
5761 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
5762 SplitSrc2Regs[1]};
5763
5764 Register Hi, Lo;
5765
5766 // If Lo or Hi uses elements from at most two of the four input vectors, then
5767 // express it as a vector shuffle of those two inputs. Otherwise extract the
5768 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
5769 SmallVector<int, 16> Ops;
5770 for (unsigned High = 0; High < 2; ++High) {
5771 Register &Output = High ? Hi : Lo;
5772
5773 // Build a shuffle mask for the output, discovering on the fly which
5774 // input vectors to use as shuffle operands (recorded in InputUsed).
5775 // If building a suitable shuffle vector proves too hard, then bail
5776 // out with useBuildVector set.
5777 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
5778 unsigned FirstMaskIdx = High * NewElts;
5779 bool UseBuildVector = false;
5780 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5781 // The mask element. This indexes into the input.
5782 int Idx = Mask[FirstMaskIdx + MaskOffset];
5783
5784 // The input vector this mask element indexes into.
5785 unsigned Input = (unsigned)Idx / NewElts;
5786
5787 if (Input >= std::size(Inputs)) {
5788 // The mask element does not index into any input vector.
5789 Ops.push_back(Elt: -1);
5790 continue;
5791 }
5792
5793 // Turn the index into an offset from the start of the input vector.
5794 Idx -= Input * NewElts;
5795
5796 // Find or create a shuffle vector operand to hold this input.
5797 unsigned OpNo;
5798 for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
5799 if (InputUsed[OpNo] == Input) {
5800 // This input vector is already an operand.
5801 break;
5802 } else if (InputUsed[OpNo] == -1U) {
5803 // Create a new operand for this input vector.
5804 InputUsed[OpNo] = Input;
5805 break;
5806 }
5807 }
5808
5809 if (OpNo >= std::size(InputUsed)) {
5810 // More than two input vectors used! Give up on trying to create a
5811 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
5812 UseBuildVector = true;
5813 break;
5814 }
5815
5816 // Add the mask index for the new shuffle vector.
5817 Ops.push_back(Elt: Idx + OpNo * NewElts);
5818 }
5819
5820 if (UseBuildVector) {
5821 LLT EltTy = NarrowTy.getElementType();
5822 SmallVector<Register, 16> SVOps;
5823
5824 // Extract the input elements by hand.
5825 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5826 // The mask element. This indexes into the input.
5827 int Idx = Mask[FirstMaskIdx + MaskOffset];
5828
5829 // The input vector this mask element indexes into.
5830 unsigned Input = (unsigned)Idx / NewElts;
5831
5832 if (Input >= std::size(Inputs)) {
5833 // The mask element is "undef" or indexes off the end of the input.
5834 SVOps.push_back(Elt: MIRBuilder.buildUndef(Res: EltTy).getReg(Idx: 0));
5835 continue;
5836 }
5837
5838 // Turn the index into an offset from the start of the input vector.
5839 Idx -= Input * NewElts;
5840
5841 // Extract the vector element by hand.
5842 SVOps.push_back(Elt: MIRBuilder
5843 .buildExtractVectorElement(
5844 Res: EltTy, Val: Inputs[Input],
5845 Idx: MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: Idx))
5846 .getReg(Idx: 0));
5847 }
5848
5849 // Construct the Lo/Hi output using a G_BUILD_VECTOR.
5850 Output = MIRBuilder.buildBuildVector(Res: NarrowTy, Ops: SVOps).getReg(Idx: 0);
5851 } else if (InputUsed[0] == -1U) {
5852 // No input vectors were used! The result is undefined.
5853 Output = MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0);
5854 } else if (NewElts == 1) {
5855 Output = MIRBuilder.buildCopy(Res: NarrowTy, Op: Inputs[InputUsed[0]]).getReg(Idx: 0);
5856 } else {
5857 Register Op0 = Inputs[InputUsed[0]];
5858 // If only one input was used, use an undefined vector for the other.
5859 Register Op1 = InputUsed[1] == -1U
5860 ? MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0)
5861 : Inputs[InputUsed[1]];
5862 // At least one input vector was used. Create a new shuffle vector.
5863 Output = MIRBuilder.buildShuffleVector(Res: NarrowTy, Src1: Op0, Src2: Op1, Mask: Ops).getReg(Idx: 0);
5864 }
5865
5866 Ops.clear();
5867 }
5868
5869 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: {Lo, Hi});
5870 MI.eraseFromParent();
5871 return Legalized;
5872}
5873
5874LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
5875 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5876 auto &RdxMI = cast<GVecReduce>(Val&: MI);
5877
5878 if (TypeIdx != 1)
5879 return UnableToLegalize;
5880
5881 // The semantics of the normal non-sequential reductions allow us to freely
5882 // re-associate the operation.
5883 auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
5884
5885 if (NarrowTy.isVector() &&
5886 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
5887 return UnableToLegalize;
5888
5889 unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
5890 SmallVector<Register> SplitSrcs;
5891 // If NarrowTy is a scalar then we're being asked to scalarize.
5892 const unsigned NumParts =
5893 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
5894 : SrcTy.getNumElements();
5895
5896 extractParts(Reg: SrcReg, Ty: NarrowTy, NumParts, VRegs&: SplitSrcs, MIRBuilder, MRI);
5897 if (NarrowTy.isScalar()) {
5898 if (DstTy != NarrowTy)
5899 return UnableToLegalize; // FIXME: handle implicit extensions.
5900
5901 if (isPowerOf2_32(Value: NumParts)) {
5902 // Generate a tree of scalar operations to reduce the critical path.
5903 SmallVector<Register> PartialResults;
5904 unsigned NumPartsLeft = NumParts;
5905 while (NumPartsLeft > 1) {
5906 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
5907 PartialResults.emplace_back(
5908 Args: MIRBuilder
5909 .buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy},
5910 SrcOps: {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
5911 .getReg(Idx: 0));
5912 }
5913 SplitSrcs = PartialResults;
5914 PartialResults.clear();
5915 NumPartsLeft = SplitSrcs.size();
5916 }
5917 assert(SplitSrcs.size() == 1);
5918 MIRBuilder.buildCopy(Res: DstReg, Op: SplitSrcs[0]);
5919 MI.eraseFromParent();
5920 return Legalized;
5921 }
5922 // If we can't generate a tree, then just do sequential operations.
5923 Register Acc = SplitSrcs[0];
5924 for (unsigned Idx = 1; Idx < NumParts; ++Idx)
5925 Acc = MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {Acc, SplitSrcs[Idx]})
5926 .getReg(Idx: 0);
5927 MIRBuilder.buildCopy(Res: DstReg, Op: Acc);
5928 MI.eraseFromParent();
5929 return Legalized;
5930 }
5931 SmallVector<Register> PartialReductions;
5932 for (unsigned Part = 0; Part < NumParts; ++Part) {
5933 PartialReductions.push_back(
5934 Elt: MIRBuilder.buildInstr(Opc: RdxMI.getOpcode(), DstOps: {DstTy}, SrcOps: {SplitSrcs[Part]})
5935 .getReg(Idx: 0));
5936 }
5937
5938 // If the types involved are powers of 2, we can generate intermediate vector
5939 // ops, before generating a final reduction operation.
5940 if (isPowerOf2_32(Value: SrcTy.getNumElements()) &&
5941 isPowerOf2_32(Value: NarrowTy.getNumElements())) {
5942 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
5943 }
5944
5945 Register Acc = PartialReductions[0];
5946 for (unsigned Part = 1; Part < NumParts; ++Part) {
5947 if (Part == NumParts - 1) {
5948 MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {DstReg},
5949 SrcOps: {Acc, PartialReductions[Part]});
5950 } else {
5951 Acc = MIRBuilder
5952 .buildInstr(Opc: ScalarOpc, DstOps: {DstTy}, SrcOps: {Acc, PartialReductions[Part]})
5953 .getReg(Idx: 0);
5954 }
5955 }
5956 MI.eraseFromParent();
5957 return Legalized;
5958}
5959
5960LegalizerHelper::LegalizeResult
5961LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
5962 unsigned int TypeIdx,
5963 LLT NarrowTy) {
5964 auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
5965 MI.getFirst3RegLLTs();
5966 if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
5967 DstTy != NarrowTy)
5968 return UnableToLegalize;
5969
5970 assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
5971 MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
5972 "Unexpected vecreduce opcode");
5973 unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
5974 ? TargetOpcode::G_FADD
5975 : TargetOpcode::G_FMUL;
5976
5977 SmallVector<Register> SplitSrcs;
5978 unsigned NumParts = SrcTy.getNumElements();
5979 extractParts(Reg: SrcReg, Ty: NarrowTy, NumParts, VRegs&: SplitSrcs, MIRBuilder, MRI);
5980 Register Acc = ScalarReg;
5981 for (unsigned i = 0; i < NumParts; i++)
5982 Acc = MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {Acc, SplitSrcs[i]})
5983 .getReg(Idx: 0);
5984
5985 MIRBuilder.buildCopy(Res: DstReg, Op: Acc);
5986 MI.eraseFromParent();
5987 return Legalized;
5988}
5989
5990LegalizerHelper::LegalizeResult
5991LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
5992 LLT SrcTy, LLT NarrowTy,
5993 unsigned ScalarOpc) {
5994 SmallVector<Register> SplitSrcs;
5995 // Split the sources into NarrowTy size pieces.
5996 extractParts(Reg: SrcReg, Ty: NarrowTy,
5997 NumParts: SrcTy.getNumElements() / NarrowTy.getNumElements(), VRegs&: SplitSrcs,
5998 MIRBuilder, MRI);
5999 // We're going to do a tree reduction using vector operations until we have
6000 // one NarrowTy size value left.
6001 while (SplitSrcs.size() > 1) {
6002 SmallVector<Register> PartialRdxs;
6003 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
6004 Register LHS = SplitSrcs[Idx];
6005 Register RHS = SplitSrcs[Idx + 1];
6006 // Create the intermediate vector op.
6007 Register Res =
6008 MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {LHS, RHS}).getReg(Idx: 0);
6009 PartialRdxs.push_back(Elt: Res);
6010 }
6011 SplitSrcs = std::move(PartialRdxs);
6012 }
6013 // Finally generate the requested NarrowTy based reduction.
6014 Observer.changingInstr(MI);
6015 MI.getOperand(i: 1).setReg(SplitSrcs[0]);
6016 Observer.changedInstr(MI);
6017 return Legalized;
6018}
6019
6020LegalizerHelper::LegalizeResult
6021LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
6022 const LLT HalfTy, const LLT AmtTy) {
6023
6024 Register InL = MRI.createGenericVirtualRegister(Ty: HalfTy);
6025 Register InH = MRI.createGenericVirtualRegister(Ty: HalfTy);
6026 MIRBuilder.buildUnmerge(Res: {InL, InH}, Op: MI.getOperand(i: 1));
6027
6028 if (Amt.isZero()) {
6029 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {InL, InH});
6030 MI.eraseFromParent();
6031 return Legalized;
6032 }
6033
6034 LLT NVT = HalfTy;
6035 unsigned NVTBits = HalfTy.getSizeInBits();
6036 unsigned VTBits = 2 * NVTBits;
6037
6038 SrcOp Lo(Register(0)), Hi(Register(0));
6039 if (MI.getOpcode() == TargetOpcode::G_SHL) {
6040 if (Amt.ugt(RHS: VTBits)) {
6041 Lo = Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6042 } else if (Amt.ugt(RHS: NVTBits)) {
6043 Lo = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6044 Hi = MIRBuilder.buildShl(Dst: NVT, Src0: InL,
6045 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
6046 } else if (Amt == NVTBits) {
6047 Lo = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6048 Hi = InL;
6049 } else {
6050 Lo = MIRBuilder.buildShl(Dst: NVT, Src0: InL, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt));
6051 auto OrLHS =
6052 MIRBuilder.buildShl(Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt));
6053 auto OrRHS = MIRBuilder.buildLShr(
6054 Dst: NVT, Src0: InL, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
6055 Hi = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
6056 }
6057 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
6058 if (Amt.ugt(RHS: VTBits)) {
6059 Lo = Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6060 } else if (Amt.ugt(RHS: NVTBits)) {
6061 Lo = MIRBuilder.buildLShr(Dst: NVT, Src0: InH,
6062 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
6063 Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6064 } else if (Amt == NVTBits) {
6065 Lo = InH;
6066 Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6067 } else {
6068 auto ShiftAmtConst = MIRBuilder.buildConstant(Res: AmtTy, Val: Amt);
6069
6070 auto OrLHS = MIRBuilder.buildLShr(Dst: NVT, Src0: InL, Src1: ShiftAmtConst);
6071 auto OrRHS = MIRBuilder.buildShl(
6072 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
6073
6074 Lo = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
6075 Hi = MIRBuilder.buildLShr(Dst: NVT, Src0: InH, Src1: ShiftAmtConst);
6076 }
6077 } else {
6078 if (Amt.ugt(RHS: VTBits)) {
6079 Hi = Lo = MIRBuilder.buildAShr(
6080 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
6081 } else if (Amt.ugt(RHS: NVTBits)) {
6082 Lo = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
6083 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
6084 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
6085 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
6086 } else if (Amt == NVTBits) {
6087 Lo = InH;
6088 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
6089 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
6090 } else {
6091 auto ShiftAmtConst = MIRBuilder.buildConstant(Res: AmtTy, Val: Amt);
6092
6093 auto OrLHS = MIRBuilder.buildLShr(Dst: NVT, Src0: InL, Src1: ShiftAmtConst);
6094 auto OrRHS = MIRBuilder.buildShl(
6095 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
6096
6097 Lo = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
6098 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH, Src1: ShiftAmtConst);
6099 }
6100 }
6101
6102 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {Lo, Hi});
6103 MI.eraseFromParent();
6104
6105 return Legalized;
6106}
6107
6108LegalizerHelper::LegalizeResult
6109LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
6110 LLT RequestedTy) {
6111 if (TypeIdx == 1) {
6112 Observer.changingInstr(MI);
6113 narrowScalarSrc(MI, NarrowTy: RequestedTy, OpIdx: 2);
6114 Observer.changedInstr(MI);
6115 return Legalized;
6116 }
6117
6118 Register DstReg = MI.getOperand(i: 0).getReg();
6119 LLT DstTy = MRI.getType(Reg: DstReg);
6120 if (DstTy.isVector())
6121 return UnableToLegalize;
6122
6123 Register Amt = MI.getOperand(i: 2).getReg();
6124 LLT ShiftAmtTy = MRI.getType(Reg: Amt);
6125 const unsigned DstEltSize = DstTy.getScalarSizeInBits();
6126 if (DstEltSize % 2 != 0)
6127 return UnableToLegalize;
6128
6129 // Check if we should use multi-way splitting instead of recursive binary
6130 // splitting.
6131 //
6132 // Multi-way splitting directly decomposes wide shifts (e.g., 128-bit ->
6133 // 4×32-bit) in a single legalization step, avoiding the recursive overhead
6134 // and dependency chains created by usual binary splitting approach
6135 // (128->64->32).
6136 //
6137 // The >= 8 parts threshold ensures we only use this optimization when binary
6138 // splitting would require multiple recursive passes, avoiding overhead for
6139 // simple 2-way splits where binary approach is sufficient.
6140 if (RequestedTy.isValid() && RequestedTy.isScalar() &&
6141 DstEltSize % RequestedTy.getSizeInBits() == 0) {
6142 const unsigned NumParts = DstEltSize / RequestedTy.getSizeInBits();
6143 // Use multiway if we have 8 or more parts (i.e., would need 3+ recursive
6144 // steps).
6145 if (NumParts >= 8)
6146 return narrowScalarShiftMultiway(MI, TargetTy: RequestedTy);
6147 }
6148
6149 // Fall back to binary splitting:
6150 // Ignore the input type. We can only go to exactly half the size of the
6151 // input. If that isn't small enough, the resulting pieces will be further
6152 // legalized.
6153 const unsigned NewBitSize = DstEltSize / 2;
6154 const LLT HalfTy = LLT::scalar(SizeInBits: NewBitSize);
6155 const LLT CondTy = LLT::scalar(SizeInBits: 1);
6156
6157 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: Amt, MRI)) {
6158 return narrowScalarShiftByConstant(MI, Amt: VRegAndVal->Value, HalfTy,
6159 AmtTy: ShiftAmtTy);
6160 }
6161
6162 // TODO: Expand with known bits.
6163
6164 // Handle the fully general expansion by an unknown amount.
6165 auto NewBits = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: NewBitSize);
6166
6167 Register InL = MRI.createGenericVirtualRegister(Ty: HalfTy);
6168 Register InH = MRI.createGenericVirtualRegister(Ty: HalfTy);
6169 MIRBuilder.buildUnmerge(Res: {InL, InH}, Op: MI.getOperand(i: 1));
6170
6171 auto AmtExcess = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: Amt, Src1: NewBits);
6172 auto AmtLack = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: NewBits, Src1: Amt);
6173
6174 auto Zero = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
6175 auto IsShort = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_ULT, Res: CondTy, Op0: Amt, Op1: NewBits);
6176 auto IsZero = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: CondTy, Op0: Amt, Op1: Zero);
6177
6178 Register ResultRegs[2];
6179 switch (MI.getOpcode()) {
6180 case TargetOpcode::G_SHL: {
6181 // Short: ShAmt < NewBitSize
6182 auto LoS = MIRBuilder.buildShl(Dst: HalfTy, Src0: InL, Src1: Amt);
6183
6184 auto LoOr = MIRBuilder.buildLShr(Dst: HalfTy, Src0: InL, Src1: AmtLack);
6185 auto HiOr = MIRBuilder.buildShl(Dst: HalfTy, Src0: InH, Src1: Amt);
6186 auto HiS = MIRBuilder.buildOr(Dst: HalfTy, Src0: LoOr, Src1: HiOr);
6187
6188 // Long: ShAmt >= NewBitSize
6189 auto LoL = MIRBuilder.buildConstant(Res: HalfTy, Val: 0); // Lo part is zero.
6190 auto HiL = MIRBuilder.buildShl(Dst: HalfTy, Src0: InL, Src1: AmtExcess); // Hi from Lo part.
6191
6192 auto Lo = MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: LoS, Op1: LoL);
6193 auto Hi = MIRBuilder.buildSelect(
6194 Res: HalfTy, Tst: IsZero, Op0: InH, Op1: MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: HiS, Op1: HiL));
6195
6196 ResultRegs[0] = Lo.getReg(Idx: 0);
6197 ResultRegs[1] = Hi.getReg(Idx: 0);
6198 break;
6199 }
6200 case TargetOpcode::G_LSHR:
6201 case TargetOpcode::G_ASHR: {
6202 // Short: ShAmt < NewBitSize
6203 auto HiS = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {HalfTy}, SrcOps: {InH, Amt});
6204
6205 auto LoOr = MIRBuilder.buildLShr(Dst: HalfTy, Src0: InL, Src1: Amt);
6206 auto HiOr = MIRBuilder.buildShl(Dst: HalfTy, Src0: InH, Src1: AmtLack);
6207 auto LoS = MIRBuilder.buildOr(Dst: HalfTy, Src0: LoOr, Src1: HiOr);
6208
6209 // Long: ShAmt >= NewBitSize
6210 MachineInstrBuilder HiL;
6211 if (MI.getOpcode() == TargetOpcode::G_LSHR) {
6212 HiL = MIRBuilder.buildConstant(Res: HalfTy, Val: 0); // Hi part is zero.
6213 } else {
6214 auto ShiftAmt = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: NewBitSize - 1);
6215 HiL = MIRBuilder.buildAShr(Dst: HalfTy, Src0: InH, Src1: ShiftAmt); // Sign of Hi part.
6216 }
6217 auto LoL = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {HalfTy},
6218 SrcOps: {InH, AmtExcess}); // Lo from Hi part.
6219
6220 auto Lo = MIRBuilder.buildSelect(
6221 Res: HalfTy, Tst: IsZero, Op0: InL, Op1: MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: LoS, Op1: LoL));
6222
6223 auto Hi = MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: HiS, Op1: HiL);
6224
6225 ResultRegs[0] = Lo.getReg(Idx: 0);
6226 ResultRegs[1] = Hi.getReg(Idx: 0);
6227 break;
6228 }
6229 default:
6230 llvm_unreachable("not a shift");
6231 }
6232
6233 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: ResultRegs);
6234 MI.eraseFromParent();
6235 return Legalized;
6236}
6237
6238Register LegalizerHelper::buildConstantShiftPart(unsigned Opcode,
6239 unsigned PartIdx,
6240 unsigned NumParts,
6241 ArrayRef<Register> SrcParts,
6242 const ShiftParams &Params,
6243 LLT TargetTy, LLT ShiftAmtTy) {
6244 auto WordShiftConst = getIConstantVRegVal(VReg: Params.WordShift, MRI);
6245 auto BitShiftConst = getIConstantVRegVal(VReg: Params.BitShift, MRI);
6246 assert(WordShiftConst && BitShiftConst && "Expected constants");
6247
6248 const unsigned ShiftWords = WordShiftConst->getZExtValue();
6249 const unsigned ShiftBits = BitShiftConst->getZExtValue();
6250 const bool NeedsInterWordShift = ShiftBits != 0;
6251
6252 switch (Opcode) {
6253 case TargetOpcode::G_SHL: {
6254 // Data moves from lower indices to higher indices
6255 // If this part would come from a source beyond our range, it's zero
6256 if (PartIdx < ShiftWords)
6257 return Params.Zero;
6258
6259 unsigned SrcIdx = PartIdx - ShiftWords;
6260 if (!NeedsInterWordShift)
6261 return SrcParts[SrcIdx];
6262
6263 // Combine shifted main part with carry from previous part
6264 auto Hi = MIRBuilder.buildShl(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift);
6265 if (SrcIdx > 0) {
6266 auto Lo = MIRBuilder.buildLShr(Dst: TargetTy, Src0: SrcParts[SrcIdx - 1],
6267 Src1: Params.InvBitShift);
6268 return MIRBuilder.buildOr(Dst: TargetTy, Src0: Hi, Src1: Lo).getReg(Idx: 0);
6269 }
6270 return Hi.getReg(Idx: 0);
6271 }
6272
6273 case TargetOpcode::G_LSHR: {
6274 unsigned SrcIdx = PartIdx + ShiftWords;
6275 if (SrcIdx >= NumParts)
6276 return Params.Zero;
6277 if (!NeedsInterWordShift)
6278 return SrcParts[SrcIdx];
6279
6280 // Combine shifted main part with carry from next part
6281 auto Lo = MIRBuilder.buildLShr(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift);
6282 if (SrcIdx + 1 < NumParts) {
6283 auto Hi = MIRBuilder.buildShl(Dst: TargetTy, Src0: SrcParts[SrcIdx + 1],
6284 Src1: Params.InvBitShift);
6285 return MIRBuilder.buildOr(Dst: TargetTy, Src0: Lo, Src1: Hi).getReg(Idx: 0);
6286 }
6287 return Lo.getReg(Idx: 0);
6288 }
6289
6290 case TargetOpcode::G_ASHR: {
6291 // Like LSHR but preserves sign bit
6292 unsigned SrcIdx = PartIdx + ShiftWords;
6293 if (SrcIdx >= NumParts)
6294 return Params.SignBit;
6295 if (!NeedsInterWordShift)
6296 return SrcParts[SrcIdx];
6297
6298 // Only the original MSB part uses arithmetic shift to preserve sign. All
6299 // other parts use logical shift since they're just moving data bits.
6300 auto Lo =
6301 (SrcIdx == NumParts - 1)
6302 ? MIRBuilder.buildAShr(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift)
6303 : MIRBuilder.buildLShr(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift);
6304 Register HiSrc =
6305 (SrcIdx + 1 < NumParts) ? SrcParts[SrcIdx + 1] : Params.SignBit;
6306 auto Hi = MIRBuilder.buildShl(Dst: TargetTy, Src0: HiSrc, Src1: Params.InvBitShift);
6307 return MIRBuilder.buildOr(Dst: TargetTy, Src0: Lo, Src1: Hi).getReg(Idx: 0);
6308 }
6309
6310 default:
6311 llvm_unreachable("not a shift");
6312 }
6313}
6314
6315Register LegalizerHelper::buildVariableShiftPart(unsigned Opcode,
6316 Register MainOperand,
6317 Register ShiftAmt,
6318 LLT TargetTy,
6319 Register CarryOperand) {
6320 // This helper generates a single output part for variable shifts by combining
6321 // the main operand (shifted by BitShift) with carry bits from an adjacent
6322 // part.
6323
6324 // For G_ASHR, individual parts don't have their own sign bit, only the
6325 // complete value does. So we use LSHR for the main operand shift in ASHR
6326 // context.
6327 unsigned MainOpcode =
6328 (Opcode == TargetOpcode::G_ASHR) ? TargetOpcode::G_LSHR : Opcode;
6329
6330 // Perform the primary shift on the main operand
6331 Register MainShifted =
6332 MIRBuilder.buildInstr(Opc: MainOpcode, DstOps: {TargetTy}, SrcOps: {MainOperand, ShiftAmt})
6333 .getReg(Idx: 0);
6334
6335 // No carry operand available
6336 if (!CarryOperand.isValid())
6337 return MainShifted;
6338
6339 // If BitShift is 0 (word-aligned shift), no inter-word bit movement occurs,
6340 // so carry bits aren't needed.
6341 LLT ShiftAmtTy = MRI.getType(Reg: ShiftAmt);
6342 auto ZeroConst = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
6343 LLT BoolTy = LLT::scalar(SizeInBits: 1);
6344 auto IsZeroBitShift =
6345 MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: BoolTy, Op0: ShiftAmt, Op1: ZeroConst);
6346
6347 // Extract bits from the adjacent part that will "carry over" into this part.
6348 // The carry direction is opposite to the main shift direction, so we can
6349 // align the two shifted values before combining them with OR.
6350
6351 // Determine the carry shift opcode (opposite direction)
6352 unsigned CarryOpcode = (Opcode == TargetOpcode::G_SHL) ? TargetOpcode::G_LSHR
6353 : TargetOpcode::G_SHL;
6354
6355 // Calculate inverse shift amount: BitWidth - ShiftAmt
6356 auto TargetBitsConst =
6357 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetTy.getScalarSizeInBits());
6358 auto InvShiftAmt = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: TargetBitsConst, Src1: ShiftAmt);
6359
6360 // Shift the carry operand
6361 Register CarryBits =
6362 MIRBuilder
6363 .buildInstr(Opc: CarryOpcode, DstOps: {TargetTy}, SrcOps: {CarryOperand, InvShiftAmt})
6364 .getReg(Idx: 0);
6365
6366 // If BitShift is 0, don't include carry bits (InvShiftAmt would equal
6367 // TargetBits which would be poison for the individual carry shift operation).
6368 auto ZeroReg = MIRBuilder.buildConstant(Res: TargetTy, Val: 0);
6369 Register SafeCarryBits =
6370 MIRBuilder.buildSelect(Res: TargetTy, Tst: IsZeroBitShift, Op0: ZeroReg, Op1: CarryBits)
6371 .getReg(Idx: 0);
6372
6373 // Combine the main shifted part with the carry bits
6374 return MIRBuilder.buildOr(Dst: TargetTy, Src0: MainShifted, Src1: SafeCarryBits).getReg(Idx: 0);
6375}
6376
6377LegalizerHelper::LegalizeResult
6378LegalizerHelper::narrowScalarShiftByConstantMultiway(MachineInstr &MI,
6379 const APInt &Amt,
6380 LLT TargetTy,
6381 LLT ShiftAmtTy) {
6382 // Any wide shift can be decomposed into WordShift + BitShift components.
6383 // When shift amount is known constant, directly compute the decomposition
6384 // values and generate constant registers.
6385 Register DstReg = MI.getOperand(i: 0).getReg();
6386 Register SrcReg = MI.getOperand(i: 1).getReg();
6387 LLT DstTy = MRI.getType(Reg: DstReg);
6388
6389 const unsigned DstBits = DstTy.getScalarSizeInBits();
6390 const unsigned TargetBits = TargetTy.getScalarSizeInBits();
6391 const unsigned NumParts = DstBits / TargetBits;
6392
6393 assert(DstBits % TargetBits == 0 && "Target type must evenly divide source");
6394
6395 // When the shift amount is known at compile time, we just calculate which
6396 // source parts contribute to each output part.
6397
6398 SmallVector<Register, 8> SrcParts;
6399 extractParts(Reg: SrcReg, Ty: TargetTy, NumParts, VRegs&: SrcParts, MIRBuilder, MRI);
6400
6401 if (Amt.isZero()) {
6402 // No shift needed, just copy
6403 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SrcParts);
6404 MI.eraseFromParent();
6405 return Legalized;
6406 }
6407
6408 ShiftParams Params;
6409 const unsigned ShiftWords = Amt.getZExtValue() / TargetBits;
6410 const unsigned ShiftBits = Amt.getZExtValue() % TargetBits;
6411
6412 // Generate constants and values needed by all shift types
6413 Params.WordShift = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: ShiftWords).getReg(Idx: 0);
6414 Params.BitShift = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: ShiftBits).getReg(Idx: 0);
6415 Params.InvBitShift =
6416 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - ShiftBits).getReg(Idx: 0);
6417 Params.Zero = MIRBuilder.buildConstant(Res: TargetTy, Val: 0).getReg(Idx: 0);
6418
6419 // For ASHR, we need the sign-extended value to fill shifted-out positions
6420 if (MI.getOpcode() == TargetOpcode::G_ASHR)
6421 Params.SignBit =
6422 MIRBuilder
6423 .buildAShr(Dst: TargetTy, Src0: SrcParts[SrcParts.size() - 1],
6424 Src1: MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - 1))
6425 .getReg(Idx: 0);
6426
6427 SmallVector<Register, 8> DstParts(NumParts);
6428 for (unsigned I = 0; I < NumParts; ++I)
6429 DstParts[I] = buildConstantShiftPart(Opcode: MI.getOpcode(), PartIdx: I, NumParts, SrcParts,
6430 Params, TargetTy, ShiftAmtTy);
6431
6432 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstParts);
6433 MI.eraseFromParent();
6434 return Legalized;
6435}
6436
6437LegalizerHelper::LegalizeResult
6438LegalizerHelper::narrowScalarShiftMultiway(MachineInstr &MI, LLT TargetTy) {
6439 Register DstReg = MI.getOperand(i: 0).getReg();
6440 Register SrcReg = MI.getOperand(i: 1).getReg();
6441 Register AmtReg = MI.getOperand(i: 2).getReg();
6442 LLT DstTy = MRI.getType(Reg: DstReg);
6443 LLT ShiftAmtTy = MRI.getType(Reg: AmtReg);
6444
6445 const unsigned DstBits = DstTy.getScalarSizeInBits();
6446 const unsigned TargetBits = TargetTy.getScalarSizeInBits();
6447 const unsigned NumParts = DstBits / TargetBits;
6448
6449 assert(DstBits % TargetBits == 0 && "Target type must evenly divide source");
6450 assert(isPowerOf2_32(TargetBits) && "Target bit width must be power of 2");
6451
6452 // If the shift amount is known at compile time, we can use direct indexing
6453 // instead of generating select chains in the general case.
6454 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: AmtReg, MRI))
6455 return narrowScalarShiftByConstantMultiway(MI, Amt: VRegAndVal->Value, TargetTy,
6456 ShiftAmtTy);
6457
6458 // For runtime-variable shift amounts, we must generate a more complex
6459 // sequence that handles all possible shift values using select chains.
6460
6461 // Split the input into target-sized pieces
6462 SmallVector<Register, 8> SrcParts;
6463 extractParts(Reg: SrcReg, Ty: TargetTy, NumParts, VRegs&: SrcParts, MIRBuilder, MRI);
6464
6465 // Shifting by zero should be a no-op.
6466 auto ZeroAmtConst = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
6467 LLT BoolTy = LLT::scalar(SizeInBits: 1);
6468 auto IsZeroShift =
6469 MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: BoolTy, Op0: AmtReg, Op1: ZeroAmtConst);
6470
6471 // Any wide shift can be decomposed into two components:
6472 // 1. WordShift: number of complete target-sized words to shift
6473 // 2. BitShift: number of bits to shift within each word
6474 //
6475 // Example: 128-bit >> 50 with 32-bit target:
6476 // WordShift = 50 / 32 = 1 (shift right by 1 complete word)
6477 // BitShift = 50 % 32 = 18 (shift each word right by 18 bits)
6478 unsigned TargetBitsLog2 = Log2_32(Value: TargetBits);
6479 auto TargetBitsLog2Const =
6480 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBitsLog2);
6481 auto TargetBitsMask = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - 1);
6482
6483 Register WordShift =
6484 MIRBuilder.buildLShr(Dst: ShiftAmtTy, Src0: AmtReg, Src1: TargetBitsLog2Const).getReg(Idx: 0);
6485 Register BitShift =
6486 MIRBuilder.buildAnd(Dst: ShiftAmtTy, Src0: AmtReg, Src1: TargetBitsMask).getReg(Idx: 0);
6487
6488 // Fill values:
6489 // - SHL/LSHR: fill with zeros
6490 // - ASHR: fill with sign-extended MSB
6491 Register ZeroReg = MIRBuilder.buildConstant(Res: TargetTy, Val: 0).getReg(Idx: 0);
6492
6493 Register FillValue;
6494 if (MI.getOpcode() == TargetOpcode::G_ASHR) {
6495 auto TargetBitsMinusOneConst =
6496 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - 1);
6497 FillValue = MIRBuilder
6498 .buildAShr(Dst: TargetTy, Src0: SrcParts[NumParts - 1],
6499 Src1: TargetBitsMinusOneConst)
6500 .getReg(Idx: 0);
6501 } else {
6502 FillValue = ZeroReg;
6503 }
6504
6505 SmallVector<Register, 8> DstParts(NumParts);
6506
6507 // For each output part, generate a select chain that chooses the correct
6508 // result based on the runtime WordShift value. This handles all possible
6509 // word shift amounts by pre-calculating what each would produce.
6510 for (unsigned I = 0; I < NumParts; ++I) {
6511 // Initialize with appropriate default value for this shift type
6512 Register InBoundsResult = FillValue;
6513
6514 // clang-format off
6515 // Build a branchless select chain by pre-computing results for all possible
6516 // WordShift values (0 to NumParts-1). Each iteration nests a new select:
6517 //
6518 // K=0: select(WordShift==0, result0, FillValue)
6519 // K=1: select(WordShift==1, result1, select(WordShift==0, result0, FillValue))
6520 // K=2: select(WordShift==2, result2, select(WordShift==1, result1, select(...)))
6521 // clang-format on
6522 for (unsigned K = 0; K < NumParts; ++K) {
6523 auto WordShiftKConst = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: K);
6524 auto IsWordShiftK = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: BoolTy,
6525 Op0: WordShift, Op1: WordShiftKConst);
6526
6527 // Calculate source indices for this word shift
6528 //
6529 // For 4-part 128-bit value with K=1 word shift:
6530 // SHL: [3][2][1][0] << K => [2][1][0][Z]
6531 // -> (MainIdx = I-K, CarryIdx = I-K-1)
6532 // LSHR: [3][2][1][0] >> K => [Z][3][2][1]
6533 // -> (MainIdx = I+K, CarryIdx = I+K+1)
6534 int MainSrcIdx;
6535 int CarrySrcIdx; // Index for the word that provides the carried-in bits.
6536
6537 switch (MI.getOpcode()) {
6538 case TargetOpcode::G_SHL:
6539 MainSrcIdx = (int)I - (int)K;
6540 CarrySrcIdx = MainSrcIdx - 1;
6541 break;
6542 case TargetOpcode::G_LSHR:
6543 case TargetOpcode::G_ASHR:
6544 MainSrcIdx = (int)I + (int)K;
6545 CarrySrcIdx = MainSrcIdx + 1;
6546 break;
6547 default:
6548 llvm_unreachable("Not a shift");
6549 }
6550
6551 // Check bounds and build the result for this word shift
6552 Register ResultForK;
6553 if (MainSrcIdx >= 0 && MainSrcIdx < (int)NumParts) {
6554 Register MainOp = SrcParts[MainSrcIdx];
6555 Register CarryOp;
6556
6557 // Determine carry operand with bounds checking
6558 if (CarrySrcIdx >= 0 && CarrySrcIdx < (int)NumParts)
6559 CarryOp = SrcParts[CarrySrcIdx];
6560 else if (MI.getOpcode() == TargetOpcode::G_ASHR &&
6561 CarrySrcIdx >= (int)NumParts)
6562 CarryOp = FillValue; // Use sign extension
6563
6564 ResultForK = buildVariableShiftPart(Opcode: MI.getOpcode(), MainOperand: MainOp, ShiftAmt: BitShift,
6565 TargetTy, CarryOperand: CarryOp);
6566 } else {
6567 // Out of bounds - use fill value for this k
6568 ResultForK = FillValue;
6569 }
6570
6571 // Select this result if WordShift equals k
6572 InBoundsResult =
6573 MIRBuilder
6574 .buildSelect(Res: TargetTy, Tst: IsWordShiftK, Op0: ResultForK, Op1: InBoundsResult)
6575 .getReg(Idx: 0);
6576 }
6577
6578 // Handle zero-shift special case: if shift is 0, use original input
6579 DstParts[I] =
6580 MIRBuilder
6581 .buildSelect(Res: TargetTy, Tst: IsZeroShift, Op0: SrcParts[I], Op1: InBoundsResult)
6582 .getReg(Idx: 0);
6583 }
6584
6585 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstParts);
6586 MI.eraseFromParent();
6587 return Legalized;
6588}
6589
6590LegalizerHelper::LegalizeResult
6591LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
6592 LLT MoreTy) {
6593 assert(TypeIdx == 0 && "Expecting only Idx 0");
6594
6595 Observer.changingInstr(MI);
6596 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6597 MachineBasicBlock &OpMBB = *MI.getOperand(i: I + 1).getMBB();
6598 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminator());
6599 moreElementsVectorSrc(MI, MoreTy, OpIdx: I);
6600 }
6601
6602 MachineBasicBlock &MBB = *MI.getParent();
6603 MIRBuilder.setInsertPt(MBB, II: --MBB.getFirstNonPHI());
6604 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6605 Observer.changedInstr(MI);
6606 return Legalized;
6607}
6608
6609MachineInstrBuilder LegalizerHelper::getNeutralElementForVecReduce(
6610 unsigned Opcode, MachineIRBuilder &MIRBuilder, LLT Ty) {
6611 assert(Ty.isScalar() && "Expected scalar type to make neutral element for");
6612
6613 switch (Opcode) {
6614 default:
6615 llvm_unreachable(
6616 "getNeutralElementForVecReduce called with invalid opcode!");
6617 case TargetOpcode::G_VECREDUCE_ADD:
6618 case TargetOpcode::G_VECREDUCE_OR:
6619 case TargetOpcode::G_VECREDUCE_XOR:
6620 case TargetOpcode::G_VECREDUCE_UMAX:
6621 return MIRBuilder.buildConstant(Res: Ty, Val: 0);
6622 case TargetOpcode::G_VECREDUCE_MUL:
6623 return MIRBuilder.buildConstant(Res: Ty, Val: 1);
6624 case TargetOpcode::G_VECREDUCE_AND:
6625 case TargetOpcode::G_VECREDUCE_UMIN:
6626 return MIRBuilder.buildConstant(
6627 Res: Ty, Val: APInt::getAllOnes(numBits: Ty.getScalarSizeInBits()));
6628 case TargetOpcode::G_VECREDUCE_SMAX:
6629 return MIRBuilder.buildConstant(
6630 Res: Ty, Val: APInt::getSignedMinValue(numBits: Ty.getSizeInBits()));
6631 case TargetOpcode::G_VECREDUCE_SMIN:
6632 return MIRBuilder.buildConstant(
6633 Res: Ty, Val: APInt::getSignedMaxValue(numBits: Ty.getSizeInBits()));
6634 case TargetOpcode::G_VECREDUCE_FADD:
6635 return MIRBuilder.buildFConstant(Res: Ty, Val: -0.0);
6636 case TargetOpcode::G_VECREDUCE_FMUL:
6637 return MIRBuilder.buildFConstant(Res: Ty, Val: 1.0);
6638 case TargetOpcode::G_VECREDUCE_FMINIMUM:
6639 case TargetOpcode::G_VECREDUCE_FMAXIMUM:
6640 assert(false && "getNeutralElementForVecReduce unimplemented for "
6641 "G_VECREDUCE_FMINIMUM and G_VECREDUCE_FMAXIMUM!");
6642 }
6643 llvm_unreachable("switch expected to return!");
6644}
6645
6646LegalizerHelper::LegalizeResult
6647LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
6648 LLT MoreTy) {
6649 unsigned Opc = MI.getOpcode();
6650 switch (Opc) {
6651 case TargetOpcode::G_IMPLICIT_DEF:
6652 case TargetOpcode::G_LOAD: {
6653 if (TypeIdx != 0)
6654 return UnableToLegalize;
6655 Observer.changingInstr(MI);
6656 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6657 Observer.changedInstr(MI);
6658 return Legalized;
6659 }
6660 case TargetOpcode::G_STORE:
6661 if (TypeIdx != 0)
6662 return UnableToLegalize;
6663 Observer.changingInstr(MI);
6664 moreElementsVectorSrc(MI, MoreTy, OpIdx: 0);
6665 Observer.changedInstr(MI);
6666 return Legalized;
6667 case TargetOpcode::G_AND:
6668 case TargetOpcode::G_OR:
6669 case TargetOpcode::G_XOR:
6670 case TargetOpcode::G_ADD:
6671 case TargetOpcode::G_SUB:
6672 case TargetOpcode::G_MUL:
6673 case TargetOpcode::G_FADD:
6674 case TargetOpcode::G_FSUB:
6675 case TargetOpcode::G_FMUL:
6676 case TargetOpcode::G_FDIV:
6677 case TargetOpcode::G_FCOPYSIGN:
6678 case TargetOpcode::G_UADDSAT:
6679 case TargetOpcode::G_USUBSAT:
6680 case TargetOpcode::G_SADDSAT:
6681 case TargetOpcode::G_SSUBSAT:
6682 case TargetOpcode::G_SMIN:
6683 case TargetOpcode::G_SMAX:
6684 case TargetOpcode::G_UMIN:
6685 case TargetOpcode::G_UMAX:
6686 case TargetOpcode::G_FMINNUM:
6687 case TargetOpcode::G_FMAXNUM:
6688 case TargetOpcode::G_FMINNUM_IEEE:
6689 case TargetOpcode::G_FMAXNUM_IEEE:
6690 case TargetOpcode::G_FMINIMUM:
6691 case TargetOpcode::G_FMAXIMUM:
6692 case TargetOpcode::G_FMINIMUMNUM:
6693 case TargetOpcode::G_FMAXIMUMNUM:
6694 case TargetOpcode::G_STRICT_FADD:
6695 case TargetOpcode::G_STRICT_FSUB:
6696 case TargetOpcode::G_STRICT_FMUL: {
6697 Observer.changingInstr(MI);
6698 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6699 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6700 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6701 Observer.changedInstr(MI);
6702 return Legalized;
6703 }
6704 case TargetOpcode::G_SHL:
6705 case TargetOpcode::G_ASHR:
6706 case TargetOpcode::G_LSHR: {
6707 Observer.changingInstr(MI);
6708 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6709 // The shift operand may have a different scalar type from the source and
6710 // destination operands.
6711 LLT ShiftMoreTy = MoreTy.changeElementType(
6712 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getElementType());
6713 moreElementsVectorSrc(MI, MoreTy: ShiftMoreTy, OpIdx: 2);
6714 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6715 Observer.changedInstr(MI);
6716 return Legalized;
6717 }
6718 case TargetOpcode::G_FMA:
6719 case TargetOpcode::G_STRICT_FMA:
6720 case TargetOpcode::G_FSHR:
6721 case TargetOpcode::G_FSHL: {
6722 Observer.changingInstr(MI);
6723 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6724 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6725 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6726 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6727 Observer.changedInstr(MI);
6728 return Legalized;
6729 }
6730 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
6731 case TargetOpcode::G_EXTRACT:
6732 if (TypeIdx != 1)
6733 return UnableToLegalize;
6734 Observer.changingInstr(MI);
6735 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6736 Observer.changedInstr(MI);
6737 return Legalized;
6738 case TargetOpcode::G_INSERT:
6739 case TargetOpcode::G_INSERT_VECTOR_ELT:
6740 case TargetOpcode::G_FREEZE:
6741 case TargetOpcode::G_FNEG:
6742 case TargetOpcode::G_FABS:
6743 case TargetOpcode::G_FSQRT:
6744 case TargetOpcode::G_FCEIL:
6745 case TargetOpcode::G_FFLOOR:
6746 case TargetOpcode::G_FNEARBYINT:
6747 case TargetOpcode::G_FRINT:
6748 case TargetOpcode::G_INTRINSIC_ROUND:
6749 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
6750 case TargetOpcode::G_INTRINSIC_TRUNC:
6751 case TargetOpcode::G_BITREVERSE:
6752 case TargetOpcode::G_BSWAP:
6753 case TargetOpcode::G_FCANONICALIZE:
6754 case TargetOpcode::G_SEXT_INREG:
6755 case TargetOpcode::G_ABS:
6756 case TargetOpcode::G_CTLZ:
6757 case TargetOpcode::G_CTPOP:
6758 if (TypeIdx != 0)
6759 return UnableToLegalize;
6760 Observer.changingInstr(MI);
6761 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6762 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6763 Observer.changedInstr(MI);
6764 return Legalized;
6765 case TargetOpcode::G_SELECT: {
6766 auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
6767 if (TypeIdx == 1) {
6768 if (!CondTy.isScalar() ||
6769 DstTy.getElementCount() != MoreTy.getElementCount())
6770 return UnableToLegalize;
6771
6772 // This is turning a scalar select of vectors into a vector
6773 // select. Broadcast the select condition.
6774 auto ShufSplat = MIRBuilder.buildShuffleSplat(Res: MoreTy, Src: CondReg);
6775 Observer.changingInstr(MI);
6776 MI.getOperand(i: 1).setReg(ShufSplat.getReg(Idx: 0));
6777 Observer.changedInstr(MI);
6778 return Legalized;
6779 }
6780
6781 if (CondTy.isVector())
6782 return UnableToLegalize;
6783
6784 Observer.changingInstr(MI);
6785 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6786 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6787 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6788 Observer.changedInstr(MI);
6789 return Legalized;
6790 }
6791 case TargetOpcode::G_UNMERGE_VALUES:
6792 return UnableToLegalize;
6793 case TargetOpcode::G_PHI:
6794 return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
6795 case TargetOpcode::G_SHUFFLE_VECTOR:
6796 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
6797 case TargetOpcode::G_BUILD_VECTOR: {
6798 SmallVector<SrcOp, 8> Elts;
6799 for (auto Op : MI.uses()) {
6800 Elts.push_back(Elt: Op.getReg());
6801 }
6802
6803 for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
6804 Elts.push_back(Elt: MIRBuilder.buildUndef(Res: MoreTy.getScalarType()));
6805 }
6806
6807 MIRBuilder.buildDeleteTrailingVectorElements(
6808 Res: MI.getOperand(i: 0).getReg(), Op0: MIRBuilder.buildInstr(Opc, DstOps: {MoreTy}, SrcOps: Elts));
6809 MI.eraseFromParent();
6810 return Legalized;
6811 }
6812 case TargetOpcode::G_SEXT:
6813 case TargetOpcode::G_ZEXT:
6814 case TargetOpcode::G_ANYEXT:
6815 case TargetOpcode::G_TRUNC:
6816 case TargetOpcode::G_FPTRUNC:
6817 case TargetOpcode::G_FPEXT:
6818 case TargetOpcode::G_FPTOSI:
6819 case TargetOpcode::G_FPTOUI:
6820 case TargetOpcode::G_FPTOSI_SAT:
6821 case TargetOpcode::G_FPTOUI_SAT:
6822 case TargetOpcode::G_SITOFP:
6823 case TargetOpcode::G_UITOFP: {
6824 Observer.changingInstr(MI);
6825 LLT SrcExtTy;
6826 LLT DstExtTy;
6827 if (TypeIdx == 0) {
6828 DstExtTy = MoreTy;
6829 SrcExtTy = MoreTy.changeElementType(
6830 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getElementType());
6831 } else {
6832 DstExtTy = MoreTy.changeElementType(
6833 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getElementType());
6834 SrcExtTy = MoreTy;
6835 }
6836 moreElementsVectorSrc(MI, MoreTy: SrcExtTy, OpIdx: 1);
6837 moreElementsVectorDst(MI, WideTy: DstExtTy, OpIdx: 0);
6838 Observer.changedInstr(MI);
6839 return Legalized;
6840 }
6841 case TargetOpcode::G_ICMP:
6842 case TargetOpcode::G_FCMP: {
6843 if (TypeIdx != 1)
6844 return UnableToLegalize;
6845
6846 Observer.changingInstr(MI);
6847 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6848 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6849 LLT CondTy = MoreTy.changeVectorElementType(
6850 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getElementType());
6851 moreElementsVectorDst(MI, WideTy: CondTy, OpIdx: 0);
6852 Observer.changedInstr(MI);
6853 return Legalized;
6854 }
6855 case TargetOpcode::G_BITCAST: {
6856 if (TypeIdx != 0)
6857 return UnableToLegalize;
6858
6859 LLT SrcTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
6860 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
6861
6862 unsigned coefficient = SrcTy.getNumElements() * MoreTy.getNumElements();
6863 if (coefficient % DstTy.getNumElements() != 0)
6864 return UnableToLegalize;
6865
6866 coefficient = coefficient / DstTy.getNumElements();
6867
6868 LLT NewTy = SrcTy.changeElementCount(
6869 EC: ElementCount::get(MinVal: coefficient, Scalable: MoreTy.isScalable()));
6870 Observer.changingInstr(MI);
6871 moreElementsVectorSrc(MI, MoreTy: NewTy, OpIdx: 1);
6872 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6873 Observer.changedInstr(MI);
6874 return Legalized;
6875 }
6876 case TargetOpcode::G_VECREDUCE_FADD:
6877 case TargetOpcode::G_VECREDUCE_FMUL:
6878 case TargetOpcode::G_VECREDUCE_ADD:
6879 case TargetOpcode::G_VECREDUCE_MUL:
6880 case TargetOpcode::G_VECREDUCE_AND:
6881 case TargetOpcode::G_VECREDUCE_OR:
6882 case TargetOpcode::G_VECREDUCE_XOR:
6883 case TargetOpcode::G_VECREDUCE_SMAX:
6884 case TargetOpcode::G_VECREDUCE_SMIN:
6885 case TargetOpcode::G_VECREDUCE_UMAX:
6886 case TargetOpcode::G_VECREDUCE_UMIN: {
6887 LLT OrigTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
6888 MachineOperand &MO = MI.getOperand(i: 1);
6889 auto NewVec = MIRBuilder.buildPadVectorWithUndefElements(Res: MoreTy, Op0: MO);
6890 auto NeutralElement = getNeutralElementForVecReduce(
6891 Opcode: MI.getOpcode(), MIRBuilder, Ty: MoreTy.getElementType());
6892
6893 LLT IdxTy(TLI.getVectorIdxLLT(DL: MIRBuilder.getDataLayout()));
6894 for (size_t i = OrigTy.getNumElements(), e = MoreTy.getNumElements();
6895 i != e; i++) {
6896 auto Idx = MIRBuilder.buildConstant(Res: IdxTy, Val: i);
6897 NewVec = MIRBuilder.buildInsertVectorElement(Res: MoreTy, Val: NewVec,
6898 Elt: NeutralElement, Idx);
6899 }
6900
6901 Observer.changingInstr(MI);
6902 MO.setReg(NewVec.getReg(Idx: 0));
6903 Observer.changedInstr(MI);
6904 return Legalized;
6905 }
6906
6907 default:
6908 return UnableToLegalize;
6909 }
6910}
6911
6912LegalizerHelper::LegalizeResult
6913LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
6914 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6915 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
6916 unsigned MaskNumElts = Mask.size();
6917 unsigned SrcNumElts = SrcTy.getNumElements();
6918 LLT DestEltTy = DstTy.getElementType();
6919
6920 if (MaskNumElts == SrcNumElts)
6921 return Legalized;
6922
6923 if (MaskNumElts < SrcNumElts) {
6924 // Extend mask to match new destination vector size with
6925 // undef values.
6926 SmallVector<int, 16> NewMask(SrcNumElts, -1);
6927 llvm::copy(Range&: Mask, Out: NewMask.begin());
6928
6929 moreElementsVectorDst(MI, WideTy: SrcTy, OpIdx: 0);
6930 MIRBuilder.setInstrAndDebugLoc(MI);
6931 MIRBuilder.buildShuffleVector(Res: MI.getOperand(i: 0).getReg(),
6932 Src1: MI.getOperand(i: 1).getReg(),
6933 Src2: MI.getOperand(i: 2).getReg(), Mask: NewMask);
6934 MI.eraseFromParent();
6935
6936 return Legalized;
6937 }
6938
6939 unsigned PaddedMaskNumElts = alignTo(Value: MaskNumElts, Align: SrcNumElts);
6940 unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
6941 LLT PaddedTy =
6942 DstTy.changeVectorElementCount(EC: ElementCount::getFixed(MinVal: PaddedMaskNumElts));
6943
6944 // Create new source vectors by concatenating the initial
6945 // source vectors with undefined vectors of the same size.
6946 auto Undef = MIRBuilder.buildUndef(Res: SrcTy);
6947 SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(Idx: 0));
6948 SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(Idx: 0));
6949 MOps1[0] = MI.getOperand(i: 1).getReg();
6950 MOps2[0] = MI.getOperand(i: 2).getReg();
6951
6952 auto Src1 = MIRBuilder.buildConcatVectors(Res: PaddedTy, Ops: MOps1);
6953 auto Src2 = MIRBuilder.buildConcatVectors(Res: PaddedTy, Ops: MOps2);
6954
6955 // Readjust mask for new input vector length.
6956 SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
6957 for (unsigned I = 0; I != MaskNumElts; ++I) {
6958 int Idx = Mask[I];
6959 if (Idx >= static_cast<int>(SrcNumElts))
6960 Idx += PaddedMaskNumElts - SrcNumElts;
6961 MappedOps[I] = Idx;
6962 }
6963
6964 // If we got more elements than required, extract subvector.
6965 if (MaskNumElts != PaddedMaskNumElts) {
6966 auto Shuffle =
6967 MIRBuilder.buildShuffleVector(Res: PaddedTy, Src1, Src2, Mask: MappedOps);
6968
6969 SmallVector<Register, 16> Elts(MaskNumElts);
6970 for (unsigned I = 0; I < MaskNumElts; ++I) {
6971 Elts[I] =
6972 MIRBuilder.buildExtractVectorElementConstant(Res: DestEltTy, Val: Shuffle, Idx: I)
6973 .getReg(Idx: 0);
6974 }
6975 MIRBuilder.buildBuildVector(Res: DstReg, Ops: Elts);
6976 } else {
6977 MIRBuilder.buildShuffleVector(Res: DstReg, Src1, Src2, Mask: MappedOps);
6978 }
6979
6980 MI.eraseFromParent();
6981 return LegalizerHelper::LegalizeResult::Legalized;
6982}
6983
6984LegalizerHelper::LegalizeResult
6985LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
6986 unsigned int TypeIdx, LLT MoreTy) {
6987 auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
6988 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
6989 unsigned NumElts = DstTy.getNumElements();
6990 unsigned WidenNumElts = MoreTy.getNumElements();
6991
6992 if (DstTy.isVector() && Src1Ty.isVector() &&
6993 DstTy.getNumElements() != Src1Ty.getNumElements()) {
6994 return equalizeVectorShuffleLengths(MI);
6995 }
6996
6997 if (TypeIdx != 0)
6998 return UnableToLegalize;
6999
7000 // Expect a canonicalized shuffle.
7001 if (DstTy != Src1Ty || DstTy != Src2Ty)
7002 return UnableToLegalize;
7003
7004 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
7005 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
7006
7007 // Adjust mask based on new input vector length.
7008 SmallVector<int, 16> NewMask(WidenNumElts, -1);
7009 for (unsigned I = 0; I != NumElts; ++I) {
7010 int Idx = Mask[I];
7011 if (Idx < static_cast<int>(NumElts))
7012 NewMask[I] = Idx;
7013 else
7014 NewMask[I] = Idx - NumElts + WidenNumElts;
7015 }
7016 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
7017 MIRBuilder.setInstrAndDebugLoc(MI);
7018 MIRBuilder.buildShuffleVector(Res: MI.getOperand(i: 0).getReg(),
7019 Src1: MI.getOperand(i: 1).getReg(),
7020 Src2: MI.getOperand(i: 2).getReg(), Mask: NewMask);
7021 MI.eraseFromParent();
7022 return Legalized;
7023}
7024
7025void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
7026 ArrayRef<Register> Src1Regs,
7027 ArrayRef<Register> Src2Regs,
7028 LLT NarrowTy) {
7029 MachineIRBuilder &B = MIRBuilder;
7030 unsigned SrcParts = Src1Regs.size();
7031 unsigned DstParts = DstRegs.size();
7032
7033 unsigned DstIdx = 0; // Low bits of the result.
7034 Register FactorSum =
7035 B.buildMul(Dst: NarrowTy, Src0: Src1Regs[DstIdx], Src1: Src2Regs[DstIdx]).getReg(Idx: 0);
7036 DstRegs[DstIdx] = FactorSum;
7037
7038 Register CarrySumPrevDstIdx;
7039 SmallVector<Register, 4> Factors;
7040
7041 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
7042 // Collect low parts of muls for DstIdx.
7043 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
7044 i <= std::min(a: DstIdx, b: SrcParts - 1); ++i) {
7045 MachineInstrBuilder Mul =
7046 B.buildMul(Dst: NarrowTy, Src0: Src1Regs[DstIdx - i], Src1: Src2Regs[i]);
7047 Factors.push_back(Elt: Mul.getReg(Idx: 0));
7048 }
7049 // Collect high parts of muls from previous DstIdx.
7050 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
7051 i <= std::min(a: DstIdx - 1, b: SrcParts - 1); ++i) {
7052 MachineInstrBuilder Umulh =
7053 B.buildUMulH(Dst: NarrowTy, Src0: Src1Regs[DstIdx - 1 - i], Src1: Src2Regs[i]);
7054 Factors.push_back(Elt: Umulh.getReg(Idx: 0));
7055 }
7056 // Add CarrySum from additions calculated for previous DstIdx.
7057 if (DstIdx != 1) {
7058 Factors.push_back(Elt: CarrySumPrevDstIdx);
7059 }
7060
7061 Register CarrySum;
7062 // Add all factors and accumulate all carries into CarrySum.
7063 if (DstIdx != DstParts - 1) {
7064 MachineInstrBuilder Uaddo =
7065 B.buildUAddo(Res: NarrowTy, CarryOut: LLT::scalar(SizeInBits: 1), Op0: Factors[0], Op1: Factors[1]);
7066 FactorSum = Uaddo.getReg(Idx: 0);
7067 CarrySum = B.buildZExt(Res: NarrowTy, Op: Uaddo.getReg(Idx: 1)).getReg(Idx: 0);
7068 for (unsigned i = 2; i < Factors.size(); ++i) {
7069 MachineInstrBuilder Uaddo =
7070 B.buildUAddo(Res: NarrowTy, CarryOut: LLT::scalar(SizeInBits: 1), Op0: FactorSum, Op1: Factors[i]);
7071 FactorSum = Uaddo.getReg(Idx: 0);
7072 MachineInstrBuilder Carry = B.buildZExt(Res: NarrowTy, Op: Uaddo.getReg(Idx: 1));
7073 CarrySum = B.buildAdd(Dst: NarrowTy, Src0: CarrySum, Src1: Carry).getReg(Idx: 0);
7074 }
7075 } else {
7076 // Since value for the next index is not calculated, neither is CarrySum.
7077 FactorSum = B.buildAdd(Dst: NarrowTy, Src0: Factors[0], Src1: Factors[1]).getReg(Idx: 0);
7078 for (unsigned i = 2; i < Factors.size(); ++i)
7079 FactorSum = B.buildAdd(Dst: NarrowTy, Src0: FactorSum, Src1: Factors[i]).getReg(Idx: 0);
7080 }
7081
7082 CarrySumPrevDstIdx = CarrySum;
7083 DstRegs[DstIdx] = FactorSum;
7084 Factors.clear();
7085 }
7086}
7087
7088LegalizerHelper::LegalizeResult
7089LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
7090 LLT NarrowTy) {
7091 if (TypeIdx != 0)
7092 return UnableToLegalize;
7093
7094 Register DstReg = MI.getOperand(i: 0).getReg();
7095 LLT DstType = MRI.getType(Reg: DstReg);
7096 // FIXME: add support for vector types
7097 if (DstType.isVector())
7098 return UnableToLegalize;
7099
7100 unsigned Opcode = MI.getOpcode();
7101 unsigned OpO, OpE, OpF;
7102 switch (Opcode) {
7103 case TargetOpcode::G_SADDO:
7104 case TargetOpcode::G_SADDE:
7105 case TargetOpcode::G_UADDO:
7106 case TargetOpcode::G_UADDE:
7107 case TargetOpcode::G_ADD:
7108 OpO = TargetOpcode::G_UADDO;
7109 OpE = TargetOpcode::G_UADDE;
7110 OpF = TargetOpcode::G_UADDE;
7111 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
7112 OpF = TargetOpcode::G_SADDE;
7113 break;
7114 case TargetOpcode::G_SSUBO:
7115 case TargetOpcode::G_SSUBE:
7116 case TargetOpcode::G_USUBO:
7117 case TargetOpcode::G_USUBE:
7118 case TargetOpcode::G_SUB:
7119 OpO = TargetOpcode::G_USUBO;
7120 OpE = TargetOpcode::G_USUBE;
7121 OpF = TargetOpcode::G_USUBE;
7122 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
7123 OpF = TargetOpcode::G_SSUBE;
7124 break;
7125 default:
7126 llvm_unreachable("Unexpected add/sub opcode!");
7127 }
7128
7129 // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
7130 unsigned NumDefs = MI.getNumExplicitDefs();
7131 Register Src1 = MI.getOperand(i: NumDefs).getReg();
7132 Register Src2 = MI.getOperand(i: NumDefs + 1).getReg();
7133 Register CarryDst, CarryIn;
7134 if (NumDefs == 2)
7135 CarryDst = MI.getOperand(i: 1).getReg();
7136 if (MI.getNumOperands() == NumDefs + 3)
7137 CarryIn = MI.getOperand(i: NumDefs + 2).getReg();
7138
7139 LLT RegTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
7140 LLT LeftoverTy, DummyTy;
7141 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
7142 extractParts(Reg: Src1, RegTy, MainTy: NarrowTy, LeftoverTy, VRegs&: Src1Regs, LeftoverVRegs&: Src1Left,
7143 MIRBuilder, MRI);
7144 extractParts(Reg: Src2, RegTy, MainTy: NarrowTy, LeftoverTy&: DummyTy, VRegs&: Src2Regs, LeftoverVRegs&: Src2Left, MIRBuilder,
7145 MRI);
7146
7147 int NarrowParts = Src1Regs.size();
7148 Src1Regs.append(RHS: Src1Left);
7149 Src2Regs.append(RHS: Src2Left);
7150 DstRegs.reserve(N: Src1Regs.size());
7151
7152 for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
7153 Register DstReg =
7154 MRI.createGenericVirtualRegister(Ty: MRI.getType(Reg: Src1Regs[i]));
7155 Register CarryOut;
7156 // Forward the final carry-out to the destination register
7157 if (i == e - 1 && CarryDst)
7158 CarryOut = CarryDst;
7159 else
7160 CarryOut = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 1));
7161
7162 if (!CarryIn) {
7163 MIRBuilder.buildInstr(Opc: OpO, DstOps: {DstReg, CarryOut},
7164 SrcOps: {Src1Regs[i], Src2Regs[i]});
7165 } else if (i == e - 1) {
7166 MIRBuilder.buildInstr(Opc: OpF, DstOps: {DstReg, CarryOut},
7167 SrcOps: {Src1Regs[i], Src2Regs[i], CarryIn});
7168 } else {
7169 MIRBuilder.buildInstr(Opc: OpE, DstOps: {DstReg, CarryOut},
7170 SrcOps: {Src1Regs[i], Src2Regs[i], CarryIn});
7171 }
7172
7173 DstRegs.push_back(Elt: DstReg);
7174 CarryIn = CarryOut;
7175 }
7176 insertParts(DstReg: MI.getOperand(i: 0).getReg(), ResultTy: RegTy, PartTy: NarrowTy,
7177 PartRegs: ArrayRef(DstRegs).take_front(N: NarrowParts), LeftoverTy,
7178 LeftoverRegs: ArrayRef(DstRegs).drop_front(N: NarrowParts));
7179
7180 MI.eraseFromParent();
7181 return Legalized;
7182}
7183
7184LegalizerHelper::LegalizeResult
7185LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
7186 auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
7187
7188 LLT Ty = MRI.getType(Reg: DstReg);
7189 if (Ty.isVector())
7190 return UnableToLegalize;
7191
7192 unsigned Size = Ty.getSizeInBits();
7193 unsigned NarrowSize = NarrowTy.getSizeInBits();
7194 if (Size % NarrowSize != 0)
7195 return UnableToLegalize;
7196
7197 unsigned NumParts = Size / NarrowSize;
7198 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
7199 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
7200
7201 SmallVector<Register, 2> Src1Parts, Src2Parts;
7202 SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
7203 extractParts(Reg: Src1, Ty: NarrowTy, NumParts, VRegs&: Src1Parts, MIRBuilder, MRI);
7204 extractParts(Reg: Src2, Ty: NarrowTy, NumParts, VRegs&: Src2Parts, MIRBuilder, MRI);
7205 multiplyRegisters(DstRegs&: DstTmpRegs, Src1Regs: Src1Parts, Src2Regs: Src2Parts, NarrowTy);
7206
7207 // Take only high half of registers if this is high mul.
7208 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
7209 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
7210 MI.eraseFromParent();
7211 return Legalized;
7212}
7213
7214LegalizerHelper::LegalizeResult
7215LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
7216 LLT NarrowTy) {
7217 if (TypeIdx != 0)
7218 return UnableToLegalize;
7219
7220 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
7221
7222 Register Src = MI.getOperand(i: 1).getReg();
7223 LLT SrcTy = MRI.getType(Reg: Src);
7224
7225 // If all finite floats fit into the narrowed integer type, we can just swap
7226 // out the result type. This is practically only useful for conversions from
7227 // half to at least 16-bits, so just handle the one case.
7228 if (SrcTy.getScalarType() != LLT::scalar(SizeInBits: 16) ||
7229 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
7230 return UnableToLegalize;
7231
7232 Observer.changingInstr(MI);
7233 narrowScalarDst(MI, NarrowTy, OpIdx: 0,
7234 ExtOpcode: IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
7235 Observer.changedInstr(MI);
7236 return Legalized;
7237}
7238
7239LegalizerHelper::LegalizeResult
7240LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
7241 LLT NarrowTy) {
7242 if (TypeIdx != 1)
7243 return UnableToLegalize;
7244
7245 uint64_t NarrowSize = NarrowTy.getSizeInBits();
7246
7247 int64_t SizeOp1 = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7248 // FIXME: add support for when SizeOp1 isn't an exact multiple of
7249 // NarrowSize.
7250 if (SizeOp1 % NarrowSize != 0)
7251 return UnableToLegalize;
7252 int NumParts = SizeOp1 / NarrowSize;
7253
7254 SmallVector<Register, 2> SrcRegs, DstRegs;
7255 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowTy, NumParts, VRegs&: SrcRegs,
7256 MIRBuilder, MRI);
7257
7258 Register OpReg = MI.getOperand(i: 0).getReg();
7259 uint64_t OpStart = MI.getOperand(i: 2).getImm();
7260 uint64_t OpSize = MRI.getType(Reg: OpReg).getSizeInBits();
7261 for (int i = 0; i < NumParts; ++i) {
7262 unsigned SrcStart = i * NarrowSize;
7263
7264 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
7265 // No part of the extract uses this subregister, ignore it.
7266 continue;
7267 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(Reg: OpReg)) {
7268 // The entire subregister is extracted, forward the value.
7269 DstRegs.push_back(Elt: SrcRegs[i]);
7270 continue;
7271 }
7272
7273 // OpSegStart is where this destination segment would start in OpReg if it
7274 // extended infinitely in both directions.
7275 int64_t ExtractOffset;
7276 uint64_t SegSize;
7277 if (OpStart < SrcStart) {
7278 ExtractOffset = 0;
7279 SegSize = std::min(a: NarrowSize, b: OpStart + OpSize - SrcStart);
7280 } else {
7281 ExtractOffset = OpStart - SrcStart;
7282 SegSize = std::min(a: SrcStart + NarrowSize - OpStart, b: OpSize);
7283 }
7284
7285 Register SegReg = SrcRegs[i];
7286 if (ExtractOffset != 0 || SegSize != NarrowSize) {
7287 // A genuine extract is needed.
7288 SegReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: SegSize));
7289 MIRBuilder.buildExtract(Res: SegReg, Src: SrcRegs[i], Index: ExtractOffset);
7290 }
7291
7292 DstRegs.push_back(Elt: SegReg);
7293 }
7294
7295 Register DstReg = MI.getOperand(i: 0).getReg();
7296 if (MRI.getType(Reg: DstReg).isVector())
7297 MIRBuilder.buildBuildVector(Res: DstReg, Ops: DstRegs);
7298 else if (DstRegs.size() > 1)
7299 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
7300 else
7301 MIRBuilder.buildCopy(Res: DstReg, Op: DstRegs[0]);
7302 MI.eraseFromParent();
7303 return Legalized;
7304}
7305
7306LegalizerHelper::LegalizeResult
7307LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
7308 LLT NarrowTy) {
7309 // FIXME: Don't know how to handle secondary types yet.
7310 if (TypeIdx != 0)
7311 return UnableToLegalize;
7312
7313 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
7314 LLT RegTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
7315 LLT LeftoverTy;
7316 extractParts(Reg: MI.getOperand(i: 1).getReg(), RegTy, MainTy: NarrowTy, LeftoverTy, VRegs&: SrcRegs,
7317 LeftoverVRegs&: LeftoverRegs, MIRBuilder, MRI);
7318
7319 SrcRegs.append(RHS: LeftoverRegs);
7320
7321 uint64_t NarrowSize = NarrowTy.getSizeInBits();
7322 Register OpReg = MI.getOperand(i: 2).getReg();
7323 uint64_t OpStart = MI.getOperand(i: 3).getImm();
7324 uint64_t OpSize = MRI.getType(Reg: OpReg).getSizeInBits();
7325 for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
7326 unsigned DstStart = I * NarrowSize;
7327
7328 if (DstStart == OpStart && NarrowTy == MRI.getType(Reg: OpReg)) {
7329 // The entire subregister is defined by this insert, forward the new
7330 // value.
7331 DstRegs.push_back(Elt: OpReg);
7332 continue;
7333 }
7334
7335 Register SrcReg = SrcRegs[I];
7336 if (MRI.getType(Reg: SrcRegs[I]) == LeftoverTy) {
7337 // The leftover reg is smaller than NarrowTy, so we need to extend it.
7338 SrcReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
7339 MIRBuilder.buildAnyExt(Res: SrcReg, Op: SrcRegs[I]);
7340 }
7341
7342 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
7343 // No part of the insert affects this subregister, forward the original.
7344 DstRegs.push_back(Elt: SrcReg);
7345 continue;
7346 }
7347
7348 // OpSegStart is where this destination segment would start in OpReg if it
7349 // extended infinitely in both directions.
7350 int64_t ExtractOffset, InsertOffset;
7351 uint64_t SegSize;
7352 if (OpStart < DstStart) {
7353 InsertOffset = 0;
7354 ExtractOffset = DstStart - OpStart;
7355 SegSize = std::min(a: NarrowSize, b: OpStart + OpSize - DstStart);
7356 } else {
7357 InsertOffset = OpStart - DstStart;
7358 ExtractOffset = 0;
7359 SegSize =
7360 std::min(a: NarrowSize - InsertOffset, b: OpStart + OpSize - DstStart);
7361 }
7362
7363 Register SegReg = OpReg;
7364 if (ExtractOffset != 0 || SegSize != OpSize) {
7365 // A genuine extract is needed.
7366 SegReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: SegSize));
7367 MIRBuilder.buildExtract(Res: SegReg, Src: OpReg, Index: ExtractOffset);
7368 }
7369
7370 Register DstReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
7371 MIRBuilder.buildInsert(Res: DstReg, Src: SrcReg, Op: SegReg, Index: InsertOffset);
7372 DstRegs.push_back(Elt: DstReg);
7373 }
7374
7375 uint64_t WideSize = DstRegs.size() * NarrowSize;
7376 Register DstReg = MI.getOperand(i: 0).getReg();
7377 if (WideSize > RegTy.getSizeInBits()) {
7378 Register MergeReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: WideSize));
7379 MIRBuilder.buildMergeLikeInstr(Res: MergeReg, Ops: DstRegs);
7380 MIRBuilder.buildTrunc(Res: DstReg, Op: MergeReg);
7381 } else
7382 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
7383
7384 MI.eraseFromParent();
7385 return Legalized;
7386}
7387
7388LegalizerHelper::LegalizeResult
7389LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
7390 LLT NarrowTy) {
7391 Register DstReg = MI.getOperand(i: 0).getReg();
7392 LLT DstTy = MRI.getType(Reg: DstReg);
7393
7394 assert(MI.getNumOperands() == 3 && TypeIdx == 0);
7395
7396 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
7397 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
7398 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
7399 LLT LeftoverTy;
7400 if (!extractParts(Reg: MI.getOperand(i: 1).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy,
7401 VRegs&: Src0Regs, LeftoverVRegs&: Src0LeftoverRegs, MIRBuilder, MRI))
7402 return UnableToLegalize;
7403
7404 LLT Unused;
7405 if (!extractParts(Reg: MI.getOperand(i: 2).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy&: Unused,
7406 VRegs&: Src1Regs, LeftoverVRegs&: Src1LeftoverRegs, MIRBuilder, MRI))
7407 llvm_unreachable("inconsistent extractParts result");
7408
7409 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
7410 auto Inst = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy},
7411 SrcOps: {Src0Regs[I], Src1Regs[I]});
7412 DstRegs.push_back(Elt: Inst.getReg(Idx: 0));
7413 }
7414
7415 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
7416 auto Inst = MIRBuilder.buildInstr(
7417 Opc: MI.getOpcode(),
7418 DstOps: {LeftoverTy}, SrcOps: {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
7419 DstLeftoverRegs.push_back(Elt: Inst.getReg(Idx: 0));
7420 }
7421
7422 insertParts(DstReg, ResultTy: DstTy, PartTy: NarrowTy, PartRegs: DstRegs,
7423 LeftoverTy, LeftoverRegs: DstLeftoverRegs);
7424
7425 MI.eraseFromParent();
7426 return Legalized;
7427}
7428
7429LegalizerHelper::LegalizeResult
7430LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
7431 LLT NarrowTy) {
7432 if (TypeIdx != 0)
7433 return UnableToLegalize;
7434
7435 auto [DstReg, SrcReg] = MI.getFirst2Regs();
7436
7437 LLT DstTy = MRI.getType(Reg: DstReg);
7438 if (DstTy.isVector())
7439 return UnableToLegalize;
7440
7441 SmallVector<Register, 8> Parts;
7442 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
7443 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, VRegs&: Parts, PadStrategy: MI.getOpcode());
7444 buildWidenedRemergeToDst(DstReg, LCMTy, RemergeRegs: Parts);
7445
7446 MI.eraseFromParent();
7447 return Legalized;
7448}
7449
7450LegalizerHelper::LegalizeResult
7451LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
7452 LLT NarrowTy) {
7453 if (TypeIdx != 0)
7454 return UnableToLegalize;
7455
7456 Register CondReg = MI.getOperand(i: 1).getReg();
7457 LLT CondTy = MRI.getType(Reg: CondReg);
7458 if (CondTy.isVector()) // TODO: Handle vselect
7459 return UnableToLegalize;
7460
7461 Register DstReg = MI.getOperand(i: 0).getReg();
7462 LLT DstTy = MRI.getType(Reg: DstReg);
7463
7464 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
7465 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
7466 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
7467 LLT LeftoverTy;
7468 if (!extractParts(Reg: MI.getOperand(i: 2).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy,
7469 VRegs&: Src1Regs, LeftoverVRegs&: Src1LeftoverRegs, MIRBuilder, MRI))
7470 return UnableToLegalize;
7471
7472 LLT Unused;
7473 if (!extractParts(Reg: MI.getOperand(i: 3).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy&: Unused,
7474 VRegs&: Src2Regs, LeftoverVRegs&: Src2LeftoverRegs, MIRBuilder, MRI))
7475 llvm_unreachable("inconsistent extractParts result");
7476
7477 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
7478 auto Select = MIRBuilder.buildSelect(Res: NarrowTy,
7479 Tst: CondReg, Op0: Src1Regs[I], Op1: Src2Regs[I]);
7480 DstRegs.push_back(Elt: Select.getReg(Idx: 0));
7481 }
7482
7483 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
7484 auto Select = MIRBuilder.buildSelect(
7485 Res: LeftoverTy, Tst: CondReg, Op0: Src1LeftoverRegs[I], Op1: Src2LeftoverRegs[I]);
7486 DstLeftoverRegs.push_back(Elt: Select.getReg(Idx: 0));
7487 }
7488
7489 insertParts(DstReg, ResultTy: DstTy, PartTy: NarrowTy, PartRegs: DstRegs,
7490 LeftoverTy, LeftoverRegs: DstLeftoverRegs);
7491
7492 MI.eraseFromParent();
7493 return Legalized;
7494}
7495
7496LegalizerHelper::LegalizeResult
7497LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
7498 LLT NarrowTy) {
7499 if (TypeIdx != 1)
7500 return UnableToLegalize;
7501
7502 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7503 unsigned NarrowSize = NarrowTy.getSizeInBits();
7504
7505 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7506 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
7507
7508 MachineIRBuilder &B = MIRBuilder;
7509 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
7510 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
7511 auto C_0 = B.buildConstant(Res: NarrowTy, Val: 0);
7512 auto HiIsZero = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
7513 Op0: UnmergeSrc.getReg(Idx: 1), Op1: C_0);
7514 auto LoCTLZ = IsUndef ?
7515 B.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0)) :
7516 B.buildCTLZ(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7517 auto C_NarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
7518 auto HiIsZeroCTLZ = B.buildAdd(Dst: DstTy, Src0: LoCTLZ, Src1: C_NarrowSize);
7519 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7520 B.buildSelect(Res: DstReg, Tst: HiIsZero, Op0: HiIsZeroCTLZ, Op1: HiCTLZ);
7521
7522 MI.eraseFromParent();
7523 return Legalized;
7524 }
7525
7526 return UnableToLegalize;
7527}
7528
7529LegalizerHelper::LegalizeResult
7530LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
7531 LLT NarrowTy) {
7532 if (TypeIdx != 1)
7533 return UnableToLegalize;
7534
7535 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7536 unsigned NarrowSize = NarrowTy.getSizeInBits();
7537
7538 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7539 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
7540
7541 MachineIRBuilder &B = MIRBuilder;
7542 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
7543 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
7544 auto C_0 = B.buildConstant(Res: NarrowTy, Val: 0);
7545 auto LoIsZero = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
7546 Op0: UnmergeSrc.getReg(Idx: 0), Op1: C_0);
7547 auto HiCTTZ = IsUndef ?
7548 B.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1)) :
7549 B.buildCTTZ(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7550 auto C_NarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
7551 auto LoIsZeroCTTZ = B.buildAdd(Dst: DstTy, Src0: HiCTTZ, Src1: C_NarrowSize);
7552 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7553 B.buildSelect(Res: DstReg, Tst: LoIsZero, Op0: LoIsZeroCTTZ, Op1: LoCTTZ);
7554
7555 MI.eraseFromParent();
7556 return Legalized;
7557 }
7558
7559 return UnableToLegalize;
7560}
7561
7562LegalizerHelper::LegalizeResult
7563LegalizerHelper::narrowScalarCTLS(MachineInstr &MI, unsigned TypeIdx,
7564 LLT NarrowTy) {
7565 if (TypeIdx != 1)
7566 return UnableToLegalize;
7567
7568 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7569 unsigned NarrowSize = NarrowTy.getSizeInBits();
7570
7571 if (!SrcTy.isScalar() || SrcTy.getSizeInBits() != 2 * NarrowSize)
7572 return UnableToLegalize;
7573
7574 MachineIRBuilder &B = MIRBuilder;
7575
7576 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
7577 Register Lo = UnmergeSrc.getReg(Idx: 0);
7578 Register Hi = UnmergeSrc.getReg(Idx: 1);
7579
7580 auto ShAmt = B.buildConstant(Res: NarrowTy, Val: NarrowSize - 1);
7581 auto Sign = B.buildAShr(Dst: NarrowTy, Src0: Hi, Src1: ShAmt);
7582
7583 auto LoSign = B.buildAShr(Dst: NarrowTy, Src0: Lo, Src1: ShAmt);
7584 auto LoSameSign = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
7585 Op0: LoSign.getReg(Idx: 0), Op1: Sign.getReg(Idx: 0));
7586
7587 auto HiIsSign =
7588 B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1), Op0: Hi, Op1: Sign.getReg(Idx: 0));
7589
7590 auto LoCTLS = B.buildCTLS(Dst: DstTy, Src0: Lo);
7591 auto GNarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
7592 auto HiIsSignCTLS = B.buildAdd(Dst: DstTy, Src0: LoCTLS, Src1: GNarrowSize);
7593
7594 // If the low half flips sign, the run of redundant bits stops at the
7595 // boundary, so use (NarrowSize - 1) instead of extending into Lo.
7596 auto GNarrowSizeMinus1 = B.buildConstant(Res: DstTy, Val: NarrowSize - 1);
7597 auto HiSignResult =
7598 B.buildSelect(Res: DstTy, Tst: LoSameSign, Op0: HiIsSignCTLS, Op1: GNarrowSizeMinus1);
7599
7600 auto HiCTLS = B.buildCTLS(Dst: DstTy, Src0: Hi);
7601
7602 B.buildSelect(Res: DstReg, Tst: HiIsSign, Op0: HiSignResult, Op1: HiCTLS);
7603
7604 MI.eraseFromParent();
7605 return Legalized;
7606}
7607
7608LegalizerHelper::LegalizeResult
7609LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
7610 LLT NarrowTy) {
7611 if (TypeIdx != 1)
7612 return UnableToLegalize;
7613
7614 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7615 unsigned NarrowSize = NarrowTy.getSizeInBits();
7616
7617 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7618 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1));
7619
7620 auto LoCTPOP = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7621 auto HiCTPOP = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7622 MIRBuilder.buildAdd(Dst: DstReg, Src0: HiCTPOP, Src1: LoCTPOP);
7623
7624 MI.eraseFromParent();
7625 return Legalized;
7626 }
7627
7628 return UnableToLegalize;
7629}
7630
7631LegalizerHelper::LegalizeResult
7632LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
7633 LLT NarrowTy) {
7634 if (TypeIdx != 1)
7635 return UnableToLegalize;
7636
7637 MachineIRBuilder &B = MIRBuilder;
7638 Register ExpReg = MI.getOperand(i: 2).getReg();
7639 LLT ExpTy = MRI.getType(Reg: ExpReg);
7640
7641 unsigned ClampSize = NarrowTy.getScalarSizeInBits();
7642
7643 // Clamp the exponent to the range of the target type.
7644 auto MinExp = B.buildConstant(Res: ExpTy, Val: minIntN(N: ClampSize));
7645 auto ClampMin = B.buildSMax(Dst: ExpTy, Src0: ExpReg, Src1: MinExp);
7646 auto MaxExp = B.buildConstant(Res: ExpTy, Val: maxIntN(N: ClampSize));
7647 auto Clamp = B.buildSMin(Dst: ExpTy, Src0: ClampMin, Src1: MaxExp);
7648
7649 auto Trunc = B.buildTrunc(Res: NarrowTy, Op: Clamp);
7650 Observer.changingInstr(MI);
7651 MI.getOperand(i: 2).setReg(Trunc.getReg(Idx: 0));
7652 Observer.changedInstr(MI);
7653 return Legalized;
7654}
7655
7656LegalizerHelper::LegalizeResult
7657LegalizerHelper::lowerBitCount(MachineInstr &MI) {
7658 unsigned Opc = MI.getOpcode();
7659 const auto &TII = MIRBuilder.getTII();
7660 auto isSupported = [this](const LegalityQuery &Q) {
7661 auto QAction = LI.getAction(Query: Q).Action;
7662 return QAction == Legal || QAction == Libcall || QAction == Custom;
7663 };
7664 switch (Opc) {
7665 default:
7666 return UnableToLegalize;
7667 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
7668 // This trivially expands to CTLZ.
7669 Observer.changingInstr(MI);
7670 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTLZ));
7671 Observer.changedInstr(MI);
7672 return Legalized;
7673 }
7674 case TargetOpcode::G_CTLZ: {
7675 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7676 unsigned Len = SrcTy.getScalarSizeInBits();
7677
7678 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7679 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
7680 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: SrcReg);
7681 auto ZeroSrc = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
7682 auto ICmp = MIRBuilder.buildICmp(
7683 Pred: CmpInst::ICMP_EQ, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: SrcReg, Op1: ZeroSrc);
7684 auto LenConst = MIRBuilder.buildConstant(Res: DstTy, Val: Len);
7685 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LenConst, Op1: CtlzZU);
7686 MI.eraseFromParent();
7687 return Legalized;
7688 }
7689 // for now, we do this:
7690 // NewLen = NextPowerOf2(Len);
7691 // x = x | (x >> 1);
7692 // x = x | (x >> 2);
7693 // ...
7694 // x = x | (x >>16);
7695 // x = x | (x >>32); // for 64-bit input
7696 // Upto NewLen/2
7697 // return Len - popcount(x);
7698 //
7699 // Ref: "Hacker's Delight" by Henry Warren
7700 Register Op = SrcReg;
7701 unsigned NewLen = PowerOf2Ceil(A: Len);
7702 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
7703 auto MIBShiftAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: 1ULL << i);
7704 auto MIBOp = MIRBuilder.buildOr(
7705 Dst: SrcTy, Src0: Op, Src1: MIRBuilder.buildLShr(Dst: SrcTy, Src0: Op, Src1: MIBShiftAmt));
7706 Op = MIBOp.getReg(Idx: 0);
7707 }
7708 auto MIBPop = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: Op);
7709 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MIRBuilder.buildConstant(Res: DstTy, Val: Len),
7710 Src1: MIBPop);
7711 MI.eraseFromParent();
7712 return Legalized;
7713 }
7714 case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
7715 // This trivially expands to CTTZ.
7716 Observer.changingInstr(MI);
7717 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTTZ));
7718 Observer.changedInstr(MI);
7719 return Legalized;
7720 }
7721 case TargetOpcode::G_CTTZ: {
7722 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7723
7724 unsigned Len = SrcTy.getScalarSizeInBits();
7725 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7726 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
7727 // zero.
7728 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: SrcReg);
7729 auto Zero = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
7730 auto ICmp = MIRBuilder.buildICmp(
7731 Pred: CmpInst::ICMP_EQ, Res: DstTy.changeElementSize(NewEltSize: 1), Op0: SrcReg, Op1: Zero);
7732 auto LenConst = MIRBuilder.buildConstant(Res: DstTy, Val: Len);
7733 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LenConst, Op1: CttzZU);
7734 MI.eraseFromParent();
7735 return Legalized;
7736 }
7737 // for now, we use: { return popcount(~x & (x - 1)); }
7738 // unless the target has ctlz but not ctpop, in which case we use:
7739 // { return 32 - nlz(~x & (x-1)); }
7740 // Ref: "Hacker's Delight" by Henry Warren
7741 auto MIBCstNeg1 = MIRBuilder.buildConstant(Res: SrcTy, Val: -1);
7742 auto MIBNot = MIRBuilder.buildXor(Dst: SrcTy, Src0: SrcReg, Src1: MIBCstNeg1);
7743 auto MIBTmp = MIRBuilder.buildAnd(
7744 Dst: SrcTy, Src0: MIBNot, Src1: MIRBuilder.buildAdd(Dst: SrcTy, Src0: SrcReg, Src1: MIBCstNeg1));
7745 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
7746 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
7747 auto MIBCstLen = MIRBuilder.buildConstant(Res: SrcTy, Val: Len);
7748 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MIBCstLen,
7749 Src1: MIRBuilder.buildCTLZ(Dst: SrcTy, Src0: MIBTmp));
7750 MI.eraseFromParent();
7751 return Legalized;
7752 }
7753 Observer.changingInstr(MI);
7754 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTPOP));
7755 MI.getOperand(i: 1).setReg(MIBTmp.getReg(Idx: 0));
7756 Observer.changedInstr(MI);
7757 return Legalized;
7758 }
7759 case TargetOpcode::G_CTPOP: {
7760 Register SrcReg = MI.getOperand(i: 1).getReg();
7761 LLT Ty = MRI.getType(Reg: SrcReg);
7762 unsigned Size = Ty.getScalarSizeInBits();
7763 MachineIRBuilder &B = MIRBuilder;
7764
7765 // Bail out on irregular type lengths.
7766 if (Size > 128 || Size % 8 != 0)
7767 return UnableToLegalize;
7768
7769 // Count set bits in blocks of 2 bits. Default approach would be
7770 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
7771 // We use following formula instead:
7772 // B2Count = val - { (val >> 1) & 0x55555555 }
7773 // since it gives same result in blocks of 2 with one instruction less.
7774 auto C_1 = B.buildConstant(Res: Ty, Val: 1);
7775 auto B2Set1LoTo1Hi = B.buildLShr(Dst: Ty, Src0: SrcReg, Src1: C_1);
7776 APInt B2Mask1HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x55));
7777 auto C_B2Mask1HiTo0 = B.buildConstant(Res: Ty, Val: B2Mask1HiTo0);
7778 auto B2Count1Hi = B.buildAnd(Dst: Ty, Src0: B2Set1LoTo1Hi, Src1: C_B2Mask1HiTo0);
7779 auto B2Count = B.buildSub(Dst: Ty, Src0: SrcReg, Src1: B2Count1Hi);
7780
7781 // In order to get count in blocks of 4 add values from adjacent block of 2.
7782 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
7783 auto C_2 = B.buildConstant(Res: Ty, Val: 2);
7784 auto B4Set2LoTo2Hi = B.buildLShr(Dst: Ty, Src0: B2Count, Src1: C_2);
7785 APInt B4Mask2HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x33));
7786 auto C_B4Mask2HiTo0 = B.buildConstant(Res: Ty, Val: B4Mask2HiTo0);
7787 auto B4HiB2Count = B.buildAnd(Dst: Ty, Src0: B4Set2LoTo2Hi, Src1: C_B4Mask2HiTo0);
7788 auto B4LoB2Count = B.buildAnd(Dst: Ty, Src0: B2Count, Src1: C_B4Mask2HiTo0);
7789 auto B4Count = B.buildAdd(Dst: Ty, Src0: B4HiB2Count, Src1: B4LoB2Count);
7790
7791 // For count in blocks of 8 bits we don't have to mask high 4 bits before
7792 // addition since count value sits in range {0,...,8} and 4 bits are enough
7793 // to hold such binary values. After addition high 4 bits still hold count
7794 // of set bits in high 4 bit block, set them to zero and get 8 bit result.
7795 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
7796 auto C_4 = B.buildConstant(Res: Ty, Val: 4);
7797 auto B8HiB4Count = B.buildLShr(Dst: Ty, Src0: B4Count, Src1: C_4);
7798 auto B8CountDirty4Hi = B.buildAdd(Dst: Ty, Src0: B8HiB4Count, Src1: B4Count);
7799 APInt B8Mask4HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x0F));
7800 auto C_B8Mask4HiTo0 = B.buildConstant(Res: Ty, Val: B8Mask4HiTo0);
7801 auto B8Count = B.buildAnd(Dst: Ty, Src0: B8CountDirty4Hi, Src1: C_B8Mask4HiTo0);
7802
7803 assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
7804 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
7805 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
7806 auto MulMask = B.buildConstant(Res: Ty, Val: APInt::getSplat(NewLen: Size, V: APInt(8, 0x01)));
7807
7808 // Shift count result from 8 high bits to low bits.
7809 auto C_SizeM8 = B.buildConstant(Res: Ty, Val: Size - 8);
7810
7811 auto IsMulSupported = [this](const LLT Ty) {
7812 auto Action = LI.getAction(Query: {TargetOpcode::G_MUL, {Ty}}).Action;
7813 return Action == Legal || Action == WidenScalar || Action == Custom;
7814 };
7815 if (IsMulSupported(Ty)) {
7816 auto ResTmp = B.buildMul(Dst: Ty, Src0: B8Count, Src1: MulMask);
7817 B.buildLShr(Dst: MI.getOperand(i: 0).getReg(), Src0: ResTmp, Src1: C_SizeM8);
7818 } else {
7819 auto ResTmp = B8Count;
7820 for (unsigned Shift = 8; Shift < Size; Shift *= 2) {
7821 auto ShiftC = B.buildConstant(Res: Ty, Val: Shift);
7822 auto Shl = B.buildShl(Dst: Ty, Src0: ResTmp, Src1: ShiftC);
7823 ResTmp = B.buildAdd(Dst: Ty, Src0: ResTmp, Src1: Shl);
7824 }
7825 B.buildLShr(Dst: MI.getOperand(i: 0).getReg(), Src0: ResTmp, Src1: C_SizeM8);
7826 }
7827 MI.eraseFromParent();
7828 return Legalized;
7829 }
7830 case TargetOpcode::G_CTLS: {
7831 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7832
7833 // ctls(x) -> ctlz(x ^ (x >> (N - 1))) - 1
7834 auto SignIdxC =
7835 MIRBuilder.buildConstant(Res: SrcTy, Val: SrcTy.getScalarSizeInBits() - 1);
7836 auto OneC = MIRBuilder.buildConstant(Res: DstTy, Val: 1);
7837
7838 auto Shr = MIRBuilder.buildAShr(Dst: SrcTy, Src0: SrcReg, Src1: SignIdxC);
7839
7840 auto Xor = MIRBuilder.buildXor(Dst: SrcTy, Src0: SrcReg, Src1: Shr);
7841 auto Ctlz = MIRBuilder.buildCTLZ(Dst: DstTy, Src0: Xor);
7842
7843 MIRBuilder.buildSub(Dst: DstReg, Src0: Ctlz, Src1: OneC);
7844 MI.eraseFromParent();
7845 return Legalized;
7846 }
7847 }
7848}
7849
7850// Check that (every element of) Reg is undef or not an exact multiple of BW.
7851static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
7852 Register Reg, unsigned BW) {
7853 return matchUnaryPredicate(
7854 MRI, Reg,
7855 Match: [=](const Constant *C) {
7856 // Null constant here means an undef.
7857 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(Val: C);
7858 return !CI || CI->getValue().urem(RHS: BW) != 0;
7859 },
7860 /*AllowUndefs*/ true);
7861}
7862
7863LegalizerHelper::LegalizeResult
7864LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
7865 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7866 LLT Ty = MRI.getType(Reg: Dst);
7867 LLT ShTy = MRI.getType(Reg: Z);
7868
7869 unsigned BW = Ty.getScalarSizeInBits();
7870
7871 if (!isPowerOf2_32(Value: BW))
7872 return UnableToLegalize;
7873
7874 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7875 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7876
7877 if (isNonZeroModBitWidthOrUndef(MRI, Reg: Z, BW)) {
7878 // fshl X, Y, Z -> fshr X, Y, -Z
7879 // fshr X, Y, Z -> fshl X, Y, -Z
7880 auto Zero = MIRBuilder.buildConstant(Res: ShTy, Val: 0);
7881 Z = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: Z).getReg(Idx: 0);
7882 } else {
7883 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
7884 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
7885 auto One = MIRBuilder.buildConstant(Res: ShTy, Val: 1);
7886 if (IsFSHL) {
7887 Y = MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Ty}, SrcOps: {X, Y, One}).getReg(Idx: 0);
7888 X = MIRBuilder.buildLShr(Dst: Ty, Src0: X, Src1: One).getReg(Idx: 0);
7889 } else {
7890 X = MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Ty}, SrcOps: {X, Y, One}).getReg(Idx: 0);
7891 Y = MIRBuilder.buildShl(Dst: Ty, Src0: Y, Src1: One).getReg(Idx: 0);
7892 }
7893
7894 Z = MIRBuilder.buildNot(Dst: ShTy, Src0: Z).getReg(Idx: 0);
7895 }
7896
7897 MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Dst}, SrcOps: {X, Y, Z});
7898 MI.eraseFromParent();
7899 return Legalized;
7900}
7901
7902LegalizerHelper::LegalizeResult
7903LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
7904 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7905 LLT Ty = MRI.getType(Reg: Dst);
7906 LLT ShTy = MRI.getType(Reg: Z);
7907
7908 const unsigned BW = Ty.getScalarSizeInBits();
7909 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7910
7911 Register ShX, ShY;
7912 Register ShAmt, InvShAmt;
7913
7914 // FIXME: Emit optimized urem by constant instead of letting it expand later.
7915 if (isNonZeroModBitWidthOrUndef(MRI, Reg: Z, BW)) {
7916 // fshl: X << C | Y >> (BW - C)
7917 // fshr: X << (BW - C) | Y >> C
7918 // where C = Z % BW is not zero
7919 auto BitWidthC = MIRBuilder.buildConstant(Res: ShTy, Val: BW);
7920 ShAmt = MIRBuilder.buildURem(Dst: ShTy, Src0: Z, Src1: BitWidthC).getReg(Idx: 0);
7921 InvShAmt = MIRBuilder.buildSub(Dst: ShTy, Src0: BitWidthC, Src1: ShAmt).getReg(Idx: 0);
7922 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: IsFSHL ? ShAmt : InvShAmt).getReg(Idx: 0);
7923 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: IsFSHL ? InvShAmt : ShAmt).getReg(Idx: 0);
7924 } else {
7925 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
7926 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
7927 auto Mask = MIRBuilder.buildConstant(Res: ShTy, Val: BW - 1);
7928 if (isPowerOf2_32(Value: BW)) {
7929 // Z % BW -> Z & (BW - 1)
7930 ShAmt = MIRBuilder.buildAnd(Dst: ShTy, Src0: Z, Src1: Mask).getReg(Idx: 0);
7931 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
7932 auto NotZ = MIRBuilder.buildNot(Dst: ShTy, Src0: Z);
7933 InvShAmt = MIRBuilder.buildAnd(Dst: ShTy, Src0: NotZ, Src1: Mask).getReg(Idx: 0);
7934 } else {
7935 auto BitWidthC = MIRBuilder.buildConstant(Res: ShTy, Val: BW);
7936 ShAmt = MIRBuilder.buildURem(Dst: ShTy, Src0: Z, Src1: BitWidthC).getReg(Idx: 0);
7937 InvShAmt = MIRBuilder.buildSub(Dst: ShTy, Src0: Mask, Src1: ShAmt).getReg(Idx: 0);
7938 }
7939
7940 auto One = MIRBuilder.buildConstant(Res: ShTy, Val: 1);
7941 if (IsFSHL) {
7942 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: ShAmt).getReg(Idx: 0);
7943 auto ShY1 = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: One);
7944 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: ShY1, Src1: InvShAmt).getReg(Idx: 0);
7945 } else {
7946 auto ShX1 = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: One);
7947 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: ShX1, Src1: InvShAmt).getReg(Idx: 0);
7948 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: ShAmt).getReg(Idx: 0);
7949 }
7950 }
7951
7952 MIRBuilder.buildOr(Dst, Src0: ShX, Src1: ShY, Flags: MachineInstr::Disjoint);
7953 MI.eraseFromParent();
7954 return Legalized;
7955}
7956
7957LegalizerHelper::LegalizeResult
7958LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
7959 // These operations approximately do the following (while avoiding undefined
7960 // shifts by BW):
7961 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
7962 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
7963 Register Dst = MI.getOperand(i: 0).getReg();
7964 LLT Ty = MRI.getType(Reg: Dst);
7965 LLT ShTy = MRI.getType(Reg: MI.getOperand(i: 3).getReg());
7966
7967 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7968 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7969
7970 // TODO: Use smarter heuristic that accounts for vector legalization.
7971 if (LI.getAction(Query: {RevOpcode, {Ty, ShTy}}).Action == Lower)
7972 return lowerFunnelShiftAsShifts(MI);
7973
7974 // This only works for powers of 2, fallback to shifts if it fails.
7975 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
7976 if (Result == UnableToLegalize)
7977 return lowerFunnelShiftAsShifts(MI);
7978 return Result;
7979}
7980
7981LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
7982 auto [Dst, Src] = MI.getFirst2Regs();
7983 LLT DstTy = MRI.getType(Reg: Dst);
7984 LLT SrcTy = MRI.getType(Reg: Src);
7985
7986 uint32_t DstTySize = DstTy.getSizeInBits();
7987 uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
7988 uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
7989
7990 if (!isPowerOf2_32(Value: DstTySize) || !isPowerOf2_32(Value: DstTyScalarSize) ||
7991 !isPowerOf2_32(Value: SrcTyScalarSize))
7992 return UnableToLegalize;
7993
7994 // The step between extend is too large, split it by creating an intermediate
7995 // extend instruction
7996 if (SrcTyScalarSize * 2 < DstTyScalarSize) {
7997 LLT MidTy = SrcTy.changeElementSize(NewEltSize: SrcTyScalarSize * 2);
7998 // If the destination type is illegal, split it into multiple statements
7999 // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
8000 auto NewExt = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {MidTy}, SrcOps: {Src});
8001 // Unmerge the vector
8002 LLT EltTy = MidTy.changeElementCount(
8003 EC: MidTy.getElementCount().divideCoefficientBy(RHS: 2));
8004 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: NewExt);
8005
8006 // ZExt the vectors
8007 LLT ZExtResTy = DstTy.changeElementCount(
8008 EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2));
8009 auto ZExtRes1 = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {ZExtResTy},
8010 SrcOps: {UnmergeSrc.getReg(Idx: 0)});
8011 auto ZExtRes2 = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {ZExtResTy},
8012 SrcOps: {UnmergeSrc.getReg(Idx: 1)});
8013
8014 // Merge the ending vectors
8015 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: {ZExtRes1, ZExtRes2});
8016
8017 MI.eraseFromParent();
8018 return Legalized;
8019 }
8020 return UnableToLegalize;
8021}
8022
8023LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
8024 // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
8025 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
8026 // Similar to how operand splitting is done in SelectiondDAG, we can handle
8027 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
8028 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
8029 // %lo16(<4 x s16>) = G_TRUNC %inlo
8030 // %hi16(<4 x s16>) = G_TRUNC %inhi
8031 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
8032 // %res(<8 x s8>) = G_TRUNC %in16
8033
8034 assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
8035
8036 Register DstReg = MI.getOperand(i: 0).getReg();
8037 Register SrcReg = MI.getOperand(i: 1).getReg();
8038 LLT DstTy = MRI.getType(Reg: DstReg);
8039 LLT SrcTy = MRI.getType(Reg: SrcReg);
8040
8041 if (DstTy.isVector() && isPowerOf2_32(Value: DstTy.getNumElements()) &&
8042 isPowerOf2_32(Value: DstTy.getScalarSizeInBits()) &&
8043 isPowerOf2_32(Value: SrcTy.getNumElements()) &&
8044 isPowerOf2_32(Value: SrcTy.getScalarSizeInBits())) {
8045 // Split input type.
8046 LLT SplitSrcTy = SrcTy.changeElementCount(
8047 EC: SrcTy.getElementCount().divideCoefficientBy(RHS: 2));
8048
8049 // First, split the source into two smaller vectors.
8050 SmallVector<Register, 2> SplitSrcs;
8051 extractParts(Reg: SrcReg, Ty: SplitSrcTy, NumParts: 2, VRegs&: SplitSrcs, MIRBuilder, MRI);
8052
8053 // Truncate the splits into intermediate narrower elements.
8054 LLT InterTy;
8055 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
8056 InterTy = SplitSrcTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits() * 2);
8057 else
8058 InterTy = SplitSrcTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits());
8059 for (Register &Src : SplitSrcs)
8060 Src = MIRBuilder.buildTrunc(Res: InterTy, Op: Src).getReg(Idx: 0);
8061
8062 // Combine the new truncates into one vector
8063 auto Merge = MIRBuilder.buildMergeLikeInstr(
8064 Res: DstTy.changeElementSize(NewEltSize: InterTy.getScalarSizeInBits()), Ops: SplitSrcs);
8065
8066 // Truncate the new vector to the final result type
8067 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
8068 MIRBuilder.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Merge.getReg(Idx: 0));
8069 else
8070 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0).getReg(), Op: Merge.getReg(Idx: 0));
8071
8072 MI.eraseFromParent();
8073
8074 return Legalized;
8075 }
8076 return UnableToLegalize;
8077}
8078
8079LegalizerHelper::LegalizeResult
8080LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
8081 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
8082 auto Zero = MIRBuilder.buildConstant(Res: AmtTy, Val: 0);
8083 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
8084 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
8085 auto Neg = MIRBuilder.buildSub(Dst: AmtTy, Src0: Zero, Src1: Amt);
8086 MIRBuilder.buildInstr(Opc: RevRot, DstOps: {Dst}, SrcOps: {Src, Neg});
8087 MI.eraseFromParent();
8088 return Legalized;
8089}
8090
8091LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
8092 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
8093
8094 unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
8095 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
8096
8097 MIRBuilder.setInstrAndDebugLoc(MI);
8098
8099 // If a rotate in the other direction is supported, use it.
8100 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
8101 if (LI.isLegalOrCustom(Query: {RevRot, {DstTy, SrcTy}}) &&
8102 isPowerOf2_32(Value: EltSizeInBits))
8103 return lowerRotateWithReverseRotate(MI);
8104
8105 // If a funnel shift is supported, use it.
8106 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
8107 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
8108 bool IsFShLegal = false;
8109 if ((IsFShLegal = LI.isLegalOrCustom(Query: {FShOpc, {DstTy, AmtTy}})) ||
8110 LI.isLegalOrCustom(Query: {RevFsh, {DstTy, AmtTy}})) {
8111 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
8112 Register R3) {
8113 MIRBuilder.buildInstr(Opc, DstOps: {R1}, SrcOps: {R2, R2, R3});
8114 MI.eraseFromParent();
8115 return Legalized;
8116 };
8117 // If a funnel shift in the other direction is supported, use it.
8118 if (IsFShLegal) {
8119 return buildFunnelShift(FShOpc, Dst, Src, Amt);
8120 } else if (isPowerOf2_32(Value: EltSizeInBits)) {
8121 Amt = MIRBuilder.buildNeg(Dst: DstTy, Src0: Amt).getReg(Idx: 0);
8122 return buildFunnelShift(RevFsh, Dst, Src, Amt);
8123 }
8124 }
8125
8126 auto Zero = MIRBuilder.buildConstant(Res: AmtTy, Val: 0);
8127 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
8128 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
8129 auto BitWidthMinusOneC = MIRBuilder.buildConstant(Res: AmtTy, Val: EltSizeInBits - 1);
8130 Register ShVal;
8131 Register RevShiftVal;
8132 if (isPowerOf2_32(Value: EltSizeInBits)) {
8133 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
8134 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
8135 auto NegAmt = MIRBuilder.buildSub(Dst: AmtTy, Src0: Zero, Src1: Amt);
8136 auto ShAmt = MIRBuilder.buildAnd(Dst: AmtTy, Src0: Amt, Src1: BitWidthMinusOneC);
8137 ShVal = MIRBuilder.buildInstr(Opc: ShOpc, DstOps: {DstTy}, SrcOps: {Src, ShAmt}).getReg(Idx: 0);
8138 auto RevAmt = MIRBuilder.buildAnd(Dst: AmtTy, Src0: NegAmt, Src1: BitWidthMinusOneC);
8139 RevShiftVal =
8140 MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Src, RevAmt}).getReg(Idx: 0);
8141 } else {
8142 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
8143 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
8144 auto BitWidthC = MIRBuilder.buildConstant(Res: AmtTy, Val: EltSizeInBits);
8145 auto ShAmt = MIRBuilder.buildURem(Dst: AmtTy, Src0: Amt, Src1: BitWidthC);
8146 ShVal = MIRBuilder.buildInstr(Opc: ShOpc, DstOps: {DstTy}, SrcOps: {Src, ShAmt}).getReg(Idx: 0);
8147 auto RevAmt = MIRBuilder.buildSub(Dst: AmtTy, Src0: BitWidthMinusOneC, Src1: ShAmt);
8148 auto One = MIRBuilder.buildConstant(Res: AmtTy, Val: 1);
8149 auto Inner = MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Src, One});
8150 RevShiftVal =
8151 MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Inner, RevAmt}).getReg(Idx: 0);
8152 }
8153 MIRBuilder.buildOr(Dst, Src0: ShVal, Src1: RevShiftVal, Flags: MachineInstr::Disjoint);
8154 MI.eraseFromParent();
8155 return Legalized;
8156}
8157
8158// Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
8159// representation.
8160LegalizerHelper::LegalizeResult
8161LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
8162 auto [Dst, Src] = MI.getFirst2Regs();
8163 const LLT S64 = LLT::scalar(SizeInBits: 64);
8164 const LLT S32 = LLT::scalar(SizeInBits: 32);
8165 const LLT S1 = LLT::scalar(SizeInBits: 1);
8166
8167 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
8168
8169 // unsigned cul2f(ulong u) {
8170 // uint lz = clz(u);
8171 // uint e = (u != 0) ? 127U + 63U - lz : 0;
8172 // u = (u << lz) & 0x7fffffffffffffffUL;
8173 // ulong t = u & 0xffffffffffUL;
8174 // uint v = (e << 23) | (uint)(u >> 40);
8175 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
8176 // return as_float(v + r);
8177 // }
8178
8179 auto Zero32 = MIRBuilder.buildConstant(Res: S32, Val: 0);
8180 auto Zero64 = MIRBuilder.buildConstant(Res: S64, Val: 0);
8181
8182 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(Dst: S32, Src0: Src);
8183
8184 auto K = MIRBuilder.buildConstant(Res: S32, Val: 127U + 63U);
8185 auto Sub = MIRBuilder.buildSub(Dst: S32, Src0: K, Src1: LZ);
8186
8187 auto NotZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: Src, Op1: Zero64);
8188 auto E = MIRBuilder.buildSelect(Res: S32, Tst: NotZero, Op0: Sub, Op1: Zero32);
8189
8190 auto Mask0 = MIRBuilder.buildConstant(Res: S64, Val: (-1ULL) >> 1);
8191 auto ShlLZ = MIRBuilder.buildShl(Dst: S64, Src0: Src, Src1: LZ);
8192
8193 auto U = MIRBuilder.buildAnd(Dst: S64, Src0: ShlLZ, Src1: Mask0);
8194
8195 auto Mask1 = MIRBuilder.buildConstant(Res: S64, Val: 0xffffffffffULL);
8196 auto T = MIRBuilder.buildAnd(Dst: S64, Src0: U, Src1: Mask1);
8197
8198 auto UShl = MIRBuilder.buildLShr(Dst: S64, Src0: U, Src1: MIRBuilder.buildConstant(Res: S64, Val: 40));
8199 auto ShlE = MIRBuilder.buildShl(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: 23));
8200 auto V = MIRBuilder.buildOr(Dst: S32, Src0: ShlE, Src1: MIRBuilder.buildTrunc(Res: S32, Op: UShl));
8201
8202 auto C = MIRBuilder.buildConstant(Res: S64, Val: 0x8000000000ULL);
8203 auto RCmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_UGT, Res: S1, Op0: T, Op1: C);
8204 auto TCmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: T, Op1: C);
8205 auto One = MIRBuilder.buildConstant(Res: S32, Val: 1);
8206
8207 auto VTrunc1 = MIRBuilder.buildAnd(Dst: S32, Src0: V, Src1: One);
8208 auto Select0 = MIRBuilder.buildSelect(Res: S32, Tst: TCmp, Op0: VTrunc1, Op1: Zero32);
8209 auto R = MIRBuilder.buildSelect(Res: S32, Tst: RCmp, Op0: One, Op1: Select0);
8210 MIRBuilder.buildAdd(Dst, Src0: V, Src1: R);
8211
8212 MI.eraseFromParent();
8213 return Legalized;
8214}
8215
8216// Expand s32 = G_UITOFP s64 to an IEEE float representation using bit
8217// operations and G_SITOFP
8218LegalizerHelper::LegalizeResult
8219LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
8220 auto [Dst, Src] = MI.getFirst2Regs();
8221 const LLT S64 = LLT::scalar(SizeInBits: 64);
8222 const LLT S32 = LLT::scalar(SizeInBits: 32);
8223 const LLT S1 = LLT::scalar(SizeInBits: 1);
8224
8225 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
8226
8227 // For i64 < INT_MAX we simply reuse SITOFP.
8228 // Otherwise, divide i64 by 2, round result by ORing with the lowest bit
8229 // saved before division, convert to float by SITOFP, multiply the result
8230 // by 2.
8231 auto One = MIRBuilder.buildConstant(Res: S64, Val: 1);
8232 auto Zero = MIRBuilder.buildConstant(Res: S64, Val: 0);
8233 // Result if Src < INT_MAX
8234 auto SmallResult = MIRBuilder.buildSITOFP(Dst: S32, Src0: Src);
8235 // Result if Src >= INT_MAX
8236 auto Halved = MIRBuilder.buildLShr(Dst: S64, Src0: Src, Src1: One);
8237 auto LowerBit = MIRBuilder.buildAnd(Dst: S64, Src0: Src, Src1: One);
8238 auto RoundedHalved = MIRBuilder.buildOr(Dst: S64, Src0: Halved, Src1: LowerBit);
8239 auto HalvedFP = MIRBuilder.buildSITOFP(Dst: S32, Src0: RoundedHalved);
8240 auto LargeResult = MIRBuilder.buildFAdd(Dst: S32, Src0: HalvedFP, Src1: HalvedFP);
8241 // Check if the original value is larger than INT_MAX by comparing with
8242 // zero to pick one of the two conversions.
8243 auto IsLarge =
8244 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_SLT, Res: S1, Op0: Src, Op1: Zero);
8245 MIRBuilder.buildSelect(Res: Dst, Tst: IsLarge, Op0: LargeResult, Op1: SmallResult);
8246
8247 MI.eraseFromParent();
8248 return Legalized;
8249}
8250
8251// Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an
8252// IEEE double representation.
8253LegalizerHelper::LegalizeResult
8254LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
8255 auto [Dst, Src] = MI.getFirst2Regs();
8256 const LLT S64 = LLT::scalar(SizeInBits: 64);
8257 const LLT S32 = LLT::scalar(SizeInBits: 32);
8258
8259 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
8260
8261 // We create double value from 32 bit parts with 32 exponent difference.
8262 // Note that + and - are float operations that adjust the implicit leading
8263 // one, the bases 2^52 and 2^84 are for illustrative purposes.
8264 //
8265 // X = 2^52 * 1.0...LowBits
8266 // Y = 2^84 * 1.0...HighBits
8267 // Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
8268 // = - 2^52 * 1.0...HighBits
8269 // Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
8270 auto TwoP52 = MIRBuilder.buildConstant(Res: S64, UINT64_C(0x4330000000000000));
8271 auto TwoP84 = MIRBuilder.buildConstant(Res: S64, UINT64_C(0x4530000000000000));
8272 auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
8273 auto TwoP52P84FP = MIRBuilder.buildFConstant(Res: S64, Val: TwoP52P84);
8274 auto HalfWidth = MIRBuilder.buildConstant(Res: S64, Val: 32);
8275
8276 auto LowBits = MIRBuilder.buildTrunc(Res: S32, Op: Src);
8277 LowBits = MIRBuilder.buildZExt(Res: S64, Op: LowBits);
8278 auto LowBitsFP = MIRBuilder.buildOr(Dst: S64, Src0: TwoP52, Src1: LowBits);
8279 auto HighBits = MIRBuilder.buildLShr(Dst: S64, Src0: Src, Src1: HalfWidth);
8280 auto HighBitsFP = MIRBuilder.buildOr(Dst: S64, Src0: TwoP84, Src1: HighBits);
8281 auto Scratch = MIRBuilder.buildFSub(Dst: S64, Src0: HighBitsFP, Src1: TwoP52P84FP);
8282 MIRBuilder.buildFAdd(Dst, Src0: Scratch, Src1: LowBitsFP);
8283
8284 MI.eraseFromParent();
8285 return Legalized;
8286}
8287
8288/// i64->fp16 itofp can be lowered to i64->f64,f64->f32,f32->f16. We cannot
8289/// convert fpround f64->f16 without double-rounding, so we manually perform the
8290/// lowering here where we know it is valid.
8291static LegalizerHelper::LegalizeResult
8292loweri64tof16ITOFP(MachineInstr &MI, Register Dst, LLT DstTy, Register Src,
8293 LLT SrcTy, MachineIRBuilder &MIRBuilder) {
8294 auto M1 = MI.getOpcode() == TargetOpcode::G_UITOFP
8295 ? MIRBuilder.buildUITOFP(Dst: SrcTy, Src0: Src)
8296 : MIRBuilder.buildSITOFP(Dst: SrcTy, Src0: Src);
8297 LLT S32Ty = SrcTy.changeElementSize(NewEltSize: 32);
8298 auto M2 = MIRBuilder.buildFPTrunc(Res: S32Ty, Op: M1);
8299 MIRBuilder.buildFPTrunc(Res: Dst, Op: M2);
8300 MI.eraseFromParent();
8301 return LegalizerHelper::Legalized;
8302}
8303
8304LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
8305 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8306
8307 if (SrcTy == LLT::scalar(SizeInBits: 1)) {
8308 auto True = MIRBuilder.buildFConstant(Res: DstTy, Val: 1.0);
8309 auto False = MIRBuilder.buildFConstant(Res: DstTy, Val: 0.0);
8310 MIRBuilder.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
8311 MI.eraseFromParent();
8312 return Legalized;
8313 }
8314
8315 if (DstTy.getScalarSizeInBits() == 16 && SrcTy.getScalarSizeInBits() == 64)
8316 return loweri64tof16ITOFP(MI, Dst, DstTy, Src, SrcTy, MIRBuilder);
8317
8318 if (SrcTy != LLT::scalar(SizeInBits: 64))
8319 return UnableToLegalize;
8320
8321 if (DstTy == LLT::scalar(SizeInBits: 32))
8322 // TODO: SelectionDAG has several alternative expansions to port which may
8323 // be more reasonable depending on the available instructions. We also need
8324 // a more advanced mechanism to choose an optimal version depending on
8325 // target features such as sitofp or CTLZ availability.
8326 return lowerU64ToF32WithSITOFP(MI);
8327
8328 if (DstTy == LLT::scalar(SizeInBits: 64))
8329 return lowerU64ToF64BitFloatOps(MI);
8330
8331 return UnableToLegalize;
8332}
8333
8334LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
8335 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8336
8337 const LLT S64 = LLT::scalar(SizeInBits: 64);
8338 const LLT S32 = LLT::scalar(SizeInBits: 32);
8339 const LLT S1 = LLT::scalar(SizeInBits: 1);
8340
8341 if (SrcTy == S1) {
8342 auto True = MIRBuilder.buildFConstant(Res: DstTy, Val: -1.0);
8343 auto False = MIRBuilder.buildFConstant(Res: DstTy, Val: 0.0);
8344 MIRBuilder.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
8345 MI.eraseFromParent();
8346 return Legalized;
8347 }
8348
8349 if (DstTy.getScalarSizeInBits() == 16 && SrcTy.getScalarSizeInBits() == 64)
8350 return loweri64tof16ITOFP(MI, Dst, DstTy, Src, SrcTy, MIRBuilder);
8351
8352 if (SrcTy != S64)
8353 return UnableToLegalize;
8354
8355 if (DstTy == S32) {
8356 // signed cl2f(long l) {
8357 // long s = l >> 63;
8358 // float r = cul2f((l + s) ^ s);
8359 // return s ? -r : r;
8360 // }
8361 Register L = Src;
8362 auto SignBit = MIRBuilder.buildConstant(Res: S64, Val: 63);
8363 auto S = MIRBuilder.buildAShr(Dst: S64, Src0: L, Src1: SignBit);
8364
8365 auto LPlusS = MIRBuilder.buildAdd(Dst: S64, Src0: L, Src1: S);
8366 auto Xor = MIRBuilder.buildXor(Dst: S64, Src0: LPlusS, Src1: S);
8367 auto R = MIRBuilder.buildUITOFP(Dst: S32, Src0: Xor);
8368
8369 auto RNeg = MIRBuilder.buildFNeg(Dst: S32, Src0: R);
8370 auto SignNotZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: S,
8371 Op1: MIRBuilder.buildConstant(Res: S64, Val: 0));
8372 MIRBuilder.buildSelect(Res: Dst, Tst: SignNotZero, Op0: RNeg, Op1: R);
8373 MI.eraseFromParent();
8374 return Legalized;
8375 }
8376
8377 return UnableToLegalize;
8378}
8379
8380LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
8381 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8382 const LLT S64 = LLT::scalar(SizeInBits: 64);
8383 const LLT S32 = LLT::scalar(SizeInBits: 32);
8384
8385 if (SrcTy != S64 && SrcTy != S32)
8386 return UnableToLegalize;
8387 if (DstTy != S32 && DstTy != S64)
8388 return UnableToLegalize;
8389
8390 // FPTOSI gives same result as FPTOUI for positive signed integers.
8391 // FPTOUI needs to deal with fp values that convert to unsigned integers
8392 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
8393
8394 APInt TwoPExpInt = APInt::getSignMask(BitWidth: DstTy.getSizeInBits());
8395 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
8396 : APFloat::IEEEdouble(),
8397 APInt::getZero(numBits: SrcTy.getSizeInBits()));
8398 TwoPExpFP.convertFromAPInt(Input: TwoPExpInt, IsSigned: false, RM: APFloat::rmNearestTiesToEven);
8399
8400 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Src);
8401
8402 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(Res: SrcTy, Val: TwoPExpFP);
8403 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
8404 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
8405 MachineInstrBuilder FSub = MIRBuilder.buildFSub(Dst: SrcTy, Src0: Src, Src1: Threshold);
8406 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: FSub);
8407 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(Res: DstTy, Val: TwoPExpInt);
8408 MachineInstrBuilder Res = MIRBuilder.buildXor(Dst: DstTy, Src0: ResLowBits, Src1: ResHighBit);
8409
8410 const LLT S1 = LLT::scalar(SizeInBits: 1);
8411
8412 MachineInstrBuilder FCMP =
8413 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ULT, Res: S1, Op0: Src, Op1: Threshold);
8414 MIRBuilder.buildSelect(Res: Dst, Tst: FCMP, Op0: FPTOSI, Op1: Res);
8415
8416 MI.eraseFromParent();
8417 return Legalized;
8418}
8419
8420LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
8421 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8422 const LLT S64 = LLT::scalar(SizeInBits: 64);
8423 const LLT S32 = LLT::scalar(SizeInBits: 32);
8424
8425 // FIXME: Only f32 to i64 conversions are supported.
8426 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
8427 return UnableToLegalize;
8428
8429 // Expand f32 -> i64 conversion
8430 // This algorithm comes from compiler-rt's implementation of fixsfdi:
8431 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
8432
8433 unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
8434
8435 auto ExponentMask = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x7F800000);
8436 auto ExponentLoBit = MIRBuilder.buildConstant(Res: SrcTy, Val: 23);
8437
8438 auto AndExpMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: ExponentMask);
8439 auto ExponentBits = MIRBuilder.buildLShr(Dst: SrcTy, Src0: AndExpMask, Src1: ExponentLoBit);
8440
8441 auto SignMask = MIRBuilder.buildConstant(Res: SrcTy,
8442 Val: APInt::getSignMask(BitWidth: SrcEltBits));
8443 auto AndSignMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: SignMask);
8444 auto SignLowBit = MIRBuilder.buildConstant(Res: SrcTy, Val: SrcEltBits - 1);
8445 auto Sign = MIRBuilder.buildAShr(Dst: SrcTy, Src0: AndSignMask, Src1: SignLowBit);
8446 Sign = MIRBuilder.buildSExt(Res: DstTy, Op: Sign);
8447
8448 auto MantissaMask = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x007FFFFF);
8449 auto AndMantissaMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: MantissaMask);
8450 auto K = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x00800000);
8451
8452 auto R = MIRBuilder.buildOr(Dst: SrcTy, Src0: AndMantissaMask, Src1: K);
8453 R = MIRBuilder.buildZExt(Res: DstTy, Op: R);
8454
8455 auto Bias = MIRBuilder.buildConstant(Res: SrcTy, Val: 127);
8456 auto Exponent = MIRBuilder.buildSub(Dst: SrcTy, Src0: ExponentBits, Src1: Bias);
8457 auto SubExponent = MIRBuilder.buildSub(Dst: SrcTy, Src0: Exponent, Src1: ExponentLoBit);
8458 auto ExponentSub = MIRBuilder.buildSub(Dst: SrcTy, Src0: ExponentLoBit, Src1: Exponent);
8459
8460 auto Shl = MIRBuilder.buildShl(Dst: DstTy, Src0: R, Src1: SubExponent);
8461 auto Srl = MIRBuilder.buildLShr(Dst: DstTy, Src0: R, Src1: ExponentSub);
8462
8463 const LLT S1 = LLT::scalar(SizeInBits: 1);
8464 auto CmpGt = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT,
8465 Res: S1, Op0: Exponent, Op1: ExponentLoBit);
8466
8467 R = MIRBuilder.buildSelect(Res: DstTy, Tst: CmpGt, Op0: Shl, Op1: Srl);
8468
8469 auto XorSign = MIRBuilder.buildXor(Dst: DstTy, Src0: R, Src1: Sign);
8470 auto Ret = MIRBuilder.buildSub(Dst: DstTy, Src0: XorSign, Src1: Sign);
8471
8472 auto ZeroSrcTy = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
8473
8474 auto ExponentLt0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT,
8475 Res: S1, Op0: Exponent, Op1: ZeroSrcTy);
8476
8477 auto ZeroDstTy = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
8478 MIRBuilder.buildSelect(Res: Dst, Tst: ExponentLt0, Op0: ZeroDstTy, Op1: Ret);
8479
8480 MI.eraseFromParent();
8481 return Legalized;
8482}
8483
8484LegalizerHelper::LegalizeResult
8485LegalizerHelper::lowerFPTOINT_SAT(MachineInstr &MI) {
8486 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8487
8488 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI_SAT;
8489 unsigned SatWidth = DstTy.getScalarSizeInBits();
8490
8491 // Determine minimum and maximum integer values and their corresponding
8492 // floating-point values.
8493 APInt MinInt, MaxInt;
8494 if (IsSigned) {
8495 MinInt = APInt::getSignedMinValue(numBits: SatWidth);
8496 MaxInt = APInt::getSignedMaxValue(numBits: SatWidth);
8497 } else {
8498 MinInt = APInt::getMinValue(numBits: SatWidth);
8499 MaxInt = APInt::getMaxValue(numBits: SatWidth);
8500 }
8501
8502 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: SrcTy.getScalarType());
8503 APFloat MinFloat(Semantics);
8504 APFloat MaxFloat(Semantics);
8505
8506 APFloat::opStatus MinStatus =
8507 MinFloat.convertFromAPInt(Input: MinInt, IsSigned, RM: APFloat::rmTowardZero);
8508 APFloat::opStatus MaxStatus =
8509 MaxFloat.convertFromAPInt(Input: MaxInt, IsSigned, RM: APFloat::rmTowardZero);
8510 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
8511 !(MaxStatus & APFloat::opStatus::opInexact);
8512
8513 // If the integer bounds are exactly representable as floats, emit a
8514 // min+max+fptoi sequence. Otherwise we have to use a sequence of comparisons
8515 // and selects.
8516 if (AreExactFloatBounds) {
8517 // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
8518 auto MaxC = MIRBuilder.buildFConstant(Res: SrcTy, Val: MinFloat);
8519 auto MaxP = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGT,
8520 Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Src, Op1: MaxC);
8521 auto Max = MIRBuilder.buildSelect(Res: SrcTy, Tst: MaxP, Op0: Src, Op1: MaxC);
8522 // Clamp by MaxFloat from above. NaN cannot occur.
8523 auto MinC = MIRBuilder.buildFConstant(Res: SrcTy, Val: MaxFloat);
8524 auto MinP =
8525 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Max,
8526 Op1: MinC, Flags: MachineInstr::FmNoNans);
8527 auto Min =
8528 MIRBuilder.buildSelect(Res: SrcTy, Tst: MinP, Op0: Max, Op1: MinC, Flags: MachineInstr::FmNoNans);
8529 // Convert clamped value to integer. In the unsigned case we're done,
8530 // because we mapped NaN to MinFloat, which will cast to zero.
8531 if (!IsSigned) {
8532 MIRBuilder.buildFPTOUI(Dst, Src0: Min);
8533 MI.eraseFromParent();
8534 return Legalized;
8535 }
8536
8537 // Otherwise, select 0 if Src is NaN.
8538 auto FpToInt = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Min);
8539 auto IsZero = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_UNO,
8540 Res: DstTy.changeElementSize(NewEltSize: 1), Op0: Src, Op1: Src);
8541 MIRBuilder.buildSelect(Res: Dst, Tst: IsZero, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: 0),
8542 Op1: FpToInt);
8543 MI.eraseFromParent();
8544 return Legalized;
8545 }
8546
8547 // Result of direct conversion. The assumption here is that the operation is
8548 // non-trapping and it's fine to apply it to an out-of-range value if we
8549 // select it away later.
8550 auto FpToInt = IsSigned ? MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Src)
8551 : MIRBuilder.buildFPTOUI(Dst: DstTy, Src0: Src);
8552
8553 // If Src ULT MinFloat, select MinInt. In particular, this also selects
8554 // MinInt if Src is NaN.
8555 auto ULT =
8556 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ULT, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Src,
8557 Op1: MIRBuilder.buildFConstant(Res: SrcTy, Val: MinFloat));
8558 auto Max = MIRBuilder.buildSelect(
8559 Res: DstTy, Tst: ULT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MinInt), Op1: FpToInt);
8560 // If Src OGT MaxFloat, select MaxInt.
8561 auto OGT =
8562 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Src,
8563 Op1: MIRBuilder.buildFConstant(Res: SrcTy, Val: MaxFloat));
8564
8565 // In the unsigned case we are done, because we mapped NaN to MinInt, which
8566 // is already zero.
8567 if (!IsSigned) {
8568 MIRBuilder.buildSelect(Res: Dst, Tst: OGT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MaxInt),
8569 Op1: Max);
8570 MI.eraseFromParent();
8571 return Legalized;
8572 }
8573
8574 // Otherwise, select 0 if Src is NaN.
8575 auto Min = MIRBuilder.buildSelect(
8576 Res: DstTy, Tst: OGT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MaxInt), Op1: Max);
8577 auto IsZero = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_UNO,
8578 Res: DstTy.changeElementSize(NewEltSize: 1), Op0: Src, Op1: Src);
8579 MIRBuilder.buildSelect(Res: Dst, Tst: IsZero, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: 0), Op1: Min);
8580 MI.eraseFromParent();
8581 return Legalized;
8582}
8583
8584// f64 -> f16 conversion using round-to-nearest-even rounding mode.
8585LegalizerHelper::LegalizeResult
8586LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
8587 const LLT S1 = LLT::scalar(SizeInBits: 1);
8588 const LLT S32 = LLT::scalar(SizeInBits: 32);
8589
8590 auto [Dst, Src] = MI.getFirst2Regs();
8591 assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
8592 MRI.getType(Src).getScalarType() == LLT::scalar(64));
8593
8594 if (MRI.getType(Reg: Src).isVector()) // TODO: Handle vectors directly.
8595 return UnableToLegalize;
8596
8597 if (MI.getFlag(Flag: MachineInstr::FmAfn)) {
8598 unsigned Flags = MI.getFlags();
8599 auto Src32 = MIRBuilder.buildFPTrunc(Res: S32, Op: Src, Flags);
8600 MIRBuilder.buildFPTrunc(Res: Dst, Op: Src32, Flags);
8601 MI.eraseFromParent();
8602 return Legalized;
8603 }
8604
8605 const unsigned ExpMask = 0x7ff;
8606 const unsigned ExpBiasf64 = 1023;
8607 const unsigned ExpBiasf16 = 15;
8608
8609 auto Unmerge = MIRBuilder.buildUnmerge(Res: S32, Op: Src);
8610 Register U = Unmerge.getReg(Idx: 0);
8611 Register UH = Unmerge.getReg(Idx: 1);
8612
8613 auto E = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 20));
8614 E = MIRBuilder.buildAnd(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: ExpMask));
8615
8616 // Subtract the fp64 exponent bias (1023) to get the real exponent and
8617 // add the f16 bias (15) to get the biased exponent for the f16 format.
8618 E = MIRBuilder.buildAdd(
8619 Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: -ExpBiasf64 + ExpBiasf16));
8620
8621 auto M = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 8));
8622 M = MIRBuilder.buildAnd(Dst: S32, Src0: M, Src1: MIRBuilder.buildConstant(Res: S32, Val: 0xffe));
8623
8624 auto MaskedSig = MIRBuilder.buildAnd(Dst: S32, Src0: UH,
8625 Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x1ff));
8626 MaskedSig = MIRBuilder.buildOr(Dst: S32, Src0: MaskedSig, Src1: U);
8627
8628 auto Zero = MIRBuilder.buildConstant(Res: S32, Val: 0);
8629 auto SigCmpNE0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: MaskedSig, Op1: Zero);
8630 auto Lo40Set = MIRBuilder.buildZExt(Res: S32, Op: SigCmpNE0);
8631 M = MIRBuilder.buildOr(Dst: S32, Src0: M, Src1: Lo40Set);
8632
8633 // (M != 0 ? 0x0200 : 0) | 0x7c00;
8634 auto Bits0x200 = MIRBuilder.buildConstant(Res: S32, Val: 0x0200);
8635 auto CmpM_NE0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: M, Op1: Zero);
8636 auto SelectCC = MIRBuilder.buildSelect(Res: S32, Tst: CmpM_NE0, Op0: Bits0x200, Op1: Zero);
8637
8638 auto Bits0x7c00 = MIRBuilder.buildConstant(Res: S32, Val: 0x7c00);
8639 auto I = MIRBuilder.buildOr(Dst: S32, Src0: SelectCC, Src1: Bits0x7c00);
8640
8641 // N = M | (E << 12);
8642 auto EShl12 = MIRBuilder.buildShl(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: 12));
8643 auto N = MIRBuilder.buildOr(Dst: S32, Src0: M, Src1: EShl12);
8644
8645 // B = clamp(1-E, 0, 13);
8646 auto One = MIRBuilder.buildConstant(Res: S32, Val: 1);
8647 auto OneSubExp = MIRBuilder.buildSub(Dst: S32, Src0: One, Src1: E);
8648 auto B = MIRBuilder.buildSMax(Dst: S32, Src0: OneSubExp, Src1: Zero);
8649 B = MIRBuilder.buildSMin(Dst: S32, Src0: B, Src1: MIRBuilder.buildConstant(Res: S32, Val: 13));
8650
8651 auto SigSetHigh = MIRBuilder.buildOr(Dst: S32, Src0: M,
8652 Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x1000));
8653
8654 auto D = MIRBuilder.buildLShr(Dst: S32, Src0: SigSetHigh, Src1: B);
8655 auto D0 = MIRBuilder.buildShl(Dst: S32, Src0: D, Src1: B);
8656
8657 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1,
8658 Op0: D0, Op1: SigSetHigh);
8659 auto D1 = MIRBuilder.buildZExt(Res: S32, Op: D0_NE_SigSetHigh);
8660 D = MIRBuilder.buildOr(Dst: S32, Src0: D, Src1: D1);
8661
8662 auto CmpELtOne = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: E, Op1: One);
8663 auto V = MIRBuilder.buildSelect(Res: S32, Tst: CmpELtOne, Op0: D, Op1: N);
8664
8665 auto VLow3 = MIRBuilder.buildAnd(Dst: S32, Src0: V, Src1: MIRBuilder.buildConstant(Res: S32, Val: 7));
8666 V = MIRBuilder.buildLShr(Dst: S32, Src0: V, Src1: MIRBuilder.buildConstant(Res: S32, Val: 2));
8667
8668 auto VLow3Eq3 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: VLow3,
8669 Op1: MIRBuilder.buildConstant(Res: S32, Val: 3));
8670 auto V0 = MIRBuilder.buildZExt(Res: S32, Op: VLow3Eq3);
8671
8672 auto VLow3Gt5 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: VLow3,
8673 Op1: MIRBuilder.buildConstant(Res: S32, Val: 5));
8674 auto V1 = MIRBuilder.buildZExt(Res: S32, Op: VLow3Gt5);
8675
8676 V1 = MIRBuilder.buildOr(Dst: S32, Src0: V0, Src1: V1);
8677 V = MIRBuilder.buildAdd(Dst: S32, Src0: V, Src1: V1);
8678
8679 auto CmpEGt30 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1,
8680 Op0: E, Op1: MIRBuilder.buildConstant(Res: S32, Val: 30));
8681 V = MIRBuilder.buildSelect(Res: S32, Tst: CmpEGt30,
8682 Op0: MIRBuilder.buildConstant(Res: S32, Val: 0x7c00), Op1: V);
8683
8684 auto CmpEGt1039 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1,
8685 Op0: E, Op1: MIRBuilder.buildConstant(Res: S32, Val: 1039));
8686 V = MIRBuilder.buildSelect(Res: S32, Tst: CmpEGt1039, Op0: I, Op1: V);
8687
8688 // Extract the sign bit.
8689 auto Sign = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 16));
8690 Sign = MIRBuilder.buildAnd(Dst: S32, Src0: Sign, Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x8000));
8691
8692 // Insert the sign bit
8693 V = MIRBuilder.buildOr(Dst: S32, Src0: Sign, Src1: V);
8694
8695 MIRBuilder.buildTrunc(Res: Dst, Op: V);
8696 MI.eraseFromParent();
8697 return Legalized;
8698}
8699
8700LegalizerHelper::LegalizeResult
8701LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
8702 auto [DstTy, SrcTy] = MI.getFirst2LLTs();
8703 const LLT S64 = LLT::scalar(SizeInBits: 64);
8704 const LLT S16 = LLT::scalar(SizeInBits: 16);
8705
8706 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
8707 return lowerFPTRUNC_F64_TO_F16(MI);
8708
8709 return UnableToLegalize;
8710}
8711
8712LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
8713 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8714 LLT Ty = MRI.getType(Reg: Dst);
8715
8716 auto CvtSrc1 = MIRBuilder.buildSITOFP(Dst: Ty, Src0: Src1);
8717 MIRBuilder.buildFPow(Dst, Src0, Src1: CvtSrc1, Flags: MI.getFlags());
8718 MI.eraseFromParent();
8719 return Legalized;
8720}
8721
8722static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
8723 switch (Opc) {
8724 case TargetOpcode::G_SMIN:
8725 return CmpInst::ICMP_SLT;
8726 case TargetOpcode::G_SMAX:
8727 return CmpInst::ICMP_SGT;
8728 case TargetOpcode::G_UMIN:
8729 return CmpInst::ICMP_ULT;
8730 case TargetOpcode::G_UMAX:
8731 return CmpInst::ICMP_UGT;
8732 default:
8733 llvm_unreachable("not in integer min/max");
8734 }
8735}
8736
8737LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
8738 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8739
8740 const CmpInst::Predicate Pred = minMaxToCompare(Opc: MI.getOpcode());
8741 LLT CmpType = MRI.getType(Reg: Dst).changeElementType(NewEltTy: LLT::scalar(SizeInBits: 1));
8742
8743 auto Cmp = MIRBuilder.buildICmp(Pred, Res: CmpType, Op0: Src0, Op1: Src1);
8744 MIRBuilder.buildSelect(Res: Dst, Tst: Cmp, Op0: Src0, Op1: Src1);
8745
8746 MI.eraseFromParent();
8747 return Legalized;
8748}
8749
8750LegalizerHelper::LegalizeResult
8751LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
8752 GSUCmp *Cmp = cast<GSUCmp>(Val: &MI);
8753
8754 Register Dst = Cmp->getReg(Idx: 0);
8755 LLT DstTy = MRI.getType(Reg: Dst);
8756 LLT SrcTy = MRI.getType(Reg: Cmp->getReg(Idx: 1));
8757 LLT CmpTy = DstTy.changeElementSize(NewEltSize: 1);
8758
8759 CmpInst::Predicate LTPredicate = Cmp->isSigned()
8760 ? CmpInst::Predicate::ICMP_SLT
8761 : CmpInst::Predicate::ICMP_ULT;
8762 CmpInst::Predicate GTPredicate = Cmp->isSigned()
8763 ? CmpInst::Predicate::ICMP_SGT
8764 : CmpInst::Predicate::ICMP_UGT;
8765
8766 auto Zero = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
8767 auto IsGT = MIRBuilder.buildICmp(Pred: GTPredicate, Res: CmpTy, Op0: Cmp->getLHSReg(),
8768 Op1: Cmp->getRHSReg());
8769 auto IsLT = MIRBuilder.buildICmp(Pred: LTPredicate, Res: CmpTy, Op0: Cmp->getLHSReg(),
8770 Op1: Cmp->getRHSReg());
8771
8772 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
8773 auto BC = TLI.getBooleanContents(isVec: DstTy.isVector(), /*isFP=*/isFloat: false);
8774 if (TLI.preferSelectsOverBooleanArithmetic(
8775 VT: getApproximateEVTForLLT(Ty: SrcTy, Ctx)) ||
8776 BC == TargetLowering::UndefinedBooleanContent) {
8777 auto One = MIRBuilder.buildConstant(Res: DstTy, Val: 1);
8778 auto SelectZeroOrOne = MIRBuilder.buildSelect(Res: DstTy, Tst: IsGT, Op0: One, Op1: Zero);
8779
8780 auto MinusOne = MIRBuilder.buildConstant(Res: DstTy, Val: -1);
8781 MIRBuilder.buildSelect(Res: Dst, Tst: IsLT, Op0: MinusOne, Op1: SelectZeroOrOne);
8782 } else {
8783 if (BC == TargetLowering::ZeroOrNegativeOneBooleanContent)
8784 std::swap(a&: IsGT, b&: IsLT);
8785 // Extend boolean results to DstTy, which is at least i2, before subtracting
8786 // them.
8787 unsigned BoolExtOp =
8788 MIRBuilder.getBoolExtOp(IsVec: DstTy.isVector(), /*isFP=*/IsFP: false);
8789 IsGT = MIRBuilder.buildInstr(Opc: BoolExtOp, DstOps: {DstTy}, SrcOps: {IsGT});
8790 IsLT = MIRBuilder.buildInstr(Opc: BoolExtOp, DstOps: {DstTy}, SrcOps: {IsLT});
8791 MIRBuilder.buildSub(Dst, Src0: IsGT, Src1: IsLT);
8792 }
8793
8794 MI.eraseFromParent();
8795 return Legalized;
8796}
8797
8798LegalizerHelper::LegalizeResult
8799LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
8800 auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
8801 const int Src0Size = Src0Ty.getScalarSizeInBits();
8802 const int Src1Size = Src1Ty.getScalarSizeInBits();
8803
8804 auto SignBitMask = MIRBuilder.buildConstant(
8805 Res: Src0Ty, Val: APInt::getSignMask(BitWidth: Src0Size));
8806
8807 auto NotSignBitMask = MIRBuilder.buildConstant(
8808 Res: Src0Ty, Val: APInt::getLowBitsSet(numBits: Src0Size, loBitsSet: Src0Size - 1));
8809
8810 Register And0 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0, Src1: NotSignBitMask).getReg(Idx: 0);
8811 Register And1;
8812 if (Src0Ty == Src1Ty) {
8813 And1 = MIRBuilder.buildAnd(Dst: Src1Ty, Src0: Src1, Src1: SignBitMask).getReg(Idx: 0);
8814 } else if (Src0Size > Src1Size) {
8815 auto ShiftAmt = MIRBuilder.buildConstant(Res: Src0Ty, Val: Src0Size - Src1Size);
8816 auto Zext = MIRBuilder.buildZExt(Res: Src0Ty, Op: Src1);
8817 auto Shift = MIRBuilder.buildShl(Dst: Src0Ty, Src0: Zext, Src1: ShiftAmt);
8818 And1 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0: Shift, Src1: SignBitMask).getReg(Idx: 0);
8819 } else {
8820 auto ShiftAmt = MIRBuilder.buildConstant(Res: Src1Ty, Val: Src1Size - Src0Size);
8821 auto Shift = MIRBuilder.buildLShr(Dst: Src1Ty, Src0: Src1, Src1: ShiftAmt);
8822 auto Trunc = MIRBuilder.buildTrunc(Res: Src0Ty, Op: Shift);
8823 And1 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0: Trunc, Src1: SignBitMask).getReg(Idx: 0);
8824 }
8825
8826 // Be careful about setting nsz/nnan/ninf on every instruction, since the
8827 // constants are a nan and -0.0, but the final result should preserve
8828 // everything.
8829 unsigned Flags = MI.getFlags();
8830
8831 // We masked the sign bit and the not-sign bit, so these are disjoint.
8832 Flags |= MachineInstr::Disjoint;
8833
8834 MIRBuilder.buildOr(Dst, Src0: And0, Src1: And1, Flags);
8835
8836 MI.eraseFromParent();
8837 return Legalized;
8838}
8839
8840LegalizerHelper::LegalizeResult
8841LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
8842 // FIXME: fminnum/fmaxnum and fminimumnum/fmaximumnum should not have
8843 // identical handling. fminimumnum/fmaximumnum also need a path that do not
8844 // depend on fminnum/fmaxnum.
8845
8846 unsigned NewOp;
8847 switch (MI.getOpcode()) {
8848 case TargetOpcode::G_FMINNUM:
8849 NewOp = TargetOpcode::G_FMINNUM_IEEE;
8850 break;
8851 case TargetOpcode::G_FMINIMUMNUM:
8852 NewOp = TargetOpcode::G_FMINNUM;
8853 break;
8854 case TargetOpcode::G_FMAXNUM:
8855 NewOp = TargetOpcode::G_FMAXNUM_IEEE;
8856 break;
8857 case TargetOpcode::G_FMAXIMUMNUM:
8858 NewOp = TargetOpcode::G_FMAXNUM;
8859 break;
8860 default:
8861 llvm_unreachable("unexpected min/max opcode");
8862 }
8863
8864 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8865 LLT Ty = MRI.getType(Reg: Dst);
8866
8867 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
8868 // Insert canonicalizes if it's possible we need to quiet to get correct
8869 // sNaN behavior.
8870
8871 // Note this must be done here, and not as an optimization combine in the
8872 // absence of a dedicate quiet-snan instruction as we're using an
8873 // omni-purpose G_FCANONICALIZE.
8874 if (!isKnownNeverSNaN(Val: Src0, MRI))
8875 Src0 = MIRBuilder.buildFCanonicalize(Dst: Ty, Src0, Flags: MI.getFlags()).getReg(Idx: 0);
8876
8877 if (!isKnownNeverSNaN(Val: Src1, MRI))
8878 Src1 = MIRBuilder.buildFCanonicalize(Dst: Ty, Src0: Src1, Flags: MI.getFlags()).getReg(Idx: 0);
8879 }
8880
8881 // If there are no nans, it's safe to simply replace this with the non-IEEE
8882 // version.
8883 MIRBuilder.buildInstr(Opc: NewOp, DstOps: {Dst}, SrcOps: {Src0, Src1}, Flags: MI.getFlags());
8884 MI.eraseFromParent();
8885 return Legalized;
8886}
8887
8888LegalizerHelper::LegalizeResult
8889LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) {
8890 unsigned Opc = MI.getOpcode();
8891 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8892 LLT Ty = MRI.getType(Reg: Dst);
8893 LLT CmpTy = Ty.changeElementSize(NewEltSize: 1);
8894
8895 bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM);
8896 unsigned OpcIeee =
8897 IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE;
8898 unsigned OpcNonIeee =
8899 IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM;
8900 bool MinMaxMustRespectOrderedZero = false;
8901 Register Res;
8902
8903 // IEEE variants don't need canonicalization
8904 if (LI.isLegalOrCustom(Query: {OpcIeee, Ty})) {
8905 Res = MIRBuilder.buildInstr(Opc: OpcIeee, DstOps: {Ty}, SrcOps: {Src0, Src1}).getReg(Idx: 0);
8906 MinMaxMustRespectOrderedZero = true;
8907 } else if (LI.isLegalOrCustom(Query: {OpcNonIeee, Ty})) {
8908 Res = MIRBuilder.buildInstr(Opc: OpcNonIeee, DstOps: {Ty}, SrcOps: {Src0, Src1}).getReg(Idx: 0);
8909 } else {
8910 auto Compare = MIRBuilder.buildFCmp(
8911 Pred: IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, Res: CmpTy, Op0: Src0, Op1: Src1);
8912 Res = MIRBuilder.buildSelect(Res: Ty, Tst: Compare, Op0: Src0, Op1: Src1).getReg(Idx: 0);
8913 }
8914
8915 // Propagate any NaN of both operands
8916 if (!MI.getFlag(Flag: MachineInstr::FmNoNans) &&
8917 (!isKnownNeverNaN(Val: Src0, MRI) || isKnownNeverNaN(Val: Src1, MRI))) {
8918 auto IsOrdered = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: CmpTy, Op0: Src0, Op1: Src1);
8919
8920 LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType();
8921 APFloat NaNValue = APFloat::getNaN(Sem: getFltSemanticForLLT(Ty: ElementTy));
8922 Register NaN = MIRBuilder.buildFConstant(Res: ElementTy, Val: NaNValue).getReg(Idx: 0);
8923 if (Ty.isVector())
8924 NaN = MIRBuilder.buildSplatBuildVector(Res: Ty, Src: NaN).getReg(Idx: 0);
8925
8926 Res = MIRBuilder.buildSelect(Res: Ty, Tst: IsOrdered, Op0: Res, Op1: NaN).getReg(Idx: 0);
8927 }
8928
8929 // fminimum/fmaximum requires -0.0 less than +0.0
8930 if (!MinMaxMustRespectOrderedZero && !MI.getFlag(Flag: MachineInstr::FmNsz)) {
8931 GISelValueTracking VT(MIRBuilder.getMF());
8932 KnownFPClass Src0Info = VT.computeKnownFPClass(R: Src0, InterestedClasses: fcZero);
8933 KnownFPClass Src1Info = VT.computeKnownFPClass(R: Src1, InterestedClasses: fcZero);
8934
8935 if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) {
8936 const unsigned Flags = MI.getFlags();
8937 Register Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0).getReg(Idx: 0);
8938 auto IsZero = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OEQ, Res: CmpTy, Op0: Res, Op1: Zero);
8939
8940 unsigned TestClass = IsMax ? fcPosZero : fcNegZero;
8941
8942 auto LHSTestZero = MIRBuilder.buildIsFPClass(Res: CmpTy, Src: Src0, Mask: TestClass);
8943 auto LHSSelect =
8944 MIRBuilder.buildSelect(Res: Ty, Tst: LHSTestZero, Op0: Src0, Op1: Res, Flags);
8945
8946 auto RHSTestZero = MIRBuilder.buildIsFPClass(Res: CmpTy, Src: Src1, Mask: TestClass);
8947 auto RHSSelect =
8948 MIRBuilder.buildSelect(Res: Ty, Tst: RHSTestZero, Op0: Src1, Op1: LHSSelect, Flags);
8949
8950 Res = MIRBuilder.buildSelect(Res: Ty, Tst: IsZero, Op0: RHSSelect, Op1: Res, Flags).getReg(Idx: 0);
8951 }
8952 }
8953
8954 MIRBuilder.buildCopy(Res: Dst, Op: Res);
8955 MI.eraseFromParent();
8956 return Legalized;
8957}
8958
8959LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
8960 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
8961 Register DstReg = MI.getOperand(i: 0).getReg();
8962 LLT Ty = MRI.getType(Reg: DstReg);
8963 unsigned Flags = MI.getFlags();
8964
8965 auto Mul = MIRBuilder.buildFMul(Dst: Ty, Src0: MI.getOperand(i: 1), Src1: MI.getOperand(i: 2),
8966 Flags);
8967 MIRBuilder.buildFAdd(Dst: DstReg, Src0: Mul, Src1: MI.getOperand(i: 3), Flags);
8968 MI.eraseFromParent();
8969 return Legalized;
8970}
8971
8972LegalizerHelper::LegalizeResult
8973LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
8974 auto [DstReg, X] = MI.getFirst2Regs();
8975 const unsigned Flags = MI.getFlags();
8976 const LLT Ty = MRI.getType(Reg: DstReg);
8977 const LLT CondTy = Ty.changeElementSize(NewEltSize: 1);
8978
8979 // round(x) =>
8980 // t = trunc(x);
8981 // d = fabs(x - t);
8982 // o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
8983 // return t + o;
8984
8985 auto T = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: X, Flags);
8986
8987 auto Diff = MIRBuilder.buildFSub(Dst: Ty, Src0: X, Src1: T, Flags);
8988 auto AbsDiff = MIRBuilder.buildFAbs(Dst: Ty, Src0: Diff, Flags);
8989
8990 auto Half = MIRBuilder.buildFConstant(Res: Ty, Val: 0.5);
8991 auto Cmp =
8992 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGE, Res: CondTy, Op0: AbsDiff, Op1: Half, Flags);
8993
8994 // Could emit G_UITOFP instead
8995 auto One = MIRBuilder.buildFConstant(Res: Ty, Val: 1.0);
8996 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
8997 auto BoolFP = MIRBuilder.buildSelect(Res: Ty, Tst: Cmp, Op0: One, Op1: Zero);
8998 auto SignedOffset = MIRBuilder.buildFCopysign(Dst: Ty, Src0: BoolFP, Src1: X);
8999
9000 MIRBuilder.buildFAdd(Dst: DstReg, Src0: T, Src1: SignedOffset, Flags);
9001
9002 MI.eraseFromParent();
9003 return Legalized;
9004}
9005
9006LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
9007 auto [DstReg, SrcReg] = MI.getFirst2Regs();
9008 unsigned Flags = MI.getFlags();
9009 LLT Ty = MRI.getType(Reg: DstReg);
9010 const LLT CondTy = Ty.changeElementSize(NewEltSize: 1);
9011
9012 // result = trunc(src);
9013 // if (src < 0.0 && src != result)
9014 // result += -1.0.
9015
9016 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: SrcReg, Flags);
9017 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
9018
9019 auto Lt0 = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: CondTy,
9020 Op0: SrcReg, Op1: Zero, Flags);
9021 auto NeTrunc = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: CondTy,
9022 Op0: SrcReg, Op1: Trunc, Flags);
9023 auto And = MIRBuilder.buildAnd(Dst: CondTy, Src0: Lt0, Src1: NeTrunc);
9024 auto AddVal = MIRBuilder.buildSITOFP(Dst: Ty, Src0: And);
9025
9026 MIRBuilder.buildFAdd(Dst: DstReg, Src0: Trunc, Src1: AddVal, Flags);
9027 MI.eraseFromParent();
9028 return Legalized;
9029}
9030
9031LegalizerHelper::LegalizeResult
9032LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
9033 const unsigned NumOps = MI.getNumOperands();
9034 auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
9035 unsigned PartSize = Src0Ty.getSizeInBits();
9036
9037 LLT WideTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
9038 Register ResultReg = MIRBuilder.buildZExt(Res: WideTy, Op: Src0Reg).getReg(Idx: 0);
9039
9040 for (unsigned I = 2; I != NumOps; ++I) {
9041 const unsigned Offset = (I - 1) * PartSize;
9042
9043 Register SrcReg = MI.getOperand(i: I).getReg();
9044 auto ZextInput = MIRBuilder.buildZExt(Res: WideTy, Op: SrcReg);
9045
9046 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
9047 MRI.createGenericVirtualRegister(Ty: WideTy);
9048
9049 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: Offset);
9050 auto Shl = MIRBuilder.buildShl(Dst: WideTy, Src0: ZextInput, Src1: ShiftAmt);
9051 MIRBuilder.buildOr(Dst: NextResult, Src0: ResultReg, Src1: Shl);
9052 ResultReg = NextResult;
9053 }
9054
9055 if (DstTy.isPointer()) {
9056 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
9057 AddrSpace: DstTy.getAddressSpace())) {
9058 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
9059 return UnableToLegalize;
9060 }
9061
9062 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
9063 }
9064
9065 MI.eraseFromParent();
9066 return Legalized;
9067}
9068
9069LegalizerHelper::LegalizeResult
9070LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
9071 const unsigned NumDst = MI.getNumOperands() - 1;
9072 Register SrcReg = MI.getOperand(i: NumDst).getReg();
9073 Register Dst0Reg = MI.getOperand(i: 0).getReg();
9074 LLT DstTy = MRI.getType(Reg: Dst0Reg);
9075 if (DstTy.isPointer())
9076 return UnableToLegalize; // TODO
9077
9078 SrcReg = coerceToScalar(Val: SrcReg);
9079 if (!SrcReg)
9080 return UnableToLegalize;
9081
9082 // Expand scalarizing unmerge as bitcast to integer and shift.
9083 LLT IntTy = MRI.getType(Reg: SrcReg);
9084
9085 MIRBuilder.buildTrunc(Res: Dst0Reg, Op: SrcReg);
9086
9087 const unsigned DstSize = DstTy.getSizeInBits();
9088 unsigned Offset = DstSize;
9089 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
9090 auto ShiftAmt = MIRBuilder.buildConstant(Res: IntTy, Val: Offset);
9091 auto Shift = MIRBuilder.buildLShr(Dst: IntTy, Src0: SrcReg, Src1: ShiftAmt);
9092 MIRBuilder.buildTrunc(Res: MI.getOperand(i: I), Op: Shift);
9093 }
9094
9095 MI.eraseFromParent();
9096 return Legalized;
9097}
9098
9099/// Lower a vector extract or insert by writing the vector to a stack temporary
9100/// and reloading the element or vector.
9101///
9102/// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
9103/// =>
9104/// %stack_temp = G_FRAME_INDEX
9105/// G_STORE %vec, %stack_temp
9106/// %idx = clamp(%idx, %vec.getNumElements())
9107/// %element_ptr = G_PTR_ADD %stack_temp, %idx
9108/// %dst = G_LOAD %element_ptr
9109LegalizerHelper::LegalizeResult
9110LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
9111 Register DstReg = MI.getOperand(i: 0).getReg();
9112 Register SrcVec = MI.getOperand(i: 1).getReg();
9113 Register InsertVal;
9114 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
9115 InsertVal = MI.getOperand(i: 2).getReg();
9116
9117 Register Idx = MI.getOperand(i: MI.getNumOperands() - 1).getReg();
9118
9119 LLT VecTy = MRI.getType(Reg: SrcVec);
9120 LLT EltTy = VecTy.getElementType();
9121 unsigned NumElts = VecTy.getNumElements();
9122
9123 int64_t IdxVal;
9124 if (mi_match(R: Idx, MRI, P: m_ICst(Cst&: IdxVal)) && IdxVal <= NumElts) {
9125 SmallVector<Register, 8> SrcRegs;
9126 extractParts(Reg: SrcVec, Ty: EltTy, NumParts: NumElts, VRegs&: SrcRegs, MIRBuilder, MRI);
9127
9128 if (InsertVal) {
9129 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
9130 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SrcRegs);
9131 } else {
9132 MIRBuilder.buildCopy(Res: DstReg, Op: SrcRegs[IdxVal]);
9133 }
9134
9135 MI.eraseFromParent();
9136 return Legalized;
9137 }
9138
9139 if (!EltTy.isByteSized()) { // Not implemented.
9140 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
9141 return UnableToLegalize;
9142 }
9143
9144 unsigned EltBytes = EltTy.getSizeInBytes();
9145 Align VecAlign = getStackTemporaryAlignment(Ty: VecTy);
9146 Align EltAlign;
9147
9148 MachinePointerInfo PtrInfo;
9149 auto StackTemp = createStackTemporary(
9150 Bytes: TypeSize::getFixed(ExactSize: VecTy.getSizeInBytes()), Alignment: VecAlign, PtrInfo);
9151 MIRBuilder.buildStore(Val: SrcVec, Addr: StackTemp, PtrInfo, Alignment: VecAlign);
9152
9153 // Get the pointer to the element, and be sure not to hit undefined behavior
9154 // if the index is out of bounds.
9155 Register EltPtr = getVectorElementPointer(VecPtr: StackTemp.getReg(Idx: 0), VecTy, Index: Idx);
9156
9157 if (mi_match(R: Idx, MRI, P: m_ICst(Cst&: IdxVal))) {
9158 int64_t Offset = IdxVal * EltBytes;
9159 PtrInfo = PtrInfo.getWithOffset(O: Offset);
9160 EltAlign = commonAlignment(A: VecAlign, Offset);
9161 } else {
9162 // We lose information with a variable offset.
9163 EltAlign = getStackTemporaryAlignment(Ty: EltTy);
9164 PtrInfo = MachinePointerInfo(MRI.getType(Reg: EltPtr).getAddressSpace());
9165 }
9166
9167 if (InsertVal) {
9168 // Write the inserted element
9169 MIRBuilder.buildStore(Val: InsertVal, Addr: EltPtr, PtrInfo, Alignment: EltAlign);
9170
9171 // Reload the whole vector.
9172 MIRBuilder.buildLoad(Res: DstReg, Addr: StackTemp, PtrInfo, Alignment: VecAlign);
9173 } else {
9174 MIRBuilder.buildLoad(Res: DstReg, Addr: EltPtr, PtrInfo, Alignment: EltAlign);
9175 }
9176
9177 MI.eraseFromParent();
9178 return Legalized;
9179}
9180
9181LegalizerHelper::LegalizeResult
9182LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
9183 auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
9184 MI.getFirst3RegLLTs();
9185 LLT IdxTy = LLT::scalar(SizeInBits: 32);
9186
9187 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
9188 Register Undef;
9189 SmallVector<Register, 32> BuildVec;
9190 LLT EltTy = DstTy.getScalarType();
9191
9192 DenseMap<unsigned, Register> CachedExtract;
9193
9194 for (int Idx : Mask) {
9195 if (Idx < 0) {
9196 if (!Undef.isValid())
9197 Undef = MIRBuilder.buildUndef(Res: EltTy).getReg(Idx: 0);
9198 BuildVec.push_back(Elt: Undef);
9199 continue;
9200 }
9201
9202 assert(!Src0Ty.isScalar() && "Unexpected scalar G_SHUFFLE_VECTOR");
9203
9204 int NumElts = Src0Ty.getNumElements();
9205 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
9206 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
9207 auto [It, Inserted] = CachedExtract.try_emplace(Key: Idx);
9208 if (Inserted) {
9209 auto IdxK = MIRBuilder.buildConstant(Res: IdxTy, Val: ExtractIdx);
9210 It->second =
9211 MIRBuilder.buildExtractVectorElement(Res: EltTy, Val: SrcVec, Idx: IdxK).getReg(Idx: 0);
9212 }
9213 BuildVec.push_back(Elt: It->second);
9214 }
9215
9216 assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR");
9217 MIRBuilder.buildBuildVector(Res: DstReg, Ops: BuildVec);
9218 MI.eraseFromParent();
9219 return Legalized;
9220}
9221
9222LegalizerHelper::LegalizeResult
9223LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
9224 auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] =
9225 MI.getFirst4RegLLTs();
9226
9227 if (VecTy.isScalableVector())
9228 report_fatal_error(reason: "Cannot expand masked_compress for scalable vectors.");
9229
9230 Align VecAlign = getStackTemporaryAlignment(Ty: VecTy);
9231 MachinePointerInfo PtrInfo;
9232 Register StackPtr =
9233 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: VecTy.getSizeInBytes()), Alignment: VecAlign,
9234 PtrInfo)
9235 .getReg(Idx: 0);
9236 MachinePointerInfo ValPtrInfo =
9237 MachinePointerInfo::getUnknownStack(MF&: *MI.getMF());
9238
9239 LLT IdxTy = LLT::scalar(SizeInBits: 32);
9240 LLT ValTy = VecTy.getElementType();
9241 Align ValAlign = getStackTemporaryAlignment(Ty: ValTy);
9242
9243 auto OutPos = MIRBuilder.buildConstant(Res: IdxTy, Val: 0);
9244
9245 bool HasPassthru =
9246 MRI.getVRegDef(Reg: Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF;
9247
9248 if (HasPassthru)
9249 MIRBuilder.buildStore(Val: Passthru, Addr: StackPtr, PtrInfo, Alignment: VecAlign);
9250
9251 Register LastWriteVal;
9252 std::optional<APInt> PassthruSplatVal =
9253 isConstantOrConstantSplatVector(MI&: *MRI.getVRegDef(Reg: Passthru), MRI);
9254
9255 if (PassthruSplatVal.has_value()) {
9256 LastWriteVal =
9257 MIRBuilder.buildConstant(Res: ValTy, Val: PassthruSplatVal.value()).getReg(Idx: 0);
9258 } else if (HasPassthru) {
9259 auto Popcount = MIRBuilder.buildZExt(Res: MaskTy.changeElementSize(NewEltSize: 32), Op: Mask);
9260 Popcount = MIRBuilder.buildInstr(Opc: TargetOpcode::G_VECREDUCE_ADD,
9261 DstOps: {LLT::scalar(SizeInBits: 32)}, SrcOps: {Popcount});
9262
9263 Register LastElmtPtr =
9264 getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: Popcount.getReg(Idx: 0));
9265 LastWriteVal =
9266 MIRBuilder.buildLoad(Res: ValTy, Addr: LastElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign)
9267 .getReg(Idx: 0);
9268 }
9269
9270 unsigned NumElmts = VecTy.getNumElements();
9271 for (unsigned I = 0; I < NumElmts; ++I) {
9272 auto Idx = MIRBuilder.buildConstant(Res: IdxTy, Val: I);
9273 auto Val = MIRBuilder.buildExtractVectorElement(Res: ValTy, Val: Vec, Idx);
9274 Register ElmtPtr =
9275 getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: OutPos.getReg(Idx: 0));
9276 MIRBuilder.buildStore(Val, Addr: ElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign);
9277
9278 LLT MaskITy = MaskTy.getElementType();
9279 auto MaskI = MIRBuilder.buildExtractVectorElement(Res: MaskITy, Val: Mask, Idx);
9280 if (MaskITy.getSizeInBits() > 1)
9281 MaskI = MIRBuilder.buildTrunc(Res: LLT::scalar(SizeInBits: 1), Op: MaskI);
9282
9283 MaskI = MIRBuilder.buildZExt(Res: IdxTy, Op: MaskI);
9284 OutPos = MIRBuilder.buildAdd(Dst: IdxTy, Src0: OutPos, Src1: MaskI);
9285
9286 if (HasPassthru && I == NumElmts - 1) {
9287 auto EndOfVector =
9288 MIRBuilder.buildConstant(Res: IdxTy, Val: VecTy.getNumElements() - 1);
9289 auto AllLanesSelected = MIRBuilder.buildICmp(
9290 Pred: CmpInst::ICMP_UGT, Res: LLT::scalar(SizeInBits: 1), Op0: OutPos, Op1: EndOfVector);
9291 OutPos = MIRBuilder.buildInstr(Opc: TargetOpcode::G_UMIN, DstOps: {IdxTy},
9292 SrcOps: {OutPos, EndOfVector});
9293 ElmtPtr = getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: OutPos.getReg(Idx: 0));
9294
9295 LastWriteVal =
9296 MIRBuilder.buildSelect(Res: ValTy, Tst: AllLanesSelected, Op0: Val, Op1: LastWriteVal)
9297 .getReg(Idx: 0);
9298 MIRBuilder.buildStore(Val: LastWriteVal, Addr: ElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign);
9299 }
9300 }
9301
9302 // TODO: Use StackPtr's FrameIndex alignment.
9303 MIRBuilder.buildLoad(Res: Dst, Addr: StackPtr, PtrInfo, Alignment: VecAlign);
9304
9305 MI.eraseFromParent();
9306 return Legalized;
9307}
9308
9309Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
9310 Register AllocSize,
9311 Align Alignment,
9312 LLT PtrTy) {
9313 LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
9314
9315 auto SPTmp = MIRBuilder.buildCopy(Res: PtrTy, Op: SPReg);
9316 SPTmp = MIRBuilder.buildCast(Dst: IntPtrTy, Src: SPTmp);
9317
9318 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
9319 // have to generate an extra instruction to negate the alloc and then use
9320 // G_PTR_ADD to add the negative offset.
9321 auto Alloc = MIRBuilder.buildSub(Dst: IntPtrTy, Src0: SPTmp, Src1: AllocSize);
9322 if (Alignment > Align(1)) {
9323 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
9324 AlignMask.negate();
9325 auto AlignCst = MIRBuilder.buildConstant(Res: IntPtrTy, Val: AlignMask);
9326 Alloc = MIRBuilder.buildAnd(Dst: IntPtrTy, Src0: Alloc, Src1: AlignCst);
9327 }
9328
9329 return MIRBuilder.buildCast(Dst: PtrTy, Src: Alloc).getReg(Idx: 0);
9330}
9331
9332LegalizerHelper::LegalizeResult
9333LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
9334 const auto &MF = *MI.getMF();
9335 const auto &TFI = *MF.getSubtarget().getFrameLowering();
9336 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
9337 return UnableToLegalize;
9338
9339 Register Dst = MI.getOperand(i: 0).getReg();
9340 Register AllocSize = MI.getOperand(i: 1).getReg();
9341 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
9342
9343 LLT PtrTy = MRI.getType(Reg: Dst);
9344 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
9345 Register SPTmp =
9346 getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
9347
9348 MIRBuilder.buildCopy(Res: SPReg, Op: SPTmp);
9349 MIRBuilder.buildCopy(Res: Dst, Op: SPTmp);
9350
9351 MI.eraseFromParent();
9352 return Legalized;
9353}
9354
9355LegalizerHelper::LegalizeResult
9356LegalizerHelper::lowerStackSave(MachineInstr &MI) {
9357 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
9358 if (!StackPtr)
9359 return UnableToLegalize;
9360
9361 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0), Op: StackPtr);
9362 MI.eraseFromParent();
9363 return Legalized;
9364}
9365
9366LegalizerHelper::LegalizeResult
9367LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
9368 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
9369 if (!StackPtr)
9370 return UnableToLegalize;
9371
9372 MIRBuilder.buildCopy(Res: StackPtr, Op: MI.getOperand(i: 0));
9373 MI.eraseFromParent();
9374 return Legalized;
9375}
9376
9377LegalizerHelper::LegalizeResult
9378LegalizerHelper::lowerExtract(MachineInstr &MI) {
9379 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
9380 unsigned Offset = MI.getOperand(i: 2).getImm();
9381
9382 // Extract sub-vector or one element
9383 if (SrcTy.isVector()) {
9384 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
9385 unsigned DstSize = DstTy.getSizeInBits();
9386
9387 if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
9388 (Offset + DstSize <= SrcTy.getSizeInBits())) {
9389 // Unmerge and allow access to each Src element for the artifact combiner.
9390 auto Unmerge = MIRBuilder.buildUnmerge(Res: SrcTy.getElementType(), Op: SrcReg);
9391
9392 // Take element(s) we need to extract and copy it (merge them).
9393 SmallVector<Register, 8> SubVectorElts;
9394 for (unsigned Idx = Offset / SrcEltSize;
9395 Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
9396 SubVectorElts.push_back(Elt: Unmerge.getReg(Idx));
9397 }
9398 if (SubVectorElts.size() == 1)
9399 MIRBuilder.buildCopy(Res: DstReg, Op: SubVectorElts[0]);
9400 else
9401 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SubVectorElts);
9402
9403 MI.eraseFromParent();
9404 return Legalized;
9405 }
9406 }
9407
9408 if (DstTy.isScalar() &&
9409 (SrcTy.isScalar() ||
9410 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
9411 LLT SrcIntTy = SrcTy;
9412 if (!SrcTy.isScalar()) {
9413 SrcIntTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
9414 SrcReg = MIRBuilder.buildBitcast(Dst: SrcIntTy, Src: SrcReg).getReg(Idx: 0);
9415 }
9416
9417 if (Offset == 0)
9418 MIRBuilder.buildTrunc(Res: DstReg, Op: SrcReg);
9419 else {
9420 auto ShiftAmt = MIRBuilder.buildConstant(Res: SrcIntTy, Val: Offset);
9421 auto Shr = MIRBuilder.buildLShr(Dst: SrcIntTy, Src0: SrcReg, Src1: ShiftAmt);
9422 MIRBuilder.buildTrunc(Res: DstReg, Op: Shr);
9423 }
9424
9425 MI.eraseFromParent();
9426 return Legalized;
9427 }
9428
9429 return UnableToLegalize;
9430}
9431
9432LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
9433 auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
9434 uint64_t Offset = MI.getOperand(i: 3).getImm();
9435
9436 LLT DstTy = MRI.getType(Reg: Src);
9437 LLT InsertTy = MRI.getType(Reg: InsertSrc);
9438
9439 // Insert sub-vector or one element
9440 if (DstTy.isVector() && !InsertTy.isPointer()) {
9441 LLT EltTy = DstTy.getElementType();
9442 unsigned EltSize = EltTy.getSizeInBits();
9443 unsigned InsertSize = InsertTy.getSizeInBits();
9444
9445 if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
9446 (Offset + InsertSize <= DstTy.getSizeInBits())) {
9447 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: Src);
9448 SmallVector<Register, 8> DstElts;
9449 unsigned Idx = 0;
9450 // Elements from Src before insert start Offset
9451 for (; Idx < Offset / EltSize; ++Idx) {
9452 DstElts.push_back(Elt: UnmergeSrc.getReg(Idx));
9453 }
9454
9455 // Replace elements in Src with elements from InsertSrc
9456 if (InsertTy.getSizeInBits() > EltSize) {
9457 auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: InsertSrc);
9458 for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
9459 ++Idx, ++i) {
9460 DstElts.push_back(Elt: UnmergeInsertSrc.getReg(Idx: i));
9461 }
9462 } else {
9463 DstElts.push_back(Elt: InsertSrc);
9464 ++Idx;
9465 }
9466
9467 // Remaining elements from Src after insert
9468 for (; Idx < DstTy.getNumElements(); ++Idx) {
9469 DstElts.push_back(Elt: UnmergeSrc.getReg(Idx));
9470 }
9471
9472 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: DstElts);
9473 MI.eraseFromParent();
9474 return Legalized;
9475 }
9476 }
9477
9478 if (InsertTy.isVector() ||
9479 (DstTy.isVector() && DstTy.getElementType() != InsertTy))
9480 return UnableToLegalize;
9481
9482 const DataLayout &DL = MIRBuilder.getDataLayout();
9483 if ((DstTy.isPointer() &&
9484 DL.isNonIntegralAddressSpace(AddrSpace: DstTy.getAddressSpace())) ||
9485 (InsertTy.isPointer() &&
9486 DL.isNonIntegralAddressSpace(AddrSpace: InsertTy.getAddressSpace()))) {
9487 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
9488 return UnableToLegalize;
9489 }
9490
9491 LLT IntDstTy = DstTy;
9492
9493 if (!DstTy.isScalar()) {
9494 IntDstTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
9495 Src = MIRBuilder.buildCast(Dst: IntDstTy, Src).getReg(Idx: 0);
9496 }
9497
9498 if (!InsertTy.isScalar()) {
9499 const LLT IntInsertTy = LLT::scalar(SizeInBits: InsertTy.getSizeInBits());
9500 InsertSrc = MIRBuilder.buildPtrToInt(Dst: IntInsertTy, Src: InsertSrc).getReg(Idx: 0);
9501 }
9502
9503 Register ExtInsSrc = MIRBuilder.buildZExt(Res: IntDstTy, Op: InsertSrc).getReg(Idx: 0);
9504 if (Offset != 0) {
9505 auto ShiftAmt = MIRBuilder.buildConstant(Res: IntDstTy, Val: Offset);
9506 ExtInsSrc = MIRBuilder.buildShl(Dst: IntDstTy, Src0: ExtInsSrc, Src1: ShiftAmt).getReg(Idx: 0);
9507 }
9508
9509 APInt MaskVal = APInt::getBitsSetWithWrap(
9510 numBits: DstTy.getSizeInBits(), loBit: Offset + InsertTy.getSizeInBits(), hiBit: Offset);
9511
9512 auto Mask = MIRBuilder.buildConstant(Res: IntDstTy, Val: MaskVal);
9513 auto MaskedSrc = MIRBuilder.buildAnd(Dst: IntDstTy, Src0: Src, Src1: Mask);
9514 auto Or = MIRBuilder.buildOr(Dst: IntDstTy, Src0: MaskedSrc, Src1: ExtInsSrc);
9515
9516 MIRBuilder.buildCast(Dst, Src: Or);
9517 MI.eraseFromParent();
9518 return Legalized;
9519}
9520
9521LegalizerHelper::LegalizeResult
9522LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
9523 auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
9524 MI.getFirst4RegLLTs();
9525 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
9526
9527 LLT Ty = Dst0Ty;
9528 LLT BoolTy = Dst1Ty;
9529
9530 Register NewDst0 = MRI.cloneVirtualRegister(VReg: Dst0);
9531
9532 if (IsAdd)
9533 MIRBuilder.buildAdd(Dst: NewDst0, Src0: LHS, Src1: RHS);
9534 else
9535 MIRBuilder.buildSub(Dst: NewDst0, Src0: LHS, Src1: RHS);
9536
9537 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
9538
9539 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9540
9541 // For an addition, the result should be less than one of the operands (LHS)
9542 // if and only if the other operand (RHS) is negative, otherwise there will
9543 // be overflow.
9544 // For a subtraction, the result should be less than one of the operands
9545 // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
9546 // otherwise there will be overflow.
9547 auto ResultLowerThanLHS =
9548 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: BoolTy, Op0: NewDst0, Op1: LHS);
9549 auto ConditionRHS = MIRBuilder.buildICmp(
9550 Pred: IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, Res: BoolTy, Op0: RHS, Op1: Zero);
9551
9552 MIRBuilder.buildXor(Dst: Dst1, Src0: ConditionRHS, Src1: ResultLowerThanLHS);
9553
9554 MIRBuilder.buildCopy(Res: Dst0, Op: NewDst0);
9555 MI.eraseFromParent();
9556
9557 return Legalized;
9558}
9559
9560LegalizerHelper::LegalizeResult LegalizerHelper::lowerSADDE(MachineInstr &MI) {
9561 auto [Res, OvOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
9562 const LLT Ty = MRI.getType(Reg: Res);
9563
9564 // sum = LHS + RHS + zext(CarryIn)
9565 auto Tmp = MIRBuilder.buildAdd(Dst: Ty, Src0: LHS, Src1: RHS);
9566 auto CarryZ = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
9567 auto Sum = MIRBuilder.buildAdd(Dst: Ty, Src0: Tmp, Src1: CarryZ);
9568 MIRBuilder.buildCopy(Res, Op: Sum);
9569
9570 // OvOut = icmp slt ((sum ^ lhs) & (sum ^ rhs)), 0
9571 auto AX = MIRBuilder.buildXor(Dst: Ty, Src0: Sum, Src1: LHS);
9572 auto BX = MIRBuilder.buildXor(Dst: Ty, Src0: Sum, Src1: RHS);
9573 auto T = MIRBuilder.buildAnd(Dst: Ty, Src0: AX, Src1: BX);
9574
9575 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9576 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: OvOut, Op0: T, Op1: Zero);
9577
9578 MI.eraseFromParent();
9579 return Legalized;
9580}
9581
9582LegalizerHelper::LegalizeResult LegalizerHelper::lowerSSUBE(MachineInstr &MI) {
9583 auto [Res, OvOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
9584 const LLT Ty = MRI.getType(Reg: Res);
9585
9586 // Diff = LHS - (RHS + zext(CarryIn))
9587 auto CarryZ = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
9588 auto RHSPlusCI = MIRBuilder.buildAdd(Dst: Ty, Src0: RHS, Src1: CarryZ);
9589 auto Diff = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHSPlusCI);
9590 MIRBuilder.buildCopy(Res, Op: Diff);
9591
9592 // ov = msb((LHS ^ RHS) & (LHS ^ Diff))
9593 auto X1 = MIRBuilder.buildXor(Dst: Ty, Src0: LHS, Src1: RHS);
9594 auto X2 = MIRBuilder.buildXor(Dst: Ty, Src0: LHS, Src1: Diff);
9595 auto T = MIRBuilder.buildAnd(Dst: Ty, Src0: X1, Src1: X2);
9596 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9597 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: OvOut, Op0: T, Op1: Zero);
9598
9599 MI.eraseFromParent();
9600 return Legalized;
9601}
9602
9603LegalizerHelper::LegalizeResult
9604LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
9605 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9606 LLT Ty = MRI.getType(Reg: Res);
9607 bool IsSigned;
9608 bool IsAdd;
9609 unsigned BaseOp;
9610 switch (MI.getOpcode()) {
9611 default:
9612 llvm_unreachable("unexpected addsat/subsat opcode");
9613 case TargetOpcode::G_UADDSAT:
9614 IsSigned = false;
9615 IsAdd = true;
9616 BaseOp = TargetOpcode::G_ADD;
9617 break;
9618 case TargetOpcode::G_SADDSAT:
9619 IsSigned = true;
9620 IsAdd = true;
9621 BaseOp = TargetOpcode::G_ADD;
9622 break;
9623 case TargetOpcode::G_USUBSAT:
9624 IsSigned = false;
9625 IsAdd = false;
9626 BaseOp = TargetOpcode::G_SUB;
9627 break;
9628 case TargetOpcode::G_SSUBSAT:
9629 IsSigned = true;
9630 IsAdd = false;
9631 BaseOp = TargetOpcode::G_SUB;
9632 break;
9633 }
9634
9635 if (IsSigned) {
9636 // sadd.sat(a, b) ->
9637 // hi = 0x7fffffff - smax(a, 0)
9638 // lo = 0x80000000 - smin(a, 0)
9639 // a + smin(smax(lo, b), hi)
9640 // ssub.sat(a, b) ->
9641 // lo = smax(a, -1) - 0x7fffffff
9642 // hi = smin(a, -1) - 0x80000000
9643 // a - smin(smax(lo, b), hi)
9644 // TODO: AMDGPU can use a "median of 3" instruction here:
9645 // a +/- med3(lo, b, hi)
9646 uint64_t NumBits = Ty.getScalarSizeInBits();
9647 auto MaxVal =
9648 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMaxValue(numBits: NumBits));
9649 auto MinVal =
9650 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: NumBits));
9651 MachineInstrBuilder Hi, Lo;
9652 if (IsAdd) {
9653 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9654 Hi = MIRBuilder.buildSub(Dst: Ty, Src0: MaxVal, Src1: MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: Zero));
9655 Lo = MIRBuilder.buildSub(Dst: Ty, Src0: MinVal, Src1: MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: Zero));
9656 } else {
9657 auto NegOne = MIRBuilder.buildConstant(Res: Ty, Val: -1);
9658 Lo = MIRBuilder.buildSub(Dst: Ty, Src0: MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: NegOne),
9659 Src1: MaxVal);
9660 Hi = MIRBuilder.buildSub(Dst: Ty, Src0: MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: NegOne),
9661 Src1: MinVal);
9662 }
9663 auto RHSClamped =
9664 MIRBuilder.buildSMin(Dst: Ty, Src0: MIRBuilder.buildSMax(Dst: Ty, Src0: Lo, Src1: RHS), Src1: Hi);
9665 MIRBuilder.buildInstr(Opc: BaseOp, DstOps: {Res}, SrcOps: {LHS, RHSClamped});
9666 } else {
9667 // uadd.sat(a, b) -> a + umin(~a, b)
9668 // usub.sat(a, b) -> a - umin(a, b)
9669 Register Not = IsAdd ? MIRBuilder.buildNot(Dst: Ty, Src0: LHS).getReg(Idx: 0) : LHS;
9670 auto Min = MIRBuilder.buildUMin(Dst: Ty, Src0: Not, Src1: RHS);
9671 MIRBuilder.buildInstr(Opc: BaseOp, DstOps: {Res}, SrcOps: {LHS, Min});
9672 }
9673
9674 MI.eraseFromParent();
9675 return Legalized;
9676}
9677
9678LegalizerHelper::LegalizeResult
9679LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
9680 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9681 LLT Ty = MRI.getType(Reg: Res);
9682 LLT BoolTy = Ty.changeElementSize(NewEltSize: 1);
9683 bool IsSigned;
9684 bool IsAdd;
9685 unsigned OverflowOp;
9686 switch (MI.getOpcode()) {
9687 default:
9688 llvm_unreachable("unexpected addsat/subsat opcode");
9689 case TargetOpcode::G_UADDSAT:
9690 IsSigned = false;
9691 IsAdd = true;
9692 OverflowOp = TargetOpcode::G_UADDO;
9693 break;
9694 case TargetOpcode::G_SADDSAT:
9695 IsSigned = true;
9696 IsAdd = true;
9697 OverflowOp = TargetOpcode::G_SADDO;
9698 break;
9699 case TargetOpcode::G_USUBSAT:
9700 IsSigned = false;
9701 IsAdd = false;
9702 OverflowOp = TargetOpcode::G_USUBO;
9703 break;
9704 case TargetOpcode::G_SSUBSAT:
9705 IsSigned = true;
9706 IsAdd = false;
9707 OverflowOp = TargetOpcode::G_SSUBO;
9708 break;
9709 }
9710
9711 auto OverflowRes =
9712 MIRBuilder.buildInstr(Opc: OverflowOp, DstOps: {Ty, BoolTy}, SrcOps: {LHS, RHS});
9713 Register Tmp = OverflowRes.getReg(Idx: 0);
9714 Register Ov = OverflowRes.getReg(Idx: 1);
9715 MachineInstrBuilder Clamp;
9716 if (IsSigned) {
9717 // sadd.sat(a, b) ->
9718 // {tmp, ov} = saddo(a, b)
9719 // ov ? (tmp >>s 31) + 0x80000000 : r
9720 // ssub.sat(a, b) ->
9721 // {tmp, ov} = ssubo(a, b)
9722 // ov ? (tmp >>s 31) + 0x80000000 : r
9723 uint64_t NumBits = Ty.getScalarSizeInBits();
9724 auto ShiftAmount = MIRBuilder.buildConstant(Res: Ty, Val: NumBits - 1);
9725 auto Sign = MIRBuilder.buildAShr(Dst: Ty, Src0: Tmp, Src1: ShiftAmount);
9726 auto MinVal =
9727 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: NumBits));
9728 Clamp = MIRBuilder.buildAdd(Dst: Ty, Src0: Sign, Src1: MinVal);
9729 } else {
9730 // uadd.sat(a, b) ->
9731 // {tmp, ov} = uaddo(a, b)
9732 // ov ? 0xffffffff : tmp
9733 // usub.sat(a, b) ->
9734 // {tmp, ov} = usubo(a, b)
9735 // ov ? 0 : tmp
9736 Clamp = MIRBuilder.buildConstant(Res: Ty, Val: IsAdd ? -1 : 0);
9737 }
9738 MIRBuilder.buildSelect(Res, Tst: Ov, Op0: Clamp, Op1: Tmp);
9739
9740 MI.eraseFromParent();
9741 return Legalized;
9742}
9743
9744LegalizerHelper::LegalizeResult
9745LegalizerHelper::lowerShlSat(MachineInstr &MI) {
9746 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
9747 MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
9748 "Expected shlsat opcode!");
9749 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
9750 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9751 LLT Ty = MRI.getType(Reg: Res);
9752 LLT BoolTy = Ty.changeElementSize(NewEltSize: 1);
9753
9754 unsigned BW = Ty.getScalarSizeInBits();
9755 auto Result = MIRBuilder.buildShl(Dst: Ty, Src0: LHS, Src1: RHS);
9756 auto Orig = IsSigned ? MIRBuilder.buildAShr(Dst: Ty, Src0: Result, Src1: RHS)
9757 : MIRBuilder.buildLShr(Dst: Ty, Src0: Result, Src1: RHS);
9758
9759 MachineInstrBuilder SatVal;
9760 if (IsSigned) {
9761 auto SatMin = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: BW));
9762 auto SatMax = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMaxValue(numBits: BW));
9763 auto Cmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: BoolTy, Op0: LHS,
9764 Op1: MIRBuilder.buildConstant(Res: Ty, Val: 0));
9765 SatVal = MIRBuilder.buildSelect(Res: Ty, Tst: Cmp, Op0: SatMin, Op1: SatMax);
9766 } else {
9767 SatVal = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getMaxValue(numBits: BW));
9768 }
9769 auto Ov = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: BoolTy, Op0: LHS, Op1: Orig);
9770 MIRBuilder.buildSelect(Res, Tst: Ov, Op0: SatVal, Op1: Result);
9771
9772 MI.eraseFromParent();
9773 return Legalized;
9774}
9775
9776LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
9777 auto [Dst, Src] = MI.getFirst2Regs();
9778 const LLT Ty = MRI.getType(Reg: Src);
9779 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
9780 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
9781
9782 // Swap most and least significant byte, set remaining bytes in Res to zero.
9783 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: BaseShiftAmt);
9784 auto LSByteShiftedLeft = MIRBuilder.buildShl(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9785 auto MSByteShiftedRight = MIRBuilder.buildLShr(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9786 auto Res = MIRBuilder.buildOr(Dst: Ty, Src0: MSByteShiftedRight, Src1: LSByteShiftedLeft);
9787
9788 // Set i-th high/low byte in Res to i-th low/high byte from Src.
9789 for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
9790 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
9791 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
9792 auto Mask = MIRBuilder.buildConstant(Res: Ty, Val: APMask);
9793 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: BaseShiftAmt - 16 * i);
9794 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
9795 auto LoByte = MIRBuilder.buildAnd(Dst: Ty, Src0: Src, Src1: Mask);
9796 auto LoShiftedLeft = MIRBuilder.buildShl(Dst: Ty, Src0: LoByte, Src1: ShiftAmt);
9797 Res = MIRBuilder.buildOr(Dst: Ty, Src0: Res, Src1: LoShiftedLeft);
9798 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
9799 auto SrcShiftedRight = MIRBuilder.buildLShr(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9800 auto HiShiftedRight = MIRBuilder.buildAnd(Dst: Ty, Src0: SrcShiftedRight, Src1: Mask);
9801 Res = MIRBuilder.buildOr(Dst: Ty, Src0: Res, Src1: HiShiftedRight);
9802 }
9803 Res.getInstr()->getOperand(i: 0).setReg(Dst);
9804
9805 MI.eraseFromParent();
9806 return Legalized;
9807}
9808
9809//{ (Src & Mask) >> N } | { (Src << N) & Mask }
9810static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
9811 MachineInstrBuilder Src, const APInt &Mask) {
9812 const LLT Ty = Dst.getLLTTy(MRI: *B.getMRI());
9813 MachineInstrBuilder C_N = B.buildConstant(Res: Ty, Val: N);
9814 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Res: Ty, Val: Mask);
9815 auto LHS = B.buildLShr(Dst: Ty, Src0: B.buildAnd(Dst: Ty, Src0: Src, Src1: MaskLoNTo0), Src1: C_N);
9816 auto RHS = B.buildAnd(Dst: Ty, Src0: B.buildShl(Dst: Ty, Src0: Src, Src1: C_N), Src1: MaskLoNTo0);
9817 return B.buildOr(Dst, Src0: LHS, Src1: RHS);
9818}
9819
9820LegalizerHelper::LegalizeResult
9821LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
9822 auto [Dst, Src] = MI.getFirst2Regs();
9823 const LLT SrcTy = MRI.getType(Reg: Src);
9824 unsigned Size = SrcTy.getScalarSizeInBits();
9825 unsigned VSize = SrcTy.getSizeInBits();
9826
9827 if (Size >= 8) {
9828 if (SrcTy.isVector() && (VSize % 8 == 0) &&
9829 (LI.isLegal(Query: {TargetOpcode::G_BITREVERSE,
9830 {LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8),
9831 LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8)}}))) {
9832 // If bitreverse is legal for i8 vector of the same size, then cast
9833 // to i8 vector type.
9834 // e.g. v4s32 -> v16s8
9835 LLT VTy = LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8);
9836 auto BSWAP = MIRBuilder.buildBSwap(Dst: SrcTy, Src0: Src);
9837 auto Cast = MIRBuilder.buildBitcast(Dst: VTy, Src: BSWAP);
9838 auto RBIT = MIRBuilder.buildBitReverse(Dst: VTy, Src: Cast);
9839 MIRBuilder.buildBitcast(Dst, Src: RBIT);
9840 } else {
9841 MachineInstrBuilder BSWAP =
9842 MIRBuilder.buildInstr(Opc: TargetOpcode::G_BSWAP, DstOps: {SrcTy}, SrcOps: {Src});
9843
9844 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
9845 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
9846 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
9847 MachineInstrBuilder Swap4 = SwapN(N: 4, Dst: SrcTy, B&: MIRBuilder, Src: BSWAP,
9848 Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xF0)));
9849
9850 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
9851 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
9852 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
9853 MachineInstrBuilder Swap2 = SwapN(N: 2, Dst: SrcTy, B&: MIRBuilder, Src: Swap4,
9854 Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xCC)));
9855
9856 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
9857 // 6|7
9858 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
9859 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
9860 SwapN(N: 1, Dst, B&: MIRBuilder, Src: Swap2, Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xAA)));
9861 }
9862 } else {
9863 // Expand bitreverse for types smaller than 8 bits.
9864 MachineInstrBuilder Tmp;
9865 for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
9866 MachineInstrBuilder Tmp2;
9867 if (I < J) {
9868 auto ShAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: J - I);
9869 Tmp2 = MIRBuilder.buildShl(Dst: SrcTy, Src0: Src, Src1: ShAmt);
9870 } else {
9871 auto ShAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: I - J);
9872 Tmp2 = MIRBuilder.buildLShr(Dst: SrcTy, Src0: Src, Src1: ShAmt);
9873 }
9874
9875 auto Mask = MIRBuilder.buildConstant(Res: SrcTy, Val: 1ULL << J);
9876 Tmp2 = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Tmp2, Src1: Mask);
9877 if (I == 0)
9878 Tmp = Tmp2;
9879 else
9880 Tmp = MIRBuilder.buildOr(Dst: SrcTy, Src0: Tmp, Src1: Tmp2);
9881 }
9882 MIRBuilder.buildCopy(Res: Dst, Op: Tmp);
9883 }
9884
9885 MI.eraseFromParent();
9886 return Legalized;
9887}
9888
9889LegalizerHelper::LegalizeResult
9890LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
9891 MachineFunction &MF = MIRBuilder.getMF();
9892
9893 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
9894 int NameOpIdx = IsRead ? 1 : 0;
9895 int ValRegIndex = IsRead ? 0 : 1;
9896
9897 Register ValReg = MI.getOperand(i: ValRegIndex).getReg();
9898 const LLT Ty = MRI.getType(Reg: ValReg);
9899 const MDString *RegStr = cast<MDString>(
9900 Val: cast<MDNode>(Val: MI.getOperand(i: NameOpIdx).getMetadata())->getOperand(I: 0));
9901
9902 Register PhysReg = TLI.getRegisterByName(RegName: RegStr->getString().data(), Ty, MF);
9903 if (!PhysReg) {
9904 const Function &Fn = MF.getFunction();
9905 Fn.getContext().diagnose(DI: DiagnosticInfoGenericWithLoc(
9906 "invalid register \"" + Twine(RegStr->getString().data()) + "\" for " +
9907 (IsRead ? "llvm.read_register" : "llvm.write_register"),
9908 Fn, MI.getDebugLoc()));
9909 if (IsRead)
9910 MIRBuilder.buildUndef(Res: ValReg);
9911
9912 MI.eraseFromParent();
9913 return Legalized;
9914 }
9915
9916 if (IsRead)
9917 MIRBuilder.buildCopy(Res: ValReg, Op: PhysReg);
9918 else
9919 MIRBuilder.buildCopy(Res: PhysReg, Op: ValReg);
9920
9921 MI.eraseFromParent();
9922 return Legalized;
9923}
9924
9925LegalizerHelper::LegalizeResult
9926LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
9927 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
9928 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
9929 Register Result = MI.getOperand(i: 0).getReg();
9930 LLT OrigTy = MRI.getType(Reg: Result);
9931 auto SizeInBits = OrigTy.getScalarSizeInBits();
9932 LLT WideTy = OrigTy.changeElementSize(NewEltSize: SizeInBits * 2);
9933
9934 auto LHS = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 1)});
9935 auto RHS = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 2)});
9936 auto Mul = MIRBuilder.buildMul(Dst: WideTy, Src0: LHS, Src1: RHS);
9937 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
9938
9939 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: SizeInBits);
9940 auto Shifted = MIRBuilder.buildInstr(Opc: ShiftOp, DstOps: {WideTy}, SrcOps: {Mul, ShiftAmt});
9941 MIRBuilder.buildTrunc(Res: Result, Op: Shifted);
9942
9943 MI.eraseFromParent();
9944 return Legalized;
9945}
9946
9947LegalizerHelper::LegalizeResult
9948LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
9949 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
9950 FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(i: 2).getImm());
9951
9952 if (Mask == fcNone) {
9953 MIRBuilder.buildConstant(Res: DstReg, Val: 0);
9954 MI.eraseFromParent();
9955 return Legalized;
9956 }
9957 if (Mask == fcAllFlags) {
9958 MIRBuilder.buildConstant(Res: DstReg, Val: 1);
9959 MI.eraseFromParent();
9960 return Legalized;
9961 }
9962
9963 // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
9964 // version
9965
9966 unsigned BitSize = SrcTy.getScalarSizeInBits();
9967 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: SrcTy.getScalarType());
9968
9969 LLT IntTy = SrcTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: BitSize));
9970 auto AsInt = MIRBuilder.buildCopy(Res: IntTy, Op: SrcReg);
9971
9972 // Various masks.
9973 APInt SignBit = APInt::getSignMask(BitWidth: BitSize);
9974 APInt ValueMask = APInt::getSignedMaxValue(numBits: BitSize); // All bits but sign.
9975 APInt Inf = APFloat::getInf(Sem: Semantics).bitcastToAPInt(); // Exp and int bit.
9976 APInt ExpMask = Inf;
9977 APInt AllOneMantissa = APFloat::getLargest(Sem: Semantics).bitcastToAPInt() & ~Inf;
9978 APInt QNaNBitMask =
9979 APInt::getOneBitSet(numBits: BitSize, BitNo: AllOneMantissa.getActiveBits() - 1);
9980 APInt InversionMask = APInt::getAllOnes(numBits: DstTy.getScalarSizeInBits());
9981
9982 auto SignBitC = MIRBuilder.buildConstant(Res: IntTy, Val: SignBit);
9983 auto ValueMaskC = MIRBuilder.buildConstant(Res: IntTy, Val: ValueMask);
9984 auto InfC = MIRBuilder.buildConstant(Res: IntTy, Val: Inf);
9985 auto ExpMaskC = MIRBuilder.buildConstant(Res: IntTy, Val: ExpMask);
9986 auto ZeroC = MIRBuilder.buildConstant(Res: IntTy, Val: 0);
9987
9988 auto Abs = MIRBuilder.buildAnd(Dst: IntTy, Src0: AsInt, Src1: ValueMaskC);
9989 auto Sign =
9990 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_NE, Res: DstTy, Op0: AsInt, Op1: Abs);
9991
9992 auto Res = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
9993 // Clang doesn't support capture of structured bindings:
9994 LLT DstTyCopy = DstTy;
9995 const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
9996 Res = MIRBuilder.buildOr(Dst: DstTyCopy, Src0: Res, Src1: ToAppend);
9997 };
9998
9999 // Tests that involve more than one class should be processed first.
10000 if ((Mask & fcFinite) == fcFinite) {
10001 // finite(V) ==> abs(V) u< exp_mask
10002 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: Abs,
10003 Op1: ExpMaskC));
10004 Mask &= ~fcFinite;
10005 } else if ((Mask & fcFinite) == fcPosFinite) {
10006 // finite(V) && V > 0 ==> V u< exp_mask
10007 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: AsInt,
10008 Op1: ExpMaskC));
10009 Mask &= ~fcPosFinite;
10010 } else if ((Mask & fcFinite) == fcNegFinite) {
10011 // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
10012 auto Cmp = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: Abs,
10013 Op1: ExpMaskC);
10014 auto And = MIRBuilder.buildAnd(Dst: DstTy, Src0: Cmp, Src1: Sign);
10015 appendToRes(And);
10016 Mask &= ~fcNegFinite;
10017 }
10018
10019 if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
10020 // fcZero | fcSubnormal => test all exponent bits are 0
10021 // TODO: Handle sign bit specific cases
10022 // TODO: Handle inverted case
10023 if (PartialCheck == (fcZero | fcSubnormal)) {
10024 auto ExpBits = MIRBuilder.buildAnd(Dst: IntTy, Src0: AsInt, Src1: ExpMaskC);
10025 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10026 Op0: ExpBits, Op1: ZeroC));
10027 Mask &= ~PartialCheck;
10028 }
10029 }
10030
10031 // Check for individual classes.
10032 if (FPClassTest PartialCheck = Mask & fcZero) {
10033 if (PartialCheck == fcPosZero)
10034 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10035 Op0: AsInt, Op1: ZeroC));
10036 else if (PartialCheck == fcZero)
10037 appendToRes(
10038 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy, Op0: Abs, Op1: ZeroC));
10039 else // fcNegZero
10040 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10041 Op0: AsInt, Op1: SignBitC));
10042 }
10043
10044 if (FPClassTest PartialCheck = Mask & fcSubnormal) {
10045 // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
10046 // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
10047 auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
10048 auto OneC = MIRBuilder.buildConstant(Res: IntTy, Val: 1);
10049 auto VMinusOne = MIRBuilder.buildSub(Dst: IntTy, Src0: V, Src1: OneC);
10050 auto SubnormalRes =
10051 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: VMinusOne,
10052 Op1: MIRBuilder.buildConstant(Res: IntTy, Val: AllOneMantissa));
10053 if (PartialCheck == fcNegSubnormal)
10054 SubnormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: SubnormalRes, Src1: Sign);
10055 appendToRes(SubnormalRes);
10056 }
10057
10058 if (FPClassTest PartialCheck = Mask & fcInf) {
10059 if (PartialCheck == fcPosInf)
10060 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10061 Op0: AsInt, Op1: InfC));
10062 else if (PartialCheck == fcInf)
10063 appendToRes(
10064 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy, Op0: Abs, Op1: InfC));
10065 else { // fcNegInf
10066 APInt NegInf = APFloat::getInf(Sem: Semantics, Negative: true).bitcastToAPInt();
10067 auto NegInfC = MIRBuilder.buildConstant(Res: IntTy, Val: NegInf);
10068 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10069 Op0: AsInt, Op1: NegInfC));
10070 }
10071 }
10072
10073 if (FPClassTest PartialCheck = Mask & fcNan) {
10074 auto InfWithQnanBitC = MIRBuilder.buildConstant(Res: IntTy, Val: Inf | QNaNBitMask);
10075 if (PartialCheck == fcNan) {
10076 // isnan(V) ==> abs(V) u> int(inf)
10077 appendToRes(
10078 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGT, Res: DstTy, Op0: Abs, Op1: InfC));
10079 } else if (PartialCheck == fcQNan) {
10080 // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
10081 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGE, Res: DstTy, Op0: Abs,
10082 Op1: InfWithQnanBitC));
10083 } else { // fcSNan
10084 // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
10085 // abs(V) u< (unsigned(Inf) | quiet_bit)
10086 auto IsNan =
10087 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGT, Res: DstTy, Op0: Abs, Op1: InfC);
10088 auto IsNotQnan = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy,
10089 Op0: Abs, Op1: InfWithQnanBitC);
10090 appendToRes(MIRBuilder.buildAnd(Dst: DstTy, Src0: IsNan, Src1: IsNotQnan));
10091 }
10092 }
10093
10094 if (FPClassTest PartialCheck = Mask & fcNormal) {
10095 // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
10096 // (max_exp-1))
10097 APInt ExpLSB = ExpMask & ~(ExpMask.shl(shiftAmt: 1));
10098 auto ExpMinusOne = MIRBuilder.buildSub(
10099 Dst: IntTy, Src0: Abs, Src1: MIRBuilder.buildConstant(Res: IntTy, Val: ExpLSB));
10100 APInt MaxExpMinusOne = ExpMask - ExpLSB;
10101 auto NormalRes =
10102 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: ExpMinusOne,
10103 Op1: MIRBuilder.buildConstant(Res: IntTy, Val: MaxExpMinusOne));
10104 if (PartialCheck == fcNegNormal)
10105 NormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: NormalRes, Src1: Sign);
10106 else if (PartialCheck == fcPosNormal) {
10107 auto PosSign = MIRBuilder.buildXor(
10108 Dst: DstTy, Src0: Sign, Src1: MIRBuilder.buildConstant(Res: DstTy, Val: InversionMask));
10109 NormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: NormalRes, Src1: PosSign);
10110 }
10111 appendToRes(NormalRes);
10112 }
10113
10114 MIRBuilder.buildCopy(Res: DstReg, Op: Res);
10115 MI.eraseFromParent();
10116 return Legalized;
10117}
10118
10119LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
10120 // Implement G_SELECT in terms of XOR, AND, OR.
10121 auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
10122 MI.getFirst4RegLLTs();
10123
10124 bool IsEltPtr = DstTy.isPointerOrPointerVector();
10125 if (IsEltPtr) {
10126 LLT ScalarPtrTy = LLT::scalar(SizeInBits: DstTy.getScalarSizeInBits());
10127 LLT NewTy = DstTy.changeElementType(NewEltTy: ScalarPtrTy);
10128 Op1Reg = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Op1Reg).getReg(Idx: 0);
10129 Op2Reg = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Op2Reg).getReg(Idx: 0);
10130 DstTy = NewTy;
10131 }
10132
10133 if (MaskTy.isScalar()) {
10134 // Turn the scalar condition into a vector condition mask if needed.
10135
10136 Register MaskElt = MaskReg;
10137
10138 // The condition was potentially zero extended before, but we want a sign
10139 // extended boolean.
10140 if (MaskTy != LLT::scalar(SizeInBits: 1))
10141 MaskElt = MIRBuilder.buildSExtInReg(Res: MaskTy, Op: MaskElt, ImmOp: 1).getReg(Idx: 0);
10142
10143 // Continue the sign extension (or truncate) to match the data type.
10144 MaskElt =
10145 MIRBuilder.buildSExtOrTrunc(Res: DstTy.getScalarType(), Op: MaskElt).getReg(Idx: 0);
10146
10147 if (DstTy.isVector()) {
10148 // Generate a vector splat idiom.
10149 auto ShufSplat = MIRBuilder.buildShuffleSplat(Res: DstTy, Src: MaskElt);
10150 MaskReg = ShufSplat.getReg(Idx: 0);
10151 } else {
10152 MaskReg = MaskElt;
10153 }
10154 MaskTy = DstTy;
10155 } else if (!DstTy.isVector()) {
10156 // Cannot handle the case that mask is a vector and dst is a scalar.
10157 return UnableToLegalize;
10158 }
10159
10160 if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
10161 return UnableToLegalize;
10162 }
10163
10164 auto NotMask = MIRBuilder.buildNot(Dst: MaskTy, Src0: MaskReg);
10165 auto NewOp1 = MIRBuilder.buildAnd(Dst: MaskTy, Src0: Op1Reg, Src1: MaskReg);
10166 auto NewOp2 = MIRBuilder.buildAnd(Dst: MaskTy, Src0: Op2Reg, Src1: NotMask);
10167 if (IsEltPtr) {
10168 auto Or = MIRBuilder.buildOr(Dst: DstTy, Src0: NewOp1, Src1: NewOp2);
10169 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: Or);
10170 } else {
10171 MIRBuilder.buildOr(Dst: DstReg, Src0: NewOp1, Src1: NewOp2);
10172 }
10173 MI.eraseFromParent();
10174 return Legalized;
10175}
10176
10177LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
10178 // Split DIVREM into individual instructions.
10179 unsigned Opcode = MI.getOpcode();
10180
10181 MIRBuilder.buildInstr(
10182 Opc: Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
10183 : TargetOpcode::G_UDIV,
10184 DstOps: {MI.getOperand(i: 0).getReg()}, SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
10185 MIRBuilder.buildInstr(
10186 Opc: Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
10187 : TargetOpcode::G_UREM,
10188 DstOps: {MI.getOperand(i: 1).getReg()}, SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
10189 MI.eraseFromParent();
10190 return Legalized;
10191}
10192
10193LegalizerHelper::LegalizeResult
10194LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
10195 // Expand %res = G_ABS %a into:
10196 // %v1 = G_ASHR %a, scalar_size-1
10197 // %v2 = G_ADD %a, %v1
10198 // %res = G_XOR %v2, %v1
10199 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
10200 Register OpReg = MI.getOperand(i: 1).getReg();
10201 auto ShiftAmt =
10202 MIRBuilder.buildConstant(Res: DstTy, Val: DstTy.getScalarSizeInBits() - 1);
10203 auto Shift = MIRBuilder.buildAShr(Dst: DstTy, Src0: OpReg, Src1: ShiftAmt);
10204 auto Add = MIRBuilder.buildAdd(Dst: DstTy, Src0: OpReg, Src1: Shift);
10205 MIRBuilder.buildXor(Dst: MI.getOperand(i: 0).getReg(), Src0: Add, Src1: Shift);
10206 MI.eraseFromParent();
10207 return Legalized;
10208}
10209
10210LegalizerHelper::LegalizeResult
10211LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
10212 // Expand %res = G_ABS %a into:
10213 // %v1 = G_CONSTANT 0
10214 // %v2 = G_SUB %v1, %a
10215 // %res = G_SMAX %a, %v2
10216 Register SrcReg = MI.getOperand(i: 1).getReg();
10217 LLT Ty = MRI.getType(Reg: SrcReg);
10218 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
10219 auto Sub = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: SrcReg);
10220 MIRBuilder.buildSMax(Dst: MI.getOperand(i: 0), Src0: SrcReg, Src1: Sub);
10221 MI.eraseFromParent();
10222 return Legalized;
10223}
10224
10225LegalizerHelper::LegalizeResult
10226LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
10227 Register SrcReg = MI.getOperand(i: 1).getReg();
10228 Register DestReg = MI.getOperand(i: 0).getReg();
10229 LLT Ty = MRI.getType(Reg: SrcReg), IType = LLT::scalar(SizeInBits: 1);
10230 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0).getReg(Idx: 0);
10231 auto Sub = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: SrcReg).getReg(Idx: 0);
10232 auto ICmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: IType, Op0: SrcReg, Op1: Zero);
10233 MIRBuilder.buildSelect(Res: DestReg, Tst: ICmp, Op0: SrcReg, Op1: Sub);
10234 MI.eraseFromParent();
10235 return Legalized;
10236}
10237
10238LegalizerHelper::LegalizeResult
10239LegalizerHelper::lowerAbsDiffToSelect(MachineInstr &MI) {
10240 assert((MI.getOpcode() == TargetOpcode::G_ABDS ||
10241 MI.getOpcode() == TargetOpcode::G_ABDU) &&
10242 "Expected G_ABDS or G_ABDU instruction");
10243
10244 auto [DstReg, LHS, RHS] = MI.getFirst3Regs();
10245 LLT Ty = MRI.getType(Reg: LHS);
10246
10247 // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
10248 // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
10249 Register LHSSub = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10250 Register RHSSub = MIRBuilder.buildSub(Dst: Ty, Src0: RHS, Src1: LHS).getReg(Idx: 0);
10251 CmpInst::Predicate Pred = (MI.getOpcode() == TargetOpcode::G_ABDS)
10252 ? CmpInst::ICMP_SGT
10253 : CmpInst::ICMP_UGT;
10254 auto ICmp = MIRBuilder.buildICmp(Pred, Res: LLT::scalar(SizeInBits: 1), Op0: LHS, Op1: RHS);
10255 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LHSSub, Op1: RHSSub);
10256
10257 MI.eraseFromParent();
10258 return Legalized;
10259}
10260
10261LegalizerHelper::LegalizeResult
10262LegalizerHelper::lowerAbsDiffToMinMax(MachineInstr &MI) {
10263 assert((MI.getOpcode() == TargetOpcode::G_ABDS ||
10264 MI.getOpcode() == TargetOpcode::G_ABDU) &&
10265 "Expected G_ABDS or G_ABDU instruction");
10266
10267 auto [DstReg, LHS, RHS] = MI.getFirst3Regs();
10268 LLT Ty = MRI.getType(Reg: LHS);
10269
10270 // abds(lhs, rhs) -→ sub(smax(lhs, rhs), smin(lhs, rhs))
10271 // abdu(lhs, rhs) -→ sub(umax(lhs, rhs), umin(lhs, rhs))
10272 Register MaxReg, MinReg;
10273 if (MI.getOpcode() == TargetOpcode::G_ABDS) {
10274 MaxReg = MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10275 MinReg = MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10276 } else {
10277 MaxReg = MIRBuilder.buildUMax(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10278 MinReg = MIRBuilder.buildUMin(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10279 }
10280 MIRBuilder.buildSub(Dst: DstReg, Src0: MaxReg, Src1: MinReg);
10281
10282 MI.eraseFromParent();
10283 return Legalized;
10284}
10285
10286LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
10287 Register SrcReg = MI.getOperand(i: 1).getReg();
10288 Register DstReg = MI.getOperand(i: 0).getReg();
10289
10290 LLT Ty = MRI.getType(Reg: DstReg);
10291
10292 // Reset sign bit
10293 MIRBuilder.buildAnd(
10294 Dst: DstReg, Src0: SrcReg,
10295 Src1: MIRBuilder.buildConstant(
10296 Res: Ty, Val: APInt::getSignedMaxValue(numBits: Ty.getScalarSizeInBits())));
10297
10298 MI.eraseFromParent();
10299 return Legalized;
10300}
10301
10302LegalizerHelper::LegalizeResult
10303LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
10304 Register SrcReg = MI.getOperand(i: 1).getReg();
10305 LLT SrcTy = MRI.getType(Reg: SrcReg);
10306 LLT DstTy = MRI.getType(Reg: SrcReg);
10307
10308 // The source could be a scalar if the IR type was <1 x sN>.
10309 if (SrcTy.isScalar()) {
10310 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
10311 return UnableToLegalize; // FIXME: handle extension.
10312 // This can be just a plain copy.
10313 Observer.changingInstr(MI);
10314 MI.setDesc(MIRBuilder.getTII().get(Opcode: TargetOpcode::COPY));
10315 Observer.changedInstr(MI);
10316 return Legalized;
10317 }
10318 return UnableToLegalize;
10319}
10320
10321LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
10322 MachineFunction &MF = *MI.getMF();
10323 const DataLayout &DL = MIRBuilder.getDataLayout();
10324 LLVMContext &Ctx = MF.getFunction().getContext();
10325 Register ListPtr = MI.getOperand(i: 1).getReg();
10326 LLT PtrTy = MRI.getType(Reg: ListPtr);
10327
10328 // LstPtr is a pointer to the head of the list. Get the address
10329 // of the head of the list.
10330 Align PtrAlignment = DL.getABITypeAlign(Ty: getTypeForLLT(Ty: PtrTy, C&: Ctx));
10331 MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
10332 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad, MemTy: PtrTy, base_alignment: PtrAlignment);
10333 auto VAList = MIRBuilder.buildLoad(Res: PtrTy, Addr: ListPtr, MMO&: *PtrLoadMMO).getReg(Idx: 0);
10334
10335 const Align A(MI.getOperand(i: 2).getImm());
10336 LLT PtrTyAsScalarTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
10337 if (A > TLI.getMinStackArgumentAlignment()) {
10338 Register AlignAmt =
10339 MIRBuilder.buildConstant(Res: PtrTyAsScalarTy, Val: A.value() - 1).getReg(Idx: 0);
10340 auto AddDst = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VAList, Op1: AlignAmt);
10341 auto AndDst = MIRBuilder.buildMaskLowPtrBits(Res: PtrTy, Op0: AddDst, NumBits: Log2(A));
10342 VAList = AndDst.getReg(Idx: 0);
10343 }
10344
10345 // Increment the pointer, VAList, to the next vaarg
10346 // The list should be bumped by the size of element in the current head of
10347 // list.
10348 Register Dst = MI.getOperand(i: 0).getReg();
10349 LLT LLTTy = MRI.getType(Reg: Dst);
10350 Type *Ty = getTypeForLLT(Ty: LLTTy, C&: Ctx);
10351 auto IncAmt =
10352 MIRBuilder.buildConstant(Res: PtrTyAsScalarTy, Val: DL.getTypeAllocSize(Ty));
10353 auto Succ = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VAList, Op1: IncAmt);
10354
10355 // Store the increment VAList to the legalized pointer
10356 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10357 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOStore, MemTy: PtrTy, base_alignment: PtrAlignment);
10358 MIRBuilder.buildStore(Val: Succ, Addr: ListPtr, MMO&: *StoreMMO);
10359 // Load the actual argument out of the pointer VAList
10360 Align EltAlignment = DL.getABITypeAlign(Ty);
10361 MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
10362 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad, MemTy: LLTTy, base_alignment: EltAlignment);
10363 MIRBuilder.buildLoad(Res: Dst, Addr: VAList, MMO&: *EltLoadMMO);
10364
10365 MI.eraseFromParent();
10366 return Legalized;
10367}
10368
10369static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
10370 // On Darwin, -Os means optimize for size without hurting performance, so
10371 // only really optimize for size when -Oz (MinSize) is used.
10372 if (MF.getTarget().getTargetTriple().isOSDarwin())
10373 return MF.getFunction().hasMinSize();
10374 return MF.getFunction().hasOptSize();
10375}
10376
10377// Returns a list of types to use for memory op lowering in MemOps. A partial
10378// port of findOptimalMemOpLowering in TargetLowering.
10379static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
10380 unsigned Limit, const MemOp &Op,
10381 unsigned DstAS, unsigned SrcAS,
10382 const AttributeList &FuncAttributes,
10383 const TargetLowering &TLI) {
10384 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
10385 return false;
10386
10387 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
10388
10389 if (Ty == LLT()) {
10390 // Use the largest scalar type whose alignment constraints are satisfied.
10391 // We only need to check DstAlign here as SrcAlign is always greater or
10392 // equal to DstAlign (or zero).
10393 Ty = LLT::scalar(SizeInBits: 64);
10394 if (Op.isFixedDstAlign())
10395 while (Op.getDstAlign() < Ty.getSizeInBytes() &&
10396 !TLI.allowsMisalignedMemoryAccesses(Ty, AddrSpace: DstAS, Alignment: Op.getDstAlign()))
10397 Ty = LLT::scalar(SizeInBits: Ty.getSizeInBytes());
10398 assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
10399 // FIXME: check for the largest legal type we can load/store to.
10400 }
10401
10402 unsigned NumMemOps = 0;
10403 uint64_t Size = Op.size();
10404 while (Size) {
10405 unsigned TySize = Ty.getSizeInBytes();
10406 while (TySize > Size) {
10407 // For now, only use non-vector load / store's for the left-over pieces.
10408 LLT NewTy = Ty;
10409 // FIXME: check for mem op safety and legality of the types. Not all of
10410 // SDAGisms map cleanly to GISel concepts.
10411 if (NewTy.isVector())
10412 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32);
10413 NewTy = LLT::scalar(SizeInBits: llvm::bit_floor(Value: NewTy.getSizeInBits() - 1));
10414 unsigned NewTySize = NewTy.getSizeInBytes();
10415 assert(NewTySize > 0 && "Could not find appropriate type");
10416
10417 // If the new LLT cannot cover all of the remaining bits, then consider
10418 // issuing a (or a pair of) unaligned and overlapping load / store.
10419 unsigned Fast;
10420 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
10421 MVT VT = getMVTForLLT(Ty);
10422 if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
10423 TLI.allowsMisalignedMemoryAccesses(
10424 VT, AddrSpace: DstAS, Alignment: Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
10425 Flags: MachineMemOperand::MONone, &Fast) &&
10426 Fast)
10427 TySize = Size;
10428 else {
10429 Ty = NewTy;
10430 TySize = NewTySize;
10431 }
10432 }
10433
10434 if (++NumMemOps > Limit)
10435 return false;
10436
10437 MemOps.push_back(x: Ty);
10438 Size -= TySize;
10439 }
10440
10441 return true;
10442}
10443
10444// Get a vectorized representation of the memset value operand, GISel edition.
10445static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
10446 MachineRegisterInfo &MRI = *MIB.getMRI();
10447 unsigned NumBits = Ty.getScalarSizeInBits();
10448 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Val, MRI);
10449 if (!Ty.isVector() && ValVRegAndVal) {
10450 APInt Scalar = ValVRegAndVal->Value.trunc(width: 8);
10451 APInt SplatVal = APInt::getSplat(NewLen: NumBits, V: Scalar);
10452 return MIB.buildConstant(Res: Ty, Val: SplatVal).getReg(Idx: 0);
10453 }
10454
10455 // Extend the byte value to the larger type, and then multiply by a magic
10456 // value 0x010101... in order to replicate it across every byte.
10457 // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
10458 if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
10459 return MIB.buildConstant(Res: Ty, Val: 0).getReg(Idx: 0);
10460 }
10461
10462 LLT ExtType = Ty.getScalarType();
10463 auto ZExt = MIB.buildZExtOrTrunc(Res: ExtType, Op: Val);
10464 if (NumBits > 8) {
10465 APInt Magic = APInt::getSplat(NewLen: NumBits, V: APInt(8, 0x01));
10466 auto MagicMI = MIB.buildConstant(Res: ExtType, Val: Magic);
10467 Val = MIB.buildMul(Dst: ExtType, Src0: ZExt, Src1: MagicMI).getReg(Idx: 0);
10468 }
10469
10470 // For vector types create a G_BUILD_VECTOR.
10471 if (Ty.isVector())
10472 Val = MIB.buildSplatBuildVector(Res: Ty, Src: Val).getReg(Idx: 0);
10473
10474 return Val;
10475}
10476
10477LegalizerHelper::LegalizeResult
10478LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
10479 uint64_t KnownLen, Align Alignment,
10480 bool IsVolatile) {
10481 auto &MF = *MI.getParent()->getParent();
10482 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10483 auto &DL = MF.getDataLayout();
10484 LLVMContext &C = MF.getFunction().getContext();
10485
10486 assert(KnownLen != 0 && "Have a zero length memset length!");
10487
10488 bool DstAlignCanChange = false;
10489 MachineFrameInfo &MFI = MF.getFrameInfo();
10490 bool OptSize = shouldLowerMemFuncForSize(MF);
10491
10492 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
10493 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
10494 DstAlignCanChange = true;
10495
10496 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
10497 std::vector<LLT> MemOps;
10498
10499 const auto &DstMMO = **MI.memoperands_begin();
10500 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10501
10502 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Val, MRI);
10503 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
10504
10505 if (!findGISelOptimalMemOpLowering(MemOps, Limit,
10506 Op: MemOp::Set(Size: KnownLen, DstAlignCanChange,
10507 DstAlign: Alignment,
10508 /*IsZeroMemset=*/IsZeroVal,
10509 /*IsVolatile=*/IsVolatile),
10510 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: ~0u,
10511 FuncAttributes: MF.getFunction().getAttributes(), TLI))
10512 return UnableToLegalize;
10513
10514 if (DstAlignCanChange) {
10515 // Get an estimate of the type from the LLT.
10516 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
10517 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
10518 if (NewAlign > Alignment) {
10519 Alignment = NewAlign;
10520 unsigned FI = FIDef->getOperand(i: 1).getIndex();
10521 // Give the stack frame object a larger alignment if needed.
10522 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
10523 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
10524 }
10525 }
10526
10527 MachineIRBuilder MIB(MI);
10528 // Find the largest store and generate the bit pattern for it.
10529 LLT LargestTy = MemOps[0];
10530 for (unsigned i = 1; i < MemOps.size(); i++)
10531 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
10532 LargestTy = MemOps[i];
10533
10534 // The memset stored value is always defined as an s8, so in order to make it
10535 // work with larger store types we need to repeat the bit pattern across the
10536 // wider type.
10537 Register MemSetValue = getMemsetValue(Val, Ty: LargestTy, MIB);
10538
10539 if (!MemSetValue)
10540 return UnableToLegalize;
10541
10542 // Generate the stores. For each store type in the list, we generate the
10543 // matching store of that type to the destination address.
10544 LLT PtrTy = MRI.getType(Reg: Dst);
10545 unsigned DstOff = 0;
10546 unsigned Size = KnownLen;
10547 for (unsigned I = 0; I < MemOps.size(); I++) {
10548 LLT Ty = MemOps[I];
10549 unsigned TySize = Ty.getSizeInBytes();
10550 if (TySize > Size) {
10551 // Issuing an unaligned load / store pair that overlaps with the previous
10552 // pair. Adjust the offset accordingly.
10553 assert(I == MemOps.size() - 1 && I != 0);
10554 DstOff -= TySize - Size;
10555 }
10556
10557 // If this store is smaller than the largest store see whether we can get
10558 // the smaller value for free with a truncate.
10559 Register Value = MemSetValue;
10560 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
10561 MVT VT = getMVTForLLT(Ty);
10562 MVT LargestVT = getMVTForLLT(Ty: LargestTy);
10563 if (!LargestTy.isVector() && !Ty.isVector() &&
10564 TLI.isTruncateFree(FromVT: LargestVT, ToVT: VT))
10565 Value = MIB.buildTrunc(Res: Ty, Op: MemSetValue).getReg(Idx: 0);
10566 else
10567 Value = getMemsetValue(Val, Ty, MIB);
10568 if (!Value)
10569 return UnableToLegalize;
10570 }
10571
10572 auto *StoreMMO = MF.getMachineMemOperand(MMO: &DstMMO, Offset: DstOff, Ty);
10573
10574 Register Ptr = Dst;
10575 if (DstOff != 0) {
10576 auto Offset =
10577 MIB.buildConstant(Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()), Val: DstOff);
10578 Ptr = MIB.buildObjectPtrOffset(Res: PtrTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
10579 }
10580
10581 MIB.buildStore(Val: Value, Addr: Ptr, MMO&: *StoreMMO);
10582 DstOff += Ty.getSizeInBytes();
10583 Size -= TySize;
10584 }
10585
10586 MI.eraseFromParent();
10587 return Legalized;
10588}
10589
10590LegalizerHelper::LegalizeResult
10591LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
10592 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
10593
10594 auto [Dst, Src, Len] = MI.getFirst3Regs();
10595
10596 const auto *MMOIt = MI.memoperands_begin();
10597 const MachineMemOperand *MemOp = *MMOIt;
10598 bool IsVolatile = MemOp->isVolatile();
10599
10600 // See if this is a constant length copy
10601 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Len, MRI);
10602 // FIXME: support dynamically sized G_MEMCPY_INLINE
10603 assert(LenVRegAndVal &&
10604 "inline memcpy with dynamic size is not yet supported");
10605 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
10606 if (KnownLen == 0) {
10607 MI.eraseFromParent();
10608 return Legalized;
10609 }
10610
10611 const auto &DstMMO = **MI.memoperands_begin();
10612 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
10613 Align DstAlign = DstMMO.getBaseAlign();
10614 Align SrcAlign = SrcMMO.getBaseAlign();
10615
10616 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
10617 IsVolatile);
10618}
10619
10620LegalizerHelper::LegalizeResult
10621LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
10622 uint64_t KnownLen, Align DstAlign,
10623 Align SrcAlign, bool IsVolatile) {
10624 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
10625 return lowerMemcpy(MI, Dst, Src, KnownLen,
10626 Limit: std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
10627 IsVolatile);
10628}
10629
10630LegalizerHelper::LegalizeResult
10631LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
10632 uint64_t KnownLen, uint64_t Limit, Align DstAlign,
10633 Align SrcAlign, bool IsVolatile) {
10634 auto &MF = *MI.getParent()->getParent();
10635 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10636 auto &DL = MF.getDataLayout();
10637 LLVMContext &C = MF.getFunction().getContext();
10638
10639 assert(KnownLen != 0 && "Have a zero length memcpy length!");
10640
10641 bool DstAlignCanChange = false;
10642 MachineFrameInfo &MFI = MF.getFrameInfo();
10643 Align Alignment = std::min(a: DstAlign, b: SrcAlign);
10644
10645 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
10646 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
10647 DstAlignCanChange = true;
10648
10649 // FIXME: infer better src pointer alignment like SelectionDAG does here.
10650 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
10651 // if the memcpy is in a tail call position.
10652
10653 std::vector<LLT> MemOps;
10654
10655 const auto &DstMMO = **MI.memoperands_begin();
10656 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
10657 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10658 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
10659
10660 if (!findGISelOptimalMemOpLowering(
10661 MemOps, Limit,
10662 Op: MemOp::Copy(Size: KnownLen, DstAlignCanChange, DstAlign: Alignment, SrcAlign,
10663 IsVolatile),
10664 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: SrcPtrInfo.getAddrSpace(),
10665 FuncAttributes: MF.getFunction().getAttributes(), TLI))
10666 return UnableToLegalize;
10667
10668 if (DstAlignCanChange) {
10669 // Get an estimate of the type from the LLT.
10670 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
10671 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
10672
10673 // Don't promote to an alignment that would require dynamic stack
10674 // realignment.
10675 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
10676 if (!TRI->hasStackRealignment(MF))
10677 if (MaybeAlign StackAlign = DL.getStackAlignment())
10678 NewAlign = std::min(a: NewAlign, b: *StackAlign);
10679
10680 if (NewAlign > Alignment) {
10681 Alignment = NewAlign;
10682 unsigned FI = FIDef->getOperand(i: 1).getIndex();
10683 // Give the stack frame object a larger alignment if needed.
10684 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
10685 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
10686 }
10687 }
10688
10689 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
10690
10691 MachineIRBuilder MIB(MI);
10692 // Now we need to emit a pair of load and stores for each of the types we've
10693 // collected. I.e. for each type, generate a load from the source pointer of
10694 // that type width, and then generate a corresponding store to the dest buffer
10695 // of that value loaded. This can result in a sequence of loads and stores
10696 // mixed types, depending on what the target specifies as good types to use.
10697 unsigned CurrOffset = 0;
10698 unsigned Size = KnownLen;
10699 for (auto CopyTy : MemOps) {
10700 // Issuing an unaligned load / store pair that overlaps with the previous
10701 // pair. Adjust the offset accordingly.
10702 if (CopyTy.getSizeInBytes() > Size)
10703 CurrOffset -= CopyTy.getSizeInBytes() - Size;
10704
10705 // Construct MMOs for the accesses.
10706 auto *LoadMMO =
10707 MF.getMachineMemOperand(MMO: &SrcMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10708 auto *StoreMMO =
10709 MF.getMachineMemOperand(MMO: &DstMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10710
10711 // Create the load.
10712 Register LoadPtr = Src;
10713 Register Offset;
10714 if (CurrOffset != 0) {
10715 LLT SrcTy = MRI.getType(Reg: Src);
10716 Offset = MIB.buildConstant(Res: LLT::scalar(SizeInBits: SrcTy.getSizeInBits()), Val: CurrOffset)
10717 .getReg(Idx: 0);
10718 LoadPtr = MIB.buildObjectPtrOffset(Res: SrcTy, Op0: Src, Op1: Offset).getReg(Idx: 0);
10719 }
10720 auto LdVal = MIB.buildLoad(Res: CopyTy, Addr: LoadPtr, MMO&: *LoadMMO);
10721
10722 // Create the store.
10723 Register StorePtr = Dst;
10724 if (CurrOffset != 0) {
10725 LLT DstTy = MRI.getType(Reg: Dst);
10726 StorePtr = MIB.buildObjectPtrOffset(Res: DstTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
10727 }
10728 MIB.buildStore(Val: LdVal, Addr: StorePtr, MMO&: *StoreMMO);
10729 CurrOffset += CopyTy.getSizeInBytes();
10730 Size -= CopyTy.getSizeInBytes();
10731 }
10732
10733 MI.eraseFromParent();
10734 return Legalized;
10735}
10736
10737LegalizerHelper::LegalizeResult
10738LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
10739 uint64_t KnownLen, Align DstAlign, Align SrcAlign,
10740 bool IsVolatile) {
10741 auto &MF = *MI.getParent()->getParent();
10742 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10743 auto &DL = MF.getDataLayout();
10744 LLVMContext &C = MF.getFunction().getContext();
10745
10746 assert(KnownLen != 0 && "Have a zero length memmove length!");
10747
10748 bool DstAlignCanChange = false;
10749 MachineFrameInfo &MFI = MF.getFrameInfo();
10750 bool OptSize = shouldLowerMemFuncForSize(MF);
10751 Align Alignment = std::min(a: DstAlign, b: SrcAlign);
10752
10753 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
10754 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
10755 DstAlignCanChange = true;
10756
10757 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
10758 std::vector<LLT> MemOps;
10759
10760 const auto &DstMMO = **MI.memoperands_begin();
10761 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
10762 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10763 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
10764
10765 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
10766 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
10767 // same thing here.
10768 if (!findGISelOptimalMemOpLowering(
10769 MemOps, Limit,
10770 Op: MemOp::Copy(Size: KnownLen, DstAlignCanChange, DstAlign: Alignment, SrcAlign,
10771 /*IsVolatile*/ true),
10772 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: SrcPtrInfo.getAddrSpace(),
10773 FuncAttributes: MF.getFunction().getAttributes(), TLI))
10774 return UnableToLegalize;
10775
10776 if (DstAlignCanChange) {
10777 // Get an estimate of the type from the LLT.
10778 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
10779 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
10780
10781 // Don't promote to an alignment that would require dynamic stack
10782 // realignment.
10783 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
10784 if (!TRI->hasStackRealignment(MF))
10785 if (MaybeAlign StackAlign = DL.getStackAlignment())
10786 NewAlign = std::min(a: NewAlign, b: *StackAlign);
10787
10788 if (NewAlign > Alignment) {
10789 Alignment = NewAlign;
10790 unsigned FI = FIDef->getOperand(i: 1).getIndex();
10791 // Give the stack frame object a larger alignment if needed.
10792 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
10793 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
10794 }
10795 }
10796
10797 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
10798
10799 MachineIRBuilder MIB(MI);
10800 // Memmove requires that we perform the loads first before issuing the stores.
10801 // Apart from that, this loop is pretty much doing the same thing as the
10802 // memcpy codegen function.
10803 unsigned CurrOffset = 0;
10804 SmallVector<Register, 16> LoadVals;
10805 for (auto CopyTy : MemOps) {
10806 // Construct MMO for the load.
10807 auto *LoadMMO =
10808 MF.getMachineMemOperand(MMO: &SrcMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10809
10810 // Create the load.
10811 Register LoadPtr = Src;
10812 if (CurrOffset != 0) {
10813 LLT SrcTy = MRI.getType(Reg: Src);
10814 auto Offset =
10815 MIB.buildConstant(Res: LLT::scalar(SizeInBits: SrcTy.getSizeInBits()), Val: CurrOffset);
10816 LoadPtr = MIB.buildObjectPtrOffset(Res: SrcTy, Op0: Src, Op1: Offset).getReg(Idx: 0);
10817 }
10818 LoadVals.push_back(Elt: MIB.buildLoad(Res: CopyTy, Addr: LoadPtr, MMO&: *LoadMMO).getReg(Idx: 0));
10819 CurrOffset += CopyTy.getSizeInBytes();
10820 }
10821
10822 CurrOffset = 0;
10823 for (unsigned I = 0; I < MemOps.size(); ++I) {
10824 LLT CopyTy = MemOps[I];
10825 // Now store the values loaded.
10826 auto *StoreMMO =
10827 MF.getMachineMemOperand(MMO: &DstMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10828
10829 Register StorePtr = Dst;
10830 if (CurrOffset != 0) {
10831 LLT DstTy = MRI.getType(Reg: Dst);
10832 auto Offset =
10833 MIB.buildConstant(Res: LLT::scalar(SizeInBits: DstTy.getSizeInBits()), Val: CurrOffset);
10834 StorePtr = MIB.buildObjectPtrOffset(Res: DstTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
10835 }
10836 MIB.buildStore(Val: LoadVals[I], Addr: StorePtr, MMO&: *StoreMMO);
10837 CurrOffset += CopyTy.getSizeInBytes();
10838 }
10839 MI.eraseFromParent();
10840 return Legalized;
10841}
10842
10843LegalizerHelper::LegalizeResult
10844LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
10845 const unsigned Opc = MI.getOpcode();
10846 // This combine is fairly complex so it's not written with a separate
10847 // matcher function.
10848 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
10849 Opc == TargetOpcode::G_MEMSET) &&
10850 "Expected memcpy like instruction");
10851
10852 auto MMOIt = MI.memoperands_begin();
10853 const MachineMemOperand *MemOp = *MMOIt;
10854
10855 Align DstAlign = MemOp->getBaseAlign();
10856 Align SrcAlign;
10857 auto [Dst, Src, Len] = MI.getFirst3Regs();
10858
10859 if (Opc != TargetOpcode::G_MEMSET) {
10860 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
10861 MemOp = *(++MMOIt);
10862 SrcAlign = MemOp->getBaseAlign();
10863 }
10864
10865 // See if this is a constant length copy
10866 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Len, MRI);
10867 if (!LenVRegAndVal)
10868 return UnableToLegalize;
10869 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
10870
10871 if (KnownLen == 0) {
10872 MI.eraseFromParent();
10873 return Legalized;
10874 }
10875
10876 if (MaxLen && KnownLen > MaxLen)
10877 return UnableToLegalize;
10878
10879 bool IsVolatile = MemOp->isVolatile();
10880 if (Opc == TargetOpcode::G_MEMCPY) {
10881 auto &MF = *MI.getParent()->getParent();
10882 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10883 bool OptSize = shouldLowerMemFuncForSize(MF);
10884 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
10885 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
10886 IsVolatile);
10887 }
10888 if (Opc == TargetOpcode::G_MEMMOVE)
10889 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
10890 if (Opc == TargetOpcode::G_MEMSET)
10891 return lowerMemset(MI, Dst, Val: Src, KnownLen, Alignment: DstAlign, IsVolatile);
10892 return UnableToLegalize;
10893}
10894