1//===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This file implements the LegalizerHelper class to legalize
10/// individual instructions and the LegalizeMachineIR wrapper pass for the
11/// primary legalization.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16#include "llvm/CodeGen/GlobalISel/CallLowering.h"
17#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
19#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24#include "llvm/CodeGen/GlobalISel/Utils.h"
25#include "llvm/CodeGen/LowLevelTypeUtils.h"
26#include "llvm/CodeGen/MachineConstantPool.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/CodeGen/MachineRegisterInfo.h"
29#include "llvm/CodeGen/RuntimeLibcallUtil.h"
30#include "llvm/CodeGen/TargetFrameLowering.h"
31#include "llvm/CodeGen/TargetInstrInfo.h"
32#include "llvm/CodeGen/TargetLowering.h"
33#include "llvm/CodeGen/TargetOpcodes.h"
34#include "llvm/CodeGen/TargetSubtargetInfo.h"
35#include "llvm/IR/Instructions.h"
36#include "llvm/Support/Debug.h"
37#include "llvm/Support/MathExtras.h"
38#include "llvm/Support/raw_ostream.h"
39#include "llvm/Target/TargetMachine.h"
40#include <numeric>
41#include <optional>
42
43#define DEBUG_TYPE "legalizer"
44
45using namespace llvm;
46using namespace LegalizeActions;
47using namespace MIPatternMatch;
48
49/// Try to break down \p OrigTy into \p NarrowTy sized pieces.
50///
51/// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
52/// with any leftover piece as type \p LeftoverTy
53///
54/// Returns -1 in the first element of the pair if the breakdown is not
55/// satisfiable.
56static std::pair<int, int>
57getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
58 assert(!LeftoverTy.isValid() && "this is an out argument");
59
60 unsigned Size = OrigTy.getSizeInBits();
61 unsigned NarrowSize = NarrowTy.getSizeInBits();
62 unsigned NumParts = Size / NarrowSize;
63 unsigned LeftoverSize = Size - NumParts * NarrowSize;
64 assert(Size > NarrowSize);
65
66 if (LeftoverSize == 0)
67 return {NumParts, 0};
68
69 if (NarrowTy.isVector()) {
70 unsigned EltSize = OrigTy.getScalarSizeInBits();
71 if (LeftoverSize % EltSize != 0)
72 return {-1, -1};
73 LeftoverTy = OrigTy.changeElementCount(
74 EC: ElementCount::getFixed(MinVal: LeftoverSize / EltSize));
75 } else {
76 LeftoverTy = LLT::scalar(SizeInBits: LeftoverSize);
77 }
78
79 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
80 return std::make_pair(x&: NumParts, y&: NumLeftover);
81}
82
83static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
84
85 if (!Ty.isScalar())
86 return nullptr;
87
88 switch (Ty.getSizeInBits()) {
89 case 16:
90 return Type::getHalfTy(C&: Ctx);
91 case 32:
92 return Type::getFloatTy(C&: Ctx);
93 case 64:
94 return Type::getDoubleTy(C&: Ctx);
95 case 80:
96 return Type::getX86_FP80Ty(C&: Ctx);
97 case 128:
98 return Type::getFP128Ty(C&: Ctx);
99 default:
100 return nullptr;
101 }
102}
103
104LegalizerHelper::LegalizerHelper(MachineFunction &MF,
105 GISelChangeObserver &Observer,
106 MachineIRBuilder &Builder,
107 const LibcallLoweringInfo *Libcalls)
108 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
109 LI(*MF.getSubtarget().getLegalizerInfo()),
110 TLI(*MF.getSubtarget().getTargetLowering()), Libcalls(Libcalls) {}
111
112LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
113 GISelChangeObserver &Observer,
114 MachineIRBuilder &B,
115 const LibcallLoweringInfo *Libcalls,
116 GISelValueTracking *VT)
117 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
118 TLI(*MF.getSubtarget().getTargetLowering()), Libcalls(Libcalls), VT(VT) {}
119
120LegalizerHelper::LegalizeResult
121LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
122 LostDebugLocObserver &LocObserver) {
123 LLVM_DEBUG(dbgs() << "\nLegalizing: " << MI);
124
125 MIRBuilder.setInstrAndDebugLoc(MI);
126
127 if (isa<GIntrinsic>(Val: MI))
128 return LI.legalizeIntrinsic(Helper&: *this, MI) ? Legalized : UnableToLegalize;
129 auto Step = LI.getAction(MI, MRI);
130 switch (Step.Action) {
131 case Legal:
132 LLVM_DEBUG(dbgs() << ".. Already legal\n");
133 return AlreadyLegal;
134 case Libcall:
135 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
136 return libcall(MI, LocObserver);
137 case NarrowScalar:
138 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
139 return narrowScalar(MI, TypeIdx: Step.TypeIdx, NarrowTy: Step.NewType);
140 case WidenScalar:
141 LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
142 return widenScalar(MI, TypeIdx: Step.TypeIdx, WideTy: Step.NewType);
143 case Bitcast:
144 LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
145 return bitcast(MI, TypeIdx: Step.TypeIdx, Ty: Step.NewType);
146 case Lower:
147 LLVM_DEBUG(dbgs() << ".. Lower\n");
148 return lower(MI, TypeIdx: Step.TypeIdx, Ty: Step.NewType);
149 case FewerElements:
150 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
151 return fewerElementsVector(MI, TypeIdx: Step.TypeIdx, NarrowTy: Step.NewType);
152 case MoreElements:
153 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
154 return moreElementsVector(MI, TypeIdx: Step.TypeIdx, MoreTy: Step.NewType);
155 case Custom:
156 LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
157 return LI.legalizeCustom(Helper&: *this, MI, LocObserver) ? Legalized
158 : UnableToLegalize;
159 default:
160 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
161 return UnableToLegalize;
162 }
163}
164
165void LegalizerHelper::insertParts(Register DstReg,
166 LLT ResultTy, LLT PartTy,
167 ArrayRef<Register> PartRegs,
168 LLT LeftoverTy,
169 ArrayRef<Register> LeftoverRegs) {
170 if (!LeftoverTy.isValid()) {
171 assert(LeftoverRegs.empty());
172
173 if (!ResultTy.isVector()) {
174 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: PartRegs);
175 return;
176 }
177
178 if (PartTy.isVector())
179 MIRBuilder.buildConcatVectors(Res: DstReg, Ops: PartRegs);
180 else
181 MIRBuilder.buildBuildVector(Res: DstReg, Ops: PartRegs);
182 return;
183 }
184
185 // Merge sub-vectors with different number of elements and insert into DstReg.
186 if (ResultTy.isVector()) {
187 assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
188 SmallVector<Register, 8> AllRegs(PartRegs);
189 AllRegs.append(in_start: LeftoverRegs.begin(), in_end: LeftoverRegs.end());
190 return mergeMixedSubvectors(DstReg, PartRegs: AllRegs);
191 }
192
193 SmallVector<Register> GCDRegs;
194 LLT GCDTy = getGCDType(OrigTy: getGCDType(OrigTy: ResultTy, TargetTy: LeftoverTy), TargetTy: PartTy);
195 for (auto PartReg : concat<const Register>(Ranges&: PartRegs, Ranges&: LeftoverRegs))
196 extractGCDType(Parts&: GCDRegs, GCDTy, SrcReg: PartReg);
197 LLT ResultLCMTy = buildLCMMergePieces(DstTy: ResultTy, NarrowTy: LeftoverTy, GCDTy, VRegs&: GCDRegs);
198 buildWidenedRemergeToDst(DstReg, LCMTy: ResultLCMTy, RemergeRegs: GCDRegs);
199}
200
201void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
202 Register Reg) {
203 LLT Ty = MRI.getType(Reg);
204 SmallVector<Register, 8> RegElts;
205 extractParts(Reg, Ty: Ty.getScalarType(), NumParts: Ty.getNumElements(), VRegs&: RegElts,
206 MIRBuilder, MRI);
207 Elts.append(RHS: RegElts);
208}
209
210/// Merge \p PartRegs with different types into \p DstReg.
211void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
212 ArrayRef<Register> PartRegs) {
213 SmallVector<Register, 8> AllElts;
214 for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
215 appendVectorElts(Elts&: AllElts, Reg: PartRegs[i]);
216
217 Register Leftover = PartRegs[PartRegs.size() - 1];
218 if (!MRI.getType(Reg: Leftover).isVector())
219 AllElts.push_back(Elt: Leftover);
220 else
221 appendVectorElts(Elts&: AllElts, Reg: Leftover);
222
223 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: AllElts);
224}
225
226/// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
227static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
228 const MachineInstr &MI) {
229 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
230
231 const int StartIdx = Regs.size();
232 const int NumResults = MI.getNumOperands() - 1;
233 Regs.resize(N: Regs.size() + NumResults);
234 for (int I = 0; I != NumResults; ++I)
235 Regs[StartIdx + I] = MI.getOperand(i: I).getReg();
236}
237
238void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
239 LLT GCDTy, Register SrcReg) {
240 LLT SrcTy = MRI.getType(Reg: SrcReg);
241 if (SrcTy == GCDTy) {
242 // If the source already evenly divides the result type, we don't need to do
243 // anything.
244 Parts.push_back(Elt: SrcReg);
245 } else {
246 // Need to split into common type sized pieces.
247 auto Unmerge = MIRBuilder.buildUnmerge(Res: GCDTy, Op: SrcReg);
248 getUnmergeResults(Regs&: Parts, MI: *Unmerge);
249 }
250}
251
252LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
253 LLT NarrowTy, Register SrcReg) {
254 LLT SrcTy = MRI.getType(Reg: SrcReg);
255 LLT GCDTy = getGCDType(OrigTy: getGCDType(OrigTy: SrcTy, TargetTy: NarrowTy), TargetTy: DstTy);
256 extractGCDType(Parts, GCDTy, SrcReg);
257 return GCDTy;
258}
259
260LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
261 SmallVectorImpl<Register> &VRegs,
262 unsigned PadStrategy) {
263 LLT LCMTy = getLCMType(OrigTy: DstTy, TargetTy: NarrowTy);
264
265 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
266 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
267 int NumOrigSrc = VRegs.size();
268
269 Register PadReg;
270
271 // Get a value we can use to pad the source value if the sources won't evenly
272 // cover the result type.
273 if (NumOrigSrc < NumParts * NumSubParts) {
274 if (PadStrategy == TargetOpcode::G_ZEXT)
275 PadReg = MIRBuilder.buildConstant(Res: GCDTy, Val: 0).getReg(Idx: 0);
276 else if (PadStrategy == TargetOpcode::G_ANYEXT)
277 PadReg = MIRBuilder.buildUndef(Res: GCDTy).getReg(Idx: 0);
278 else {
279 assert(PadStrategy == TargetOpcode::G_SEXT);
280
281 // Shift the sign bit of the low register through the high register.
282 auto ShiftAmt =
283 MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: GCDTy.getSizeInBits() - 1);
284 PadReg = MIRBuilder.buildAShr(Dst: GCDTy, Src0: VRegs.back(), Src1: ShiftAmt).getReg(Idx: 0);
285 }
286 }
287
288 // Registers for the final merge to be produced.
289 SmallVector<Register, 4> Remerge(NumParts);
290
291 // Registers needed for intermediate merges, which will be merged into a
292 // source for Remerge.
293 SmallVector<Register, 4> SubMerge(NumSubParts);
294
295 // Once we've fully read off the end of the original source bits, we can reuse
296 // the same high bits for remaining padding elements.
297 Register AllPadReg;
298
299 // Build merges to the LCM type to cover the original result type.
300 for (int I = 0; I != NumParts; ++I) {
301 bool AllMergePartsArePadding = true;
302
303 // Build the requested merges to the requested type.
304 for (int J = 0; J != NumSubParts; ++J) {
305 int Idx = I * NumSubParts + J;
306 if (Idx >= NumOrigSrc) {
307 SubMerge[J] = PadReg;
308 continue;
309 }
310
311 SubMerge[J] = VRegs[Idx];
312
313 // There are meaningful bits here we can't reuse later.
314 AllMergePartsArePadding = false;
315 }
316
317 // If we've filled up a complete piece with padding bits, we can directly
318 // emit the natural sized constant if applicable, rather than a merge of
319 // smaller constants.
320 if (AllMergePartsArePadding && !AllPadReg) {
321 if (PadStrategy == TargetOpcode::G_ANYEXT)
322 AllPadReg = MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0);
323 else if (PadStrategy == TargetOpcode::G_ZEXT)
324 AllPadReg = MIRBuilder.buildConstant(Res: NarrowTy, Val: 0).getReg(Idx: 0);
325
326 // If this is a sign extension, we can't materialize a trivial constant
327 // with the right type and have to produce a merge.
328 }
329
330 if (AllPadReg) {
331 // Avoid creating additional instructions if we're just adding additional
332 // copies of padding bits.
333 Remerge[I] = AllPadReg;
334 continue;
335 }
336
337 if (NumSubParts == 1)
338 Remerge[I] = SubMerge[0];
339 else
340 Remerge[I] = MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: SubMerge).getReg(Idx: 0);
341
342 // In the sign extend padding case, re-use the first all-signbit merge.
343 if (AllMergePartsArePadding && !AllPadReg)
344 AllPadReg = Remerge[I];
345 }
346
347 VRegs = std::move(Remerge);
348 return LCMTy;
349}
350
351void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
352 ArrayRef<Register> RemergeRegs) {
353 LLT DstTy = MRI.getType(Reg: DstReg);
354
355 // Create the merge to the widened source, and extract the relevant bits into
356 // the result.
357
358 if (DstTy == LCMTy) {
359 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: RemergeRegs);
360 return;
361 }
362
363 auto Remerge = MIRBuilder.buildMergeLikeInstr(Res: LCMTy, Ops: RemergeRegs);
364 if (DstTy.isScalar() && LCMTy.isScalar()) {
365 MIRBuilder.buildTrunc(Res: DstReg, Op: Remerge);
366 return;
367 }
368
369 if (LCMTy.isVector()) {
370 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
371 SmallVector<Register, 8> UnmergeDefs(NumDefs);
372 UnmergeDefs[0] = DstReg;
373 for (unsigned I = 1; I != NumDefs; ++I)
374 UnmergeDefs[I] = MRI.createGenericVirtualRegister(Ty: DstTy);
375
376 MIRBuilder.buildUnmerge(Res: UnmergeDefs,
377 Op: MIRBuilder.buildMergeLikeInstr(Res: LCMTy, Ops: RemergeRegs));
378 return;
379 }
380
381 llvm_unreachable("unhandled case");
382}
383
384static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
385#define RTLIBCASE_INT(LibcallPrefix) \
386 do { \
387 switch (Size) { \
388 case 32: \
389 return RTLIB::LibcallPrefix##32; \
390 case 64: \
391 return RTLIB::LibcallPrefix##64; \
392 case 128: \
393 return RTLIB::LibcallPrefix##128; \
394 default: \
395 llvm_unreachable("unexpected size"); \
396 } \
397 } while (0)
398
399#define RTLIBCASE(LibcallPrefix) \
400 do { \
401 switch (Size) { \
402 case 32: \
403 return RTLIB::LibcallPrefix##32; \
404 case 64: \
405 return RTLIB::LibcallPrefix##64; \
406 case 80: \
407 return RTLIB::LibcallPrefix##80; \
408 case 128: \
409 return RTLIB::LibcallPrefix##128; \
410 default: \
411 llvm_unreachable("unexpected size"); \
412 } \
413 } while (0)
414
415 switch (Opcode) {
416 case TargetOpcode::G_LROUND:
417 RTLIBCASE(LROUND_F);
418 case TargetOpcode::G_LLROUND:
419 RTLIBCASE(LLROUND_F);
420 case TargetOpcode::G_MUL:
421 RTLIBCASE_INT(MUL_I);
422 case TargetOpcode::G_SDIV:
423 RTLIBCASE_INT(SDIV_I);
424 case TargetOpcode::G_UDIV:
425 RTLIBCASE_INT(UDIV_I);
426 case TargetOpcode::G_SREM:
427 RTLIBCASE_INT(SREM_I);
428 case TargetOpcode::G_UREM:
429 RTLIBCASE_INT(UREM_I);
430 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
431 RTLIBCASE_INT(CTLZ_I);
432 case TargetOpcode::G_FADD:
433 RTLIBCASE(ADD_F);
434 case TargetOpcode::G_FSUB:
435 RTLIBCASE(SUB_F);
436 case TargetOpcode::G_FMUL:
437 RTLIBCASE(MUL_F);
438 case TargetOpcode::G_FDIV:
439 RTLIBCASE(DIV_F);
440 case TargetOpcode::G_FEXP:
441 RTLIBCASE(EXP_F);
442 case TargetOpcode::G_FEXP2:
443 RTLIBCASE(EXP2_F);
444 case TargetOpcode::G_FEXP10:
445 RTLIBCASE(EXP10_F);
446 case TargetOpcode::G_FREM:
447 RTLIBCASE(REM_F);
448 case TargetOpcode::G_FPOW:
449 RTLIBCASE(POW_F);
450 case TargetOpcode::G_FPOWI:
451 RTLIBCASE(POWI_F);
452 case TargetOpcode::G_FMA:
453 RTLIBCASE(FMA_F);
454 case TargetOpcode::G_FSIN:
455 RTLIBCASE(SIN_F);
456 case TargetOpcode::G_FCOS:
457 RTLIBCASE(COS_F);
458 case TargetOpcode::G_FTAN:
459 RTLIBCASE(TAN_F);
460 case TargetOpcode::G_FASIN:
461 RTLIBCASE(ASIN_F);
462 case TargetOpcode::G_FACOS:
463 RTLIBCASE(ACOS_F);
464 case TargetOpcode::G_FATAN:
465 RTLIBCASE(ATAN_F);
466 case TargetOpcode::G_FATAN2:
467 RTLIBCASE(ATAN2_F);
468 case TargetOpcode::G_FSINH:
469 RTLIBCASE(SINH_F);
470 case TargetOpcode::G_FCOSH:
471 RTLIBCASE(COSH_F);
472 case TargetOpcode::G_FTANH:
473 RTLIBCASE(TANH_F);
474 case TargetOpcode::G_FSINCOS:
475 RTLIBCASE(SINCOS_F);
476 case TargetOpcode::G_FMODF:
477 RTLIBCASE(MODF_F);
478 case TargetOpcode::G_FLOG10:
479 RTLIBCASE(LOG10_F);
480 case TargetOpcode::G_FLOG:
481 RTLIBCASE(LOG_F);
482 case TargetOpcode::G_FLOG2:
483 RTLIBCASE(LOG2_F);
484 case TargetOpcode::G_FLDEXP:
485 RTLIBCASE(LDEXP_F);
486 case TargetOpcode::G_FCEIL:
487 RTLIBCASE(CEIL_F);
488 case TargetOpcode::G_FFLOOR:
489 RTLIBCASE(FLOOR_F);
490 case TargetOpcode::G_FMINNUM:
491 RTLIBCASE(FMIN_F);
492 case TargetOpcode::G_FMAXNUM:
493 RTLIBCASE(FMAX_F);
494 case TargetOpcode::G_FMINIMUMNUM:
495 RTLIBCASE(FMINIMUM_NUM_F);
496 case TargetOpcode::G_FMAXIMUMNUM:
497 RTLIBCASE(FMAXIMUM_NUM_F);
498 case TargetOpcode::G_FSQRT:
499 RTLIBCASE(SQRT_F);
500 case TargetOpcode::G_FRINT:
501 RTLIBCASE(RINT_F);
502 case TargetOpcode::G_FNEARBYINT:
503 RTLIBCASE(NEARBYINT_F);
504 case TargetOpcode::G_INTRINSIC_TRUNC:
505 RTLIBCASE(TRUNC_F);
506 case TargetOpcode::G_INTRINSIC_ROUND:
507 RTLIBCASE(ROUND_F);
508 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
509 RTLIBCASE(ROUNDEVEN_F);
510 case TargetOpcode::G_INTRINSIC_LRINT:
511 RTLIBCASE(LRINT_F);
512 case TargetOpcode::G_INTRINSIC_LLRINT:
513 RTLIBCASE(LLRINT_F);
514 }
515 llvm_unreachable("Unknown libcall function");
516#undef RTLIBCASE_INT
517#undef RTLIBCASE
518}
519
520/// True if an instruction is in tail position in its caller. Intended for
521/// legalizing libcalls as tail calls when possible.
522static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
523 MachineInstr &MI,
524 const TargetInstrInfo &TII,
525 MachineRegisterInfo &MRI) {
526 MachineBasicBlock &MBB = *MI.getParent();
527 const Function &F = MBB.getParent()->getFunction();
528
529 // Conservatively require the attributes of the call to match those of
530 // the return. Ignore NoAlias and NonNull because they don't affect the
531 // call sequence.
532 AttributeList CallerAttrs = F.getAttributes();
533 if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
534 .removeAttribute(Val: Attribute::NoAlias)
535 .removeAttribute(Val: Attribute::NonNull)
536 .hasAttributes())
537 return false;
538
539 // It's not safe to eliminate the sign / zero extension of the return value.
540 if (CallerAttrs.hasRetAttr(Kind: Attribute::ZExt) ||
541 CallerAttrs.hasRetAttr(Kind: Attribute::SExt))
542 return false;
543
544 // Only tail call if the following instruction is a standard return or if we
545 // have a `thisreturn` callee, and a sequence like:
546 //
547 // G_MEMCPY %0, %1, %2
548 // $x0 = COPY %0
549 // RET_ReallyLR implicit $x0
550 auto Next = next_nodbg(It: MI.getIterator(), End: MBB.instr_end());
551 if (Next != MBB.instr_end() && Next->isCopy()) {
552 if (MI.getOpcode() == TargetOpcode::G_BZERO)
553 return false;
554
555 // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
556 // mempy/etc routines return the same parameter. For other it will be the
557 // returned value.
558 Register VReg = MI.getOperand(i: 0).getReg();
559 if (!VReg.isVirtual() || VReg != Next->getOperand(i: 1).getReg())
560 return false;
561
562 Register PReg = Next->getOperand(i: 0).getReg();
563 if (!PReg.isPhysical())
564 return false;
565
566 auto Ret = next_nodbg(It: Next, End: MBB.instr_end());
567 if (Ret == MBB.instr_end() || !Ret->isReturn())
568 return false;
569
570 if (Ret->getNumImplicitOperands() != 1)
571 return false;
572
573 if (!Ret->getOperand(i: 0).isReg() || PReg != Ret->getOperand(i: 0).getReg())
574 return false;
575
576 // Skip over the COPY that we just validated.
577 Next = Ret;
578 }
579
580 if (Next == MBB.instr_end() || TII.isTailCall(Inst: *Next) || !Next->isReturn())
581 return false;
582
583 return true;
584}
585
586LegalizerHelper::LegalizeResult LegalizerHelper::createLibcall(
587 const char *Name, const CallLowering::ArgInfo &Result,
588 ArrayRef<CallLowering::ArgInfo> Args, const CallingConv::ID CC,
589 LostDebugLocObserver &LocObserver, MachineInstr *MI) const {
590 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
591
592 CallLowering::CallLoweringInfo Info;
593 Info.CallConv = CC;
594 Info.Callee = MachineOperand::CreateES(SymName: Name);
595 Info.OrigRet = Result;
596 if (MI)
597 Info.IsTailCall =
598 (Result.Ty->isVoidTy() ||
599 Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
600 isLibCallInTailPosition(Result, MI&: *MI, TII: MIRBuilder.getTII(),
601 MRI&: *MIRBuilder.getMRI());
602
603 llvm::append_range(C&: Info.OrigArgs, R&: Args);
604 if (!CLI.lowerCall(MIRBuilder, Info))
605 return LegalizerHelper::UnableToLegalize;
606
607 if (MI && Info.LoweredTailCall) {
608 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
609
610 // Check debug locations before removing the return.
611 LocObserver.checkpoint(CheckDebugLocs: true);
612
613 // We must have a return following the call (or debug insts) to get past
614 // isLibCallInTailPosition.
615 do {
616 MachineInstr *Next = MI->getNextNode();
617 assert(Next &&
618 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
619 "Expected instr following MI to be return or debug inst?");
620 // We lowered a tail call, so the call is now the return from the block.
621 // Delete the old return.
622 Next->eraseFromParent();
623 } while (MI->getNextNode());
624
625 // We expect to lose the debug location from the return.
626 LocObserver.checkpoint(CheckDebugLocs: false);
627 }
628 return LegalizerHelper::Legalized;
629}
630
631LegalizerHelper::LegalizeResult LegalizerHelper::createLibcall(
632 RTLIB::Libcall Libcall, const CallLowering::ArgInfo &Result,
633 ArrayRef<CallLowering::ArgInfo> Args, LostDebugLocObserver &LocObserver,
634 MachineInstr *MI) const {
635 if (!Libcalls)
636 return LegalizerHelper::UnableToLegalize;
637
638 RTLIB::LibcallImpl LibcallImpl = Libcalls->getLibcallImpl(Call: Libcall);
639 if (LibcallImpl == RTLIB::Unsupported)
640 return LegalizerHelper::UnableToLegalize;
641
642 StringRef Name = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: LibcallImpl);
643 const CallingConv::ID CC = Libcalls->getLibcallImplCallingConv(Call: LibcallImpl);
644 return createLibcall(Name: Name.data(), Result, Args, CC, LocObserver, MI);
645}
646
647// Useful for libcalls where all operands have the same type.
648LegalizerHelper::LegalizeResult
649LegalizerHelper::simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
650 unsigned Size, Type *OpType,
651 LostDebugLocObserver &LocObserver) const {
652 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
653
654 // FIXME: What does the original arg index mean here?
655 SmallVector<CallLowering::ArgInfo, 3> Args;
656 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands()))
657 Args.push_back(Elt: {MO.getReg(), OpType, 0});
658 return createLibcall(Libcall, Result: {MI.getOperand(i: 0).getReg(), OpType, 0}, Args,
659 LocObserver, MI: &MI);
660}
661
662LegalizerHelper::LegalizeResult LegalizerHelper::emitSincosLibcall(
663 MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, Type *OpType,
664 LostDebugLocObserver &LocObserver) {
665 MachineFunction &MF = *MI.getMF();
666 MachineRegisterInfo &MRI = MF.getRegInfo();
667
668 Register DstSin = MI.getOperand(i: 0).getReg();
669 Register DstCos = MI.getOperand(i: 1).getReg();
670 Register Src = MI.getOperand(i: 2).getReg();
671 LLT DstTy = MRI.getType(Reg: DstSin);
672
673 int MemSize = DstTy.getSizeInBytes();
674 Align Alignment = getStackTemporaryAlignment(Type: DstTy);
675 const DataLayout &DL = MIRBuilder.getDataLayout();
676 unsigned AddrSpace = DL.getAllocaAddrSpace();
677 MachinePointerInfo PtrInfo;
678
679 Register StackPtrSin =
680 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
681 .getReg(Idx: 0);
682 Register StackPtrCos =
683 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
684 .getReg(Idx: 0);
685
686 auto &Ctx = MF.getFunction().getContext();
687 auto LibcallResult = createLibcall(
688 Libcall: getRTLibDesc(Opcode: MI.getOpcode(), Size), Result: {{0}, Type::getVoidTy(C&: Ctx), 0},
689 Args: {{Src, OpType, 0},
690 {StackPtrSin, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 1},
691 {StackPtrCos, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 2}},
692 LocObserver, MI: &MI);
693
694 if (LibcallResult != LegalizeResult::Legalized)
695 return LegalizerHelper::UnableToLegalize;
696
697 MachineMemOperand *LoadMMOSin = MF.getMachineMemOperand(
698 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
699 MachineMemOperand *LoadMMOCos = MF.getMachineMemOperand(
700 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
701
702 MIRBuilder.buildLoad(Res: DstSin, Addr: StackPtrSin, MMO&: *LoadMMOSin);
703 MIRBuilder.buildLoad(Res: DstCos, Addr: StackPtrCos, MMO&: *LoadMMOCos);
704 MI.eraseFromParent();
705
706 return LegalizerHelper::Legalized;
707}
708
709LegalizerHelper::LegalizeResult
710LegalizerHelper::emitModfLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
711 unsigned Size, Type *OpType,
712 LostDebugLocObserver &LocObserver) {
713 MachineFunction &MF = MIRBuilder.getMF();
714 MachineRegisterInfo &MRI = MF.getRegInfo();
715
716 Register DstFrac = MI.getOperand(i: 0).getReg();
717 Register DstInt = MI.getOperand(i: 1).getReg();
718 Register Src = MI.getOperand(i: 2).getReg();
719 LLT DstTy = MRI.getType(Reg: DstFrac);
720
721 int MemSize = DstTy.getSizeInBytes();
722 Align Alignment = getStackTemporaryAlignment(Type: DstTy);
723 const DataLayout &DL = MIRBuilder.getDataLayout();
724 unsigned AddrSpace = DL.getAllocaAddrSpace();
725 MachinePointerInfo PtrInfo;
726
727 Register StackPtrInt =
728 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: MemSize), Alignment, PtrInfo)
729 .getReg(Idx: 0);
730
731 auto &Ctx = MF.getFunction().getContext();
732 auto LibcallResult = createLibcall(
733 Libcall: getRTLibDesc(Opcode: MI.getOpcode(), Size), Result: {DstFrac, OpType, 0},
734 Args: {{Src, OpType, 0}, {StackPtrInt, PointerType::get(C&: Ctx, AddressSpace: AddrSpace), 1}},
735 LocObserver, MI: &MI);
736
737 if (LibcallResult != LegalizeResult::Legalized)
738 return LegalizerHelper::UnableToLegalize;
739
740 MachineMemOperand *LoadMMOInt = MF.getMachineMemOperand(
741 PtrInfo, F: MachineMemOperand::MOLoad, Size: MemSize, BaseAlignment: Alignment);
742
743 MIRBuilder.buildLoad(Res: DstInt, Addr: StackPtrInt, MMO&: *LoadMMOInt);
744 MI.eraseFromParent();
745
746 return LegalizerHelper::Legalized;
747}
748
749static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
750 Type *FromType) {
751 auto ToMVT = MVT::getVT(Ty: ToType);
752 auto FromMVT = MVT::getVT(Ty: FromType);
753
754 switch (Opcode) {
755 case TargetOpcode::G_FPEXT:
756 return RTLIB::getFPEXT(OpVT: FromMVT, RetVT: ToMVT);
757 case TargetOpcode::G_FPTRUNC:
758 return RTLIB::getFPROUND(OpVT: FromMVT, RetVT: ToMVT);
759 case TargetOpcode::G_FPTOSI:
760 return RTLIB::getFPTOSINT(OpVT: FromMVT, RetVT: ToMVT);
761 case TargetOpcode::G_FPTOUI:
762 return RTLIB::getFPTOUINT(OpVT: FromMVT, RetVT: ToMVT);
763 case TargetOpcode::G_SITOFP:
764 return RTLIB::getSINTTOFP(OpVT: FromMVT, RetVT: ToMVT);
765 case TargetOpcode::G_UITOFP:
766 return RTLIB::getUINTTOFP(OpVT: FromMVT, RetVT: ToMVT);
767 }
768 llvm_unreachable("Unsupported libcall function");
769}
770
771LegalizerHelper::LegalizeResult LegalizerHelper::conversionLibcall(
772 MachineInstr &MI, Type *ToType, Type *FromType,
773 LostDebugLocObserver &LocObserver, bool IsSigned) const {
774 CallLowering::ArgInfo Arg = {MI.getOperand(i: 1).getReg(), FromType, 0};
775 if (FromType->isIntegerTy()) {
776 if (TLI.shouldSignExtendTypeInLibCall(Ty: FromType, IsSigned))
777 Arg.Flags[0].setSExt();
778 else
779 Arg.Flags[0].setZExt();
780 }
781
782 RTLIB::Libcall Libcall = getConvRTLibDesc(Opcode: MI.getOpcode(), ToType, FromType);
783 return createLibcall(Libcall, Result: {MI.getOperand(i: 0).getReg(), ToType, 0}, Args: Arg,
784 LocObserver, MI: &MI);
785}
786
787LegalizerHelper::LegalizeResult
788LegalizerHelper::createMemLibcall(MachineRegisterInfo &MRI, MachineInstr &MI,
789 LostDebugLocObserver &LocObserver) const {
790 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
791
792 SmallVector<CallLowering::ArgInfo, 3> Args;
793 // Add all the args, except for the last which is an imm denoting 'tail'.
794 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
795 Register Reg = MI.getOperand(i).getReg();
796
797 // Need derive an IR type for call lowering.
798 LLT OpLLT = MRI.getType(Reg);
799 Type *OpTy = nullptr;
800 if (OpLLT.isPointer())
801 OpTy = PointerType::get(C&: Ctx, AddressSpace: OpLLT.getAddressSpace());
802 else
803 OpTy = IntegerType::get(C&: Ctx, NumBits: OpLLT.getSizeInBits());
804 Args.push_back(Elt: {Reg, OpTy, 0});
805 }
806
807 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
808 RTLIB::Libcall RTLibcall;
809 unsigned Opc = MI.getOpcode();
810 switch (Opc) {
811 case TargetOpcode::G_BZERO:
812 RTLibcall = RTLIB::BZERO;
813 break;
814 case TargetOpcode::G_MEMCPY:
815 RTLibcall = RTLIB::MEMCPY;
816 Args[0].Flags[0].setReturned();
817 break;
818 case TargetOpcode::G_MEMMOVE:
819 RTLibcall = RTLIB::MEMMOVE;
820 Args[0].Flags[0].setReturned();
821 break;
822 case TargetOpcode::G_MEMSET:
823 RTLibcall = RTLIB::MEMSET;
824 Args[0].Flags[0].setReturned();
825 break;
826 default:
827 llvm_unreachable("unsupported opcode");
828 }
829
830 if (!Libcalls) // FIXME: Should be mandatory
831 return LegalizerHelper::UnableToLegalize;
832
833 RTLIB::LibcallImpl RTLibcallImpl = Libcalls->getLibcallImpl(Call: RTLibcall);
834
835 // Unsupported libcall on the target.
836 if (RTLibcallImpl == RTLIB::Unsupported) {
837 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
838 << MIRBuilder.getTII().getName(Opc) << "\n");
839 return LegalizerHelper::UnableToLegalize;
840 }
841
842 CallLowering::CallLoweringInfo Info;
843 Info.CallConv = Libcalls->getLibcallImplCallingConv(Call: RTLibcallImpl);
844
845 StringRef LibcallName =
846 RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: RTLibcallImpl);
847 Info.Callee = MachineOperand::CreateES(SymName: LibcallName.data());
848 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0);
849 Info.IsTailCall =
850 MI.getOperand(i: MI.getNumOperands() - 1).getImm() &&
851 isLibCallInTailPosition(Result: Info.OrigRet, MI, TII: MIRBuilder.getTII(), MRI);
852
853 llvm::append_range(C&: Info.OrigArgs, R&: Args);
854 if (!CLI.lowerCall(MIRBuilder, Info))
855 return LegalizerHelper::UnableToLegalize;
856
857 if (Info.LoweredTailCall) {
858 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
859
860 // Check debug locations before removing the return.
861 LocObserver.checkpoint(CheckDebugLocs: true);
862
863 // We must have a return following the call (or debug insts) to get past
864 // isLibCallInTailPosition.
865 do {
866 MachineInstr *Next = MI.getNextNode();
867 assert(Next &&
868 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
869 "Expected instr following MI to be return or debug inst?");
870 // We lowered a tail call, so the call is now the return from the block.
871 // Delete the old return.
872 Next->eraseFromParent();
873 } while (MI.getNextNode());
874
875 // We expect to lose the debug location from the return.
876 LocObserver.checkpoint(CheckDebugLocs: false);
877 }
878
879 return LegalizerHelper::Legalized;
880}
881
882static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
883 unsigned Opc = MI.getOpcode();
884 auto &AtomicMI = cast<GMemOperation>(Val&: MI);
885 auto &MMO = AtomicMI.getMMO();
886 auto Ordering = MMO.getMergedOrdering();
887 LLT MemType = MMO.getMemoryType();
888 uint64_t MemSize = MemType.getSizeInBytes();
889 if (MemType.isVector())
890 return RTLIB::UNKNOWN_LIBCALL;
891
892#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
893#define LCALL5(A) \
894 LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
895 switch (Opc) {
896 case TargetOpcode::G_ATOMIC_CMPXCHG:
897 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
898 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
899 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
900 }
901 case TargetOpcode::G_ATOMICRMW_XCHG: {
902 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
903 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
904 }
905 case TargetOpcode::G_ATOMICRMW_ADD:
906 case TargetOpcode::G_ATOMICRMW_SUB: {
907 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
908 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
909 }
910 case TargetOpcode::G_ATOMICRMW_AND: {
911 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
912 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
913 }
914 case TargetOpcode::G_ATOMICRMW_OR: {
915 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
916 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
917 }
918 case TargetOpcode::G_ATOMICRMW_XOR: {
919 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
920 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
921 }
922 default:
923 return RTLIB::UNKNOWN_LIBCALL;
924 }
925#undef LCALLS
926#undef LCALL5
927}
928
929LegalizerHelper::LegalizeResult
930LegalizerHelper::createAtomicLibcall(MachineInstr &MI) const {
931 auto &Ctx = MIRBuilder.getContext();
932
933 Type *RetTy;
934 SmallVector<Register> RetRegs;
935 SmallVector<CallLowering::ArgInfo, 3> Args;
936 unsigned Opc = MI.getOpcode();
937 switch (Opc) {
938 case TargetOpcode::G_ATOMIC_CMPXCHG:
939 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
940 Register Success;
941 LLT SuccessLLT;
942 auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
943 MI.getFirst4RegLLTs();
944 RetRegs.push_back(Elt: Ret);
945 RetTy = IntegerType::get(C&: Ctx, NumBits: RetLLT.getSizeInBits());
946 if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
947 std::tie(args&: Ret, args&: RetLLT, args&: Success, args&: SuccessLLT, args&: Mem, args&: MemLLT, args&: Cmp, args&: CmpLLT, args&: New,
948 args&: NewLLT) = MI.getFirst5RegLLTs();
949 RetRegs.push_back(Elt: Success);
950 RetTy = StructType::get(
951 Context&: Ctx, Elements: {RetTy, IntegerType::get(C&: Ctx, NumBits: SuccessLLT.getSizeInBits())});
952 }
953 Args.push_back(Elt: {Cmp, IntegerType::get(C&: Ctx, NumBits: CmpLLT.getSizeInBits()), 0});
954 Args.push_back(Elt: {New, IntegerType::get(C&: Ctx, NumBits: NewLLT.getSizeInBits()), 0});
955 Args.push_back(Elt: {Mem, PointerType::get(C&: Ctx, AddressSpace: MemLLT.getAddressSpace()), 0});
956 break;
957 }
958 case TargetOpcode::G_ATOMICRMW_XCHG:
959 case TargetOpcode::G_ATOMICRMW_ADD:
960 case TargetOpcode::G_ATOMICRMW_SUB:
961 case TargetOpcode::G_ATOMICRMW_AND:
962 case TargetOpcode::G_ATOMICRMW_OR:
963 case TargetOpcode::G_ATOMICRMW_XOR: {
964 auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
965 RetRegs.push_back(Elt: Ret);
966 RetTy = IntegerType::get(C&: Ctx, NumBits: RetLLT.getSizeInBits());
967 if (Opc == TargetOpcode::G_ATOMICRMW_AND)
968 Val =
969 MIRBuilder.buildXor(Dst: ValLLT, Src0: MIRBuilder.buildConstant(Res: ValLLT, Val: -1), Src1: Val)
970 .getReg(Idx: 0);
971 else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
972 Val =
973 MIRBuilder.buildSub(Dst: ValLLT, Src0: MIRBuilder.buildConstant(Res: ValLLT, Val: 0), Src1: Val)
974 .getReg(Idx: 0);
975 Args.push_back(Elt: {Val, IntegerType::get(C&: Ctx, NumBits: ValLLT.getSizeInBits()), 0});
976 Args.push_back(Elt: {Mem, PointerType::get(C&: Ctx, AddressSpace: MemLLT.getAddressSpace()), 0});
977 break;
978 }
979 default:
980 llvm_unreachable("unsupported opcode");
981 }
982
983 if (!Libcalls) // FIXME: Should be mandatory
984 return LegalizerHelper::UnableToLegalize;
985
986 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
987 RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
988 RTLIB::LibcallImpl RTLibcallImpl = Libcalls->getLibcallImpl(Call: RTLibcall);
989
990 // Unsupported libcall on the target.
991 if (RTLibcallImpl == RTLIB::Unsupported) {
992 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
993 << MIRBuilder.getTII().getName(Opc) << "\n");
994 return LegalizerHelper::UnableToLegalize;
995 }
996
997 CallLowering::CallLoweringInfo Info;
998 Info.CallConv = Libcalls->getLibcallImplCallingConv(Call: RTLibcallImpl);
999
1000 StringRef LibcallName =
1001 RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: RTLibcallImpl);
1002 Info.Callee = MachineOperand::CreateES(SymName: LibcallName.data());
1003 Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
1004
1005 llvm::append_range(C&: Info.OrigArgs, R&: Args);
1006 if (!CLI.lowerCall(MIRBuilder, Info))
1007 return LegalizerHelper::UnableToLegalize;
1008
1009 return LegalizerHelper::Legalized;
1010}
1011
1012static RTLIB::Libcall
1013getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
1014 RTLIB::Libcall RTLibcall;
1015 switch (MI.getOpcode()) {
1016 case TargetOpcode::G_GET_FPENV:
1017 RTLibcall = RTLIB::FEGETENV;
1018 break;
1019 case TargetOpcode::G_SET_FPENV:
1020 case TargetOpcode::G_RESET_FPENV:
1021 RTLibcall = RTLIB::FESETENV;
1022 break;
1023 case TargetOpcode::G_GET_FPMODE:
1024 RTLibcall = RTLIB::FEGETMODE;
1025 break;
1026 case TargetOpcode::G_SET_FPMODE:
1027 case TargetOpcode::G_RESET_FPMODE:
1028 RTLibcall = RTLIB::FESETMODE;
1029 break;
1030 default:
1031 llvm_unreachable("Unexpected opcode");
1032 }
1033 return RTLibcall;
1034}
1035
1036// Some library functions that read FP state (fegetmode, fegetenv) write the
1037// state into a region in memory. IR intrinsics that do the same operations
1038// (get_fpmode, get_fpenv) return the state as integer value. To implement these
1039// intrinsics via the library functions, we need to use temporary variable,
1040// for example:
1041//
1042// %0:_(s32) = G_GET_FPMODE
1043//
1044// is transformed to:
1045//
1046// %1:_(p0) = G_FRAME_INDEX %stack.0
1047// BL &fegetmode
1048// %0:_(s32) = G_LOAD % 1
1049//
1050LegalizerHelper::LegalizeResult
1051LegalizerHelper::createGetStateLibcall(MachineInstr &MI,
1052 LostDebugLocObserver &LocObserver) {
1053 const DataLayout &DL = MIRBuilder.getDataLayout();
1054 auto &MF = MIRBuilder.getMF();
1055 auto &MRI = *MIRBuilder.getMRI();
1056 auto &Ctx = MF.getFunction().getContext();
1057
1058 // Create temporary, where library function will put the read state.
1059 Register Dst = MI.getOperand(i: 0).getReg();
1060 LLT StateTy = MRI.getType(Reg: Dst);
1061 TypeSize StateSize = StateTy.getSizeInBytes();
1062 Align TempAlign = getStackTemporaryAlignment(Type: StateTy);
1063 MachinePointerInfo TempPtrInfo;
1064 auto Temp = createStackTemporary(Bytes: StateSize, Alignment: TempAlign, PtrInfo&: TempPtrInfo);
1065
1066 // Create a call to library function, with the temporary as an argument.
1067 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1068 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: TempAddrSpace);
1069 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1070 auto Res = createLibcall(
1071 Libcall: RTLibcall, Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1072 Args: CallLowering::ArgInfo({Temp.getReg(Idx: 0), StatePtrTy, 0}), LocObserver,
1073 MI: nullptr);
1074 if (Res != LegalizerHelper::Legalized)
1075 return Res;
1076
1077 // Create a load from the temporary.
1078 MachineMemOperand *MMO = MF.getMachineMemOperand(
1079 PtrInfo: TempPtrInfo, f: MachineMemOperand::MOLoad, MemTy: StateTy, base_alignment: TempAlign);
1080 MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_LOAD, Res: Dst, Addr: Temp, MMO&: *MMO);
1081
1082 return LegalizerHelper::Legalized;
1083}
1084
1085// Similar to `createGetStateLibcall` the function calls a library function
1086// using transient space in stack. In this case the library function reads
1087// content of memory region.
1088LegalizerHelper::LegalizeResult
1089LegalizerHelper::createSetStateLibcall(MachineInstr &MI,
1090 LostDebugLocObserver &LocObserver) {
1091 const DataLayout &DL = MIRBuilder.getDataLayout();
1092 auto &MF = MIRBuilder.getMF();
1093 auto &MRI = *MIRBuilder.getMRI();
1094 auto &Ctx = MF.getFunction().getContext();
1095
1096 // Create temporary, where library function will get the new state.
1097 Register Src = MI.getOperand(i: 0).getReg();
1098 LLT StateTy = MRI.getType(Reg: Src);
1099 TypeSize StateSize = StateTy.getSizeInBytes();
1100 Align TempAlign = getStackTemporaryAlignment(Type: StateTy);
1101 MachinePointerInfo TempPtrInfo;
1102 auto Temp = createStackTemporary(Bytes: StateSize, Alignment: TempAlign, PtrInfo&: TempPtrInfo);
1103
1104 // Put the new state into the temporary.
1105 MachineMemOperand *MMO = MF.getMachineMemOperand(
1106 PtrInfo: TempPtrInfo, f: MachineMemOperand::MOStore, MemTy: StateTy, base_alignment: TempAlign);
1107 MIRBuilder.buildStore(Val: Src, Addr: Temp, MMO&: *MMO);
1108
1109 // Create a call to library function, with the temporary as an argument.
1110 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1111 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: TempAddrSpace);
1112 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1113 return createLibcall(Libcall: RTLibcall,
1114 Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1115 Args: CallLowering::ArgInfo({Temp.getReg(Idx: 0), StatePtrTy, 0}),
1116 LocObserver, MI: nullptr);
1117}
1118
1119/// Returns the corresponding libcall for the given Pred and
1120/// the ICMP predicate that should be generated to compare with #0
1121/// after the libcall.
1122static std::pair<RTLIB::Libcall, CmpInst::Predicate>
1123getFCMPLibcallDesc(const CmpInst::Predicate Pred, unsigned Size) {
1124#define RTLIBCASE_CMP(LibcallPrefix, ICmpPred) \
1125 do { \
1126 switch (Size) { \
1127 case 32: \
1128 return {RTLIB::LibcallPrefix##32, ICmpPred}; \
1129 case 64: \
1130 return {RTLIB::LibcallPrefix##64, ICmpPred}; \
1131 case 128: \
1132 return {RTLIB::LibcallPrefix##128, ICmpPred}; \
1133 default: \
1134 llvm_unreachable("unexpected size"); \
1135 } \
1136 } while (0)
1137
1138 switch (Pred) {
1139 case CmpInst::FCMP_OEQ:
1140 RTLIBCASE_CMP(OEQ_F, CmpInst::ICMP_EQ);
1141 case CmpInst::FCMP_UNE:
1142 RTLIBCASE_CMP(UNE_F, CmpInst::ICMP_NE);
1143 case CmpInst::FCMP_OGE:
1144 RTLIBCASE_CMP(OGE_F, CmpInst::ICMP_SGE);
1145 case CmpInst::FCMP_OLT:
1146 RTLIBCASE_CMP(OLT_F, CmpInst::ICMP_SLT);
1147 case CmpInst::FCMP_OLE:
1148 RTLIBCASE_CMP(OLE_F, CmpInst::ICMP_SLE);
1149 case CmpInst::FCMP_OGT:
1150 RTLIBCASE_CMP(OGT_F, CmpInst::ICMP_SGT);
1151 case CmpInst::FCMP_UNO:
1152 RTLIBCASE_CMP(UO_F, CmpInst::ICMP_NE);
1153 default:
1154 return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
1155 }
1156}
1157
1158LegalizerHelper::LegalizeResult
1159LegalizerHelper::createFCMPLibcall(MachineInstr &MI,
1160 LostDebugLocObserver &LocObserver) {
1161 auto &MF = MIRBuilder.getMF();
1162 auto &Ctx = MF.getFunction().getContext();
1163 const GFCmp *Cmp = cast<GFCmp>(Val: &MI);
1164
1165 LLT OpLLT = MRI.getType(Reg: Cmp->getLHSReg());
1166 unsigned Size = OpLLT.getSizeInBits();
1167 if ((Size != 32 && Size != 64 && Size != 128) ||
1168 OpLLT != MRI.getType(Reg: Cmp->getRHSReg()))
1169 return UnableToLegalize;
1170
1171 Type *OpType = getFloatTypeForLLT(Ctx, Ty: OpLLT);
1172
1173 // DstReg type is s32
1174 const Register DstReg = Cmp->getReg(Idx: 0);
1175 LLT DstTy = MRI.getType(Reg: DstReg);
1176 const auto Cond = Cmp->getCond();
1177
1178 // Reference:
1179 // https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
1180 // Generates a libcall followed by ICMP.
1181 const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
1182 const CmpInst::Predicate ICmpPred,
1183 const DstOp &Res) -> Register {
1184 // FCMP libcall always returns an i32, and needs an ICMP with #0.
1185 constexpr LLT TempLLT = LLT::scalar(SizeInBits: 32);
1186 Register Temp = MRI.createGenericVirtualRegister(Ty: TempLLT);
1187 // Generate libcall, holding result in Temp
1188 const auto Status = createLibcall(
1189 Libcall, Result: {Temp, Type::getInt32Ty(C&: Ctx), 0},
1190 Args: {{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
1191 LocObserver, MI: &MI);
1192 if (!Status)
1193 return {};
1194
1195 // Compare temp with #0 to get the final result.
1196 return MIRBuilder
1197 .buildICmp(Pred: ICmpPred, Res, Op0: Temp, Op1: MIRBuilder.buildConstant(Res: TempLLT, Val: 0))
1198 .getReg(Idx: 0);
1199 };
1200
1201 // Simple case if we have a direct mapping from predicate to libcall
1202 if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Pred: Cond, Size);
1203 Libcall != RTLIB::UNKNOWN_LIBCALL &&
1204 ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
1205 if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
1206 return Legalized;
1207 }
1208 return UnableToLegalize;
1209 }
1210
1211 // No direct mapping found, should be generated as combination of libcalls.
1212
1213 switch (Cond) {
1214 case CmpInst::FCMP_UEQ: {
1215 // FCMP_UEQ: unordered or equal
1216 // Convert into (FCMP_OEQ || FCMP_UNO).
1217
1218 const auto [OeqLibcall, OeqPred] =
1219 getFCMPLibcallDesc(Pred: CmpInst::FCMP_OEQ, Size);
1220 const auto Oeq = BuildLibcall(OeqLibcall, OeqPred, DstTy);
1221
1222 const auto [UnoLibcall, UnoPred] =
1223 getFCMPLibcallDesc(Pred: CmpInst::FCMP_UNO, Size);
1224 const auto Uno = BuildLibcall(UnoLibcall, UnoPred, DstTy);
1225 if (Oeq && Uno)
1226 MIRBuilder.buildOr(Dst: DstReg, Src0: Oeq, Src1: Uno);
1227 else
1228 return UnableToLegalize;
1229
1230 break;
1231 }
1232 case CmpInst::FCMP_ONE: {
1233 // FCMP_ONE: ordered and operands are unequal
1234 // Convert into (!FCMP_OEQ && !FCMP_UNO).
1235
1236 // We inverse the predicate instead of generating a NOT
1237 // to save one instruction.
1238 // On AArch64 isel can even select two cmp into a single ccmp.
1239 const auto [OeqLibcall, OeqPred] =
1240 getFCMPLibcallDesc(Pred: CmpInst::FCMP_OEQ, Size);
1241 const auto NotOeq =
1242 BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(pred: OeqPred), DstTy);
1243
1244 const auto [UnoLibcall, UnoPred] =
1245 getFCMPLibcallDesc(Pred: CmpInst::FCMP_UNO, Size);
1246 const auto NotUno =
1247 BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(pred: UnoPred), DstTy);
1248
1249 if (NotOeq && NotUno)
1250 MIRBuilder.buildAnd(Dst: DstReg, Src0: NotOeq, Src1: NotUno);
1251 else
1252 return UnableToLegalize;
1253
1254 break;
1255 }
1256 case CmpInst::FCMP_ULT:
1257 case CmpInst::FCMP_UGE:
1258 case CmpInst::FCMP_UGT:
1259 case CmpInst::FCMP_ULE:
1260 case CmpInst::FCMP_ORD: {
1261 // Convert into: !(inverse(Pred))
1262 // E.g. FCMP_ULT becomes !FCMP_OGE
1263 // This is equivalent to the following, but saves some instructions.
1264 // MIRBuilder.buildNot(
1265 // PredTy,
1266 // MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
1267 // Op1, Op2));
1268 const auto [InversedLibcall, InversedPred] =
1269 getFCMPLibcallDesc(Pred: CmpInst::getInversePredicate(pred: Cond), Size);
1270 if (!BuildLibcall(InversedLibcall,
1271 CmpInst::getInversePredicate(pred: InversedPred), DstReg))
1272 return UnableToLegalize;
1273 break;
1274 }
1275 default:
1276 return UnableToLegalize;
1277 }
1278
1279 return Legalized;
1280}
1281
1282// The function is used to legalize operations that set default environment
1283// state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
1284// On most targets supported in glibc FE_DFL_MODE is defined as
1285// `((const femode_t *) -1)`. Such assumption is used here. If for some target
1286// it is not true, the target must provide custom lowering.
1287LegalizerHelper::LegalizeResult
1288LegalizerHelper::createResetStateLibcall(MachineInstr &MI,
1289 LostDebugLocObserver &LocObserver) {
1290 const DataLayout &DL = MIRBuilder.getDataLayout();
1291 auto &MF = MIRBuilder.getMF();
1292 auto &Ctx = MF.getFunction().getContext();
1293
1294 // Create an argument for the library function.
1295 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
1296 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: AddrSpace);
1297 unsigned PtrSize = DL.getPointerSizeInBits(AS: AddrSpace);
1298 LLT MemTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: PtrSize);
1299 auto DefValue = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: PtrSize), Val: -1LL);
1300 DstOp Dest(MRI.createGenericVirtualRegister(Ty: MemTy));
1301 MIRBuilder.buildIntToPtr(Dst: Dest, Src: DefValue);
1302
1303 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1304 return createLibcall(
1305 Libcall: RTLibcall, Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
1306 Args: CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}), LocObserver, MI: &MI);
1307}
1308
1309LegalizerHelper::LegalizeResult
1310LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1311 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1312
1313 switch (MI.getOpcode()) {
1314 default:
1315 return UnableToLegalize;
1316 case TargetOpcode::G_MUL:
1317 case TargetOpcode::G_SDIV:
1318 case TargetOpcode::G_UDIV:
1319 case TargetOpcode::G_SREM:
1320 case TargetOpcode::G_UREM:
1321 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1322 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1323 unsigned Size = LLTy.getSizeInBits();
1324 Type *HLTy = IntegerType::get(C&: Ctx, NumBits: Size);
1325 auto Status = simpleLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1326 if (Status != Legalized)
1327 return Status;
1328 break;
1329 }
1330 case TargetOpcode::G_FADD:
1331 case TargetOpcode::G_FSUB:
1332 case TargetOpcode::G_FMUL:
1333 case TargetOpcode::G_FDIV:
1334 case TargetOpcode::G_FMA:
1335 case TargetOpcode::G_FPOW:
1336 case TargetOpcode::G_FREM:
1337 case TargetOpcode::G_FCOS:
1338 case TargetOpcode::G_FSIN:
1339 case TargetOpcode::G_FTAN:
1340 case TargetOpcode::G_FACOS:
1341 case TargetOpcode::G_FASIN:
1342 case TargetOpcode::G_FATAN:
1343 case TargetOpcode::G_FATAN2:
1344 case TargetOpcode::G_FCOSH:
1345 case TargetOpcode::G_FSINH:
1346 case TargetOpcode::G_FTANH:
1347 case TargetOpcode::G_FLOG10:
1348 case TargetOpcode::G_FLOG:
1349 case TargetOpcode::G_FLOG2:
1350 case TargetOpcode::G_FEXP:
1351 case TargetOpcode::G_FEXP2:
1352 case TargetOpcode::G_FEXP10:
1353 case TargetOpcode::G_FCEIL:
1354 case TargetOpcode::G_FFLOOR:
1355 case TargetOpcode::G_FMINNUM:
1356 case TargetOpcode::G_FMAXNUM:
1357 case TargetOpcode::G_FMINIMUMNUM:
1358 case TargetOpcode::G_FMAXIMUMNUM:
1359 case TargetOpcode::G_FSQRT:
1360 case TargetOpcode::G_FRINT:
1361 case TargetOpcode::G_FNEARBYINT:
1362 case TargetOpcode::G_INTRINSIC_TRUNC:
1363 case TargetOpcode::G_INTRINSIC_ROUND:
1364 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1365 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1366 unsigned Size = LLTy.getSizeInBits();
1367 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1368 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1369 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1370 return UnableToLegalize;
1371 }
1372 auto Status = simpleLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1373 if (Status != Legalized)
1374 return Status;
1375 break;
1376 }
1377 case TargetOpcode::G_FSINCOS: {
1378 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1379 unsigned Size = LLTy.getSizeInBits();
1380 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1381 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1382 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1383 return UnableToLegalize;
1384 }
1385 return emitSincosLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1386 }
1387 case TargetOpcode::G_FMODF: {
1388 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1389 unsigned Size = LLTy.getSizeInBits();
1390 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1391 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1392 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1393 return UnableToLegalize;
1394 }
1395 return emitModfLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1396 }
1397 case TargetOpcode::G_LROUND:
1398 case TargetOpcode::G_LLROUND:
1399 case TargetOpcode::G_INTRINSIC_LRINT:
1400 case TargetOpcode::G_INTRINSIC_LLRINT: {
1401 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
1402 unsigned Size = LLTy.getSizeInBits();
1403 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1404 Type *ITy = IntegerType::get(
1405 C&: Ctx, NumBits: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits());
1406 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1407 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1408 return UnableToLegalize;
1409 }
1410 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
1411 LegalizeResult Status =
1412 createLibcall(Libcall, Result: {MI.getOperand(i: 0).getReg(), ITy, 0},
1413 Args: {{MI.getOperand(i: 1).getReg(), HLTy, 0}}, LocObserver, MI: &MI);
1414 if (Status != Legalized)
1415 return Status;
1416 MI.eraseFromParent();
1417 return Legalized;
1418 }
1419 case TargetOpcode::G_FPOWI:
1420 case TargetOpcode::G_FLDEXP: {
1421 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1422 unsigned Size = LLTy.getSizeInBits();
1423 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1424 Type *ITy = IntegerType::get(
1425 C&: Ctx, NumBits: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits());
1426 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1427 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1428 return UnableToLegalize;
1429 }
1430 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
1431 SmallVector<CallLowering::ArgInfo, 2> Args = {
1432 {MI.getOperand(i: 1).getReg(), HLTy, 0},
1433 {MI.getOperand(i: 2).getReg(), ITy, 1}};
1434 Args[1].Flags[0].setSExt();
1435 LegalizeResult Status = createLibcall(
1436 Libcall, Result: {MI.getOperand(i: 0).getReg(), HLTy, 0}, Args, LocObserver, MI: &MI);
1437 if (Status != Legalized)
1438 return Status;
1439 break;
1440 }
1441 case TargetOpcode::G_FPEXT:
1442 case TargetOpcode::G_FPTRUNC: {
1443 Type *FromTy = getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 1).getReg()));
1444 Type *ToTy = getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
1445 if (!FromTy || !ToTy)
1446 return UnableToLegalize;
1447 LegalizeResult Status = conversionLibcall(MI, ToType: ToTy, FromType: FromTy, LocObserver);
1448 if (Status != Legalized)
1449 return Status;
1450 break;
1451 }
1452 case TargetOpcode::G_FCMP: {
1453 LegalizeResult Status = createFCMPLibcall(MI, LocObserver);
1454 if (Status != Legalized)
1455 return Status;
1456 MI.eraseFromParent();
1457 return Status;
1458 }
1459 case TargetOpcode::G_FPTOSI:
1460 case TargetOpcode::G_FPTOUI: {
1461 // FIXME: Support other types
1462 Type *FromTy =
1463 getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 1).getReg()));
1464 unsigned ToSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1465 if ((ToSize != 32 && ToSize != 64 && ToSize != 128) || !FromTy)
1466 return UnableToLegalize;
1467 LegalizeResult Status = conversionLibcall(MI, ToType: Type::getIntNTy(C&: Ctx, N: ToSize),
1468 FromType: FromTy, LocObserver);
1469 if (Status != Legalized)
1470 return Status;
1471 break;
1472 }
1473 case TargetOpcode::G_SITOFP:
1474 case TargetOpcode::G_UITOFP: {
1475 unsigned FromSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1476 Type *ToTy =
1477 getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
1478 if ((FromSize != 32 && FromSize != 64 && FromSize != 128) || !ToTy)
1479 return UnableToLegalize;
1480 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SITOFP;
1481 LegalizeResult Status = conversionLibcall(
1482 MI, ToType: ToTy, FromType: Type::getIntNTy(C&: Ctx, N: FromSize), LocObserver, IsSigned);
1483 if (Status != Legalized)
1484 return Status;
1485 break;
1486 }
1487 case TargetOpcode::G_ATOMICRMW_XCHG:
1488 case TargetOpcode::G_ATOMICRMW_ADD:
1489 case TargetOpcode::G_ATOMICRMW_SUB:
1490 case TargetOpcode::G_ATOMICRMW_AND:
1491 case TargetOpcode::G_ATOMICRMW_OR:
1492 case TargetOpcode::G_ATOMICRMW_XOR:
1493 case TargetOpcode::G_ATOMIC_CMPXCHG:
1494 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1495 auto Status = createAtomicLibcall(MI);
1496 if (Status != Legalized)
1497 return Status;
1498 break;
1499 }
1500 case TargetOpcode::G_BZERO:
1501 case TargetOpcode::G_MEMCPY:
1502 case TargetOpcode::G_MEMMOVE:
1503 case TargetOpcode::G_MEMSET: {
1504 LegalizeResult Result =
1505 createMemLibcall(MRI&: *MIRBuilder.getMRI(), MI, LocObserver);
1506 if (Result != Legalized)
1507 return Result;
1508 MI.eraseFromParent();
1509 return Result;
1510 }
1511 case TargetOpcode::G_GET_FPENV:
1512 case TargetOpcode::G_GET_FPMODE: {
1513 LegalizeResult Result = createGetStateLibcall(MI, LocObserver);
1514 if (Result != Legalized)
1515 return Result;
1516 break;
1517 }
1518 case TargetOpcode::G_SET_FPENV:
1519 case TargetOpcode::G_SET_FPMODE: {
1520 LegalizeResult Result = createSetStateLibcall(MI, LocObserver);
1521 if (Result != Legalized)
1522 return Result;
1523 break;
1524 }
1525 case TargetOpcode::G_RESET_FPENV:
1526 case TargetOpcode::G_RESET_FPMODE: {
1527 LegalizeResult Result = createResetStateLibcall(MI, LocObserver);
1528 if (Result != Legalized)
1529 return Result;
1530 break;
1531 }
1532 }
1533
1534 MI.eraseFromParent();
1535 return Legalized;
1536}
1537
1538LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1539 unsigned TypeIdx,
1540 LLT NarrowTy) {
1541 uint64_t SizeOp0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1542 uint64_t NarrowSize = NarrowTy.getSizeInBits();
1543
1544 switch (MI.getOpcode()) {
1545 default:
1546 return UnableToLegalize;
1547 case TargetOpcode::G_IMPLICIT_DEF: {
1548 Register DstReg = MI.getOperand(i: 0).getReg();
1549 LLT DstTy = MRI.getType(Reg: DstReg);
1550
1551 // If SizeOp0 is not an exact multiple of NarrowSize, emit
1552 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1553 // FIXME: Although this would also be legal for the general case, it causes
1554 // a lot of regressions in the emitted code (superfluous COPYs, artifact
1555 // combines not being hit). This seems to be a problem related to the
1556 // artifact combiner.
1557 if (SizeOp0 % NarrowSize != 0) {
1558 LLT ImplicitTy = DstTy.changeElementType(NewEltTy: NarrowTy);
1559 Register ImplicitReg = MIRBuilder.buildUndef(Res: ImplicitTy).getReg(Idx: 0);
1560 MIRBuilder.buildAnyExt(Res: DstReg, Op: ImplicitReg);
1561
1562 MI.eraseFromParent();
1563 return Legalized;
1564 }
1565
1566 int NumParts = SizeOp0 / NarrowSize;
1567
1568 SmallVector<Register, 2> DstRegs;
1569 for (int i = 0; i < NumParts; ++i)
1570 DstRegs.push_back(Elt: MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0));
1571
1572 if (DstTy.isVector())
1573 MIRBuilder.buildBuildVector(Res: DstReg, Ops: DstRegs);
1574 else
1575 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
1576 MI.eraseFromParent();
1577 return Legalized;
1578 }
1579 case TargetOpcode::G_CONSTANT: {
1580 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1581 const APInt &Val = MI.getOperand(i: 1).getCImm()->getValue();
1582 unsigned TotalSize = Ty.getSizeInBits();
1583 unsigned NarrowSize = NarrowTy.getSizeInBits();
1584 int NumParts = TotalSize / NarrowSize;
1585
1586 SmallVector<Register, 4> PartRegs;
1587 for (int I = 0; I != NumParts; ++I) {
1588 unsigned Offset = I * NarrowSize;
1589 auto K = MIRBuilder.buildConstant(Res: NarrowTy,
1590 Val: Val.lshr(shiftAmt: Offset).trunc(width: NarrowSize));
1591 PartRegs.push_back(Elt: K.getReg(Idx: 0));
1592 }
1593
1594 LLT LeftoverTy;
1595 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1596 SmallVector<Register, 1> LeftoverRegs;
1597 if (LeftoverBits != 0) {
1598 LeftoverTy = LLT::scalar(SizeInBits: LeftoverBits);
1599 auto K = MIRBuilder.buildConstant(
1600 Res: LeftoverTy,
1601 Val: Val.lshr(shiftAmt: NumParts * NarrowSize).trunc(width: LeftoverBits));
1602 LeftoverRegs.push_back(Elt: K.getReg(Idx: 0));
1603 }
1604
1605 insertParts(DstReg: MI.getOperand(i: 0).getReg(),
1606 ResultTy: Ty, PartTy: NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1607
1608 MI.eraseFromParent();
1609 return Legalized;
1610 }
1611 case TargetOpcode::G_SEXT:
1612 case TargetOpcode::G_ZEXT:
1613 case TargetOpcode::G_ANYEXT:
1614 return narrowScalarExt(MI, TypeIdx, Ty: NarrowTy);
1615 case TargetOpcode::G_TRUNC: {
1616 if (TypeIdx != 1)
1617 return UnableToLegalize;
1618
1619 uint64_t SizeOp1 = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1620 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1621 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1622 return UnableToLegalize;
1623 }
1624
1625 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1));
1626 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0), Op: Unmerge.getReg(Idx: 0));
1627 MI.eraseFromParent();
1628 return Legalized;
1629 }
1630 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
1631 case TargetOpcode::G_FREEZE: {
1632 if (TypeIdx != 0)
1633 return UnableToLegalize;
1634
1635 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1636 // Should widen scalar first
1637 if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1638 return UnableToLegalize;
1639
1640 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1).getReg());
1641 SmallVector<Register, 8> Parts;
1642 for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1643 Parts.push_back(
1644 Elt: MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy}, SrcOps: {Unmerge.getReg(Idx: i)})
1645 .getReg(Idx: 0));
1646 }
1647
1648 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: Parts);
1649 MI.eraseFromParent();
1650 return Legalized;
1651 }
1652 case TargetOpcode::G_ADD:
1653 case TargetOpcode::G_SUB:
1654 case TargetOpcode::G_SADDO:
1655 case TargetOpcode::G_SSUBO:
1656 case TargetOpcode::G_SADDE:
1657 case TargetOpcode::G_SSUBE:
1658 case TargetOpcode::G_UADDO:
1659 case TargetOpcode::G_USUBO:
1660 case TargetOpcode::G_UADDE:
1661 case TargetOpcode::G_USUBE:
1662 return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1663 case TargetOpcode::G_MUL:
1664 case TargetOpcode::G_UMULH:
1665 return narrowScalarMul(MI, Ty: NarrowTy);
1666 case TargetOpcode::G_EXTRACT:
1667 return narrowScalarExtract(MI, TypeIdx, Ty: NarrowTy);
1668 case TargetOpcode::G_INSERT:
1669 return narrowScalarInsert(MI, TypeIdx, Ty: NarrowTy);
1670 case TargetOpcode::G_LOAD: {
1671 auto &LoadMI = cast<GLoad>(Val&: MI);
1672 Register DstReg = LoadMI.getDstReg();
1673 LLT DstTy = MRI.getType(Reg: DstReg);
1674 if (DstTy.isVector())
1675 return UnableToLegalize;
1676
1677 if (8 * LoadMI.getMemSize().getValue() != DstTy.getSizeInBits()) {
1678 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1679 MIRBuilder.buildLoad(Res: TmpReg, Addr: LoadMI.getPointerReg(), MMO&: LoadMI.getMMO());
1680 MIRBuilder.buildAnyExt(Res: DstReg, Op: TmpReg);
1681 LoadMI.eraseFromParent();
1682 return Legalized;
1683 }
1684
1685 return reduceLoadStoreWidth(MI&: LoadMI, TypeIdx, NarrowTy);
1686 }
1687 case TargetOpcode::G_ZEXTLOAD:
1688 case TargetOpcode::G_SEXTLOAD: {
1689 auto &LoadMI = cast<GExtLoad>(Val&: MI);
1690 Register DstReg = LoadMI.getDstReg();
1691 Register PtrReg = LoadMI.getPointerReg();
1692
1693 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1694 auto &MMO = LoadMI.getMMO();
1695 unsigned MemSize = MMO.getSizeInBits().getValue();
1696
1697 if (MemSize == NarrowSize) {
1698 MIRBuilder.buildLoad(Res: TmpReg, Addr: PtrReg, MMO);
1699 } else if (MemSize < NarrowSize) {
1700 MIRBuilder.buildLoadInstr(Opcode: LoadMI.getOpcode(), Res: TmpReg, Addr: PtrReg, MMO);
1701 } else if (MemSize > NarrowSize) {
1702 // FIXME: Need to split the load.
1703 return UnableToLegalize;
1704 }
1705
1706 if (isa<GZExtLoad>(Val: LoadMI))
1707 MIRBuilder.buildZExt(Res: DstReg, Op: TmpReg);
1708 else
1709 MIRBuilder.buildSExt(Res: DstReg, Op: TmpReg);
1710
1711 LoadMI.eraseFromParent();
1712 return Legalized;
1713 }
1714 case TargetOpcode::G_STORE: {
1715 auto &StoreMI = cast<GStore>(Val&: MI);
1716
1717 Register SrcReg = StoreMI.getValueReg();
1718 LLT SrcTy = MRI.getType(Reg: SrcReg);
1719 if (SrcTy.isVector())
1720 return UnableToLegalize;
1721
1722 int NumParts = SizeOp0 / NarrowSize;
1723 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1724 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1725 if (SrcTy.isVector() && LeftoverBits != 0)
1726 return UnableToLegalize;
1727
1728 if (8 * StoreMI.getMemSize().getValue() != SrcTy.getSizeInBits()) {
1729 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1730 MIRBuilder.buildTrunc(Res: TmpReg, Op: SrcReg);
1731 MIRBuilder.buildStore(Val: TmpReg, Addr: StoreMI.getPointerReg(), MMO&: StoreMI.getMMO());
1732 StoreMI.eraseFromParent();
1733 return Legalized;
1734 }
1735
1736 return reduceLoadStoreWidth(MI&: StoreMI, TypeIdx: 0, NarrowTy);
1737 }
1738 case TargetOpcode::G_SELECT:
1739 return narrowScalarSelect(MI, TypeIdx, Ty: NarrowTy);
1740 case TargetOpcode::G_AND:
1741 case TargetOpcode::G_OR:
1742 case TargetOpcode::G_XOR: {
1743 // Legalize bitwise operation:
1744 // A = BinOp<Ty> B, C
1745 // into:
1746 // B1, ..., BN = G_UNMERGE_VALUES B
1747 // C1, ..., CN = G_UNMERGE_VALUES C
1748 // A1 = BinOp<Ty/N> B1, C2
1749 // ...
1750 // AN = BinOp<Ty/N> BN, CN
1751 // A = G_MERGE_VALUES A1, ..., AN
1752 return narrowScalarBasic(MI, TypeIdx, Ty: NarrowTy);
1753 }
1754 case TargetOpcode::G_SHL:
1755 case TargetOpcode::G_LSHR:
1756 case TargetOpcode::G_ASHR:
1757 return narrowScalarShift(MI, TypeIdx, Ty: NarrowTy);
1758 case TargetOpcode::G_CTLZ:
1759 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1760 case TargetOpcode::G_CTTZ:
1761 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1762 case TargetOpcode::G_CTLS:
1763 case TargetOpcode::G_CTPOP:
1764 if (TypeIdx == 1)
1765 switch (MI.getOpcode()) {
1766 case TargetOpcode::G_CTLZ:
1767 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1768 return narrowScalarCTLZ(MI, TypeIdx, Ty: NarrowTy);
1769 case TargetOpcode::G_CTTZ:
1770 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1771 return narrowScalarCTTZ(MI, TypeIdx, Ty: NarrowTy);
1772 case TargetOpcode::G_CTPOP:
1773 return narrowScalarCTPOP(MI, TypeIdx, Ty: NarrowTy);
1774 case TargetOpcode::G_CTLS:
1775 return narrowScalarCTLS(MI, TypeIdx, Ty: NarrowTy);
1776 default:
1777 return UnableToLegalize;
1778 }
1779
1780 Observer.changingInstr(MI);
1781 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1782 Observer.changedInstr(MI);
1783 return Legalized;
1784 case TargetOpcode::G_INTTOPTR:
1785 if (TypeIdx != 1)
1786 return UnableToLegalize;
1787
1788 Observer.changingInstr(MI);
1789 narrowScalarSrc(MI, NarrowTy, OpIdx: 1);
1790 Observer.changedInstr(MI);
1791 return Legalized;
1792 case TargetOpcode::G_PTRTOINT:
1793 if (TypeIdx != 0)
1794 return UnableToLegalize;
1795
1796 Observer.changingInstr(MI);
1797 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1798 Observer.changedInstr(MI);
1799 return Legalized;
1800 case TargetOpcode::G_PHI: {
1801 // FIXME: add support for when SizeOp0 isn't an exact multiple of
1802 // NarrowSize.
1803 if (SizeOp0 % NarrowSize != 0)
1804 return UnableToLegalize;
1805
1806 unsigned NumParts = SizeOp0 / NarrowSize;
1807 SmallVector<Register, 2> DstRegs(NumParts);
1808 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1809 Observer.changingInstr(MI);
1810 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1811 MachineBasicBlock &OpMBB = *MI.getOperand(i: i + 1).getMBB();
1812 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
1813 extractParts(Reg: MI.getOperand(i).getReg(), Ty: NarrowTy, NumParts,
1814 VRegs&: SrcRegs[i / 2], MIRBuilder, MRI);
1815 }
1816 MachineBasicBlock &MBB = *MI.getParent();
1817 MIRBuilder.setInsertPt(MBB, II: MI);
1818 for (unsigned i = 0; i < NumParts; ++i) {
1819 DstRegs[i] = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1820 MachineInstrBuilder MIB =
1821 MIRBuilder.buildInstr(Opcode: TargetOpcode::G_PHI).addDef(RegNo: DstRegs[i]);
1822 for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1823 MIB.addUse(RegNo: SrcRegs[j / 2][i]).add(MO: MI.getOperand(i: j + 1));
1824 }
1825 MIRBuilder.setInsertPt(MBB, II: MBB.getFirstNonPHI());
1826 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: DstRegs);
1827 Observer.changedInstr(MI);
1828 MI.eraseFromParent();
1829 return Legalized;
1830 }
1831 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1832 case TargetOpcode::G_INSERT_VECTOR_ELT: {
1833 if (TypeIdx != 2)
1834 return UnableToLegalize;
1835
1836 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1837 Observer.changingInstr(MI);
1838 narrowScalarSrc(MI, NarrowTy, OpIdx);
1839 Observer.changedInstr(MI);
1840 return Legalized;
1841 }
1842 case TargetOpcode::G_ICMP: {
1843 Register LHS = MI.getOperand(i: 2).getReg();
1844 LLT SrcTy = MRI.getType(Reg: LHS);
1845 CmpInst::Predicate Pred =
1846 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
1847
1848 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1849 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1850 if (!extractParts(Reg: LHS, RegTy: SrcTy, MainTy: NarrowTy, LeftoverTy, VRegs&: LHSPartRegs,
1851 LeftoverVRegs&: LHSLeftoverRegs, MIRBuilder, MRI))
1852 return UnableToLegalize;
1853
1854 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1855 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1856 if (!extractParts(Reg: MI.getOperand(i: 3).getReg(), RegTy: SrcTy, MainTy: NarrowTy, LeftoverTy&: Unused,
1857 VRegs&: RHSPartRegs, LeftoverVRegs&: RHSLeftoverRegs, MIRBuilder, MRI))
1858 return UnableToLegalize;
1859
1860 // We now have the LHS and RHS of the compare split into narrow-type
1861 // registers, plus potentially some leftover type.
1862 Register Dst = MI.getOperand(i: 0).getReg();
1863 LLT ResTy = MRI.getType(Reg: Dst);
1864 if (ICmpInst::isEquality(P: Pred)) {
1865 // For each part on the LHS and RHS, keep track of the result of XOR-ing
1866 // them together. For each equal part, the result should be all 0s. For
1867 // each non-equal part, we'll get at least one 1.
1868 auto Zero = MIRBuilder.buildConstant(Res: NarrowTy, Val: 0);
1869 SmallVector<Register, 4> Xors;
1870 for (auto LHSAndRHS : zip(t&: LHSPartRegs, u&: RHSPartRegs)) {
1871 auto LHS = std::get<0>(t&: LHSAndRHS);
1872 auto RHS = std::get<1>(t&: LHSAndRHS);
1873 auto Xor = MIRBuilder.buildXor(Dst: NarrowTy, Src0: LHS, Src1: RHS).getReg(Idx: 0);
1874 Xors.push_back(Elt: Xor);
1875 }
1876
1877 // Build a G_XOR for each leftover register. Each G_XOR must be widened
1878 // to the desired narrow type so that we can OR them together later.
1879 SmallVector<Register, 4> WidenedXors;
1880 for (auto LHSAndRHS : zip(t&: LHSLeftoverRegs, u&: RHSLeftoverRegs)) {
1881 auto LHS = std::get<0>(t&: LHSAndRHS);
1882 auto RHS = std::get<1>(t&: LHSAndRHS);
1883 auto Xor = MIRBuilder.buildXor(Dst: LeftoverTy, Src0: LHS, Src1: RHS).getReg(Idx: 0);
1884 LLT GCDTy = extractGCDType(Parts&: WidenedXors, DstTy: NarrowTy, NarrowTy: LeftoverTy, SrcReg: Xor);
1885 buildLCMMergePieces(DstTy: LeftoverTy, NarrowTy, GCDTy, VRegs&: WidenedXors,
1886 /* PadStrategy = */ TargetOpcode::G_ZEXT);
1887 llvm::append_range(C&: Xors, R&: WidenedXors);
1888 }
1889
1890 // Now, for each part we broke up, we know if they are equal/not equal
1891 // based off the G_XOR. We can OR these all together and compare against
1892 // 0 to get the result.
1893 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1894 auto Or = MIRBuilder.buildOr(Dst: NarrowTy, Src0: Xors[0], Src1: Xors[1]);
1895 for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1896 Or = MIRBuilder.buildOr(Dst: NarrowTy, Src0: Or, Src1: Xors[I]);
1897 MIRBuilder.buildICmp(Pred, Res: Dst, Op0: Or, Op1: Zero);
1898 } else {
1899 Register CmpIn;
1900 for (unsigned I = 0, E = LHSPartRegs.size(); I != E; ++I) {
1901 Register CmpOut;
1902 CmpInst::Predicate PartPred;
1903
1904 if (I == E - 1 && LHSLeftoverRegs.empty()) {
1905 PartPred = Pred;
1906 CmpOut = Dst;
1907 } else {
1908 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1909 CmpOut = MRI.createGenericVirtualRegister(Ty: ResTy);
1910 }
1911
1912 if (!CmpIn) {
1913 MIRBuilder.buildICmp(Pred: PartPred, Res: CmpOut, Op0: LHSPartRegs[I],
1914 Op1: RHSPartRegs[I]);
1915 } else {
1916 auto Cmp = MIRBuilder.buildICmp(Pred: PartPred, Res: ResTy, Op0: LHSPartRegs[I],
1917 Op1: RHSPartRegs[I]);
1918 auto CmpEq = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: ResTy,
1919 Op0: LHSPartRegs[I], Op1: RHSPartRegs[I]);
1920 MIRBuilder.buildSelect(Res: CmpOut, Tst: CmpEq, Op0: CmpIn, Op1: Cmp);
1921 }
1922
1923 CmpIn = CmpOut;
1924 }
1925
1926 for (unsigned I = 0, E = LHSLeftoverRegs.size(); I != E; ++I) {
1927 Register CmpOut;
1928 CmpInst::Predicate PartPred;
1929
1930 if (I == E - 1) {
1931 PartPred = Pred;
1932 CmpOut = Dst;
1933 } else {
1934 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1935 CmpOut = MRI.createGenericVirtualRegister(Ty: ResTy);
1936 }
1937
1938 if (!CmpIn) {
1939 MIRBuilder.buildICmp(Pred: PartPred, Res: CmpOut, Op0: LHSLeftoverRegs[I],
1940 Op1: RHSLeftoverRegs[I]);
1941 } else {
1942 auto Cmp = MIRBuilder.buildICmp(Pred: PartPred, Res: ResTy, Op0: LHSLeftoverRegs[I],
1943 Op1: RHSLeftoverRegs[I]);
1944 auto CmpEq =
1945 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: ResTy,
1946 Op0: LHSLeftoverRegs[I], Op1: RHSLeftoverRegs[I]);
1947 MIRBuilder.buildSelect(Res: CmpOut, Tst: CmpEq, Op0: CmpIn, Op1: Cmp);
1948 }
1949
1950 CmpIn = CmpOut;
1951 }
1952 }
1953 MI.eraseFromParent();
1954 return Legalized;
1955 }
1956 case TargetOpcode::G_FCMP:
1957 if (TypeIdx != 0)
1958 return UnableToLegalize;
1959
1960 Observer.changingInstr(MI);
1961 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1962 Observer.changedInstr(MI);
1963 return Legalized;
1964
1965 case TargetOpcode::G_SEXT_INREG: {
1966 if (TypeIdx != 0)
1967 return UnableToLegalize;
1968
1969 int64_t SizeInBits = MI.getOperand(i: 2).getImm();
1970
1971 // So long as the new type has more bits than the bits we're extending we
1972 // don't need to break it apart.
1973 if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1974 Observer.changingInstr(MI);
1975 // We don't lose any non-extension bits by truncating the src and
1976 // sign-extending the dst.
1977 MachineOperand &MO1 = MI.getOperand(i: 1);
1978 auto TruncMIB = MIRBuilder.buildTrunc(Res: NarrowTy, Op: MO1);
1979 MO1.setReg(TruncMIB.getReg(Idx: 0));
1980
1981 MachineOperand &MO2 = MI.getOperand(i: 0);
1982 Register DstExt = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1983 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
1984 MIRBuilder.buildSExt(Res: MO2, Op: DstExt);
1985 MO2.setReg(DstExt);
1986 Observer.changedInstr(MI);
1987 return Legalized;
1988 }
1989
1990 // Break it apart. Components below the extension point are unmodified. The
1991 // component containing the extension point becomes a narrower SEXT_INREG.
1992 // Components above it are ashr'd from the component containing the
1993 // extension point.
1994 if (SizeOp0 % NarrowSize != 0)
1995 return UnableToLegalize;
1996 int NumParts = SizeOp0 / NarrowSize;
1997
1998 // List the registers where the destination will be scattered.
1999 SmallVector<Register, 2> DstRegs;
2000 // List the registers where the source will be split.
2001 SmallVector<Register, 2> SrcRegs;
2002
2003 // Create all the temporary registers.
2004 for (int i = 0; i < NumParts; ++i) {
2005 Register SrcReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
2006
2007 SrcRegs.push_back(Elt: SrcReg);
2008 }
2009
2010 // Explode the big arguments into smaller chunks.
2011 MIRBuilder.buildUnmerge(Res: SrcRegs, Op: MI.getOperand(i: 1));
2012
2013 Register AshrCstReg =
2014 MIRBuilder.buildConstant(Res: NarrowTy, Val: NarrowTy.getScalarSizeInBits() - 1)
2015 .getReg(Idx: 0);
2016 Register FullExtensionReg;
2017 Register PartialExtensionReg;
2018
2019 // Do the operation on each small part.
2020 for (int i = 0; i < NumParts; ++i) {
2021 if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
2022 DstRegs.push_back(Elt: SrcRegs[i]);
2023 PartialExtensionReg = DstRegs.back();
2024 } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
2025 assert(PartialExtensionReg &&
2026 "Expected to visit partial extension before full");
2027 if (FullExtensionReg) {
2028 DstRegs.push_back(Elt: FullExtensionReg);
2029 continue;
2030 }
2031 DstRegs.push_back(
2032 Elt: MIRBuilder.buildAShr(Dst: NarrowTy, Src0: PartialExtensionReg, Src1: AshrCstReg)
2033 .getReg(Idx: 0));
2034 FullExtensionReg = DstRegs.back();
2035 } else {
2036 DstRegs.push_back(
2037 Elt: MIRBuilder
2038 .buildInstr(
2039 Opc: TargetOpcode::G_SEXT_INREG, DstOps: {NarrowTy},
2040 SrcOps: {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
2041 .getReg(Idx: 0));
2042 PartialExtensionReg = DstRegs.back();
2043 }
2044 }
2045
2046 // Gather the destination registers into the final destination.
2047 Register DstReg = MI.getOperand(i: 0).getReg();
2048 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
2049 MI.eraseFromParent();
2050 return Legalized;
2051 }
2052 case TargetOpcode::G_BSWAP:
2053 case TargetOpcode::G_BITREVERSE: {
2054 if (SizeOp0 % NarrowSize != 0)
2055 return UnableToLegalize;
2056
2057 Observer.changingInstr(MI);
2058 SmallVector<Register, 2> SrcRegs, DstRegs;
2059 unsigned NumParts = SizeOp0 / NarrowSize;
2060 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowTy, NumParts, VRegs&: SrcRegs,
2061 MIRBuilder, MRI);
2062
2063 for (unsigned i = 0; i < NumParts; ++i) {
2064 auto DstPart = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy},
2065 SrcOps: {SrcRegs[NumParts - 1 - i]});
2066 DstRegs.push_back(Elt: DstPart.getReg(Idx: 0));
2067 }
2068
2069 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: DstRegs);
2070
2071 Observer.changedInstr(MI);
2072 MI.eraseFromParent();
2073 return Legalized;
2074 }
2075 case TargetOpcode::G_PTR_ADD:
2076 case TargetOpcode::G_PTRMASK: {
2077 if (TypeIdx != 1)
2078 return UnableToLegalize;
2079 Observer.changingInstr(MI);
2080 narrowScalarSrc(MI, NarrowTy, OpIdx: 2);
2081 Observer.changedInstr(MI);
2082 return Legalized;
2083 }
2084 case TargetOpcode::G_FPTOUI:
2085 case TargetOpcode::G_FPTOSI:
2086 case TargetOpcode::G_FPTOUI_SAT:
2087 case TargetOpcode::G_FPTOSI_SAT:
2088 return narrowScalarFPTOI(MI, TypeIdx, Ty: NarrowTy);
2089 case TargetOpcode::G_FPEXT:
2090 if (TypeIdx != 0)
2091 return UnableToLegalize;
2092 Observer.changingInstr(MI);
2093 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_FPEXT);
2094 Observer.changedInstr(MI);
2095 return Legalized;
2096 case TargetOpcode::G_FLDEXP:
2097 case TargetOpcode::G_STRICT_FLDEXP:
2098 return narrowScalarFLDEXP(MI, TypeIdx, Ty: NarrowTy);
2099 case TargetOpcode::G_VSCALE: {
2100 Register Dst = MI.getOperand(i: 0).getReg();
2101 LLT Ty = MRI.getType(Reg: Dst);
2102
2103 // Assume VSCALE(1) fits into a legal integer
2104 const APInt One(NarrowTy.getSizeInBits(), 1);
2105 auto VScaleBase = MIRBuilder.buildVScale(Res: NarrowTy, MinElts: One);
2106 auto ZExt = MIRBuilder.buildZExt(Res: Ty, Op: VScaleBase);
2107 auto C = MIRBuilder.buildConstant(Res: Ty, Val: *MI.getOperand(i: 1).getCImm());
2108 MIRBuilder.buildMul(Dst, Src0: ZExt, Src1: C);
2109
2110 MI.eraseFromParent();
2111 return Legalized;
2112 }
2113 }
2114}
2115
2116Register LegalizerHelper::coerceToScalar(Register Val) {
2117 LLT Ty = MRI.getType(Reg: Val);
2118 if (Ty.isScalar())
2119 return Val;
2120
2121 const DataLayout &DL = MIRBuilder.getDataLayout();
2122 LLT NewTy = LLT::scalar(SizeInBits: Ty.getSizeInBits());
2123 if (Ty.isPointer()) {
2124 if (DL.isNonIntegralAddressSpace(AddrSpace: Ty.getAddressSpace()))
2125 return Register();
2126 return MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Val).getReg(Idx: 0);
2127 }
2128
2129 Register NewVal = Val;
2130
2131 assert(Ty.isVector());
2132 if (Ty.isPointerVector())
2133 NewVal = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: NewVal).getReg(Idx: 0);
2134 return MIRBuilder.buildBitcast(Dst: NewTy, Src: NewVal).getReg(Idx: 0);
2135}
2136
2137void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
2138 unsigned OpIdx, unsigned ExtOpcode) {
2139 MachineOperand &MO = MI.getOperand(i: OpIdx);
2140 auto ExtB = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MO});
2141 MO.setReg(ExtB.getReg(Idx: 0));
2142}
2143
2144void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
2145 unsigned OpIdx) {
2146 MachineOperand &MO = MI.getOperand(i: OpIdx);
2147 auto ExtB = MIRBuilder.buildTrunc(Res: NarrowTy, Op: MO);
2148 MO.setReg(ExtB.getReg(Idx: 0));
2149}
2150
2151void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
2152 unsigned OpIdx, unsigned TruncOpcode) {
2153 MachineOperand &MO = MI.getOperand(i: OpIdx);
2154 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2155 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2156 MIRBuilder.buildInstr(Opc: TruncOpcode, DstOps: {MO}, SrcOps: {DstExt});
2157 MO.setReg(DstExt);
2158}
2159
2160void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
2161 unsigned OpIdx, unsigned ExtOpcode) {
2162 MachineOperand &MO = MI.getOperand(i: OpIdx);
2163 Register DstTrunc = MRI.createGenericVirtualRegister(Ty: NarrowTy);
2164 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2165 MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {MO}, SrcOps: {DstTrunc});
2166 MO.setReg(DstTrunc);
2167}
2168
2169void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
2170 unsigned OpIdx) {
2171 MachineOperand &MO = MI.getOperand(i: OpIdx);
2172 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2173 Register Dst = MO.getReg();
2174 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2175 MO.setReg(DstExt);
2176 MIRBuilder.buildDeleteTrailingVectorElements(Res: Dst, Op0: DstExt);
2177}
2178
2179void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
2180 unsigned OpIdx) {
2181 MachineOperand &MO = MI.getOperand(i: OpIdx);
2182 MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(Res: MoreTy, Op0: MO).getReg(Idx: 0));
2183}
2184
2185void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2186 MachineOperand &Op = MI.getOperand(i: OpIdx);
2187 Op.setReg(MIRBuilder.buildBitcast(Dst: CastTy, Src: Op).getReg(Idx: 0));
2188}
2189
2190void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2191 MachineOperand &MO = MI.getOperand(i: OpIdx);
2192 Register CastDst = MRI.createGenericVirtualRegister(Ty: CastTy);
2193 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2194 MIRBuilder.buildBitcast(Dst: MO, Src: CastDst);
2195 MO.setReg(CastDst);
2196}
2197
2198LegalizerHelper::LegalizeResult
2199LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
2200 LLT WideTy) {
2201 if (TypeIdx != 1)
2202 return UnableToLegalize;
2203
2204 auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
2205 if (DstTy.isVector())
2206 return UnableToLegalize;
2207
2208 LLT SrcTy = MRI.getType(Reg: Src1Reg);
2209 const int DstSize = DstTy.getSizeInBits();
2210 const int SrcSize = SrcTy.getSizeInBits();
2211 const int WideSize = WideTy.getSizeInBits();
2212 const int NumMerge = (DstSize + WideSize - 1) / WideSize;
2213
2214 unsigned NumOps = MI.getNumOperands();
2215 unsigned NumSrc = MI.getNumOperands() - 1;
2216 unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
2217
2218 if (WideSize >= DstSize) {
2219 // Directly pack the bits in the target type.
2220 Register ResultReg = MIRBuilder.buildZExt(Res: WideTy, Op: Src1Reg).getReg(Idx: 0);
2221
2222 for (unsigned I = 2; I != NumOps; ++I) {
2223 const unsigned Offset = (I - 1) * PartSize;
2224
2225 Register SrcReg = MI.getOperand(i: I).getReg();
2226 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
2227
2228 auto ZextInput = MIRBuilder.buildZExt(Res: WideTy, Op: SrcReg);
2229
2230 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
2231 MRI.createGenericVirtualRegister(Ty: WideTy);
2232
2233 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: Offset);
2234 auto Shl = MIRBuilder.buildShl(Dst: WideTy, Src0: ZextInput, Src1: ShiftAmt);
2235 MIRBuilder.buildOr(Dst: NextResult, Src0: ResultReg, Src1: Shl);
2236 ResultReg = NextResult;
2237 }
2238
2239 if (WideSize > DstSize)
2240 MIRBuilder.buildTrunc(Res: DstReg, Op: ResultReg);
2241 else if (DstTy.isPointer())
2242 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
2243
2244 MI.eraseFromParent();
2245 return Legalized;
2246 }
2247
2248 // Unmerge the original values to the GCD type, and recombine to the next
2249 // multiple greater than the original type.
2250 //
2251 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
2252 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
2253 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
2254 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
2255 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
2256 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
2257 // %12:_(s12) = G_MERGE_VALUES %10, %11
2258 //
2259 // Padding with undef if necessary:
2260 //
2261 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
2262 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
2263 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
2264 // %7:_(s2) = G_IMPLICIT_DEF
2265 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
2266 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
2267 // %10:_(s12) = G_MERGE_VALUES %8, %9
2268
2269 const int GCD = std::gcd(m: SrcSize, n: WideSize);
2270 LLT GCDTy = LLT::scalar(SizeInBits: GCD);
2271
2272 SmallVector<Register, 8> NewMergeRegs;
2273 SmallVector<Register, 8> Unmerges;
2274 LLT WideDstTy = LLT::scalar(SizeInBits: NumMerge * WideSize);
2275
2276 // Decompose the original operands if they don't evenly divide.
2277 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands())) {
2278 Register SrcReg = MO.getReg();
2279 if (GCD == SrcSize) {
2280 Unmerges.push_back(Elt: SrcReg);
2281 } else {
2282 auto Unmerge = MIRBuilder.buildUnmerge(Res: GCDTy, Op: SrcReg);
2283 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
2284 Unmerges.push_back(Elt: Unmerge.getReg(Idx: J));
2285 }
2286 }
2287
2288 // Pad with undef to the next size that is a multiple of the requested size.
2289 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
2290 Register UndefReg = MIRBuilder.buildUndef(Res: GCDTy).getReg(Idx: 0);
2291 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
2292 Unmerges.push_back(Elt: UndefReg);
2293 }
2294
2295 const int PartsPerGCD = WideSize / GCD;
2296
2297 // Build merges of each piece.
2298 ArrayRef<Register> Slicer(Unmerges);
2299 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(N: PartsPerGCD)) {
2300 auto Merge =
2301 MIRBuilder.buildMergeLikeInstr(Res: WideTy, Ops: Slicer.take_front(N: PartsPerGCD));
2302 NewMergeRegs.push_back(Elt: Merge.getReg(Idx: 0));
2303 }
2304
2305 // A truncate may be necessary if the requested type doesn't evenly divide the
2306 // original result type.
2307 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
2308 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NewMergeRegs);
2309 } else {
2310 auto FinalMerge = MIRBuilder.buildMergeLikeInstr(Res: WideDstTy, Ops: NewMergeRegs);
2311 MIRBuilder.buildTrunc(Res: DstReg, Op: FinalMerge.getReg(Idx: 0));
2312 }
2313
2314 MI.eraseFromParent();
2315 return Legalized;
2316}
2317
2318LegalizerHelper::LegalizeResult
2319LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
2320 LLT WideTy) {
2321 if (TypeIdx != 0)
2322 return UnableToLegalize;
2323
2324 int NumDst = MI.getNumOperands() - 1;
2325 Register SrcReg = MI.getOperand(i: NumDst).getReg();
2326 LLT SrcTy = MRI.getType(Reg: SrcReg);
2327 if (SrcTy.isVector())
2328 return UnableToLegalize;
2329
2330 Register Dst0Reg = MI.getOperand(i: 0).getReg();
2331 LLT DstTy = MRI.getType(Reg: Dst0Reg);
2332 if (!DstTy.isScalar())
2333 return UnableToLegalize;
2334
2335 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
2336 if (SrcTy.isPointer()) {
2337 const DataLayout &DL = MIRBuilder.getDataLayout();
2338 if (DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace())) {
2339 LLVM_DEBUG(
2340 dbgs() << "Not casting non-integral address space integer\n");
2341 return UnableToLegalize;
2342 }
2343
2344 SrcTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
2345 SrcReg = MIRBuilder.buildPtrToInt(Dst: SrcTy, Src: SrcReg).getReg(Idx: 0);
2346 }
2347
2348 // Widen SrcTy to WideTy. This does not affect the result, but since the
2349 // user requested this size, it is probably better handled than SrcTy and
2350 // should reduce the total number of legalization artifacts.
2351 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2352 SrcTy = WideTy;
2353 SrcReg = MIRBuilder.buildAnyExt(Res: WideTy, Op: SrcReg).getReg(Idx: 0);
2354 }
2355
2356 // Theres no unmerge type to target. Directly extract the bits from the
2357 // source type
2358 unsigned DstSize = DstTy.getSizeInBits();
2359
2360 MIRBuilder.buildTrunc(Res: Dst0Reg, Op: SrcReg);
2361 for (int I = 1; I != NumDst; ++I) {
2362 auto ShiftAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: DstSize * I);
2363 auto Shr = MIRBuilder.buildLShr(Dst: SrcTy, Src0: SrcReg, Src1: ShiftAmt);
2364 MIRBuilder.buildTrunc(Res: MI.getOperand(i: I), Op: Shr);
2365 }
2366
2367 MI.eraseFromParent();
2368 return Legalized;
2369 }
2370
2371 // Extend the source to a wider type.
2372 LLT LCMTy = getLCMType(OrigTy: SrcTy, TargetTy: WideTy);
2373
2374 Register WideSrc = SrcReg;
2375 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
2376 // TODO: If this is an integral address space, cast to integer and anyext.
2377 if (SrcTy.isPointer()) {
2378 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
2379 return UnableToLegalize;
2380 }
2381
2382 WideSrc = MIRBuilder.buildAnyExt(Res: LCMTy, Op: WideSrc).getReg(Idx: 0);
2383 }
2384
2385 auto Unmerge = MIRBuilder.buildUnmerge(Res: WideTy, Op: WideSrc);
2386
2387 // Create a sequence of unmerges and merges to the original results. Since we
2388 // may have widened the source, we will need to pad the results with dead defs
2389 // to cover the source register.
2390 // e.g. widen s48 to s64:
2391 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
2392 //
2393 // =>
2394 // %4:_(s192) = G_ANYEXT %0:_(s96)
2395 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
2396 // ; unpack to GCD type, with extra dead defs
2397 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
2398 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
2399 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
2400 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination
2401 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
2402 const LLT GCDTy = getGCDType(OrigTy: WideTy, TargetTy: DstTy);
2403 const int NumUnmerge = Unmerge->getNumOperands() - 1;
2404 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
2405
2406 // Directly unmerge to the destination without going through a GCD type
2407 // if possible
2408 if (PartsPerRemerge == 1) {
2409 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
2410
2411 for (int I = 0; I != NumUnmerge; ++I) {
2412 auto MIB = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_UNMERGE_VALUES);
2413
2414 for (int J = 0; J != PartsPerUnmerge; ++J) {
2415 int Idx = I * PartsPerUnmerge + J;
2416 if (Idx < NumDst)
2417 MIB.addDef(RegNo: MI.getOperand(i: Idx).getReg());
2418 else {
2419 // Create dead def for excess components.
2420 MIB.addDef(RegNo: MRI.createGenericVirtualRegister(Ty: DstTy));
2421 }
2422 }
2423
2424 MIB.addUse(RegNo: Unmerge.getReg(Idx: I));
2425 }
2426 } else {
2427 SmallVector<Register, 16> Parts;
2428 for (int J = 0; J != NumUnmerge; ++J)
2429 extractGCDType(Parts, GCDTy, SrcReg: Unmerge.getReg(Idx: J));
2430
2431 SmallVector<Register, 8> RemergeParts;
2432 for (int I = 0; I != NumDst; ++I) {
2433 for (int J = 0; J < PartsPerRemerge; ++J) {
2434 const int Idx = I * PartsPerRemerge + J;
2435 RemergeParts.emplace_back(Args&: Parts[Idx]);
2436 }
2437
2438 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: I).getReg(), Ops: RemergeParts);
2439 RemergeParts.clear();
2440 }
2441 }
2442
2443 MI.eraseFromParent();
2444 return Legalized;
2445}
2446
2447LegalizerHelper::LegalizeResult
2448LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2449 LLT WideTy) {
2450 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2451 unsigned Offset = MI.getOperand(i: 2).getImm();
2452
2453 if (TypeIdx == 0) {
2454 if (SrcTy.isVector() || DstTy.isVector())
2455 return UnableToLegalize;
2456
2457 SrcOp Src(SrcReg);
2458 if (SrcTy.isPointer()) {
2459 // Extracts from pointers can be handled only if they are really just
2460 // simple integers.
2461 const DataLayout &DL = MIRBuilder.getDataLayout();
2462 if (DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace()))
2463 return UnableToLegalize;
2464
2465 LLT SrcAsIntTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
2466 Src = MIRBuilder.buildPtrToInt(Dst: SrcAsIntTy, Src);
2467 SrcTy = SrcAsIntTy;
2468 }
2469
2470 if (DstTy.isPointer())
2471 return UnableToLegalize;
2472
2473 if (Offset == 0) {
2474 // Avoid a shift in the degenerate case.
2475 MIRBuilder.buildTrunc(Res: DstReg,
2476 Op: MIRBuilder.buildAnyExtOrTrunc(Res: WideTy, Op: Src));
2477 MI.eraseFromParent();
2478 return Legalized;
2479 }
2480
2481 // Do a shift in the source type.
2482 LLT ShiftTy = SrcTy;
2483 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2484 Src = MIRBuilder.buildAnyExt(Res: WideTy, Op: Src);
2485 ShiftTy = WideTy;
2486 }
2487
2488 auto LShr = MIRBuilder.buildLShr(
2489 Dst: ShiftTy, Src0: Src, Src1: MIRBuilder.buildConstant(Res: ShiftTy, Val: Offset));
2490 MIRBuilder.buildTrunc(Res: DstReg, Op: LShr);
2491 MI.eraseFromParent();
2492 return Legalized;
2493 }
2494
2495 if (SrcTy.isScalar()) {
2496 Observer.changingInstr(MI);
2497 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2498 Observer.changedInstr(MI);
2499 return Legalized;
2500 }
2501
2502 if (!SrcTy.isVector())
2503 return UnableToLegalize;
2504
2505 if (DstTy != SrcTy.getElementType())
2506 return UnableToLegalize;
2507
2508 if (Offset % SrcTy.getScalarSizeInBits() != 0)
2509 return UnableToLegalize;
2510
2511 Observer.changingInstr(MI);
2512 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2513
2514 MI.getOperand(i: 2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2515 Offset);
2516 widenScalarDst(MI, WideTy: WideTy.getScalarType(), OpIdx: 0);
2517 Observer.changedInstr(MI);
2518 return Legalized;
2519}
2520
2521LegalizerHelper::LegalizeResult
2522LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2523 LLT WideTy) {
2524 if (TypeIdx != 0 || WideTy.isVector())
2525 return UnableToLegalize;
2526 Observer.changingInstr(MI);
2527 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2528 widenScalarDst(MI, WideTy);
2529 Observer.changedInstr(MI);
2530 return Legalized;
2531}
2532
2533LegalizerHelper::LegalizeResult
2534LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2535 LLT WideTy) {
2536 unsigned Opcode;
2537 unsigned ExtOpcode;
2538 std::optional<Register> CarryIn;
2539 switch (MI.getOpcode()) {
2540 default:
2541 llvm_unreachable("Unexpected opcode!");
2542 case TargetOpcode::G_SADDO:
2543 Opcode = TargetOpcode::G_ADD;
2544 ExtOpcode = TargetOpcode::G_SEXT;
2545 break;
2546 case TargetOpcode::G_SSUBO:
2547 Opcode = TargetOpcode::G_SUB;
2548 ExtOpcode = TargetOpcode::G_SEXT;
2549 break;
2550 case TargetOpcode::G_UADDO:
2551 Opcode = TargetOpcode::G_ADD;
2552 ExtOpcode = TargetOpcode::G_ZEXT;
2553 break;
2554 case TargetOpcode::G_USUBO:
2555 Opcode = TargetOpcode::G_SUB;
2556 ExtOpcode = TargetOpcode::G_ZEXT;
2557 break;
2558 case TargetOpcode::G_SADDE:
2559 Opcode = TargetOpcode::G_UADDE;
2560 ExtOpcode = TargetOpcode::G_SEXT;
2561 CarryIn = MI.getOperand(i: 4).getReg();
2562 break;
2563 case TargetOpcode::G_SSUBE:
2564 Opcode = TargetOpcode::G_USUBE;
2565 ExtOpcode = TargetOpcode::G_SEXT;
2566 CarryIn = MI.getOperand(i: 4).getReg();
2567 break;
2568 case TargetOpcode::G_UADDE:
2569 Opcode = TargetOpcode::G_UADDE;
2570 ExtOpcode = TargetOpcode::G_ZEXT;
2571 CarryIn = MI.getOperand(i: 4).getReg();
2572 break;
2573 case TargetOpcode::G_USUBE:
2574 Opcode = TargetOpcode::G_USUBE;
2575 ExtOpcode = TargetOpcode::G_ZEXT;
2576 CarryIn = MI.getOperand(i: 4).getReg();
2577 break;
2578 }
2579
2580 if (TypeIdx == 1) {
2581 unsigned BoolExtOp = MIRBuilder.getBoolExtOp(IsVec: WideTy.isVector(), IsFP: false);
2582
2583 Observer.changingInstr(MI);
2584 if (CarryIn)
2585 widenScalarSrc(MI, WideTy, OpIdx: 4, ExtOpcode: BoolExtOp);
2586 widenScalarDst(MI, WideTy, OpIdx: 1);
2587
2588 Observer.changedInstr(MI);
2589 return Legalized;
2590 }
2591
2592 auto LHSExt = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 2)});
2593 auto RHSExt = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 3)});
2594 // Do the arithmetic in the larger type.
2595 Register NewOp;
2596 if (CarryIn) {
2597 LLT CarryOutTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
2598 NewOp = MIRBuilder
2599 .buildInstr(Opc: Opcode, DstOps: {WideTy, CarryOutTy},
2600 SrcOps: {LHSExt, RHSExt, *CarryIn})
2601 .getReg(Idx: 0);
2602 } else {
2603 NewOp = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {WideTy}, SrcOps: {LHSExt, RHSExt}).getReg(Idx: 0);
2604 }
2605 LLT OrigTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
2606 auto TruncOp = MIRBuilder.buildTrunc(Res: OrigTy, Op: NewOp);
2607 auto ExtOp = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {TruncOp});
2608 // There is no overflow if the ExtOp is the same as NewOp.
2609 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: MI.getOperand(i: 1), Op0: NewOp, Op1: ExtOp);
2610 // Now trunc the NewOp to the original result.
2611 MIRBuilder.buildTrunc(Res: MI.getOperand(i: 0), Op: NewOp);
2612 MI.eraseFromParent();
2613 return Legalized;
2614}
2615
2616LegalizerHelper::LegalizeResult
2617LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2618 LLT WideTy) {
2619 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2620 MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2621 MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2622 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2623 MI.getOpcode() == TargetOpcode::G_USHLSAT;
2624 // We can convert this to:
2625 // 1. Any extend iN to iM
2626 // 2. SHL by M-N
2627 // 3. [US][ADD|SUB|SHL]SAT
2628 // 4. L/ASHR by M-N
2629 //
2630 // It may be more efficient to lower this to a min and a max operation in
2631 // the higher precision arithmetic if the promoted operation isn't legal,
2632 // but this decision is up to the target's lowering request.
2633 Register DstReg = MI.getOperand(i: 0).getReg();
2634
2635 unsigned NewBits = WideTy.getScalarSizeInBits();
2636 unsigned SHLAmount = NewBits - MRI.getType(Reg: DstReg).getScalarSizeInBits();
2637
2638 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2639 // must not left shift the RHS to preserve the shift amount.
2640 auto LHS = MIRBuilder.buildAnyExt(Res: WideTy, Op: MI.getOperand(i: 1));
2641 auto RHS = IsShift ? MIRBuilder.buildZExt(Res: WideTy, Op: MI.getOperand(i: 2))
2642 : MIRBuilder.buildAnyExt(Res: WideTy, Op: MI.getOperand(i: 2));
2643 auto ShiftK = MIRBuilder.buildConstant(Res: WideTy, Val: SHLAmount);
2644 auto ShiftL = MIRBuilder.buildShl(Dst: WideTy, Src0: LHS, Src1: ShiftK);
2645 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(Dst: WideTy, Src0: RHS, Src1: ShiftK);
2646
2647 auto WideInst = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {WideTy},
2648 SrcOps: {ShiftL, ShiftR}, Flags: MI.getFlags());
2649
2650 // Use a shift that will preserve the number of sign bits when the trunc is
2651 // folded away.
2652 auto Result = IsSigned ? MIRBuilder.buildAShr(Dst: WideTy, Src0: WideInst, Src1: ShiftK)
2653 : MIRBuilder.buildLShr(Dst: WideTy, Src0: WideInst, Src1: ShiftK);
2654
2655 MIRBuilder.buildTrunc(Res: DstReg, Op: Result);
2656 MI.eraseFromParent();
2657 return Legalized;
2658}
2659
2660LegalizerHelper::LegalizeResult
2661LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2662 LLT WideTy) {
2663 if (TypeIdx == 1) {
2664 Observer.changingInstr(MI);
2665 widenScalarDst(MI, WideTy, OpIdx: 1);
2666 Observer.changedInstr(MI);
2667 return Legalized;
2668 }
2669
2670 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2671 auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2672 LLT SrcTy = MRI.getType(Reg: LHS);
2673 LLT OverflowTy = MRI.getType(Reg: OriginalOverflow);
2674 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2675
2676 // To determine if the result overflowed in the larger type, we extend the
2677 // input to the larger type, do the multiply (checking if it overflows),
2678 // then also check the high bits of the result to see if overflow happened
2679 // there.
2680 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2681 auto LeftOperand = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {LHS});
2682 auto RightOperand = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {RHS});
2683
2684 // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2685 // so we don't need to check the overflow result of larger type Mulo.
2686 bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2687
2688 unsigned MulOpc =
2689 WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2690
2691 MachineInstrBuilder Mulo;
2692 if (WideMulCanOverflow)
2693 Mulo = MIRBuilder.buildInstr(Opc: MulOpc, DstOps: {WideTy, OverflowTy},
2694 SrcOps: {LeftOperand, RightOperand});
2695 else
2696 Mulo = MIRBuilder.buildInstr(Opc: MulOpc, DstOps: {WideTy}, SrcOps: {LeftOperand, RightOperand});
2697
2698 auto Mul = Mulo->getOperand(i: 0);
2699 MIRBuilder.buildTrunc(Res: Result, Op: Mul);
2700
2701 MachineInstrBuilder ExtResult;
2702 // Overflow occurred if it occurred in the larger type, or if the high part
2703 // of the result does not zero/sign-extend the low part. Check this second
2704 // possibility first.
2705 if (IsSigned) {
2706 // For signed, overflow occurred when the high part does not sign-extend
2707 // the low part.
2708 ExtResult = MIRBuilder.buildSExtInReg(Res: WideTy, Op: Mul, ImmOp: SrcBitWidth);
2709 } else {
2710 // Unsigned overflow occurred when the high part does not zero-extend the
2711 // low part.
2712 ExtResult = MIRBuilder.buildZExtInReg(Res: WideTy, Op: Mul, ImmOp: SrcBitWidth);
2713 }
2714
2715 if (WideMulCanOverflow) {
2716 auto Overflow =
2717 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: OverflowTy, Op0: Mul, Op1: ExtResult);
2718 // Finally check if the multiplication in the larger type itself overflowed.
2719 MIRBuilder.buildOr(Dst: OriginalOverflow, Src0: Mulo->getOperand(i: 1), Src1: Overflow);
2720 } else {
2721 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: OriginalOverflow, Op0: Mul, Op1: ExtResult);
2722 }
2723 MI.eraseFromParent();
2724 return Legalized;
2725}
2726
2727LegalizerHelper::LegalizeResult
2728LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2729 unsigned Opcode = MI.getOpcode();
2730 switch (Opcode) {
2731 default:
2732 return UnableToLegalize;
2733 case TargetOpcode::G_ATOMICRMW_XCHG:
2734 case TargetOpcode::G_ATOMICRMW_ADD:
2735 case TargetOpcode::G_ATOMICRMW_SUB:
2736 case TargetOpcode::G_ATOMICRMW_AND:
2737 case TargetOpcode::G_ATOMICRMW_OR:
2738 case TargetOpcode::G_ATOMICRMW_XOR:
2739 case TargetOpcode::G_ATOMICRMW_MIN:
2740 case TargetOpcode::G_ATOMICRMW_MAX:
2741 case TargetOpcode::G_ATOMICRMW_UMIN:
2742 case TargetOpcode::G_ATOMICRMW_UMAX:
2743 assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2744 Observer.changingInstr(MI);
2745 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2746 widenScalarDst(MI, WideTy, OpIdx: 0);
2747 Observer.changedInstr(MI);
2748 return Legalized;
2749 case TargetOpcode::G_ATOMIC_CMPXCHG:
2750 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2751 Observer.changingInstr(MI);
2752 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2753 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2754 widenScalarDst(MI, WideTy, OpIdx: 0);
2755 Observer.changedInstr(MI);
2756 return Legalized;
2757 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2758 if (TypeIdx == 0) {
2759 Observer.changingInstr(MI);
2760 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2761 widenScalarSrc(MI, WideTy, OpIdx: 4, ExtOpcode: TargetOpcode::G_ANYEXT);
2762 widenScalarDst(MI, WideTy, OpIdx: 0);
2763 Observer.changedInstr(MI);
2764 return Legalized;
2765 }
2766 assert(TypeIdx == 1 &&
2767 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2768 Observer.changingInstr(MI);
2769 widenScalarDst(MI, WideTy, OpIdx: 1);
2770 Observer.changedInstr(MI);
2771 return Legalized;
2772 case TargetOpcode::G_EXTRACT:
2773 return widenScalarExtract(MI, TypeIdx, WideTy);
2774 case TargetOpcode::G_INSERT:
2775 return widenScalarInsert(MI, TypeIdx, WideTy);
2776 case TargetOpcode::G_MERGE_VALUES:
2777 return widenScalarMergeValues(MI, TypeIdx, WideTy);
2778 case TargetOpcode::G_UNMERGE_VALUES:
2779 return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2780 case TargetOpcode::G_SADDO:
2781 case TargetOpcode::G_SSUBO:
2782 case TargetOpcode::G_UADDO:
2783 case TargetOpcode::G_USUBO:
2784 case TargetOpcode::G_SADDE:
2785 case TargetOpcode::G_SSUBE:
2786 case TargetOpcode::G_UADDE:
2787 case TargetOpcode::G_USUBE:
2788 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2789 case TargetOpcode::G_UMULO:
2790 case TargetOpcode::G_SMULO:
2791 return widenScalarMulo(MI, TypeIdx, WideTy);
2792 case TargetOpcode::G_SADDSAT:
2793 case TargetOpcode::G_SSUBSAT:
2794 case TargetOpcode::G_SSHLSAT:
2795 case TargetOpcode::G_UADDSAT:
2796 case TargetOpcode::G_USUBSAT:
2797 case TargetOpcode::G_USHLSAT:
2798 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2799 case TargetOpcode::G_CTTZ:
2800 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2801 case TargetOpcode::G_CTLZ:
2802 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2803 case TargetOpcode::G_CTLS:
2804 case TargetOpcode::G_CTPOP: {
2805 if (TypeIdx == 0) {
2806 Observer.changingInstr(MI);
2807 widenScalarDst(MI, WideTy, OpIdx: 0);
2808 Observer.changedInstr(MI);
2809 return Legalized;
2810 }
2811
2812 Register SrcReg = MI.getOperand(i: 1).getReg();
2813
2814 // First extend the input.
2815 unsigned ExtOpc;
2816 switch (Opcode) {
2817 case TargetOpcode::G_CTTZ:
2818 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2819 case TargetOpcode::G_CTLZ_ZERO_UNDEF: // undef bits shifted out below
2820 ExtOpc = TargetOpcode::G_ANYEXT;
2821 break;
2822 case TargetOpcode::G_CTLS:
2823 ExtOpc = TargetOpcode::G_SEXT;
2824 break;
2825 default:
2826 ExtOpc = TargetOpcode::G_ZEXT;
2827 }
2828
2829 auto MIBSrc = MIRBuilder.buildInstr(Opc: ExtOpc, DstOps: {WideTy}, SrcOps: {SrcReg});
2830 LLT CurTy = MRI.getType(Reg: SrcReg);
2831 unsigned NewOpc = Opcode;
2832 if (NewOpc == TargetOpcode::G_CTTZ) {
2833 // The count is the same in the larger type except if the original
2834 // value was zero. This can be handled by setting the bit just off
2835 // the top of the original type.
2836 auto TopBit =
2837 APInt::getOneBitSet(numBits: WideTy.getSizeInBits(), BitNo: CurTy.getSizeInBits());
2838 MIBSrc = MIRBuilder.buildOr(
2839 Dst: WideTy, Src0: MIBSrc, Src1: MIRBuilder.buildConstant(Res: WideTy, Val: TopBit));
2840 // Now we know the operand is non-zero, use the more relaxed opcode.
2841 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2842 }
2843
2844 unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2845
2846 if (Opcode == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2847 // An optimization where the result is the CTLZ after the left shift by
2848 // (Difference in widety and current ty), that is,
2849 // MIBSrc = MIBSrc << (sizeinbits(WideTy) - sizeinbits(CurTy))
2850 // Result = ctlz MIBSrc
2851 MIBSrc = MIRBuilder.buildShl(Dst: WideTy, Src0: MIBSrc,
2852 Src1: MIRBuilder.buildConstant(Res: WideTy, Val: SizeDiff));
2853 }
2854
2855 // Perform the operation at the larger size.
2856 auto MIBNewOp = MIRBuilder.buildInstr(Opc: NewOpc, DstOps: {WideTy}, SrcOps: {MIBSrc});
2857 // This is already the correct result for CTPOP and CTTZs
2858 if (Opcode == TargetOpcode::G_CTLZ || Opcode == TargetOpcode::G_CTLS) {
2859 // The correct result is NewOp - (Difference in widety and current ty).
2860 MIBNewOp = MIRBuilder.buildSub(
2861 Dst: WideTy, Src0: MIBNewOp, Src1: MIRBuilder.buildConstant(Res: WideTy, Val: SizeDiff));
2862 }
2863
2864 MIRBuilder.buildZExtOrTrunc(Res: MI.getOperand(i: 0), Op: MIBNewOp);
2865 MI.eraseFromParent();
2866 return Legalized;
2867 }
2868 case TargetOpcode::G_BSWAP: {
2869 Observer.changingInstr(MI);
2870 Register DstReg = MI.getOperand(i: 0).getReg();
2871
2872 Register ShrReg = MRI.createGenericVirtualRegister(Ty: WideTy);
2873 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2874 Register ShiftAmtReg = MRI.createGenericVirtualRegister(Ty: WideTy);
2875 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2876
2877 MI.getOperand(i: 0).setReg(DstExt);
2878
2879 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2880
2881 LLT Ty = MRI.getType(Reg: DstReg);
2882 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2883 MIRBuilder.buildConstant(Res: ShiftAmtReg, Val: DiffBits);
2884 MIRBuilder.buildLShr(Dst: ShrReg, Src0: DstExt, Src1: ShiftAmtReg);
2885
2886 MIRBuilder.buildTrunc(Res: DstReg, Op: ShrReg);
2887 Observer.changedInstr(MI);
2888 return Legalized;
2889 }
2890 case TargetOpcode::G_BITREVERSE: {
2891 Observer.changingInstr(MI);
2892
2893 Register DstReg = MI.getOperand(i: 0).getReg();
2894 LLT Ty = MRI.getType(Reg: DstReg);
2895 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2896
2897 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2898 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2899 MI.getOperand(i: 0).setReg(DstExt);
2900 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2901
2902 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: DiffBits);
2903 auto Shift = MIRBuilder.buildLShr(Dst: WideTy, Src0: DstExt, Src1: ShiftAmt);
2904 MIRBuilder.buildTrunc(Res: DstReg, Op: Shift);
2905 Observer.changedInstr(MI);
2906 return Legalized;
2907 }
2908 case TargetOpcode::G_FREEZE:
2909 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
2910 Observer.changingInstr(MI);
2911 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2912 widenScalarDst(MI, WideTy);
2913 Observer.changedInstr(MI);
2914 return Legalized;
2915
2916 case TargetOpcode::G_ABS:
2917 Observer.changingInstr(MI);
2918 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2919 widenScalarDst(MI, WideTy);
2920 Observer.changedInstr(MI);
2921 return Legalized;
2922
2923 case TargetOpcode::G_ADD:
2924 case TargetOpcode::G_AND:
2925 case TargetOpcode::G_MUL:
2926 case TargetOpcode::G_OR:
2927 case TargetOpcode::G_XOR:
2928 case TargetOpcode::G_SUB:
2929 case TargetOpcode::G_SHUFFLE_VECTOR:
2930 // Perform operation at larger width (any extension is fines here, high bits
2931 // don't affect the result) and then truncate the result back to the
2932 // original type.
2933 Observer.changingInstr(MI);
2934 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2935 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2936 widenScalarDst(MI, WideTy);
2937 Observer.changedInstr(MI);
2938 return Legalized;
2939
2940 case TargetOpcode::G_SBFX:
2941 case TargetOpcode::G_UBFX:
2942 Observer.changingInstr(MI);
2943
2944 if (TypeIdx == 0) {
2945 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2946 widenScalarDst(MI, WideTy);
2947 } else {
2948 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2949 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
2950 }
2951
2952 Observer.changedInstr(MI);
2953 return Legalized;
2954
2955 case TargetOpcode::G_SHL:
2956 Observer.changingInstr(MI);
2957
2958 if (TypeIdx == 0) {
2959 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2960 widenScalarDst(MI, WideTy);
2961 } else {
2962 assert(TypeIdx == 1);
2963 // The "number of bits to shift" operand must preserve its value as an
2964 // unsigned integer:
2965 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2966 }
2967
2968 Observer.changedInstr(MI);
2969 return Legalized;
2970
2971 case TargetOpcode::G_ROTR:
2972 case TargetOpcode::G_ROTL:
2973 if (TypeIdx != 1)
2974 return UnableToLegalize;
2975
2976 Observer.changingInstr(MI);
2977 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2978 Observer.changedInstr(MI);
2979 return Legalized;
2980
2981 case TargetOpcode::G_SDIV:
2982 case TargetOpcode::G_SREM:
2983 case TargetOpcode::G_SMIN:
2984 case TargetOpcode::G_SMAX:
2985 case TargetOpcode::G_ABDS:
2986 Observer.changingInstr(MI);
2987 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2988 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2989 widenScalarDst(MI, WideTy);
2990 Observer.changedInstr(MI);
2991 return Legalized;
2992
2993 case TargetOpcode::G_SDIVREM:
2994 Observer.changingInstr(MI);
2995 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2996 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_SEXT);
2997 widenScalarDst(MI, WideTy);
2998 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: --MIRBuilder.getInsertPt());
2999 widenScalarDst(MI, WideTy, OpIdx: 1);
3000 Observer.changedInstr(MI);
3001 return Legalized;
3002
3003 case TargetOpcode::G_ASHR:
3004 case TargetOpcode::G_LSHR:
3005 Observer.changingInstr(MI);
3006
3007 if (TypeIdx == 0) {
3008 unsigned CvtOp = Opcode == TargetOpcode::G_ASHR ? TargetOpcode::G_SEXT
3009 : TargetOpcode::G_ZEXT;
3010
3011 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: CvtOp);
3012 widenScalarDst(MI, WideTy);
3013 } else {
3014 assert(TypeIdx == 1);
3015 // The "number of bits to shift" operand must preserve its value as an
3016 // unsigned integer:
3017 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3018 }
3019
3020 Observer.changedInstr(MI);
3021 return Legalized;
3022 case TargetOpcode::G_UDIV:
3023 case TargetOpcode::G_UREM:
3024 case TargetOpcode::G_ABDU:
3025 Observer.changingInstr(MI);
3026 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3027 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3028 widenScalarDst(MI, WideTy);
3029 Observer.changedInstr(MI);
3030 return Legalized;
3031 case TargetOpcode::G_UDIVREM:
3032 Observer.changingInstr(MI);
3033 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3034 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
3035 widenScalarDst(MI, WideTy);
3036 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: --MIRBuilder.getInsertPt());
3037 widenScalarDst(MI, WideTy, OpIdx: 1);
3038 Observer.changedInstr(MI);
3039 return Legalized;
3040 case TargetOpcode::G_UMIN:
3041 case TargetOpcode::G_UMAX: {
3042 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3043
3044 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
3045 unsigned ExtOpc =
3046 TLI.isSExtCheaperThanZExt(FromTy: getApproximateEVTForLLT(Ty, Ctx),
3047 ToTy: getApproximateEVTForLLT(Ty: WideTy, Ctx))
3048 ? TargetOpcode::G_SEXT
3049 : TargetOpcode::G_ZEXT;
3050
3051 Observer.changingInstr(MI);
3052 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: ExtOpc);
3053 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: ExtOpc);
3054 widenScalarDst(MI, WideTy);
3055 Observer.changedInstr(MI);
3056 return Legalized;
3057 }
3058
3059 case TargetOpcode::G_SELECT:
3060 Observer.changingInstr(MI);
3061 if (TypeIdx == 0) {
3062 // Perform operation at larger width (any extension is fine here, high
3063 // bits don't affect the result) and then truncate the result back to the
3064 // original type.
3065 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3066 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
3067 widenScalarDst(MI, WideTy);
3068 } else {
3069 bool IsVec = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector();
3070 // Explicit extension is required here since high bits affect the result.
3071 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: MIRBuilder.getBoolExtOp(IsVec, IsFP: false));
3072 }
3073 Observer.changedInstr(MI);
3074 return Legalized;
3075
3076 case TargetOpcode::G_FPEXT:
3077 if (TypeIdx != 1)
3078 return UnableToLegalize;
3079
3080 Observer.changingInstr(MI);
3081 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3082 Observer.changedInstr(MI);
3083 return Legalized;
3084 case TargetOpcode::G_FPTOSI:
3085 case TargetOpcode::G_FPTOUI:
3086 case TargetOpcode::G_INTRINSIC_LRINT:
3087 case TargetOpcode::G_INTRINSIC_LLRINT:
3088 case TargetOpcode::G_IS_FPCLASS:
3089 Observer.changingInstr(MI);
3090
3091 if (TypeIdx == 0)
3092 widenScalarDst(MI, WideTy);
3093 else
3094 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3095
3096 Observer.changedInstr(MI);
3097 return Legalized;
3098 case TargetOpcode::G_SITOFP:
3099 Observer.changingInstr(MI);
3100
3101 if (TypeIdx == 0)
3102 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3103 else
3104 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
3105
3106 Observer.changedInstr(MI);
3107 return Legalized;
3108 case TargetOpcode::G_UITOFP:
3109 Observer.changingInstr(MI);
3110
3111 if (TypeIdx == 0)
3112 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3113 else
3114 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3115
3116 Observer.changedInstr(MI);
3117 return Legalized;
3118 case TargetOpcode::G_FPTOSI_SAT:
3119 case TargetOpcode::G_FPTOUI_SAT:
3120 Observer.changingInstr(MI);
3121
3122 if (TypeIdx == 0) {
3123 Register OldDst = MI.getOperand(i: 0).getReg();
3124 LLT Ty = MRI.getType(Reg: OldDst);
3125 Register ExtReg = MRI.createGenericVirtualRegister(Ty: WideTy);
3126 Register NewDst;
3127 MI.getOperand(i: 0).setReg(ExtReg);
3128 uint64_t ShortBits = Ty.getScalarSizeInBits();
3129 uint64_t WideBits = WideTy.getScalarSizeInBits();
3130 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
3131 if (Opcode == TargetOpcode::G_FPTOSI_SAT) {
3132 // z = i16 fptosi_sat(a)
3133 // ->
3134 // x = i32 fptosi_sat(a)
3135 // y = smin(x, 32767)
3136 // z = smax(y, -32768)
3137 auto MaxVal = MIRBuilder.buildConstant(
3138 Res: WideTy, Val: APInt::getSignedMaxValue(numBits: ShortBits).sext(width: WideBits));
3139 auto MinVal = MIRBuilder.buildConstant(
3140 Res: WideTy, Val: APInt::getSignedMinValue(numBits: ShortBits).sext(width: WideBits));
3141 Register MidReg =
3142 MIRBuilder.buildSMin(Dst: WideTy, Src0: ExtReg, Src1: MaxVal).getReg(Idx: 0);
3143 NewDst = MIRBuilder.buildSMax(Dst: WideTy, Src0: MidReg, Src1: MinVal).getReg(Idx: 0);
3144 } else {
3145 // z = i16 fptoui_sat(a)
3146 // ->
3147 // x = i32 fptoui_sat(a)
3148 // y = smin(x, 65535)
3149 auto MaxVal = MIRBuilder.buildConstant(
3150 Res: WideTy, Val: APInt::getAllOnes(numBits: ShortBits).zext(width: WideBits));
3151 NewDst = MIRBuilder.buildUMin(Dst: WideTy, Src0: ExtReg, Src1: MaxVal).getReg(Idx: 0);
3152 }
3153 MIRBuilder.buildTrunc(Res: OldDst, Op: NewDst);
3154 } else
3155 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3156
3157 Observer.changedInstr(MI);
3158 return Legalized;
3159 case TargetOpcode::G_LOAD:
3160 case TargetOpcode::G_SEXTLOAD:
3161 case TargetOpcode::G_ZEXTLOAD:
3162 Observer.changingInstr(MI);
3163 widenScalarDst(MI, WideTy);
3164 Observer.changedInstr(MI);
3165 return Legalized;
3166
3167 case TargetOpcode::G_STORE: {
3168 if (TypeIdx != 0)
3169 return UnableToLegalize;
3170
3171 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3172 assert(!Ty.isPointerOrPointerVector() && "Can't widen type");
3173 if (!Ty.isScalar()) {
3174 // We need to widen the vector element type.
3175 Observer.changingInstr(MI);
3176 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ANYEXT);
3177 // We also need to adjust the MMO to turn this into a truncating store.
3178 MachineMemOperand &MMO = **MI.memoperands_begin();
3179 MachineFunction &MF = MIRBuilder.getMF();
3180 auto *NewMMO = MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty);
3181 MI.setMemRefs(MF, MemRefs: {NewMMO});
3182 Observer.changedInstr(MI);
3183 return Legalized;
3184 }
3185
3186 Observer.changingInstr(MI);
3187
3188 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
3189 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
3190 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: ExtType);
3191
3192 Observer.changedInstr(MI);
3193 return Legalized;
3194 }
3195 case TargetOpcode::G_CONSTANT: {
3196 MachineOperand &SrcMO = MI.getOperand(i: 1);
3197 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3198 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
3199 SmallTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
3200 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
3201 ExtOpc == TargetOpcode::G_ANYEXT) &&
3202 "Illegal Extend");
3203 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3204 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
3205 ? SrcVal.sext(width: WideTy.getSizeInBits())
3206 : SrcVal.zext(width: WideTy.getSizeInBits());
3207 Observer.changingInstr(MI);
3208 SrcMO.setCImm(ConstantInt::get(Context&: Ctx, V: Val));
3209
3210 widenScalarDst(MI, WideTy);
3211 Observer.changedInstr(MI);
3212 return Legalized;
3213 }
3214 case TargetOpcode::G_FCONSTANT: {
3215 // To avoid changing the bits of the constant due to extension to a larger
3216 // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
3217 MachineOperand &SrcMO = MI.getOperand(i: 1);
3218 APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
3219 MIRBuilder.setInstrAndDebugLoc(MI);
3220 auto IntCst = MIRBuilder.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val);
3221 widenScalarDst(MI&: *IntCst, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3222 MI.eraseFromParent();
3223 return Legalized;
3224 }
3225 case TargetOpcode::G_IMPLICIT_DEF: {
3226 Observer.changingInstr(MI);
3227 widenScalarDst(MI, WideTy);
3228 Observer.changedInstr(MI);
3229 return Legalized;
3230 }
3231 case TargetOpcode::G_BRCOND:
3232 Observer.changingInstr(MI);
3233 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: MIRBuilder.getBoolExtOp(IsVec: false, IsFP: false));
3234 Observer.changedInstr(MI);
3235 return Legalized;
3236
3237 case TargetOpcode::G_FCMP:
3238 Observer.changingInstr(MI);
3239 if (TypeIdx == 0)
3240 widenScalarDst(MI, WideTy);
3241 else {
3242 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3243 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_FPEXT);
3244 }
3245 Observer.changedInstr(MI);
3246 return Legalized;
3247
3248 case TargetOpcode::G_ICMP:
3249 Observer.changingInstr(MI);
3250 if (TypeIdx == 0)
3251 widenScalarDst(MI, WideTy);
3252 else {
3253 LLT SrcTy = MRI.getType(Reg: MI.getOperand(i: 2).getReg());
3254 CmpInst::Predicate Pred =
3255 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
3256
3257 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
3258 unsigned ExtOpcode =
3259 (CmpInst::isSigned(predicate: Pred) ||
3260 TLI.isSExtCheaperThanZExt(FromTy: getApproximateEVTForLLT(Ty: SrcTy, Ctx),
3261 ToTy: getApproximateEVTForLLT(Ty: WideTy, Ctx)))
3262 ? TargetOpcode::G_SEXT
3263 : TargetOpcode::G_ZEXT;
3264 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode);
3265 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode);
3266 }
3267 Observer.changedInstr(MI);
3268 return Legalized;
3269
3270 case TargetOpcode::G_PTR_ADD:
3271 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
3272 Observer.changingInstr(MI);
3273 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
3274 Observer.changedInstr(MI);
3275 return Legalized;
3276
3277 case TargetOpcode::G_PHI: {
3278 assert(TypeIdx == 0 && "Expecting only Idx 0");
3279
3280 Observer.changingInstr(MI);
3281 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
3282 MachineBasicBlock &OpMBB = *MI.getOperand(i: I + 1).getMBB();
3283 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
3284 widenScalarSrc(MI, WideTy, OpIdx: I, ExtOpcode: TargetOpcode::G_ANYEXT);
3285 }
3286
3287 MachineBasicBlock &MBB = *MI.getParent();
3288 MIRBuilder.setInsertPt(MBB, II: --MBB.getFirstNonPHI());
3289 widenScalarDst(MI, WideTy);
3290 Observer.changedInstr(MI);
3291 return Legalized;
3292 }
3293 case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
3294 if (TypeIdx == 0) {
3295 Register VecReg = MI.getOperand(i: 1).getReg();
3296 LLT VecTy = MRI.getType(Reg: VecReg);
3297 Observer.changingInstr(MI);
3298
3299 widenScalarSrc(
3300 MI,
3301 WideTy: VecTy.changeVectorElementType(NewEltTy: LLT::scalar(SizeInBits: WideTy.getSizeInBits())), OpIdx: 1,
3302 ExtOpcode: TargetOpcode::G_ANYEXT);
3303
3304 widenScalarDst(MI, WideTy, OpIdx: 0);
3305 Observer.changedInstr(MI);
3306 return Legalized;
3307 }
3308
3309 if (TypeIdx != 2)
3310 return UnableToLegalize;
3311 Observer.changingInstr(MI);
3312 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3313 Observer.changedInstr(MI);
3314 return Legalized;
3315 }
3316 case TargetOpcode::G_INSERT_VECTOR_ELT: {
3317 if (TypeIdx == 0) {
3318 Observer.changingInstr(MI);
3319 const LLT WideEltTy = WideTy.getElementType();
3320
3321 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3322 widenScalarSrc(MI, WideTy: WideEltTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3323 widenScalarDst(MI, WideTy, OpIdx: 0);
3324 Observer.changedInstr(MI);
3325 return Legalized;
3326 }
3327
3328 if (TypeIdx == 1) {
3329 Observer.changingInstr(MI);
3330
3331 Register VecReg = MI.getOperand(i: 1).getReg();
3332 LLT VecTy = MRI.getType(Reg: VecReg);
3333 LLT WideVecTy = VecTy.changeVectorElementType(NewEltTy: WideTy);
3334
3335 widenScalarSrc(MI, WideTy: WideVecTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3336 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
3337 widenScalarDst(MI, WideTy: WideVecTy, OpIdx: 0);
3338 Observer.changedInstr(MI);
3339 return Legalized;
3340 }
3341
3342 if (TypeIdx == 2) {
3343 Observer.changingInstr(MI);
3344 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
3345 Observer.changedInstr(MI);
3346 return Legalized;
3347 }
3348
3349 return UnableToLegalize;
3350 }
3351 case TargetOpcode::G_FADD:
3352 case TargetOpcode::G_FMUL:
3353 case TargetOpcode::G_FSUB:
3354 case TargetOpcode::G_FMA:
3355 case TargetOpcode::G_FMAD:
3356 case TargetOpcode::G_FNEG:
3357 case TargetOpcode::G_FABS:
3358 case TargetOpcode::G_FCANONICALIZE:
3359 case TargetOpcode::G_FMINNUM:
3360 case TargetOpcode::G_FMAXNUM:
3361 case TargetOpcode::G_FMINNUM_IEEE:
3362 case TargetOpcode::G_FMAXNUM_IEEE:
3363 case TargetOpcode::G_FMINIMUM:
3364 case TargetOpcode::G_FMAXIMUM:
3365 case TargetOpcode::G_FMINIMUMNUM:
3366 case TargetOpcode::G_FMAXIMUMNUM:
3367 case TargetOpcode::G_FDIV:
3368 case TargetOpcode::G_FREM:
3369 case TargetOpcode::G_FCEIL:
3370 case TargetOpcode::G_FFLOOR:
3371 case TargetOpcode::G_FCOS:
3372 case TargetOpcode::G_FSIN:
3373 case TargetOpcode::G_FTAN:
3374 case TargetOpcode::G_FACOS:
3375 case TargetOpcode::G_FASIN:
3376 case TargetOpcode::G_FATAN:
3377 case TargetOpcode::G_FATAN2:
3378 case TargetOpcode::G_FCOSH:
3379 case TargetOpcode::G_FSINH:
3380 case TargetOpcode::G_FTANH:
3381 case TargetOpcode::G_FLOG10:
3382 case TargetOpcode::G_FLOG:
3383 case TargetOpcode::G_FLOG2:
3384 case TargetOpcode::G_FRINT:
3385 case TargetOpcode::G_FNEARBYINT:
3386 case TargetOpcode::G_FSQRT:
3387 case TargetOpcode::G_FEXP:
3388 case TargetOpcode::G_FEXP2:
3389 case TargetOpcode::G_FEXP10:
3390 case TargetOpcode::G_FPOW:
3391 case TargetOpcode::G_INTRINSIC_TRUNC:
3392 case TargetOpcode::G_INTRINSIC_ROUND:
3393 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
3394 assert(TypeIdx == 0);
3395 Observer.changingInstr(MI);
3396
3397 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3398 widenScalarSrc(MI, WideTy, OpIdx: I, ExtOpcode: TargetOpcode::G_FPEXT);
3399
3400 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3401 Observer.changedInstr(MI);
3402 return Legalized;
3403 case TargetOpcode::G_FMODF: {
3404 Observer.changingInstr(MI);
3405 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3406
3407 widenScalarDst(MI, WideTy, OpIdx: 1, TruncOpcode: TargetOpcode::G_FPTRUNC);
3408 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: --MIRBuilder.getInsertPt());
3409 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3410 Observer.changedInstr(MI);
3411 return Legalized;
3412 }
3413 case TargetOpcode::G_FPOWI:
3414 case TargetOpcode::G_FLDEXP:
3415 case TargetOpcode::G_STRICT_FLDEXP: {
3416 if (TypeIdx == 0) {
3417 if (Opcode == TargetOpcode::G_STRICT_FLDEXP)
3418 return UnableToLegalize;
3419
3420 Observer.changingInstr(MI);
3421 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3422 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3423 Observer.changedInstr(MI);
3424 return Legalized;
3425 }
3426
3427 if (TypeIdx == 1) {
3428 // For some reason SelectionDAG tries to promote to a libcall without
3429 // actually changing the integer type for promotion.
3430 Observer.changingInstr(MI);
3431 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
3432 Observer.changedInstr(MI);
3433 return Legalized;
3434 }
3435
3436 return UnableToLegalize;
3437 }
3438 case TargetOpcode::G_FFREXP: {
3439 Observer.changingInstr(MI);
3440
3441 if (TypeIdx == 0) {
3442 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
3443 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3444 } else {
3445 widenScalarDst(MI, WideTy, OpIdx: 1);
3446 }
3447
3448 Observer.changedInstr(MI);
3449 return Legalized;
3450 }
3451 case TargetOpcode::G_LROUND:
3452 case TargetOpcode::G_LLROUND:
3453 Observer.changingInstr(MI);
3454
3455 if (TypeIdx == 0)
3456 widenScalarDst(MI, WideTy);
3457 else
3458 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3459
3460 Observer.changedInstr(MI);
3461 return Legalized;
3462
3463 case TargetOpcode::G_INTTOPTR:
3464 if (TypeIdx != 1)
3465 return UnableToLegalize;
3466
3467 Observer.changingInstr(MI);
3468 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
3469 Observer.changedInstr(MI);
3470 return Legalized;
3471 case TargetOpcode::G_PTRTOINT:
3472 if (TypeIdx != 0)
3473 return UnableToLegalize;
3474
3475 Observer.changingInstr(MI);
3476 widenScalarDst(MI, WideTy, OpIdx: 0);
3477 Observer.changedInstr(MI);
3478 return Legalized;
3479 case TargetOpcode::G_BUILD_VECTOR: {
3480 Observer.changingInstr(MI);
3481
3482 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
3483 for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
3484 widenScalarSrc(MI, WideTy: WideEltTy, OpIdx: I, ExtOpcode: TargetOpcode::G_ANYEXT);
3485
3486 // Avoid changing the result vector type if the source element type was
3487 // requested.
3488 if (TypeIdx == 1) {
3489 MI.setDesc(MIRBuilder.getTII().get(Opcode: TargetOpcode::G_BUILD_VECTOR_TRUNC));
3490 } else {
3491 widenScalarDst(MI, WideTy, OpIdx: 0);
3492 }
3493
3494 Observer.changedInstr(MI);
3495 return Legalized;
3496 }
3497 case TargetOpcode::G_SEXT_INREG:
3498 if (TypeIdx != 0)
3499 return UnableToLegalize;
3500
3501 Observer.changingInstr(MI);
3502 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3503 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3504 Observer.changedInstr(MI);
3505 return Legalized;
3506 case TargetOpcode::G_PTRMASK: {
3507 if (TypeIdx != 1)
3508 return UnableToLegalize;
3509 Observer.changingInstr(MI);
3510 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
3511 Observer.changedInstr(MI);
3512 return Legalized;
3513 }
3514 case TargetOpcode::G_VECREDUCE_ADD: {
3515 if (TypeIdx != 1)
3516 return UnableToLegalize;
3517 Observer.changingInstr(MI);
3518 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3519 widenScalarDst(MI, WideTy: WideTy.getScalarType(), OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
3520 Observer.changedInstr(MI);
3521 return Legalized;
3522 }
3523 case TargetOpcode::G_VECREDUCE_FADD:
3524 case TargetOpcode::G_VECREDUCE_FMUL:
3525 case TargetOpcode::G_VECREDUCE_FMIN:
3526 case TargetOpcode::G_VECREDUCE_FMAX:
3527 case TargetOpcode::G_VECREDUCE_FMINIMUM:
3528 case TargetOpcode::G_VECREDUCE_FMAXIMUM: {
3529 if (TypeIdx != 0)
3530 return UnableToLegalize;
3531 Observer.changingInstr(MI);
3532 Register VecReg = MI.getOperand(i: 1).getReg();
3533 LLT VecTy = MRI.getType(Reg: VecReg);
3534 LLT WideVecTy = VecTy.changeElementType(NewEltTy: WideTy);
3535 widenScalarSrc(MI, WideTy: WideVecTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
3536 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
3537 Observer.changedInstr(MI);
3538 return Legalized;
3539 }
3540 case TargetOpcode::G_VSCALE: {
3541 MachineOperand &SrcMO = MI.getOperand(i: 1);
3542 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3543 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3544 // The CImm is always a signed value
3545 const APInt Val = SrcVal.sext(width: WideTy.getSizeInBits());
3546 Observer.changingInstr(MI);
3547 SrcMO.setCImm(ConstantInt::get(Context&: Ctx, V: Val));
3548 widenScalarDst(MI, WideTy);
3549 Observer.changedInstr(MI);
3550 return Legalized;
3551 }
3552 case TargetOpcode::G_SPLAT_VECTOR: {
3553 if (TypeIdx != 1)
3554 return UnableToLegalize;
3555
3556 Observer.changingInstr(MI);
3557 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
3558 Observer.changedInstr(MI);
3559 return Legalized;
3560 }
3561 case TargetOpcode::G_INSERT_SUBVECTOR: {
3562 if (TypeIdx != 0)
3563 return UnableToLegalize;
3564
3565 GInsertSubvector &IS = cast<GInsertSubvector>(Val&: MI);
3566 Register BigVec = IS.getBigVec();
3567 Register SubVec = IS.getSubVec();
3568
3569 LLT SubVecTy = MRI.getType(Reg: SubVec);
3570 LLT SubVecWideTy = SubVecTy.changeElementType(NewEltTy: WideTy.getElementType());
3571
3572 // Widen the G_INSERT_SUBVECTOR
3573 auto BigZExt = MIRBuilder.buildZExt(Res: WideTy, Op: BigVec);
3574 auto SubZExt = MIRBuilder.buildZExt(Res: SubVecWideTy, Op: SubVec);
3575 auto WideInsert = MIRBuilder.buildInsertSubvector(Res: WideTy, Src0: BigZExt, Src1: SubZExt,
3576 Index: IS.getIndexImm());
3577
3578 // Truncate back down
3579 auto SplatZero = MIRBuilder.buildSplatVector(
3580 Res: WideTy, Val: MIRBuilder.buildConstant(Res: WideTy.getElementType(), Val: 0));
3581 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_NE, Res: IS.getReg(Idx: 0), Op0: WideInsert,
3582 Op1: SplatZero);
3583
3584 MI.eraseFromParent();
3585
3586 return Legalized;
3587 }
3588 }
3589}
3590
3591static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
3592 MachineIRBuilder &B, Register Src, LLT Ty) {
3593 auto Unmerge = B.buildUnmerge(Res: Ty, Op: Src);
3594 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3595 Pieces.push_back(Elt: Unmerge.getReg(Idx: I));
3596}
3597
3598static void emitLoadFromConstantPool(Register DstReg, const Constant *ConstVal,
3599 MachineIRBuilder &MIRBuilder) {
3600 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3601 MachineFunction &MF = MIRBuilder.getMF();
3602 const DataLayout &DL = MIRBuilder.getDataLayout();
3603 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
3604 LLT AddrPtrTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: DL.getPointerSizeInBits(AS: AddrSpace));
3605 LLT DstLLT = MRI.getType(Reg: DstReg);
3606
3607 Align Alignment(DL.getABITypeAlign(Ty: ConstVal->getType()));
3608
3609 auto Addr = MIRBuilder.buildConstantPool(
3610 Res: AddrPtrTy,
3611 Idx: MF.getConstantPool()->getConstantPoolIndex(C: ConstVal, Alignment));
3612
3613 MachineMemOperand *MMO =
3614 MF.getMachineMemOperand(PtrInfo: MachinePointerInfo::getConstantPool(MF),
3615 f: MachineMemOperand::MOLoad, MemTy: DstLLT, base_alignment: Alignment);
3616
3617 MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_LOAD, Res: DstReg, Addr, MMO&: *MMO);
3618}
3619
3620LegalizerHelper::LegalizeResult
3621LegalizerHelper::lowerConstant(MachineInstr &MI) {
3622 const MachineOperand &ConstOperand = MI.getOperand(i: 1);
3623 const Constant *ConstantVal = ConstOperand.getCImm();
3624
3625 emitLoadFromConstantPool(DstReg: MI.getOperand(i: 0).getReg(), ConstVal: ConstantVal, MIRBuilder);
3626 MI.eraseFromParent();
3627
3628 return Legalized;
3629}
3630
3631LegalizerHelper::LegalizeResult
3632LegalizerHelper::lowerFConstant(MachineInstr &MI) {
3633 const MachineOperand &ConstOperand = MI.getOperand(i: 1);
3634 const Constant *ConstantVal = ConstOperand.getFPImm();
3635
3636 emitLoadFromConstantPool(DstReg: MI.getOperand(i: 0).getReg(), ConstVal: ConstantVal, MIRBuilder);
3637 MI.eraseFromParent();
3638
3639 return Legalized;
3640}
3641
3642LegalizerHelper::LegalizeResult
3643LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3644 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3645 if (SrcTy.isVector()) {
3646 LLT SrcEltTy = SrcTy.getElementType();
3647 SmallVector<Register, 8> SrcRegs;
3648
3649 if (DstTy.isVector()) {
3650 int NumDstElt = DstTy.getNumElements();
3651 int NumSrcElt = SrcTy.getNumElements();
3652
3653 LLT DstEltTy = DstTy.getElementType();
3654 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3655 LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3656
3657 // If there's an element size mismatch, insert intermediate casts to match
3658 // the result element type.
3659 if (NumSrcElt < NumDstElt) { // Source element type is larger.
3660 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3661 //
3662 // =>
3663 //
3664 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3665 // %3:_(<2 x s8>) = G_BITCAST %2
3666 // %4:_(<2 x s8>) = G_BITCAST %3
3667 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3668 DstCastTy = DstTy.changeVectorElementCount(
3669 EC: ElementCount::getFixed(MinVal: NumDstElt / NumSrcElt));
3670 SrcPartTy = SrcEltTy;
3671 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3672 //
3673 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3674 //
3675 // =>
3676 //
3677 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3678 // %3:_(s16) = G_BITCAST %2
3679 // %4:_(s16) = G_BITCAST %3
3680 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3681 SrcPartTy = SrcTy.changeVectorElementCount(
3682 EC: ElementCount::getFixed(MinVal: NumSrcElt / NumDstElt));
3683 DstCastTy = DstEltTy;
3684 }
3685
3686 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: SrcPartTy);
3687 for (Register &SrcReg : SrcRegs)
3688 SrcReg = MIRBuilder.buildBitcast(Dst: DstCastTy, Src: SrcReg).getReg(Idx: 0);
3689 } else
3690 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: SrcEltTy);
3691
3692 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3693 MI.eraseFromParent();
3694 return Legalized;
3695 }
3696
3697 if (DstTy.isVector()) {
3698 SmallVector<Register, 8> SrcRegs;
3699 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: DstTy.getElementType());
3700 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3701 MI.eraseFromParent();
3702 return Legalized;
3703 }
3704
3705 return UnableToLegalize;
3706}
3707
3708/// Figure out the bit offset into a register when coercing a vector index for
3709/// the wide element type. This is only for the case when promoting vector to
3710/// one with larger elements.
3711//
3712///
3713/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3714/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3715static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3716 Register Idx,
3717 unsigned NewEltSize,
3718 unsigned OldEltSize) {
3719 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3720 LLT IdxTy = B.getMRI()->getType(Reg: Idx);
3721
3722 // Now figure out the amount we need to shift to get the target bits.
3723 auto OffsetMask = B.buildConstant(
3724 Res: IdxTy, Val: ~(APInt::getAllOnes(numBits: IdxTy.getSizeInBits()) << Log2EltRatio));
3725 auto OffsetIdx = B.buildAnd(Dst: IdxTy, Src0: Idx, Src1: OffsetMask);
3726 return B.buildShl(Dst: IdxTy, Src0: OffsetIdx,
3727 Src1: B.buildConstant(Res: IdxTy, Val: Log2_32(Value: OldEltSize))).getReg(Idx: 0);
3728}
3729
3730/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3731/// is casting to a vector with a smaller element size, perform multiple element
3732/// extracts and merge the results. If this is coercing to a vector with larger
3733/// elements, index the bitcasted vector and extract the target element with bit
3734/// operations. This is intended to force the indexing in the native register
3735/// size for architectures that can dynamically index the register file.
3736LegalizerHelper::LegalizeResult
3737LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3738 LLT CastTy) {
3739 if (TypeIdx != 1)
3740 return UnableToLegalize;
3741
3742 auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3743
3744 LLT SrcEltTy = SrcVecTy.getElementType();
3745 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3746 unsigned OldNumElts = SrcVecTy.getNumElements();
3747
3748 LLT NewEltTy = CastTy.getScalarType();
3749 Register CastVec = MIRBuilder.buildBitcast(Dst: CastTy, Src: SrcVec).getReg(Idx: 0);
3750
3751 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3752 const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3753 if (NewNumElts > OldNumElts) {
3754 // Decreasing the vector element size
3755 //
3756 // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3757 // =>
3758 // v4i32:castx = bitcast x:v2i64
3759 //
3760 // i64 = bitcast
3761 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3762 // (i32 (extract_vector_elt castx, (2 * y + 1)))
3763 //
3764 if (NewNumElts % OldNumElts != 0)
3765 return UnableToLegalize;
3766
3767 // Type of the intermediate result vector.
3768 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3769 LLT MidTy =
3770 CastTy.changeElementCount(EC: ElementCount::getFixed(MinVal: NewEltsPerOldElt));
3771
3772 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(Res: IdxTy, Val: NewEltsPerOldElt);
3773
3774 SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3775 auto NewBaseIdx = MIRBuilder.buildMul(Dst: IdxTy, Src0: Idx, Src1: NewEltsPerOldEltK);
3776
3777 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3778 auto IdxOffset = MIRBuilder.buildConstant(Res: IdxTy, Val: I);
3779 auto TmpIdx = MIRBuilder.buildAdd(Dst: IdxTy, Src0: NewBaseIdx, Src1: IdxOffset);
3780 auto Elt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec, Idx: TmpIdx);
3781 NewOps[I] = Elt.getReg(Idx: 0);
3782 }
3783
3784 auto NewVec = MIRBuilder.buildBuildVector(Res: MidTy, Ops: NewOps);
3785 MIRBuilder.buildBitcast(Dst, Src: NewVec);
3786 MI.eraseFromParent();
3787 return Legalized;
3788 }
3789
3790 if (NewNumElts < OldNumElts) {
3791 if (NewEltSize % OldEltSize != 0)
3792 return UnableToLegalize;
3793
3794 // This only depends on powers of 2 because we use bit tricks to figure out
3795 // the bit offset we need to shift to get the target element. A general
3796 // expansion could emit division/multiply.
3797 if (!isPowerOf2_32(Value: NewEltSize / OldEltSize))
3798 return UnableToLegalize;
3799
3800 // Increasing the vector element size.
3801 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3802 //
3803 // =>
3804 //
3805 // %cast = G_BITCAST %vec
3806 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3807 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3808 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3809 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3810 // %elt_bits = G_LSHR %wide_elt, %offset_bits
3811 // %elt = G_TRUNC %elt_bits
3812
3813 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3814 auto Log2Ratio = MIRBuilder.buildConstant(Res: IdxTy, Val: Log2EltRatio);
3815
3816 // Divide to get the index in the wider element type.
3817 auto ScaledIdx = MIRBuilder.buildLShr(Dst: IdxTy, Src0: Idx, Src1: Log2Ratio);
3818
3819 Register WideElt = CastVec;
3820 if (CastTy.isVector()) {
3821 WideElt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec,
3822 Idx: ScaledIdx).getReg(Idx: 0);
3823 }
3824
3825 // Compute the bit offset into the register of the target element.
3826 Register OffsetBits = getBitcastWiderVectorElementOffset(
3827 B&: MIRBuilder, Idx, NewEltSize, OldEltSize);
3828
3829 // Shift the wide element to get the target element.
3830 auto ExtractedBits = MIRBuilder.buildLShr(Dst: NewEltTy, Src0: WideElt, Src1: OffsetBits);
3831 MIRBuilder.buildTrunc(Res: Dst, Op: ExtractedBits);
3832 MI.eraseFromParent();
3833 return Legalized;
3834 }
3835
3836 return UnableToLegalize;
3837}
3838
3839/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3840/// TargetReg, while preserving other bits in \p TargetReg.
3841///
3842/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3843static Register buildBitFieldInsert(MachineIRBuilder &B,
3844 Register TargetReg, Register InsertReg,
3845 Register OffsetBits) {
3846 LLT TargetTy = B.getMRI()->getType(Reg: TargetReg);
3847 LLT InsertTy = B.getMRI()->getType(Reg: InsertReg);
3848 auto ZextVal = B.buildZExt(Res: TargetTy, Op: InsertReg);
3849 auto ShiftedInsertVal = B.buildShl(Dst: TargetTy, Src0: ZextVal, Src1: OffsetBits);
3850
3851 // Produce a bitmask of the value to insert
3852 auto EltMask = B.buildConstant(
3853 Res: TargetTy, Val: APInt::getLowBitsSet(numBits: TargetTy.getSizeInBits(),
3854 loBitsSet: InsertTy.getSizeInBits()));
3855 // Shift it into position
3856 auto ShiftedMask = B.buildShl(Dst: TargetTy, Src0: EltMask, Src1: OffsetBits);
3857 auto InvShiftedMask = B.buildNot(Dst: TargetTy, Src0: ShiftedMask);
3858
3859 // Clear out the bits in the wide element
3860 auto MaskedOldElt = B.buildAnd(Dst: TargetTy, Src0: TargetReg, Src1: InvShiftedMask);
3861
3862 // The value to insert has all zeros already, so stick it into the masked
3863 // wide element.
3864 return B.buildOr(Dst: TargetTy, Src0: MaskedOldElt, Src1: ShiftedInsertVal).getReg(Idx: 0);
3865}
3866
3867/// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3868/// is increasing the element size, perform the indexing in the target element
3869/// type, and use bit operations to insert at the element position. This is
3870/// intended for architectures that can dynamically index the register file and
3871/// want to force indexing in the native register size.
3872LegalizerHelper::LegalizeResult
3873LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3874 LLT CastTy) {
3875 if (TypeIdx != 0)
3876 return UnableToLegalize;
3877
3878 auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3879 MI.getFirst4RegLLTs();
3880 LLT VecTy = DstTy;
3881
3882 LLT VecEltTy = VecTy.getElementType();
3883 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3884 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3885 const unsigned OldEltSize = VecEltTy.getSizeInBits();
3886
3887 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3888 unsigned OldNumElts = VecTy.getNumElements();
3889
3890 Register CastVec = MIRBuilder.buildBitcast(Dst: CastTy, Src: SrcVec).getReg(Idx: 0);
3891 if (NewNumElts < OldNumElts) {
3892 if (NewEltSize % OldEltSize != 0)
3893 return UnableToLegalize;
3894
3895 // This only depends on powers of 2 because we use bit tricks to figure out
3896 // the bit offset we need to shift to get the target element. A general
3897 // expansion could emit division/multiply.
3898 if (!isPowerOf2_32(Value: NewEltSize / OldEltSize))
3899 return UnableToLegalize;
3900
3901 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3902 auto Log2Ratio = MIRBuilder.buildConstant(Res: IdxTy, Val: Log2EltRatio);
3903
3904 // Divide to get the index in the wider element type.
3905 auto ScaledIdx = MIRBuilder.buildLShr(Dst: IdxTy, Src0: Idx, Src1: Log2Ratio);
3906
3907 Register ExtractedElt = CastVec;
3908 if (CastTy.isVector()) {
3909 ExtractedElt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec,
3910 Idx: ScaledIdx).getReg(Idx: 0);
3911 }
3912
3913 // Compute the bit offset into the register of the target element.
3914 Register OffsetBits = getBitcastWiderVectorElementOffset(
3915 B&: MIRBuilder, Idx, NewEltSize, OldEltSize);
3916
3917 Register InsertedElt = buildBitFieldInsert(B&: MIRBuilder, TargetReg: ExtractedElt,
3918 InsertReg: Val, OffsetBits);
3919 if (CastTy.isVector()) {
3920 InsertedElt = MIRBuilder.buildInsertVectorElement(
3921 Res: CastTy, Val: CastVec, Elt: InsertedElt, Idx: ScaledIdx).getReg(Idx: 0);
3922 }
3923
3924 MIRBuilder.buildBitcast(Dst, Src: InsertedElt);
3925 MI.eraseFromParent();
3926 return Legalized;
3927 }
3928
3929 return UnableToLegalize;
3930}
3931
3932// This attempts to handle G_CONCAT_VECTORS with illegal operands, particularly
3933// those that have smaller than legal operands.
3934//
3935// <16 x s8> = G_CONCAT_VECTORS <4 x s8>, <4 x s8>, <4 x s8>, <4 x s8>
3936//
3937// ===>
3938//
3939// s32 = G_BITCAST <4 x s8>
3940// s32 = G_BITCAST <4 x s8>
3941// s32 = G_BITCAST <4 x s8>
3942// s32 = G_BITCAST <4 x s8>
3943// <4 x s32> = G_BUILD_VECTOR s32, s32, s32, s32
3944// <16 x s8> = G_BITCAST <4 x s32>
3945LegalizerHelper::LegalizeResult
3946LegalizerHelper::bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx,
3947 LLT CastTy) {
3948 // Convert it to CONCAT instruction
3949 auto ConcatMI = dyn_cast<GConcatVectors>(Val: &MI);
3950 if (!ConcatMI) {
3951 return UnableToLegalize;
3952 }
3953
3954 // Check if bitcast is Legal
3955 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
3956 LLT SrcScalTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
3957
3958 // Check if the build vector is Legal
3959 if (!LI.isLegal(Query: {TargetOpcode::G_BUILD_VECTOR, {CastTy, SrcScalTy}})) {
3960 return UnableToLegalize;
3961 }
3962
3963 // Bitcast the sources
3964 SmallVector<Register> BitcastRegs;
3965 for (unsigned i = 0; i < ConcatMI->getNumSources(); i++) {
3966 BitcastRegs.push_back(
3967 Elt: MIRBuilder.buildBitcast(Dst: SrcScalTy, Src: ConcatMI->getSourceReg(I: i))
3968 .getReg(Idx: 0));
3969 }
3970
3971 // Build the scalar values into a vector
3972 Register BuildReg =
3973 MIRBuilder.buildBuildVector(Res: CastTy, Ops: BitcastRegs).getReg(Idx: 0);
3974 MIRBuilder.buildBitcast(Dst: DstReg, Src: BuildReg);
3975
3976 MI.eraseFromParent();
3977 return Legalized;
3978}
3979
3980// This bitcasts a shuffle vector to a different type currently of the same
3981// element size. Mostly used to legalize ptr vectors, where ptrtoint/inttoptr
3982// will be used instead.
3983//
3984// <16 x p0> = G_CONCAT_VECTORS <4 x p0>, <4 x p0>, mask
3985// ===>
3986// <4 x s64> = G_PTRTOINT <4 x p0>
3987// <4 x s64> = G_PTRTOINT <4 x p0>
3988// <16 x s64> = G_CONCAT_VECTORS <4 x s64>, <4 x s64>, mask
3989// <16 x p0> = G_INTTOPTR <16 x s64>
3990LegalizerHelper::LegalizeResult
3991LegalizerHelper::bitcastShuffleVector(MachineInstr &MI, unsigned TypeIdx,
3992 LLT CastTy) {
3993 auto ShuffleMI = cast<GShuffleVector>(Val: &MI);
3994 LLT DstTy = MRI.getType(Reg: ShuffleMI->getReg(Idx: 0));
3995 LLT SrcTy = MRI.getType(Reg: ShuffleMI->getReg(Idx: 1));
3996
3997 // We currently only handle vectors of the same size.
3998 if (TypeIdx != 0 ||
3999 CastTy.getScalarSizeInBits() != DstTy.getScalarSizeInBits() ||
4000 CastTy.getElementCount() != DstTy.getElementCount())
4001 return UnableToLegalize;
4002
4003 LLT NewSrcTy = SrcTy.changeElementType(NewEltTy: CastTy.getScalarType());
4004
4005 auto Inp1 = MIRBuilder.buildCast(Dst: NewSrcTy, Src: ShuffleMI->getReg(Idx: 1));
4006 auto Inp2 = MIRBuilder.buildCast(Dst: NewSrcTy, Src: ShuffleMI->getReg(Idx: 2));
4007 auto Shuf =
4008 MIRBuilder.buildShuffleVector(Res: CastTy, Src1: Inp1, Src2: Inp2, Mask: ShuffleMI->getMask());
4009 MIRBuilder.buildCast(Dst: ShuffleMI->getReg(Idx: 0), Src: Shuf);
4010
4011 MI.eraseFromParent();
4012 return Legalized;
4013}
4014
4015/// This attempts to bitcast G_EXTRACT_SUBVECTOR to CastTy.
4016///
4017/// <vscale x 8 x i1> = G_EXTRACT_SUBVECTOR <vscale x 16 x i1>, N
4018///
4019/// ===>
4020///
4021/// <vscale x 2 x i1> = G_BITCAST <vscale x 16 x i1>
4022/// <vscale x 1 x i8> = G_EXTRACT_SUBVECTOR <vscale x 2 x i1>, N / 8
4023/// <vscale x 8 x i1> = G_BITCAST <vscale x 1 x i8>
4024LegalizerHelper::LegalizeResult
4025LegalizerHelper::bitcastExtractSubvector(MachineInstr &MI, unsigned TypeIdx,
4026 LLT CastTy) {
4027 auto ES = cast<GExtractSubvector>(Val: &MI);
4028
4029 if (!CastTy.isVector())
4030 return UnableToLegalize;
4031
4032 if (TypeIdx != 0)
4033 return UnableToLegalize;
4034
4035 Register Dst = ES->getReg(Idx: 0);
4036 Register Src = ES->getSrcVec();
4037 uint64_t Idx = ES->getIndexImm();
4038
4039 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4040
4041 LLT DstTy = MRI.getType(Reg: Dst);
4042 LLT SrcTy = MRI.getType(Reg: Src);
4043 ElementCount DstTyEC = DstTy.getElementCount();
4044 ElementCount SrcTyEC = SrcTy.getElementCount();
4045 auto DstTyMinElts = DstTyEC.getKnownMinValue();
4046 auto SrcTyMinElts = SrcTyEC.getKnownMinValue();
4047
4048 if (DstTy == CastTy)
4049 return Legalized;
4050
4051 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
4052 return UnableToLegalize;
4053
4054 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
4055 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
4056 if (CastEltSize < DstEltSize)
4057 return UnableToLegalize;
4058
4059 auto AdjustAmt = CastEltSize / DstEltSize;
4060 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
4061 SrcTyMinElts % AdjustAmt != 0)
4062 return UnableToLegalize;
4063
4064 Idx /= AdjustAmt;
4065 SrcTy = LLT::vector(EC: SrcTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4066 auto CastVec = MIRBuilder.buildBitcast(Dst: SrcTy, Src);
4067 auto PromotedES = MIRBuilder.buildExtractSubvector(Res: CastTy, Src: CastVec, Index: Idx);
4068 MIRBuilder.buildBitcast(Dst, Src: PromotedES);
4069
4070 ES->eraseFromParent();
4071 return Legalized;
4072}
4073
4074/// This attempts to bitcast G_INSERT_SUBVECTOR to CastTy.
4075///
4076/// <vscale x 16 x i1> = G_INSERT_SUBVECTOR <vscale x 16 x i1>,
4077/// <vscale x 8 x i1>,
4078/// N
4079///
4080/// ===>
4081///
4082/// <vscale x 2 x i8> = G_BITCAST <vscale x 16 x i1>
4083/// <vscale x 1 x i8> = G_BITCAST <vscale x 8 x i1>
4084/// <vscale x 2 x i8> = G_INSERT_SUBVECTOR <vscale x 2 x i8>,
4085/// <vscale x 1 x i8>, N / 8
4086/// <vscale x 16 x i1> = G_BITCAST <vscale x 2 x i8>
4087LegalizerHelper::LegalizeResult
4088LegalizerHelper::bitcastInsertSubvector(MachineInstr &MI, unsigned TypeIdx,
4089 LLT CastTy) {
4090 auto ES = cast<GInsertSubvector>(Val: &MI);
4091
4092 if (!CastTy.isVector())
4093 return UnableToLegalize;
4094
4095 if (TypeIdx != 0)
4096 return UnableToLegalize;
4097
4098 Register Dst = ES->getReg(Idx: 0);
4099 Register BigVec = ES->getBigVec();
4100 Register SubVec = ES->getSubVec();
4101 uint64_t Idx = ES->getIndexImm();
4102
4103 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4104
4105 LLT DstTy = MRI.getType(Reg: Dst);
4106 LLT BigVecTy = MRI.getType(Reg: BigVec);
4107 LLT SubVecTy = MRI.getType(Reg: SubVec);
4108
4109 if (DstTy == CastTy)
4110 return Legalized;
4111
4112 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
4113 return UnableToLegalize;
4114
4115 ElementCount DstTyEC = DstTy.getElementCount();
4116 ElementCount BigVecTyEC = BigVecTy.getElementCount();
4117 ElementCount SubVecTyEC = SubVecTy.getElementCount();
4118 auto DstTyMinElts = DstTyEC.getKnownMinValue();
4119 auto BigVecTyMinElts = BigVecTyEC.getKnownMinValue();
4120 auto SubVecTyMinElts = SubVecTyEC.getKnownMinValue();
4121
4122 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
4123 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
4124 if (CastEltSize < DstEltSize)
4125 return UnableToLegalize;
4126
4127 auto AdjustAmt = CastEltSize / DstEltSize;
4128 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
4129 BigVecTyMinElts % AdjustAmt != 0 || SubVecTyMinElts % AdjustAmt != 0)
4130 return UnableToLegalize;
4131
4132 Idx /= AdjustAmt;
4133 BigVecTy = LLT::vector(EC: BigVecTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4134 SubVecTy = LLT::vector(EC: SubVecTyEC.divideCoefficientBy(RHS: AdjustAmt), ScalarSizeInBits: AdjustAmt);
4135 auto CastBigVec = MIRBuilder.buildBitcast(Dst: BigVecTy, Src: BigVec);
4136 auto CastSubVec = MIRBuilder.buildBitcast(Dst: SubVecTy, Src: SubVec);
4137 auto PromotedIS =
4138 MIRBuilder.buildInsertSubvector(Res: CastTy, Src0: CastBigVec, Src1: CastSubVec, Index: Idx);
4139 MIRBuilder.buildBitcast(Dst, Src: PromotedIS);
4140
4141 ES->eraseFromParent();
4142 return Legalized;
4143}
4144
4145LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
4146 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
4147 Register DstReg = LoadMI.getDstReg();
4148 Register PtrReg = LoadMI.getPointerReg();
4149 LLT DstTy = MRI.getType(Reg: DstReg);
4150 MachineMemOperand &MMO = LoadMI.getMMO();
4151 LLT MemTy = MMO.getMemoryType();
4152 MachineFunction &MF = MIRBuilder.getMF();
4153
4154 unsigned MemSizeInBits = MemTy.getSizeInBits();
4155 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
4156
4157 if (MemSizeInBits != MemStoreSizeInBits) {
4158 if (MemTy.isVector())
4159 return UnableToLegalize;
4160
4161 // Promote to a byte-sized load if not loading an integral number of
4162 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
4163 LLT WideMemTy = LLT::scalar(SizeInBits: MemStoreSizeInBits);
4164 MachineMemOperand *NewMMO =
4165 MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty: WideMemTy);
4166
4167 Register LoadReg = DstReg;
4168 LLT LoadTy = DstTy;
4169
4170 // If this wasn't already an extending load, we need to widen the result
4171 // register to avoid creating a load with a narrower result than the source.
4172 if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
4173 LoadTy = WideMemTy;
4174 LoadReg = MRI.createGenericVirtualRegister(Ty: WideMemTy);
4175 }
4176
4177 if (isa<GSExtLoad>(Val: LoadMI)) {
4178 auto NewLoad = MIRBuilder.buildLoad(Res: LoadTy, Addr: PtrReg, MMO&: *NewMMO);
4179 MIRBuilder.buildSExtInReg(Res: LoadReg, Op: NewLoad, ImmOp: MemSizeInBits);
4180 } else if (isa<GZExtLoad>(Val: LoadMI) || WideMemTy == LoadTy) {
4181 auto NewLoad = MIRBuilder.buildLoad(Res: LoadTy, Addr: PtrReg, MMO&: *NewMMO);
4182 // The extra bits are guaranteed to be zero, since we stored them that
4183 // way. A zext load from Wide thus automatically gives zext from MemVT.
4184 MIRBuilder.buildAssertZExt(Res: LoadReg, Op: NewLoad, Size: MemSizeInBits);
4185 } else {
4186 MIRBuilder.buildLoad(Res: LoadReg, Addr: PtrReg, MMO&: *NewMMO);
4187 }
4188
4189 if (DstTy != LoadTy)
4190 MIRBuilder.buildTrunc(Res: DstReg, Op: LoadReg);
4191
4192 LoadMI.eraseFromParent();
4193 return Legalized;
4194 }
4195
4196 // Big endian lowering not implemented.
4197 if (MIRBuilder.getDataLayout().isBigEndian())
4198 return UnableToLegalize;
4199
4200 // This load needs splitting into power of 2 sized loads.
4201 //
4202 // Our strategy here is to generate anyextending loads for the smaller
4203 // types up to next power-2 result type, and then combine the two larger
4204 // result values together, before truncating back down to the non-pow-2
4205 // type.
4206 // E.g. v1 = i24 load =>
4207 // v2 = i32 zextload (2 byte)
4208 // v3 = i32 load (1 byte)
4209 // v4 = i32 shl v3, 16
4210 // v5 = i32 or v4, v2
4211 // v1 = i24 trunc v5
4212 // By doing this we generate the correct truncate which should get
4213 // combined away as an artifact with a matching extend.
4214
4215 uint64_t LargeSplitSize, SmallSplitSize;
4216
4217 if (!isPowerOf2_32(Value: MemSizeInBits)) {
4218 // This load needs splitting into power of 2 sized loads.
4219 LargeSplitSize = llvm::bit_floor(Value: MemSizeInBits);
4220 SmallSplitSize = MemSizeInBits - LargeSplitSize;
4221 } else {
4222 // This is already a power of 2, but we still need to split this in half.
4223 //
4224 // Assume we're being asked to decompose an unaligned load.
4225 // TODO: If this requires multiple splits, handle them all at once.
4226 auto &Ctx = MF.getFunction().getContext();
4227 if (TLI.allowsMemoryAccess(Context&: Ctx, DL: MIRBuilder.getDataLayout(), Ty: MemTy, MMO))
4228 return UnableToLegalize;
4229
4230 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4231 }
4232
4233 if (MemTy.isVector()) {
4234 // TODO: Handle vector extloads
4235 if (MemTy != DstTy)
4236 return UnableToLegalize;
4237
4238 Align Alignment = LoadMI.getAlign();
4239 // Given an alignment larger than the size of the memory, we can increase
4240 // the size of the load without needing to scalarize it.
4241 if (Alignment.value() * 8 > MemSizeInBits &&
4242 isPowerOf2_64(Value: DstTy.getScalarSizeInBits())) {
4243 LLT MoreTy = DstTy.changeVectorElementCount(
4244 EC: ElementCount::getFixed(MinVal: NextPowerOf2(A: DstTy.getNumElements())));
4245 MachineMemOperand *NewMMO = MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Ty: MoreTy);
4246 auto NewLoad = MIRBuilder.buildLoad(Res: MoreTy, Addr: PtrReg, MMO&: *NewMMO);
4247 MIRBuilder.buildDeleteTrailingVectorElements(Res: LoadMI.getReg(Idx: 0),
4248 Op0: NewLoad.getReg(Idx: 0));
4249 LoadMI.eraseFromParent();
4250 return Legalized;
4251 }
4252
4253 // TODO: We can do better than scalarizing the vector and at least split it
4254 // in half.
4255 return reduceLoadStoreWidth(MI&: LoadMI, TypeIdx: 0, NarrowTy: DstTy.getElementType());
4256 }
4257
4258 MachineMemOperand *LargeMMO =
4259 MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Size: LargeSplitSize / 8);
4260 MachineMemOperand *SmallMMO =
4261 MF.getMachineMemOperand(MMO: &MMO, Offset: LargeSplitSize / 8, Size: SmallSplitSize / 8);
4262
4263 LLT PtrTy = MRI.getType(Reg: PtrReg);
4264 unsigned AnyExtSize = PowerOf2Ceil(A: DstTy.getSizeInBits());
4265 LLT AnyExtTy = LLT::scalar(SizeInBits: AnyExtSize);
4266 auto LargeLoad = MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_ZEXTLOAD, Res: AnyExtTy,
4267 Addr: PtrReg, MMO&: *LargeMMO);
4268
4269 auto OffsetCst = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()),
4270 Val: LargeSplitSize / 8);
4271 Register PtrAddReg = MRI.createGenericVirtualRegister(Ty: PtrTy);
4272 auto SmallPtr = MIRBuilder.buildObjectPtrOffset(Res: PtrAddReg, Op0: PtrReg, Op1: OffsetCst);
4273 auto SmallLoad = MIRBuilder.buildLoadInstr(Opcode: LoadMI.getOpcode(), Res: AnyExtTy,
4274 Addr: SmallPtr, MMO&: *SmallMMO);
4275
4276 auto ShiftAmt = MIRBuilder.buildConstant(Res: AnyExtTy, Val: LargeSplitSize);
4277 auto Shift = MIRBuilder.buildShl(Dst: AnyExtTy, Src0: SmallLoad, Src1: ShiftAmt);
4278
4279 if (AnyExtTy == DstTy)
4280 MIRBuilder.buildOr(Dst: DstReg, Src0: Shift, Src1: LargeLoad);
4281 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
4282 auto Or = MIRBuilder.buildOr(Dst: AnyExtTy, Src0: Shift, Src1: LargeLoad);
4283 MIRBuilder.buildTrunc(Res: DstReg, Op: {Or});
4284 } else {
4285 assert(DstTy.isPointer() && "expected pointer");
4286 auto Or = MIRBuilder.buildOr(Dst: AnyExtTy, Src0: Shift, Src1: LargeLoad);
4287
4288 // FIXME: We currently consider this to be illegal for non-integral address
4289 // spaces, but we need still need a way to reinterpret the bits.
4290 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: Or);
4291 }
4292
4293 LoadMI.eraseFromParent();
4294 return Legalized;
4295}
4296
4297LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
4298 // Lower a non-power of 2 store into multiple pow-2 stores.
4299 // E.g. split an i24 store into an i16 store + i8 store.
4300 // We do this by first extending the stored value to the next largest power
4301 // of 2 type, and then using truncating stores to store the components.
4302 // By doing this, likewise with G_LOAD, generate an extend that can be
4303 // artifact-combined away instead of leaving behind extracts.
4304 Register SrcReg = StoreMI.getValueReg();
4305 Register PtrReg = StoreMI.getPointerReg();
4306 LLT SrcTy = MRI.getType(Reg: SrcReg);
4307 MachineFunction &MF = MIRBuilder.getMF();
4308 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4309 LLT MemTy = MMO.getMemoryType();
4310
4311 unsigned StoreWidth = MemTy.getSizeInBits();
4312 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
4313
4314 if (StoreWidth != StoreSizeInBits && !SrcTy.isVector()) {
4315 // Promote to a byte-sized store with upper bits zero if not
4316 // storing an integral number of bytes. For example, promote
4317 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
4318 LLT WideTy = LLT::scalar(SizeInBits: StoreSizeInBits);
4319
4320 if (StoreSizeInBits > SrcTy.getSizeInBits()) {
4321 // Avoid creating a store with a narrower source than result.
4322 SrcReg = MIRBuilder.buildAnyExt(Res: WideTy, Op: SrcReg).getReg(Idx: 0);
4323 SrcTy = WideTy;
4324 }
4325
4326 auto ZextInReg = MIRBuilder.buildZExtInReg(Res: SrcTy, Op: SrcReg, ImmOp: StoreWidth);
4327
4328 MachineMemOperand *NewMMO =
4329 MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty: WideTy);
4330 MIRBuilder.buildStore(Val: ZextInReg, Addr: PtrReg, MMO&: *NewMMO);
4331 StoreMI.eraseFromParent();
4332 return Legalized;
4333 }
4334
4335 if (MemTy.isVector()) {
4336 if (MemTy != SrcTy)
4337 return scalarizeVectorBooleanStore(MI&: StoreMI);
4338
4339 // TODO: We can do better than scalarizing the vector and at least split it
4340 // in half.
4341 return reduceLoadStoreWidth(MI&: StoreMI, TypeIdx: 0, NarrowTy: SrcTy.getElementType());
4342 }
4343
4344 unsigned MemSizeInBits = MemTy.getSizeInBits();
4345 uint64_t LargeSplitSize, SmallSplitSize;
4346
4347 if (!isPowerOf2_32(Value: MemSizeInBits)) {
4348 LargeSplitSize = llvm::bit_floor<uint64_t>(Value: MemTy.getSizeInBits());
4349 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
4350 } else {
4351 auto &Ctx = MF.getFunction().getContext();
4352 if (TLI.allowsMemoryAccess(Context&: Ctx, DL: MIRBuilder.getDataLayout(), Ty: MemTy, MMO))
4353 return UnableToLegalize; // Don't know what we're being asked to do.
4354
4355 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4356 }
4357
4358 // Extend to the next pow-2. If this store was itself the result of lowering,
4359 // e.g. an s56 store being broken into s32 + s24, we might have a stored type
4360 // that's wider than the stored size.
4361 unsigned AnyExtSize = PowerOf2Ceil(A: MemTy.getSizeInBits());
4362 const LLT NewSrcTy = LLT::scalar(SizeInBits: AnyExtSize);
4363
4364 if (SrcTy.isPointer()) {
4365 const LLT IntPtrTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
4366 SrcReg = MIRBuilder.buildPtrToInt(Dst: IntPtrTy, Src: SrcReg).getReg(Idx: 0);
4367 }
4368
4369 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(Res: NewSrcTy, Op: SrcReg);
4370
4371 // Obtain the smaller value by shifting away the larger value.
4372 auto ShiftAmt = MIRBuilder.buildConstant(Res: NewSrcTy, Val: LargeSplitSize);
4373 auto SmallVal = MIRBuilder.buildLShr(Dst: NewSrcTy, Src0: ExtVal, Src1: ShiftAmt);
4374
4375 // Generate the PtrAdd and truncating stores.
4376 LLT PtrTy = MRI.getType(Reg: PtrReg);
4377 auto OffsetCst = MIRBuilder.buildConstant(
4378 Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()), Val: LargeSplitSize / 8);
4379 auto SmallPtr = MIRBuilder.buildObjectPtrOffset(Res: PtrTy, Op0: PtrReg, Op1: OffsetCst);
4380
4381 MachineMemOperand *LargeMMO =
4382 MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Size: LargeSplitSize / 8);
4383 MachineMemOperand *SmallMMO =
4384 MF.getMachineMemOperand(MMO: &MMO, Offset: LargeSplitSize / 8, Size: SmallSplitSize / 8);
4385 MIRBuilder.buildStore(Val: ExtVal, Addr: PtrReg, MMO&: *LargeMMO);
4386 MIRBuilder.buildStore(Val: SmallVal, Addr: SmallPtr, MMO&: *SmallMMO);
4387 StoreMI.eraseFromParent();
4388 return Legalized;
4389}
4390
4391LegalizerHelper::LegalizeResult
4392LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
4393 Register SrcReg = StoreMI.getValueReg();
4394 Register PtrReg = StoreMI.getPointerReg();
4395 LLT SrcTy = MRI.getType(Reg: SrcReg);
4396 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4397 LLT MemTy = MMO.getMemoryType();
4398 LLT MemScalarTy = MemTy.getElementType();
4399 MachineFunction &MF = MIRBuilder.getMF();
4400
4401 assert(SrcTy.isVector() && "Expect a vector store type");
4402
4403 if (!MemScalarTy.isByteSized()) {
4404 // We need to build an integer scalar of the vector bit pattern.
4405 // It's not legal for us to add padding when storing a vector.
4406 unsigned NumBits = MemTy.getSizeInBits();
4407 LLT IntTy = LLT::scalar(SizeInBits: NumBits);
4408 auto CurrVal = MIRBuilder.buildConstant(Res: IntTy, Val: 0);
4409 LLT IdxTy = TLI.getVectorIdxLLT(DL: MF.getDataLayout());
4410
4411 for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
4412 auto Elt = MIRBuilder.buildExtractVectorElement(
4413 Res: SrcTy.getElementType(), Val: SrcReg, Idx: MIRBuilder.buildConstant(Res: IdxTy, Val: I));
4414 auto Trunc = MIRBuilder.buildTrunc(Res: MemScalarTy, Op: Elt);
4415 auto ZExt = MIRBuilder.buildZExt(Res: IntTy, Op: Trunc);
4416 unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
4417 ? (MemTy.getNumElements() - 1) - I
4418 : I;
4419 auto ShiftAmt = MIRBuilder.buildConstant(
4420 Res: IntTy, Val: ShiftIntoIdx * MemScalarTy.getSizeInBits());
4421 auto Shifted = MIRBuilder.buildShl(Dst: IntTy, Src0: ZExt, Src1: ShiftAmt);
4422 CurrVal = MIRBuilder.buildOr(Dst: IntTy, Src0: CurrVal, Src1: Shifted);
4423 }
4424 auto PtrInfo = MMO.getPointerInfo();
4425 auto *NewMMO = MF.getMachineMemOperand(MMO: &MMO, PtrInfo, Ty: IntTy);
4426 MIRBuilder.buildStore(Val: CurrVal, Addr: PtrReg, MMO&: *NewMMO);
4427 StoreMI.eraseFromParent();
4428 return Legalized;
4429 }
4430
4431 // TODO: implement simple scalarization.
4432 return UnableToLegalize;
4433}
4434
4435LegalizerHelper::LegalizeResult
4436LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
4437 switch (MI.getOpcode()) {
4438 case TargetOpcode::G_LOAD: {
4439 if (TypeIdx != 0)
4440 return UnableToLegalize;
4441 MachineMemOperand &MMO = **MI.memoperands_begin();
4442
4443 // Not sure how to interpret a bitcast of an extending load.
4444 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4445 return UnableToLegalize;
4446
4447 Observer.changingInstr(MI);
4448 bitcastDst(MI, CastTy, OpIdx: 0);
4449 MMO.setType(CastTy);
4450 // The range metadata is no longer valid when reinterpreted as a different
4451 // type.
4452 MMO.clearRanges();
4453 Observer.changedInstr(MI);
4454 return Legalized;
4455 }
4456 case TargetOpcode::G_STORE: {
4457 if (TypeIdx != 0)
4458 return UnableToLegalize;
4459
4460 MachineMemOperand &MMO = **MI.memoperands_begin();
4461
4462 // Not sure how to interpret a bitcast of a truncating store.
4463 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4464 return UnableToLegalize;
4465
4466 Observer.changingInstr(MI);
4467 bitcastSrc(MI, CastTy, OpIdx: 0);
4468 MMO.setType(CastTy);
4469 Observer.changedInstr(MI);
4470 return Legalized;
4471 }
4472 case TargetOpcode::G_SELECT: {
4473 if (TypeIdx != 0)
4474 return UnableToLegalize;
4475
4476 if (MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector()) {
4477 LLVM_DEBUG(
4478 dbgs() << "bitcast action not implemented for vector select\n");
4479 return UnableToLegalize;
4480 }
4481
4482 Observer.changingInstr(MI);
4483 bitcastSrc(MI, CastTy, OpIdx: 2);
4484 bitcastSrc(MI, CastTy, OpIdx: 3);
4485 bitcastDst(MI, CastTy, OpIdx: 0);
4486 Observer.changedInstr(MI);
4487 return Legalized;
4488 }
4489 case TargetOpcode::G_AND:
4490 case TargetOpcode::G_OR:
4491 case TargetOpcode::G_XOR: {
4492 Observer.changingInstr(MI);
4493 bitcastSrc(MI, CastTy, OpIdx: 1);
4494 bitcastSrc(MI, CastTy, OpIdx: 2);
4495 bitcastDst(MI, CastTy, OpIdx: 0);
4496 Observer.changedInstr(MI);
4497 return Legalized;
4498 }
4499 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4500 return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
4501 case TargetOpcode::G_INSERT_VECTOR_ELT:
4502 return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
4503 case TargetOpcode::G_CONCAT_VECTORS:
4504 return bitcastConcatVector(MI, TypeIdx, CastTy);
4505 case TargetOpcode::G_SHUFFLE_VECTOR:
4506 return bitcastShuffleVector(MI, TypeIdx, CastTy);
4507 case TargetOpcode::G_EXTRACT_SUBVECTOR:
4508 return bitcastExtractSubvector(MI, TypeIdx, CastTy);
4509 case TargetOpcode::G_INSERT_SUBVECTOR:
4510 return bitcastInsertSubvector(MI, TypeIdx, CastTy);
4511 default:
4512 return UnableToLegalize;
4513 }
4514}
4515
4516// Legalize an instruction by changing the opcode in place.
4517void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
4518 Observer.changingInstr(MI);
4519 MI.setDesc(MIRBuilder.getTII().get(Opcode: NewOpcode));
4520 Observer.changedInstr(MI);
4521}
4522
4523LegalizerHelper::LegalizeResult
4524LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
4525 using namespace TargetOpcode;
4526
4527 switch(MI.getOpcode()) {
4528 default:
4529 return UnableToLegalize;
4530 case TargetOpcode::G_FCONSTANT:
4531 return lowerFConstant(MI);
4532 case TargetOpcode::G_BITCAST:
4533 return lowerBitcast(MI);
4534 case TargetOpcode::G_SREM:
4535 case TargetOpcode::G_UREM: {
4536 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4537 auto Quot =
4538 MIRBuilder.buildInstr(Opc: MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, DstOps: {Ty},
4539 SrcOps: {MI.getOperand(i: 1), MI.getOperand(i: 2)});
4540
4541 auto Prod = MIRBuilder.buildMul(Dst: Ty, Src0: Quot, Src1: MI.getOperand(i: 2));
4542 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MI.getOperand(i: 1), Src1: Prod);
4543 MI.eraseFromParent();
4544 return Legalized;
4545 }
4546 case TargetOpcode::G_SADDO:
4547 case TargetOpcode::G_SSUBO:
4548 return lowerSADDO_SSUBO(MI);
4549 case TargetOpcode::G_SADDE:
4550 return lowerSADDE(MI);
4551 case TargetOpcode::G_SSUBE:
4552 return lowerSSUBE(MI);
4553 case TargetOpcode::G_UMULH:
4554 case TargetOpcode::G_SMULH:
4555 return lowerSMULH_UMULH(MI);
4556 case TargetOpcode::G_SMULO:
4557 case TargetOpcode::G_UMULO: {
4558 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
4559 // result.
4560 auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
4561 LLT Ty = MRI.getType(Reg: Res);
4562
4563 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
4564 ? TargetOpcode::G_SMULH
4565 : TargetOpcode::G_UMULH;
4566
4567 Observer.changingInstr(MI);
4568 const auto &TII = MIRBuilder.getTII();
4569 MI.setDesc(TII.get(Opcode: TargetOpcode::G_MUL));
4570 MI.removeOperand(OpNo: 1);
4571 Observer.changedInstr(MI);
4572
4573 auto HiPart = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {Ty}, SrcOps: {LHS, RHS});
4574 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4575
4576 // Move insert point forward so we can use the Res register if needed.
4577 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
4578
4579 // For *signed* multiply, overflow is detected by checking:
4580 // (hi != (lo >> bitwidth-1))
4581 if (Opcode == TargetOpcode::G_SMULH) {
4582 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: Ty.getSizeInBits() - 1);
4583 auto Shifted = MIRBuilder.buildAShr(Dst: Ty, Src0: Res, Src1: ShiftAmt);
4584 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: Overflow, Op0: HiPart, Op1: Shifted);
4585 } else {
4586 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: Overflow, Op0: HiPart, Op1: Zero);
4587 }
4588 return Legalized;
4589 }
4590 case TargetOpcode::G_FNEG: {
4591 auto [Res, SubByReg] = MI.getFirst2Regs();
4592 LLT Ty = MRI.getType(Reg: Res);
4593
4594 auto SignMask = MIRBuilder.buildConstant(
4595 Res: Ty, Val: APInt::getSignMask(BitWidth: Ty.getScalarSizeInBits()));
4596 MIRBuilder.buildXor(Dst: Res, Src0: SubByReg, Src1: SignMask);
4597 MI.eraseFromParent();
4598 return Legalized;
4599 }
4600 case TargetOpcode::G_FSUB:
4601 case TargetOpcode::G_STRICT_FSUB: {
4602 auto [Res, LHS, RHS] = MI.getFirst3Regs();
4603 LLT Ty = MRI.getType(Reg: Res);
4604
4605 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
4606 auto Neg = MIRBuilder.buildFNeg(Dst: Ty, Src0: RHS);
4607
4608 if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
4609 MIRBuilder.buildStrictFAdd(Dst: Res, Src0: LHS, Src1: Neg, Flags: MI.getFlags());
4610 else
4611 MIRBuilder.buildFAdd(Dst: Res, Src0: LHS, Src1: Neg, Flags: MI.getFlags());
4612
4613 MI.eraseFromParent();
4614 return Legalized;
4615 }
4616 case TargetOpcode::G_FMAD:
4617 return lowerFMad(MI);
4618 case TargetOpcode::G_FFLOOR:
4619 return lowerFFloor(MI);
4620 case TargetOpcode::G_LROUND:
4621 case TargetOpcode::G_LLROUND: {
4622 Register DstReg = MI.getOperand(i: 0).getReg();
4623 Register SrcReg = MI.getOperand(i: 1).getReg();
4624 LLT SrcTy = MRI.getType(Reg: SrcReg);
4625 auto Round = MIRBuilder.buildInstr(Opc: TargetOpcode::G_INTRINSIC_ROUND, DstOps: {SrcTy},
4626 SrcOps: {SrcReg});
4627 MIRBuilder.buildFPTOSI(Dst: DstReg, Src0: Round);
4628 MI.eraseFromParent();
4629 return Legalized;
4630 }
4631 case TargetOpcode::G_INTRINSIC_ROUND:
4632 return lowerIntrinsicRound(MI);
4633 case TargetOpcode::G_FRINT: {
4634 // Since round even is the assumed rounding mode for unconstrained FP
4635 // operations, rint and roundeven are the same operation.
4636 changeOpcode(MI, NewOpcode: TargetOpcode::G_INTRINSIC_ROUNDEVEN);
4637 return Legalized;
4638 }
4639 case TargetOpcode::G_INTRINSIC_LRINT:
4640 case TargetOpcode::G_INTRINSIC_LLRINT: {
4641 Register DstReg = MI.getOperand(i: 0).getReg();
4642 Register SrcReg = MI.getOperand(i: 1).getReg();
4643 LLT SrcTy = MRI.getType(Reg: SrcReg);
4644 auto Round =
4645 MIRBuilder.buildInstr(Opc: TargetOpcode::G_FRINT, DstOps: {SrcTy}, SrcOps: {SrcReg});
4646 MIRBuilder.buildFPTOSI(Dst: DstReg, Src0: Round);
4647 MI.eraseFromParent();
4648 return Legalized;
4649 }
4650 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
4651 auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
4652 Register NewOldValRes = MRI.cloneVirtualRegister(VReg: OldValRes);
4653 MIRBuilder.buildAtomicCmpXchg(OldValRes: NewOldValRes, Addr, CmpVal, NewVal,
4654 MMO&: **MI.memoperands_begin());
4655 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: SuccessRes, Op0: NewOldValRes, Op1: CmpVal);
4656 MIRBuilder.buildCopy(Res: OldValRes, Op: NewOldValRes);
4657 MI.eraseFromParent();
4658 return Legalized;
4659 }
4660 case TargetOpcode::G_LOAD:
4661 case TargetOpcode::G_SEXTLOAD:
4662 case TargetOpcode::G_ZEXTLOAD:
4663 return lowerLoad(LoadMI&: cast<GAnyLoad>(Val&: MI));
4664 case TargetOpcode::G_STORE:
4665 return lowerStore(StoreMI&: cast<GStore>(Val&: MI));
4666 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
4667 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
4668 case TargetOpcode::G_CTLZ:
4669 case TargetOpcode::G_CTTZ:
4670 case TargetOpcode::G_CTPOP:
4671 case TargetOpcode::G_CTLS:
4672 return lowerBitCount(MI);
4673 case G_UADDO: {
4674 auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
4675
4676 Register NewRes = MRI.cloneVirtualRegister(VReg: Res);
4677
4678 MIRBuilder.buildAdd(Dst: NewRes, Src0: LHS, Src1: RHS);
4679 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: CarryOut, Op0: NewRes, Op1: RHS);
4680
4681 MIRBuilder.buildCopy(Res, Op: NewRes);
4682
4683 MI.eraseFromParent();
4684 return Legalized;
4685 }
4686 case G_UADDE: {
4687 auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
4688 const LLT CondTy = MRI.getType(Reg: CarryOut);
4689 const LLT Ty = MRI.getType(Reg: Res);
4690
4691 Register NewRes = MRI.cloneVirtualRegister(VReg: Res);
4692
4693 // Initial add of the two operands.
4694 auto TmpRes = MIRBuilder.buildAdd(Dst: Ty, Src0: LHS, Src1: RHS);
4695
4696 // Initial check for carry.
4697 auto Carry = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: CondTy, Op0: TmpRes, Op1: LHS);
4698
4699 // Add the sum and the carry.
4700 auto ZExtCarryIn = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
4701 MIRBuilder.buildAdd(Dst: NewRes, Src0: TmpRes, Src1: ZExtCarryIn);
4702
4703 // Second check for carry. We can only carry if the initial sum is all 1s
4704 // and the carry is set, resulting in a new sum of 0.
4705 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4706 auto ResEqZero =
4707 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CondTy, Op0: NewRes, Op1: Zero);
4708 auto Carry2 = MIRBuilder.buildAnd(Dst: CondTy, Src0: ResEqZero, Src1: CarryIn);
4709 MIRBuilder.buildOr(Dst: CarryOut, Src0: Carry, Src1: Carry2);
4710
4711 MIRBuilder.buildCopy(Res, Op: NewRes);
4712
4713 MI.eraseFromParent();
4714 return Legalized;
4715 }
4716 case G_USUBO: {
4717 auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
4718
4719 MIRBuilder.buildSub(Dst: Res, Src0: LHS, Src1: RHS);
4720 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: BorrowOut, Op0: LHS, Op1: RHS);
4721
4722 MI.eraseFromParent();
4723 return Legalized;
4724 }
4725 case G_USUBE: {
4726 auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
4727 const LLT CondTy = MRI.getType(Reg: BorrowOut);
4728 const LLT Ty = MRI.getType(Reg: Res);
4729
4730 // Initial subtract of the two operands.
4731 auto TmpRes = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHS);
4732
4733 // Initial check for borrow.
4734 auto Borrow = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_UGT, Res: CondTy, Op0: TmpRes, Op1: LHS);
4735
4736 // Subtract the borrow from the first subtract.
4737 auto ZExtBorrowIn = MIRBuilder.buildZExt(Res: Ty, Op: BorrowIn);
4738 MIRBuilder.buildSub(Dst: Res, Src0: TmpRes, Src1: ZExtBorrowIn);
4739
4740 // Second check for borrow. We can only borrow if the initial difference is
4741 // 0 and the borrow is set, resulting in a new difference of all 1s.
4742 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
4743 auto TmpResEqZero =
4744 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CondTy, Op0: TmpRes, Op1: Zero);
4745 auto Borrow2 = MIRBuilder.buildAnd(Dst: CondTy, Src0: TmpResEqZero, Src1: BorrowIn);
4746 MIRBuilder.buildOr(Dst: BorrowOut, Src0: Borrow, Src1: Borrow2);
4747
4748 MI.eraseFromParent();
4749 return Legalized;
4750 }
4751 case G_UITOFP:
4752 return lowerUITOFP(MI);
4753 case G_SITOFP:
4754 return lowerSITOFP(MI);
4755 case G_FPTOUI:
4756 return lowerFPTOUI(MI);
4757 case G_FPTOSI:
4758 return lowerFPTOSI(MI);
4759 case G_FPTOUI_SAT:
4760 case G_FPTOSI_SAT:
4761 return lowerFPTOINT_SAT(MI);
4762 case G_FPTRUNC:
4763 return lowerFPTRUNC(MI);
4764 case G_FPOWI:
4765 return lowerFPOWI(MI);
4766 case G_FMODF:
4767 return lowerFMODF(MI);
4768 case G_SMIN:
4769 case G_SMAX:
4770 case G_UMIN:
4771 case G_UMAX:
4772 return lowerMinMax(MI);
4773 case G_SCMP:
4774 case G_UCMP:
4775 return lowerThreewayCompare(MI);
4776 case G_FCOPYSIGN:
4777 return lowerFCopySign(MI);
4778 case G_FMINNUM:
4779 case G_FMAXNUM:
4780 case G_FMINIMUMNUM:
4781 case G_FMAXIMUMNUM:
4782 return lowerFMinNumMaxNum(MI);
4783 case G_FMINIMUM:
4784 case G_FMAXIMUM:
4785 return lowerFMinimumMaximum(MI);
4786 case G_MERGE_VALUES:
4787 return lowerMergeValues(MI);
4788 case G_UNMERGE_VALUES:
4789 return lowerUnmergeValues(MI);
4790 case TargetOpcode::G_SEXT_INREG: {
4791 assert(MI.getOperand(2).isImm() && "Expected immediate");
4792 int64_t SizeInBits = MI.getOperand(i: 2).getImm();
4793
4794 auto [DstReg, SrcReg] = MI.getFirst2Regs();
4795 LLT DstTy = MRI.getType(Reg: DstReg);
4796 Register TmpRes = MRI.createGenericVirtualRegister(Ty: DstTy);
4797
4798 auto MIBSz = MIRBuilder.buildConstant(Res: DstTy, Val: DstTy.getScalarSizeInBits() - SizeInBits);
4799 MIRBuilder.buildShl(Dst: TmpRes, Src0: SrcReg, Src1: MIBSz->getOperand(i: 0));
4800 MIRBuilder.buildAShr(Dst: DstReg, Src0: TmpRes, Src1: MIBSz->getOperand(i: 0));
4801 MI.eraseFromParent();
4802 return Legalized;
4803 }
4804 case G_EXTRACT_VECTOR_ELT:
4805 case G_INSERT_VECTOR_ELT:
4806 return lowerExtractInsertVectorElt(MI);
4807 case G_SHUFFLE_VECTOR:
4808 return lowerShuffleVector(MI);
4809 case G_VECTOR_COMPRESS:
4810 return lowerVECTOR_COMPRESS(MI);
4811 case G_DYN_STACKALLOC:
4812 return lowerDynStackAlloc(MI);
4813 case G_STACKSAVE:
4814 return lowerStackSave(MI);
4815 case G_STACKRESTORE:
4816 return lowerStackRestore(MI);
4817 case G_EXTRACT:
4818 return lowerExtract(MI);
4819 case G_INSERT:
4820 return lowerInsert(MI);
4821 case G_BSWAP:
4822 return lowerBswap(MI);
4823 case G_BITREVERSE:
4824 return lowerBitreverse(MI);
4825 case G_READ_REGISTER:
4826 case G_WRITE_REGISTER:
4827 return lowerReadWriteRegister(MI);
4828 case G_UADDSAT:
4829 case G_USUBSAT: {
4830 // Try to make a reasonable guess about which lowering strategy to use. The
4831 // target can override this with custom lowering and calling the
4832 // implementation functions.
4833 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4834 if (LI.isLegalOrCustom(Query: {G_UMIN, Ty}))
4835 return lowerAddSubSatToMinMax(MI);
4836 return lowerAddSubSatToAddoSubo(MI);
4837 }
4838 case G_SADDSAT:
4839 case G_SSUBSAT: {
4840 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4841
4842 // FIXME: It would probably make more sense to see if G_SADDO is preferred,
4843 // since it's a shorter expansion. However, we would need to figure out the
4844 // preferred boolean type for the carry out for the query.
4845 if (LI.isLegalOrCustom(Query: {G_SMIN, Ty}) && LI.isLegalOrCustom(Query: {G_SMAX, Ty}))
4846 return lowerAddSubSatToMinMax(MI);
4847 return lowerAddSubSatToAddoSubo(MI);
4848 }
4849 case G_SSHLSAT:
4850 case G_USHLSAT:
4851 return lowerShlSat(MI);
4852 case G_ABS:
4853 return lowerAbsToAddXor(MI);
4854 case G_ABDS:
4855 case G_ABDU: {
4856 bool IsSigned = MI.getOpcode() == G_ABDS;
4857 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4858 if ((IsSigned && LI.isLegal(Query: {G_SMIN, Ty}) && LI.isLegal(Query: {G_SMAX, Ty})) ||
4859 (!IsSigned && LI.isLegal(Query: {G_UMIN, Ty}) && LI.isLegal(Query: {G_UMAX, Ty}))) {
4860 return lowerAbsDiffToMinMax(MI);
4861 }
4862 return lowerAbsDiffToSelect(MI);
4863 }
4864 case G_FABS:
4865 return lowerFAbs(MI);
4866 case G_SELECT:
4867 return lowerSelect(MI);
4868 case G_IS_FPCLASS:
4869 return lowerISFPCLASS(MI);
4870 case G_SDIVREM:
4871 case G_UDIVREM:
4872 return lowerDIVREM(MI);
4873 case G_FSHL:
4874 case G_FSHR:
4875 return lowerFunnelShift(MI);
4876 case G_ROTL:
4877 case G_ROTR:
4878 return lowerRotate(MI);
4879 case G_MEMSET:
4880 case G_MEMCPY:
4881 case G_MEMMOVE:
4882 return lowerMemCpyFamily(MI);
4883 case G_MEMCPY_INLINE:
4884 return lowerMemcpyInline(MI);
4885 case G_ZEXT:
4886 case G_SEXT:
4887 case G_ANYEXT:
4888 return lowerEXT(MI);
4889 case G_TRUNC:
4890 return lowerTRUNC(MI);
4891 GISEL_VECREDUCE_CASES_NONSEQ
4892 return lowerVectorReduction(MI);
4893 case G_VAARG:
4894 return lowerVAArg(MI);
4895 case G_ATOMICRMW_SUB: {
4896 auto [Ret, Mem, Val] = MI.getFirst3Regs();
4897 const LLT ValTy = MRI.getType(Reg: Val);
4898 MachineMemOperand *MMO = *MI.memoperands_begin();
4899
4900 auto VNeg = MIRBuilder.buildNeg(Dst: ValTy, Src0: Val);
4901 MIRBuilder.buildAtomicRMW(Opcode: G_ATOMICRMW_ADD, OldValRes: Ret, Addr: Mem, Val: VNeg, MMO&: *MMO);
4902 MI.eraseFromParent();
4903 return Legalized;
4904 }
4905 }
4906}
4907
4908Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
4909 Align MinAlign) const {
4910 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
4911 // datalayout for the preferred alignment. Also there should be a target hook
4912 // for this to allow targets to reduce the alignment and ignore the
4913 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
4914 // the type.
4915 return std::max(a: Align(PowerOf2Ceil(A: Ty.getSizeInBytes())), b: MinAlign);
4916}
4917
4918MachineInstrBuilder
4919LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
4920 MachinePointerInfo &PtrInfo) {
4921 MachineFunction &MF = MIRBuilder.getMF();
4922 const DataLayout &DL = MIRBuilder.getDataLayout();
4923 int FrameIdx = MF.getFrameInfo().CreateStackObject(Size: Bytes, Alignment, isSpillSlot: false);
4924
4925 unsigned AddrSpace = DL.getAllocaAddrSpace();
4926 LLT FramePtrTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: DL.getPointerSizeInBits(AS: AddrSpace));
4927
4928 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI: FrameIdx);
4929 return MIRBuilder.buildFrameIndex(Res: FramePtrTy, Idx: FrameIdx);
4930}
4931
4932MachineInstrBuilder LegalizerHelper::createStackStoreLoad(const DstOp &Res,
4933 const SrcOp &Val) {
4934 LLT SrcTy = Val.getLLTTy(MRI);
4935 Align StackTypeAlign =
4936 std::max(a: getStackTemporaryAlignment(Ty: SrcTy),
4937 b: getStackTemporaryAlignment(Ty: Res.getLLTTy(MRI)));
4938 MachinePointerInfo PtrInfo;
4939 auto StackTemp =
4940 createStackTemporary(Bytes: SrcTy.getSizeInBytes(), Alignment: StackTypeAlign, PtrInfo);
4941
4942 MIRBuilder.buildStore(Val, Addr: StackTemp, PtrInfo, Alignment: StackTypeAlign);
4943 return MIRBuilder.buildLoad(Res, Addr: StackTemp, PtrInfo, Alignment: StackTypeAlign);
4944}
4945
4946static Register clampVectorIndex(MachineIRBuilder &B, Register IdxReg,
4947 LLT VecTy) {
4948 LLT IdxTy = B.getMRI()->getType(Reg: IdxReg);
4949 unsigned NElts = VecTy.getNumElements();
4950
4951 int64_t IdxVal;
4952 if (mi_match(R: IdxReg, MRI: *B.getMRI(), P: m_ICst(Cst&: IdxVal))) {
4953 if (IdxVal < VecTy.getNumElements())
4954 return IdxReg;
4955 // If a constant index would be out of bounds, clamp it as well.
4956 }
4957
4958 if (isPowerOf2_32(Value: NElts)) {
4959 APInt Imm = APInt::getLowBitsSet(numBits: IdxTy.getSizeInBits(), loBitsSet: Log2_32(Value: NElts));
4960 return B.buildAnd(Dst: IdxTy, Src0: IdxReg, Src1: B.buildConstant(Res: IdxTy, Val: Imm)).getReg(Idx: 0);
4961 }
4962
4963 return B.buildUMin(Dst: IdxTy, Src0: IdxReg, Src1: B.buildConstant(Res: IdxTy, Val: NElts - 1))
4964 .getReg(Idx: 0);
4965}
4966
4967Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
4968 Register Index) {
4969 LLT EltTy = VecTy.getElementType();
4970
4971 // Calculate the element offset and add it to the pointer.
4972 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
4973 assert(EltSize * 8 == EltTy.getSizeInBits() &&
4974 "Converting bits to bytes lost precision");
4975
4976 Index = clampVectorIndex(B&: MIRBuilder, IdxReg: Index, VecTy);
4977
4978 // Convert index to the correct size for the address space.
4979 const DataLayout &DL = MIRBuilder.getDataLayout();
4980 unsigned AS = MRI.getType(Reg: VecPtr).getAddressSpace();
4981 unsigned IndexSizeInBits = DL.getIndexSize(AS) * 8;
4982 LLT IdxTy = MRI.getType(Reg: Index).changeElementSize(NewEltSize: IndexSizeInBits);
4983 if (IdxTy != MRI.getType(Reg: Index))
4984 Index = MIRBuilder.buildSExtOrTrunc(Res: IdxTy, Op: Index).getReg(Idx: 0);
4985
4986 auto Mul = MIRBuilder.buildMul(Dst: IdxTy, Src0: Index,
4987 Src1: MIRBuilder.buildConstant(Res: IdxTy, Val: EltSize));
4988
4989 LLT PtrTy = MRI.getType(Reg: VecPtr);
4990 return MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VecPtr, Op1: Mul).getReg(Idx: 0);
4991}
4992
4993#ifndef NDEBUG
4994/// Check that all vector operands have same number of elements. Other operands
4995/// should be listed in NonVecOp.
4996static bool hasSameNumEltsOnAllVectorOperands(
4997 GenericMachineInstr &MI, MachineRegisterInfo &MRI,
4998 std::initializer_list<unsigned> NonVecOpIndices) {
4999 if (MI.getNumMemOperands() != 0)
5000 return false;
5001
5002 LLT VecTy = MRI.getType(MI.getReg(0));
5003 if (!VecTy.isVector())
5004 return false;
5005 unsigned NumElts = VecTy.getNumElements();
5006
5007 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
5008 MachineOperand &Op = MI.getOperand(OpIdx);
5009 if (!Op.isReg()) {
5010 if (!is_contained(NonVecOpIndices, OpIdx))
5011 return false;
5012 continue;
5013 }
5014
5015 LLT Ty = MRI.getType(Op.getReg());
5016 if (!Ty.isVector()) {
5017 if (!is_contained(NonVecOpIndices, OpIdx))
5018 return false;
5019 continue;
5020 }
5021
5022 if (Ty.getNumElements() != NumElts)
5023 return false;
5024 }
5025
5026 return true;
5027}
5028#endif
5029
5030/// Fill \p DstOps with DstOps that have same number of elements combined as
5031/// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
5032/// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
5033/// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
5034static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
5035 unsigned NumElts) {
5036 LLT LeftoverTy;
5037 assert(Ty.isVector() && "Expected vector type");
5038 LLT NarrowTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: NumElts));
5039 int NumParts, NumLeftover;
5040 std::tie(args&: NumParts, args&: NumLeftover) =
5041 getNarrowTypeBreakDown(OrigTy: Ty, NarrowTy, LeftoverTy);
5042
5043 assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
5044 for (int i = 0; i < NumParts; ++i) {
5045 DstOps.push_back(Elt: NarrowTy);
5046 }
5047
5048 if (LeftoverTy.isValid()) {
5049 assert(NumLeftover == 1 && "expected exactly one leftover");
5050 DstOps.push_back(Elt: LeftoverTy);
5051 }
5052}
5053
5054/// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
5055/// made from \p Op depending on operand type.
5056static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
5057 MachineOperand &Op) {
5058 for (unsigned i = 0; i < N; ++i) {
5059 if (Op.isReg())
5060 Ops.push_back(Elt: Op.getReg());
5061 else if (Op.isImm())
5062 Ops.push_back(Elt: Op.getImm());
5063 else if (Op.isPredicate())
5064 Ops.push_back(Elt: static_cast<CmpInst::Predicate>(Op.getPredicate()));
5065 else
5066 llvm_unreachable("Unsupported type");
5067 }
5068}
5069
5070// Handle splitting vector operations which need to have the same number of
5071// elements in each type index, but each type index may have a different element
5072// type.
5073//
5074// e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
5075// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
5076// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
5077//
5078// Also handles some irregular breakdown cases, e.g.
5079// e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
5080// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
5081// s64 = G_SHL s64, s32
5082LegalizerHelper::LegalizeResult
5083LegalizerHelper::fewerElementsVectorMultiEltType(
5084 GenericMachineInstr &MI, unsigned NumElts,
5085 std::initializer_list<unsigned> NonVecOpIndices) {
5086 assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
5087 "Non-compatible opcode or not specified non-vector operands");
5088 unsigned OrigNumElts = MRI.getType(Reg: MI.getReg(Idx: 0)).getNumElements();
5089
5090 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
5091 unsigned NumDefs = MI.getNumDefs();
5092
5093 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
5094 // Build instructions with DstOps to use instruction found by CSE directly.
5095 // CSE copies found instruction into given vreg when building with vreg dest.
5096 SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
5097 // Output registers will be taken from created instructions.
5098 SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
5099 for (unsigned i = 0; i < NumDefs; ++i) {
5100 makeDstOps(DstOps&: OutputOpsPieces[i], Ty: MRI.getType(Reg: MI.getReg(Idx: i)), NumElts);
5101 }
5102
5103 // Split vector input operands into sub-vectors with NumElts elts + Leftover.
5104 // Operands listed in NonVecOpIndices will be used as is without splitting;
5105 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
5106 // scalar condition (op 1), immediate in sext_inreg (op 2).
5107 SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
5108 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
5109 ++UseIdx, ++UseNo) {
5110 if (is_contained(Set: NonVecOpIndices, Element: UseIdx)) {
5111 broadcastSrcOp(Ops&: InputOpsPieces[UseNo], N: OutputOpsPieces[0].size(),
5112 Op&: MI.getOperand(i: UseIdx));
5113 } else {
5114 SmallVector<Register, 8> SplitPieces;
5115 extractVectorParts(Reg: MI.getReg(Idx: UseIdx), NumElts, VRegs&: SplitPieces, MIRBuilder,
5116 MRI);
5117 llvm::append_range(C&: InputOpsPieces[UseNo], R&: SplitPieces);
5118 }
5119 }
5120
5121 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
5122
5123 // Take i-th piece of each input operand split and build sub-vector/scalar
5124 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
5125 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
5126 SmallVector<DstOp, 2> Defs;
5127 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5128 Defs.push_back(Elt: OutputOpsPieces[DstNo][i]);
5129
5130 SmallVector<SrcOp, 3> Uses;
5131 for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
5132 Uses.push_back(Elt: InputOpsPieces[InputNo][i]);
5133
5134 auto I = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: Defs, SrcOps: Uses, Flags: MI.getFlags());
5135 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5136 OutputRegs[DstNo].push_back(Elt: I.getReg(Idx: DstNo));
5137 }
5138
5139 // Merge small outputs into MI's output for each def operand.
5140 if (NumLeftovers) {
5141 for (unsigned i = 0; i < NumDefs; ++i)
5142 mergeMixedSubvectors(DstReg: MI.getReg(Idx: i), PartRegs: OutputRegs[i]);
5143 } else {
5144 for (unsigned i = 0; i < NumDefs; ++i)
5145 MIRBuilder.buildMergeLikeInstr(Res: MI.getReg(Idx: i), Ops: OutputRegs[i]);
5146 }
5147
5148 MI.eraseFromParent();
5149 return Legalized;
5150}
5151
5152LegalizerHelper::LegalizeResult
5153LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
5154 unsigned NumElts) {
5155 unsigned OrigNumElts = MRI.getType(Reg: MI.getReg(Idx: 0)).getNumElements();
5156
5157 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
5158 unsigned NumDefs = MI.getNumDefs();
5159
5160 SmallVector<DstOp, 8> OutputOpsPieces;
5161 SmallVector<Register, 8> OutputRegs;
5162 makeDstOps(DstOps&: OutputOpsPieces, Ty: MRI.getType(Reg: MI.getReg(Idx: 0)), NumElts);
5163
5164 // Instructions that perform register split will be inserted in basic block
5165 // where register is defined (basic block is in the next operand).
5166 SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
5167 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
5168 UseIdx += 2, ++UseNo) {
5169 MachineBasicBlock &OpMBB = *MI.getOperand(i: UseIdx + 1).getMBB();
5170 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
5171 extractVectorParts(Reg: MI.getReg(Idx: UseIdx), NumElts, VRegs&: InputOpsPieces[UseNo],
5172 MIRBuilder, MRI);
5173 }
5174
5175 // Build PHIs with fewer elements.
5176 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
5177 MIRBuilder.setInsertPt(MBB&: *MI.getParent(), II: MI);
5178 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
5179 auto Phi = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_PHI);
5180 Phi.addDef(
5181 RegNo: MRI.createGenericVirtualRegister(Ty: OutputOpsPieces[i].getLLTTy(MRI)));
5182 OutputRegs.push_back(Elt: Phi.getReg(Idx: 0));
5183
5184 for (unsigned j = 0; j < NumInputs / 2; ++j) {
5185 Phi.addUse(RegNo: InputOpsPieces[j][i]);
5186 Phi.add(MO: MI.getOperand(i: 1 + j * 2 + 1));
5187 }
5188 }
5189
5190 // Set the insert point after the existing PHIs
5191 MachineBasicBlock &MBB = *MI.getParent();
5192 MIRBuilder.setInsertPt(MBB, II: MBB.getFirstNonPHI());
5193
5194 // Merge small outputs into MI's def.
5195 if (NumLeftovers) {
5196 mergeMixedSubvectors(DstReg: MI.getReg(Idx: 0), PartRegs: OutputRegs);
5197 } else {
5198 MIRBuilder.buildMergeLikeInstr(Res: MI.getReg(Idx: 0), Ops: OutputRegs);
5199 }
5200
5201 MI.eraseFromParent();
5202 return Legalized;
5203}
5204
5205LegalizerHelper::LegalizeResult
5206LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
5207 unsigned TypeIdx,
5208 LLT NarrowTy) {
5209 const int NumDst = MI.getNumOperands() - 1;
5210 const Register SrcReg = MI.getOperand(i: NumDst).getReg();
5211 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5212 LLT SrcTy = MRI.getType(Reg: SrcReg);
5213
5214 if (TypeIdx != 1 || NarrowTy == DstTy)
5215 return UnableToLegalize;
5216
5217 // Requires compatible types. Otherwise SrcReg should have been defined by
5218 // merge-like instruction that would get artifact combined. Most likely
5219 // instruction that defines SrcReg has to perform more/fewer elements
5220 // legalization compatible with NarrowTy.
5221 assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5222 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5223
5224 if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5225 (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
5226 return UnableToLegalize;
5227
5228 // This is most likely DstTy (smaller then register size) packed in SrcTy
5229 // (larger then register size) and since unmerge was not combined it will be
5230 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
5231 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
5232
5233 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
5234 //
5235 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
5236 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
5237 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
5238 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: SrcReg);
5239 const int NumUnmerge = Unmerge->getNumOperands() - 1;
5240 const int PartsPerUnmerge = NumDst / NumUnmerge;
5241
5242 for (int I = 0; I != NumUnmerge; ++I) {
5243 auto MIB = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_UNMERGE_VALUES);
5244
5245 for (int J = 0; J != PartsPerUnmerge; ++J)
5246 MIB.addDef(RegNo: MI.getOperand(i: I * PartsPerUnmerge + J).getReg());
5247 MIB.addUse(RegNo: Unmerge.getReg(Idx: I));
5248 }
5249
5250 MI.eraseFromParent();
5251 return Legalized;
5252}
5253
5254LegalizerHelper::LegalizeResult
5255LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
5256 LLT NarrowTy) {
5257 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5258 // Requires compatible types. Otherwise user of DstReg did not perform unmerge
5259 // that should have been artifact combined. Most likely instruction that uses
5260 // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
5261 assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5262 assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5263 if (NarrowTy == SrcTy)
5264 return UnableToLegalize;
5265
5266 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
5267 // is for old mir tests. Since the changes to more/fewer elements it should no
5268 // longer be possible to generate MIR like this when starting from llvm-ir
5269 // because LCMTy approach was replaced with merge/unmerge to vector elements.
5270 if (TypeIdx == 1) {
5271 assert(SrcTy.isVector() && "Expected vector types");
5272 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5273 if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5274 (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
5275 return UnableToLegalize;
5276 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
5277 //
5278 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
5279 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
5280 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
5281 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
5282 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
5283 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
5284
5285 SmallVector<Register, 8> Elts;
5286 LLT EltTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getScalarType();
5287 for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
5288 auto Unmerge = MIRBuilder.buildUnmerge(Res: EltTy, Op: MI.getOperand(i).getReg());
5289 for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
5290 Elts.push_back(Elt: Unmerge.getReg(Idx: j));
5291 }
5292
5293 SmallVector<Register, 8> NarrowTyElts;
5294 unsigned NumNarrowTyElts = NarrowTy.getNumElements();
5295 unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
5296 for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
5297 ++i, Offset += NumNarrowTyElts) {
5298 ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
5299 NarrowTyElts.push_back(
5300 Elt: MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: Pieces).getReg(Idx: 0));
5301 }
5302
5303 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NarrowTyElts);
5304 MI.eraseFromParent();
5305 return Legalized;
5306 }
5307
5308 assert(TypeIdx == 0 && "Bad type index");
5309 if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
5310 (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
5311 return UnableToLegalize;
5312
5313 // This is most likely SrcTy (smaller then register size) packed in DstTy
5314 // (larger then register size) and since merge was not combined it will be
5315 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
5316 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
5317
5318 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
5319 //
5320 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
5321 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
5322 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
5323 SmallVector<Register, 8> NarrowTyElts;
5324 unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
5325 unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
5326 unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
5327 for (unsigned i = 0; i < NumParts; ++i) {
5328 SmallVector<Register, 8> Sources;
5329 for (unsigned j = 0; j < NumElts; ++j)
5330 Sources.push_back(Elt: MI.getOperand(i: 1 + i * NumElts + j).getReg());
5331 NarrowTyElts.push_back(
5332 Elt: MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: Sources).getReg(Idx: 0));
5333 }
5334
5335 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NarrowTyElts);
5336 MI.eraseFromParent();
5337 return Legalized;
5338}
5339
5340LegalizerHelper::LegalizeResult
5341LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
5342 unsigned TypeIdx,
5343 LLT NarrowVecTy) {
5344 auto [DstReg, SrcVec] = MI.getFirst2Regs();
5345 Register InsertVal;
5346 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
5347
5348 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
5349 if (IsInsert)
5350 InsertVal = MI.getOperand(i: 2).getReg();
5351
5352 Register Idx = MI.getOperand(i: MI.getNumOperands() - 1).getReg();
5353 LLT VecTy = MRI.getType(Reg: SrcVec);
5354
5355 // If the index is a constant, we can really break this down as you would
5356 // expect, and index into the target size pieces.
5357 auto MaybeCst = getIConstantVRegValWithLookThrough(VReg: Idx, MRI);
5358 if (MaybeCst) {
5359 uint64_t IdxVal = MaybeCst->Value.getZExtValue();
5360 // Avoid out of bounds indexing the pieces.
5361 if (IdxVal >= VecTy.getNumElements()) {
5362 MIRBuilder.buildUndef(Res: DstReg);
5363 MI.eraseFromParent();
5364 return Legalized;
5365 }
5366
5367 if (!NarrowVecTy.isVector()) {
5368 SmallVector<Register, 8> SplitPieces;
5369 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowVecTy,
5370 NumParts: VecTy.getNumElements(), VRegs&: SplitPieces, MIRBuilder, MRI);
5371 if (IsInsert) {
5372 SplitPieces[IdxVal] = InsertVal;
5373 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: SplitPieces);
5374 } else {
5375 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0).getReg(), Op: SplitPieces[IdxVal]);
5376 }
5377 } else {
5378 SmallVector<Register, 8> VecParts;
5379 LLT GCDTy = extractGCDType(Parts&: VecParts, DstTy: VecTy, NarrowTy: NarrowVecTy, SrcReg: SrcVec);
5380
5381 // Build a sequence of NarrowTy pieces in VecParts for this operand.
5382 LLT LCMTy = buildLCMMergePieces(DstTy: VecTy, NarrowTy: NarrowVecTy, GCDTy, VRegs&: VecParts,
5383 PadStrategy: TargetOpcode::G_ANYEXT);
5384
5385 unsigned NewNumElts = NarrowVecTy.getNumElements();
5386
5387 LLT IdxTy = MRI.getType(Reg: Idx);
5388 int64_t PartIdx = IdxVal / NewNumElts;
5389 auto NewIdx =
5390 MIRBuilder.buildConstant(Res: IdxTy, Val: IdxVal - NewNumElts * PartIdx);
5391
5392 if (IsInsert) {
5393 LLT PartTy = MRI.getType(Reg: VecParts[PartIdx]);
5394
5395 // Use the adjusted index to insert into one of the subvectors.
5396 auto InsertPart = MIRBuilder.buildInsertVectorElement(
5397 Res: PartTy, Val: VecParts[PartIdx], Elt: InsertVal, Idx: NewIdx);
5398 VecParts[PartIdx] = InsertPart.getReg(Idx: 0);
5399
5400 // Recombine the inserted subvector with the others to reform the result
5401 // vector.
5402 buildWidenedRemergeToDst(DstReg, LCMTy, RemergeRegs: VecParts);
5403 } else {
5404 MIRBuilder.buildExtractVectorElement(Res: DstReg, Val: VecParts[PartIdx], Idx: NewIdx);
5405 }
5406 }
5407
5408 MI.eraseFromParent();
5409 return Legalized;
5410 }
5411
5412 // With a variable index, we can't perform the operation in a smaller type, so
5413 // we're forced to expand this.
5414 //
5415 // TODO: We could emit a chain of compare/select to figure out which piece to
5416 // index.
5417 return lowerExtractInsertVectorElt(MI);
5418}
5419
5420LegalizerHelper::LegalizeResult
5421LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
5422 LLT NarrowTy) {
5423 // FIXME: Don't know how to handle secondary types yet.
5424 if (TypeIdx != 0)
5425 return UnableToLegalize;
5426
5427 if (!NarrowTy.isByteSized()) {
5428 LLVM_DEBUG(dbgs() << "Can't narrow load/store to non-byte-sized type\n");
5429 return UnableToLegalize;
5430 }
5431
5432 // This implementation doesn't work for atomics. Give up instead of doing
5433 // something invalid.
5434 if (LdStMI.isAtomic())
5435 return UnableToLegalize;
5436
5437 bool IsLoad = isa<GLoad>(Val: LdStMI);
5438 Register ValReg = LdStMI.getReg(Idx: 0);
5439 Register AddrReg = LdStMI.getPointerReg();
5440 LLT ValTy = MRI.getType(Reg: ValReg);
5441
5442 // FIXME: Do we need a distinct NarrowMemory legalize action?
5443 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize().getValue()) {
5444 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
5445 return UnableToLegalize;
5446 }
5447
5448 int NumParts = -1;
5449 int NumLeftover = -1;
5450 LLT LeftoverTy;
5451 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
5452 if (IsLoad) {
5453 std::tie(args&: NumParts, args&: NumLeftover) = getNarrowTypeBreakDown(OrigTy: ValTy, NarrowTy, LeftoverTy);
5454 } else {
5455 if (extractParts(Reg: ValReg, RegTy: ValTy, MainTy: NarrowTy, LeftoverTy, VRegs&: NarrowRegs,
5456 LeftoverVRegs&: NarrowLeftoverRegs, MIRBuilder, MRI)) {
5457 NumParts = NarrowRegs.size();
5458 NumLeftover = NarrowLeftoverRegs.size();
5459 }
5460 }
5461
5462 if (NumParts == -1)
5463 return UnableToLegalize;
5464
5465 LLT PtrTy = MRI.getType(Reg: AddrReg);
5466 const LLT OffsetTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
5467
5468 unsigned TotalSize = ValTy.getSizeInBits();
5469
5470 // Split the load/store into PartTy sized pieces starting at Offset. If this
5471 // is a load, return the new registers in ValRegs. For a store, each elements
5472 // of ValRegs should be PartTy. Returns the next offset that needs to be
5473 // handled.
5474 bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
5475 auto MMO = LdStMI.getMMO();
5476 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
5477 unsigned NumParts, unsigned Offset) -> unsigned {
5478 MachineFunction &MF = MIRBuilder.getMF();
5479 unsigned PartSize = PartTy.getSizeInBits();
5480 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
5481 ++Idx) {
5482 unsigned ByteOffset = Offset / 8;
5483 Register NewAddrReg;
5484
5485 MIRBuilder.materializeObjectPtrOffset(Res&: NewAddrReg, Op0: AddrReg, ValueTy: OffsetTy,
5486 Value: ByteOffset);
5487
5488 MachineMemOperand *NewMMO =
5489 MF.getMachineMemOperand(MMO: &MMO, Offset: ByteOffset, Ty: PartTy);
5490
5491 if (IsLoad) {
5492 Register Dst = MRI.createGenericVirtualRegister(Ty: PartTy);
5493 ValRegs.push_back(Elt: Dst);
5494 MIRBuilder.buildLoad(Res: Dst, Addr: NewAddrReg, MMO&: *NewMMO);
5495 } else {
5496 MIRBuilder.buildStore(Val: ValRegs[Idx], Addr: NewAddrReg, MMO&: *NewMMO);
5497 }
5498 Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
5499 }
5500
5501 return Offset;
5502 };
5503
5504 unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
5505 unsigned HandledOffset =
5506 splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
5507
5508 // Handle the rest of the register if this isn't an even type breakdown.
5509 if (LeftoverTy.isValid())
5510 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
5511
5512 if (IsLoad) {
5513 insertParts(DstReg: ValReg, ResultTy: ValTy, PartTy: NarrowTy, PartRegs: NarrowRegs,
5514 LeftoverTy, LeftoverRegs: NarrowLeftoverRegs);
5515 }
5516
5517 LdStMI.eraseFromParent();
5518 return Legalized;
5519}
5520
5521LegalizerHelper::LegalizeResult
5522LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
5523 LLT NarrowTy) {
5524 using namespace TargetOpcode;
5525 GenericMachineInstr &GMI = cast<GenericMachineInstr>(Val&: MI);
5526 unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5527
5528 switch (MI.getOpcode()) {
5529 case G_IMPLICIT_DEF:
5530 case G_TRUNC:
5531 case G_AND:
5532 case G_OR:
5533 case G_XOR:
5534 case G_ADD:
5535 case G_SUB:
5536 case G_MUL:
5537 case G_PTR_ADD:
5538 case G_SMULH:
5539 case G_UMULH:
5540 case G_FADD:
5541 case G_FMUL:
5542 case G_FSUB:
5543 case G_FNEG:
5544 case G_FABS:
5545 case G_FCANONICALIZE:
5546 case G_FDIV:
5547 case G_FREM:
5548 case G_FMA:
5549 case G_FMAD:
5550 case G_FPOW:
5551 case G_FEXP:
5552 case G_FEXP2:
5553 case G_FEXP10:
5554 case G_FLOG:
5555 case G_FLOG2:
5556 case G_FLOG10:
5557 case G_FLDEXP:
5558 case G_FNEARBYINT:
5559 case G_FCEIL:
5560 case G_FFLOOR:
5561 case G_FRINT:
5562 case G_INTRINSIC_LRINT:
5563 case G_INTRINSIC_LLRINT:
5564 case G_INTRINSIC_ROUND:
5565 case G_INTRINSIC_ROUNDEVEN:
5566 case G_LROUND:
5567 case G_LLROUND:
5568 case G_INTRINSIC_TRUNC:
5569 case G_FMODF:
5570 case G_FCOS:
5571 case G_FSIN:
5572 case G_FTAN:
5573 case G_FACOS:
5574 case G_FASIN:
5575 case G_FATAN:
5576 case G_FATAN2:
5577 case G_FCOSH:
5578 case G_FSINH:
5579 case G_FTANH:
5580 case G_FSQRT:
5581 case G_BSWAP:
5582 case G_BITREVERSE:
5583 case G_SDIV:
5584 case G_UDIV:
5585 case G_SREM:
5586 case G_UREM:
5587 case G_SDIVREM:
5588 case G_UDIVREM:
5589 case G_SMIN:
5590 case G_SMAX:
5591 case G_UMIN:
5592 case G_UMAX:
5593 case G_ABS:
5594 case G_FMINNUM:
5595 case G_FMAXNUM:
5596 case G_FMINNUM_IEEE:
5597 case G_FMAXNUM_IEEE:
5598 case G_FMINIMUM:
5599 case G_FMAXIMUM:
5600 case G_FMINIMUMNUM:
5601 case G_FMAXIMUMNUM:
5602 case G_FSHL:
5603 case G_FSHR:
5604 case G_ROTL:
5605 case G_ROTR:
5606 case G_FREEZE:
5607 case G_SADDSAT:
5608 case G_SSUBSAT:
5609 case G_UADDSAT:
5610 case G_USUBSAT:
5611 case G_UMULO:
5612 case G_SMULO:
5613 case G_SHL:
5614 case G_LSHR:
5615 case G_ASHR:
5616 case G_SSHLSAT:
5617 case G_USHLSAT:
5618 case G_CTLZ:
5619 case G_CTLZ_ZERO_UNDEF:
5620 case G_CTTZ:
5621 case G_CTTZ_ZERO_UNDEF:
5622 case G_CTPOP:
5623 case G_FCOPYSIGN:
5624 case G_ZEXT:
5625 case G_SEXT:
5626 case G_ANYEXT:
5627 case G_FPEXT:
5628 case G_FPTRUNC:
5629 case G_SITOFP:
5630 case G_UITOFP:
5631 case G_FPTOSI:
5632 case G_FPTOUI:
5633 case G_FPTOSI_SAT:
5634 case G_FPTOUI_SAT:
5635 case G_INTTOPTR:
5636 case G_PTRTOINT:
5637 case G_ADDRSPACE_CAST:
5638 case G_UADDO:
5639 case G_USUBO:
5640 case G_UADDE:
5641 case G_USUBE:
5642 case G_SADDO:
5643 case G_SSUBO:
5644 case G_SADDE:
5645 case G_SSUBE:
5646 case G_STRICT_FADD:
5647 case G_STRICT_FSUB:
5648 case G_STRICT_FMUL:
5649 case G_STRICT_FMA:
5650 case G_STRICT_FLDEXP:
5651 case G_FFREXP:
5652 case G_TRUNC_SSAT_S:
5653 case G_TRUNC_SSAT_U:
5654 case G_TRUNC_USAT_U:
5655 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts);
5656 case G_ICMP:
5657 case G_FCMP:
5658 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {1 /*cpm predicate*/});
5659 case G_IS_FPCLASS:
5660 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2, 3 /*mask,fpsem*/});
5661 case G_SELECT:
5662 if (MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector())
5663 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts);
5664 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {1 /*scalar cond*/});
5665 case G_PHI:
5666 return fewerElementsVectorPhi(MI&: GMI, NumElts);
5667 case G_UNMERGE_VALUES:
5668 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
5669 case G_BUILD_VECTOR:
5670 assert(TypeIdx == 0 && "not a vector type index");
5671 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5672 case G_CONCAT_VECTORS:
5673 if (TypeIdx != 1) // TODO: This probably does work as expected already.
5674 return UnableToLegalize;
5675 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5676 case G_EXTRACT_VECTOR_ELT:
5677 case G_INSERT_VECTOR_ELT:
5678 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowVecTy: NarrowTy);
5679 case G_LOAD:
5680 case G_STORE:
5681 return reduceLoadStoreWidth(LdStMI&: cast<GLoadStore>(Val&: MI), TypeIdx, NarrowTy);
5682 case G_SEXT_INREG:
5683 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2 /*imm*/});
5684 GISEL_VECREDUCE_CASES_NONSEQ
5685 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
5686 case TargetOpcode::G_VECREDUCE_SEQ_FADD:
5687 case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
5688 return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
5689 case G_SHUFFLE_VECTOR:
5690 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
5691 case G_FPOWI:
5692 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2 /*pow*/});
5693 case G_BITCAST:
5694 return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
5695 case G_INTRINSIC_FPTRUNC_ROUND:
5696 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2});
5697 default:
5698 return UnableToLegalize;
5699 }
5700}
5701
5702LegalizerHelper::LegalizeResult
5703LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
5704 LLT NarrowTy) {
5705 assert(MI.getOpcode() == TargetOpcode::G_BITCAST &&
5706 "Not a bitcast operation");
5707
5708 if (TypeIdx != 0)
5709 return UnableToLegalize;
5710
5711 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5712
5713 unsigned NewElemCount =
5714 NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
5715 SmallVector<Register> SrcVRegs, BitcastVRegs;
5716 if (NewElemCount == 1) {
5717 LLT SrcNarrowTy = SrcTy.getElementType();
5718
5719 auto Unmerge = MIRBuilder.buildUnmerge(Res: SrcNarrowTy, Op: SrcReg);
5720 getUnmergeResults(Regs&: SrcVRegs, MI: *Unmerge);
5721 } else {
5722 LLT SrcNarrowTy =
5723 SrcTy.changeVectorElementCount(EC: ElementCount::getFixed(MinVal: NewElemCount));
5724
5725 // Split the Src and Dst Reg into smaller registers
5726 if (extractGCDType(Parts&: SrcVRegs, DstTy, NarrowTy: SrcNarrowTy, SrcReg) != SrcNarrowTy)
5727 return UnableToLegalize;
5728 }
5729
5730 // Build new smaller bitcast instructions
5731 // Not supporting Leftover types for now but will have to
5732 for (Register Reg : SrcVRegs)
5733 BitcastVRegs.push_back(Elt: MIRBuilder.buildBitcast(Dst: NarrowTy, Src: Reg).getReg(Idx: 0));
5734
5735 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: BitcastVRegs);
5736 MI.eraseFromParent();
5737 return Legalized;
5738}
5739
5740LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
5741 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5742 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
5743 if (TypeIdx != 0)
5744 return UnableToLegalize;
5745
5746 auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
5747 MI.getFirst3RegLLTs();
5748 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
5749 // The shuffle should be canonicalized by now.
5750 if (DstTy != Src1Ty)
5751 return UnableToLegalize;
5752 if (DstTy != Src2Ty)
5753 return UnableToLegalize;
5754
5755 if (!isPowerOf2_32(Value: DstTy.getNumElements()))
5756 return UnableToLegalize;
5757
5758 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
5759 // Further legalization attempts will be needed to do split further.
5760 NarrowTy =
5761 DstTy.changeElementCount(EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2));
5762 unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5763
5764 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
5765 extractParts(Reg: Src1Reg, Ty: NarrowTy, NumParts: 2, VRegs&: SplitSrc1Regs, MIRBuilder, MRI);
5766 extractParts(Reg: Src2Reg, Ty: NarrowTy, NumParts: 2, VRegs&: SplitSrc2Regs, MIRBuilder, MRI);
5767 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
5768 SplitSrc2Regs[1]};
5769
5770 Register Hi, Lo;
5771
5772 // If Lo or Hi uses elements from at most two of the four input vectors, then
5773 // express it as a vector shuffle of those two inputs. Otherwise extract the
5774 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
5775 SmallVector<int, 16> Ops;
5776 for (unsigned High = 0; High < 2; ++High) {
5777 Register &Output = High ? Hi : Lo;
5778
5779 // Build a shuffle mask for the output, discovering on the fly which
5780 // input vectors to use as shuffle operands (recorded in InputUsed).
5781 // If building a suitable shuffle vector proves too hard, then bail
5782 // out with useBuildVector set.
5783 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
5784 unsigned FirstMaskIdx = High * NewElts;
5785 bool UseBuildVector = false;
5786 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5787 // The mask element. This indexes into the input.
5788 int Idx = Mask[FirstMaskIdx + MaskOffset];
5789
5790 // The input vector this mask element indexes into.
5791 unsigned Input = (unsigned)Idx / NewElts;
5792
5793 if (Input >= std::size(Inputs)) {
5794 // The mask element does not index into any input vector.
5795 Ops.push_back(Elt: -1);
5796 continue;
5797 }
5798
5799 // Turn the index into an offset from the start of the input vector.
5800 Idx -= Input * NewElts;
5801
5802 // Find or create a shuffle vector operand to hold this input.
5803 unsigned OpNo;
5804 for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
5805 if (InputUsed[OpNo] == Input) {
5806 // This input vector is already an operand.
5807 break;
5808 } else if (InputUsed[OpNo] == -1U) {
5809 // Create a new operand for this input vector.
5810 InputUsed[OpNo] = Input;
5811 break;
5812 }
5813 }
5814
5815 if (OpNo >= std::size(InputUsed)) {
5816 // More than two input vectors used! Give up on trying to create a
5817 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
5818 UseBuildVector = true;
5819 break;
5820 }
5821
5822 // Add the mask index for the new shuffle vector.
5823 Ops.push_back(Elt: Idx + OpNo * NewElts);
5824 }
5825
5826 if (UseBuildVector) {
5827 LLT EltTy = NarrowTy.getElementType();
5828 SmallVector<Register, 16> SVOps;
5829
5830 // Extract the input elements by hand.
5831 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5832 // The mask element. This indexes into the input.
5833 int Idx = Mask[FirstMaskIdx + MaskOffset];
5834
5835 // The input vector this mask element indexes into.
5836 unsigned Input = (unsigned)Idx / NewElts;
5837
5838 if (Input >= std::size(Inputs)) {
5839 // The mask element is "undef" or indexes off the end of the input.
5840 SVOps.push_back(Elt: MIRBuilder.buildUndef(Res: EltTy).getReg(Idx: 0));
5841 continue;
5842 }
5843
5844 // Turn the index into an offset from the start of the input vector.
5845 Idx -= Input * NewElts;
5846
5847 // Extract the vector element by hand.
5848 SVOps.push_back(Elt: MIRBuilder
5849 .buildExtractVectorElement(
5850 Res: EltTy, Val: Inputs[Input],
5851 Idx: MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: Idx))
5852 .getReg(Idx: 0));
5853 }
5854
5855 // Construct the Lo/Hi output using a G_BUILD_VECTOR.
5856 Output = MIRBuilder.buildBuildVector(Res: NarrowTy, Ops: SVOps).getReg(Idx: 0);
5857 } else if (InputUsed[0] == -1U) {
5858 // No input vectors were used! The result is undefined.
5859 Output = MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0);
5860 } else if (NewElts == 1) {
5861 Output = MIRBuilder.buildCopy(Res: NarrowTy, Op: Inputs[InputUsed[0]]).getReg(Idx: 0);
5862 } else {
5863 Register Op0 = Inputs[InputUsed[0]];
5864 // If only one input was used, use an undefined vector for the other.
5865 Register Op1 = InputUsed[1] == -1U
5866 ? MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0)
5867 : Inputs[InputUsed[1]];
5868 // At least one input vector was used. Create a new shuffle vector.
5869 Output = MIRBuilder.buildShuffleVector(Res: NarrowTy, Src1: Op0, Src2: Op1, Mask: Ops).getReg(Idx: 0);
5870 }
5871
5872 Ops.clear();
5873 }
5874
5875 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: {Lo, Hi});
5876 MI.eraseFromParent();
5877 return Legalized;
5878}
5879
5880LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
5881 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5882 auto &RdxMI = cast<GVecReduce>(Val&: MI);
5883
5884 if (TypeIdx != 1)
5885 return UnableToLegalize;
5886
5887 // The semantics of the normal non-sequential reductions allow us to freely
5888 // re-associate the operation.
5889 auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
5890
5891 if (NarrowTy.isVector() &&
5892 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
5893 return UnableToLegalize;
5894
5895 unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
5896 SmallVector<Register> SplitSrcs;
5897 // If NarrowTy is a scalar then we're being asked to scalarize.
5898 const unsigned NumParts =
5899 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
5900 : SrcTy.getNumElements();
5901
5902 extractParts(Reg: SrcReg, Ty: NarrowTy, NumParts, VRegs&: SplitSrcs, MIRBuilder, MRI);
5903 if (NarrowTy.isScalar()) {
5904 if (DstTy != NarrowTy)
5905 return UnableToLegalize; // FIXME: handle implicit extensions.
5906
5907 if (isPowerOf2_32(Value: NumParts)) {
5908 // Generate a tree of scalar operations to reduce the critical path.
5909 SmallVector<Register> PartialResults;
5910 unsigned NumPartsLeft = NumParts;
5911 while (NumPartsLeft > 1) {
5912 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
5913 PartialResults.emplace_back(
5914 Args: MIRBuilder
5915 .buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy},
5916 SrcOps: {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
5917 .getReg(Idx: 0));
5918 }
5919 SplitSrcs = PartialResults;
5920 PartialResults.clear();
5921 NumPartsLeft = SplitSrcs.size();
5922 }
5923 assert(SplitSrcs.size() == 1);
5924 MIRBuilder.buildCopy(Res: DstReg, Op: SplitSrcs[0]);
5925 MI.eraseFromParent();
5926 return Legalized;
5927 }
5928 // If we can't generate a tree, then just do sequential operations.
5929 Register Acc = SplitSrcs[0];
5930 for (unsigned Idx = 1; Idx < NumParts; ++Idx)
5931 Acc = MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {Acc, SplitSrcs[Idx]})
5932 .getReg(Idx: 0);
5933 MIRBuilder.buildCopy(Res: DstReg, Op: Acc);
5934 MI.eraseFromParent();
5935 return Legalized;
5936 }
5937 SmallVector<Register> PartialReductions;
5938 for (unsigned Part = 0; Part < NumParts; ++Part) {
5939 PartialReductions.push_back(
5940 Elt: MIRBuilder.buildInstr(Opc: RdxMI.getOpcode(), DstOps: {DstTy}, SrcOps: {SplitSrcs[Part]})
5941 .getReg(Idx: 0));
5942 }
5943
5944 // If the types involved are powers of 2, we can generate intermediate vector
5945 // ops, before generating a final reduction operation.
5946 if (isPowerOf2_32(Value: SrcTy.getNumElements()) &&
5947 isPowerOf2_32(Value: NarrowTy.getNumElements())) {
5948 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
5949 }
5950
5951 Register Acc = PartialReductions[0];
5952 for (unsigned Part = 1; Part < NumParts; ++Part) {
5953 if (Part == NumParts - 1) {
5954 MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {DstReg},
5955 SrcOps: {Acc, PartialReductions[Part]});
5956 } else {
5957 Acc = MIRBuilder
5958 .buildInstr(Opc: ScalarOpc, DstOps: {DstTy}, SrcOps: {Acc, PartialReductions[Part]})
5959 .getReg(Idx: 0);
5960 }
5961 }
5962 MI.eraseFromParent();
5963 return Legalized;
5964}
5965
5966LegalizerHelper::LegalizeResult
5967LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
5968 unsigned int TypeIdx,
5969 LLT NarrowTy) {
5970 auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
5971 MI.getFirst3RegLLTs();
5972 if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
5973 DstTy != NarrowTy)
5974 return UnableToLegalize;
5975
5976 assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
5977 MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
5978 "Unexpected vecreduce opcode");
5979 unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
5980 ? TargetOpcode::G_FADD
5981 : TargetOpcode::G_FMUL;
5982
5983 SmallVector<Register> SplitSrcs;
5984 unsigned NumParts = SrcTy.getNumElements();
5985 extractParts(Reg: SrcReg, Ty: NarrowTy, NumParts, VRegs&: SplitSrcs, MIRBuilder, MRI);
5986 Register Acc = ScalarReg;
5987 for (unsigned i = 0; i < NumParts; i++)
5988 Acc = MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {Acc, SplitSrcs[i]})
5989 .getReg(Idx: 0);
5990
5991 MIRBuilder.buildCopy(Res: DstReg, Op: Acc);
5992 MI.eraseFromParent();
5993 return Legalized;
5994}
5995
5996LegalizerHelper::LegalizeResult
5997LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
5998 LLT SrcTy, LLT NarrowTy,
5999 unsigned ScalarOpc) {
6000 SmallVector<Register> SplitSrcs;
6001 // Split the sources into NarrowTy size pieces.
6002 extractParts(Reg: SrcReg, Ty: NarrowTy,
6003 NumParts: SrcTy.getNumElements() / NarrowTy.getNumElements(), VRegs&: SplitSrcs,
6004 MIRBuilder, MRI);
6005 // We're going to do a tree reduction using vector operations until we have
6006 // one NarrowTy size value left.
6007 while (SplitSrcs.size() > 1) {
6008 SmallVector<Register> PartialRdxs;
6009 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
6010 Register LHS = SplitSrcs[Idx];
6011 Register RHS = SplitSrcs[Idx + 1];
6012 // Create the intermediate vector op.
6013 Register Res =
6014 MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {LHS, RHS}).getReg(Idx: 0);
6015 PartialRdxs.push_back(Elt: Res);
6016 }
6017 SplitSrcs = std::move(PartialRdxs);
6018 }
6019 // Finally generate the requested NarrowTy based reduction.
6020 Observer.changingInstr(MI);
6021 MI.getOperand(i: 1).setReg(SplitSrcs[0]);
6022 Observer.changedInstr(MI);
6023 return Legalized;
6024}
6025
6026LegalizerHelper::LegalizeResult
6027LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
6028 const LLT HalfTy, const LLT AmtTy) {
6029
6030 Register InL = MRI.createGenericVirtualRegister(Ty: HalfTy);
6031 Register InH = MRI.createGenericVirtualRegister(Ty: HalfTy);
6032 MIRBuilder.buildUnmerge(Res: {InL, InH}, Op: MI.getOperand(i: 1));
6033
6034 if (Amt.isZero()) {
6035 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {InL, InH});
6036 MI.eraseFromParent();
6037 return Legalized;
6038 }
6039
6040 LLT NVT = HalfTy;
6041 unsigned NVTBits = HalfTy.getSizeInBits();
6042 unsigned VTBits = 2 * NVTBits;
6043
6044 SrcOp Lo(Register(0)), Hi(Register(0));
6045 if (MI.getOpcode() == TargetOpcode::G_SHL) {
6046 if (Amt.ugt(RHS: VTBits)) {
6047 Lo = Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6048 } else if (Amt.ugt(RHS: NVTBits)) {
6049 Lo = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6050 Hi = MIRBuilder.buildShl(Dst: NVT, Src0: InL,
6051 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
6052 } else if (Amt == NVTBits) {
6053 Lo = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6054 Hi = InL;
6055 } else {
6056 Lo = MIRBuilder.buildShl(Dst: NVT, Src0: InL, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt));
6057 auto OrLHS =
6058 MIRBuilder.buildShl(Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt));
6059 auto OrRHS = MIRBuilder.buildLShr(
6060 Dst: NVT, Src0: InL, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
6061 Hi = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
6062 }
6063 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
6064 if (Amt.ugt(RHS: VTBits)) {
6065 Lo = Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6066 } else if (Amt.ugt(RHS: NVTBits)) {
6067 Lo = MIRBuilder.buildLShr(Dst: NVT, Src0: InH,
6068 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
6069 Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6070 } else if (Amt == NVTBits) {
6071 Lo = InH;
6072 Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
6073 } else {
6074 auto ShiftAmtConst = MIRBuilder.buildConstant(Res: AmtTy, Val: Amt);
6075
6076 auto OrLHS = MIRBuilder.buildLShr(Dst: NVT, Src0: InL, Src1: ShiftAmtConst);
6077 auto OrRHS = MIRBuilder.buildShl(
6078 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
6079
6080 Lo = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
6081 Hi = MIRBuilder.buildLShr(Dst: NVT, Src0: InH, Src1: ShiftAmtConst);
6082 }
6083 } else {
6084 if (Amt.ugt(RHS: VTBits)) {
6085 Hi = Lo = MIRBuilder.buildAShr(
6086 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
6087 } else if (Amt.ugt(RHS: NVTBits)) {
6088 Lo = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
6089 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
6090 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
6091 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
6092 } else if (Amt == NVTBits) {
6093 Lo = InH;
6094 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
6095 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
6096 } else {
6097 auto ShiftAmtConst = MIRBuilder.buildConstant(Res: AmtTy, Val: Amt);
6098
6099 auto OrLHS = MIRBuilder.buildLShr(Dst: NVT, Src0: InL, Src1: ShiftAmtConst);
6100 auto OrRHS = MIRBuilder.buildShl(
6101 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
6102
6103 Lo = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
6104 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH, Src1: ShiftAmtConst);
6105 }
6106 }
6107
6108 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {Lo, Hi});
6109 MI.eraseFromParent();
6110
6111 return Legalized;
6112}
6113
6114LegalizerHelper::LegalizeResult
6115LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
6116 LLT RequestedTy) {
6117 if (TypeIdx == 1) {
6118 Observer.changingInstr(MI);
6119 narrowScalarSrc(MI, NarrowTy: RequestedTy, OpIdx: 2);
6120 Observer.changedInstr(MI);
6121 return Legalized;
6122 }
6123
6124 Register DstReg = MI.getOperand(i: 0).getReg();
6125 LLT DstTy = MRI.getType(Reg: DstReg);
6126 if (DstTy.isVector())
6127 return UnableToLegalize;
6128
6129 Register Amt = MI.getOperand(i: 2).getReg();
6130 LLT ShiftAmtTy = MRI.getType(Reg: Amt);
6131 const unsigned DstEltSize = DstTy.getScalarSizeInBits();
6132 if (DstEltSize % 2 != 0)
6133 return UnableToLegalize;
6134
6135 // Check if we should use multi-way splitting instead of recursive binary
6136 // splitting.
6137 //
6138 // Multi-way splitting directly decomposes wide shifts (e.g., 128-bit ->
6139 // 4×32-bit) in a single legalization step, avoiding the recursive overhead
6140 // and dependency chains created by usual binary splitting approach
6141 // (128->64->32).
6142 //
6143 // The >= 8 parts threshold ensures we only use this optimization when binary
6144 // splitting would require multiple recursive passes, avoiding overhead for
6145 // simple 2-way splits where binary approach is sufficient.
6146 if (RequestedTy.isValid() && RequestedTy.isScalar() &&
6147 DstEltSize % RequestedTy.getSizeInBits() == 0) {
6148 const unsigned NumParts = DstEltSize / RequestedTy.getSizeInBits();
6149 // Use multiway if we have 8 or more parts (i.e., would need 3+ recursive
6150 // steps).
6151 if (NumParts >= 8)
6152 return narrowScalarShiftMultiway(MI, TargetTy: RequestedTy);
6153 }
6154
6155 // Fall back to binary splitting:
6156 // Ignore the input type. We can only go to exactly half the size of the
6157 // input. If that isn't small enough, the resulting pieces will be further
6158 // legalized.
6159 const unsigned NewBitSize = DstEltSize / 2;
6160 const LLT HalfTy = LLT::scalar(SizeInBits: NewBitSize);
6161 const LLT CondTy = LLT::scalar(SizeInBits: 1);
6162
6163 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: Amt, MRI)) {
6164 return narrowScalarShiftByConstant(MI, Amt: VRegAndVal->Value, HalfTy,
6165 AmtTy: ShiftAmtTy);
6166 }
6167
6168 // TODO: Expand with known bits.
6169
6170 // Handle the fully general expansion by an unknown amount.
6171 auto NewBits = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: NewBitSize);
6172
6173 Register InL = MRI.createGenericVirtualRegister(Ty: HalfTy);
6174 Register InH = MRI.createGenericVirtualRegister(Ty: HalfTy);
6175 MIRBuilder.buildUnmerge(Res: {InL, InH}, Op: MI.getOperand(i: 1));
6176
6177 auto AmtExcess = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: Amt, Src1: NewBits);
6178 auto AmtLack = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: NewBits, Src1: Amt);
6179
6180 auto Zero = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
6181 auto IsShort = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_ULT, Res: CondTy, Op0: Amt, Op1: NewBits);
6182 auto IsZero = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: CondTy, Op0: Amt, Op1: Zero);
6183
6184 Register ResultRegs[2];
6185 switch (MI.getOpcode()) {
6186 case TargetOpcode::G_SHL: {
6187 // Short: ShAmt < NewBitSize
6188 auto LoS = MIRBuilder.buildShl(Dst: HalfTy, Src0: InL, Src1: Amt);
6189
6190 auto LoOr = MIRBuilder.buildLShr(Dst: HalfTy, Src0: InL, Src1: AmtLack);
6191 auto HiOr = MIRBuilder.buildShl(Dst: HalfTy, Src0: InH, Src1: Amt);
6192 auto HiS = MIRBuilder.buildOr(Dst: HalfTy, Src0: LoOr, Src1: HiOr);
6193
6194 // Long: ShAmt >= NewBitSize
6195 auto LoL = MIRBuilder.buildConstant(Res: HalfTy, Val: 0); // Lo part is zero.
6196 auto HiL = MIRBuilder.buildShl(Dst: HalfTy, Src0: InL, Src1: AmtExcess); // Hi from Lo part.
6197
6198 auto Lo = MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: LoS, Op1: LoL);
6199 auto Hi = MIRBuilder.buildSelect(
6200 Res: HalfTy, Tst: IsZero, Op0: InH, Op1: MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: HiS, Op1: HiL));
6201
6202 ResultRegs[0] = Lo.getReg(Idx: 0);
6203 ResultRegs[1] = Hi.getReg(Idx: 0);
6204 break;
6205 }
6206 case TargetOpcode::G_LSHR:
6207 case TargetOpcode::G_ASHR: {
6208 // Short: ShAmt < NewBitSize
6209 auto HiS = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {HalfTy}, SrcOps: {InH, Amt});
6210
6211 auto LoOr = MIRBuilder.buildLShr(Dst: HalfTy, Src0: InL, Src1: Amt);
6212 auto HiOr = MIRBuilder.buildShl(Dst: HalfTy, Src0: InH, Src1: AmtLack);
6213 auto LoS = MIRBuilder.buildOr(Dst: HalfTy, Src0: LoOr, Src1: HiOr);
6214
6215 // Long: ShAmt >= NewBitSize
6216 MachineInstrBuilder HiL;
6217 if (MI.getOpcode() == TargetOpcode::G_LSHR) {
6218 HiL = MIRBuilder.buildConstant(Res: HalfTy, Val: 0); // Hi part is zero.
6219 } else {
6220 auto ShiftAmt = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: NewBitSize - 1);
6221 HiL = MIRBuilder.buildAShr(Dst: HalfTy, Src0: InH, Src1: ShiftAmt); // Sign of Hi part.
6222 }
6223 auto LoL = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {HalfTy},
6224 SrcOps: {InH, AmtExcess}); // Lo from Hi part.
6225
6226 auto Lo = MIRBuilder.buildSelect(
6227 Res: HalfTy, Tst: IsZero, Op0: InL, Op1: MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: LoS, Op1: LoL));
6228
6229 auto Hi = MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: HiS, Op1: HiL);
6230
6231 ResultRegs[0] = Lo.getReg(Idx: 0);
6232 ResultRegs[1] = Hi.getReg(Idx: 0);
6233 break;
6234 }
6235 default:
6236 llvm_unreachable("not a shift");
6237 }
6238
6239 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: ResultRegs);
6240 MI.eraseFromParent();
6241 return Legalized;
6242}
6243
6244Register LegalizerHelper::buildConstantShiftPart(unsigned Opcode,
6245 unsigned PartIdx,
6246 unsigned NumParts,
6247 ArrayRef<Register> SrcParts,
6248 const ShiftParams &Params,
6249 LLT TargetTy, LLT ShiftAmtTy) {
6250 auto WordShiftConst = getIConstantVRegVal(VReg: Params.WordShift, MRI);
6251 auto BitShiftConst = getIConstantVRegVal(VReg: Params.BitShift, MRI);
6252 assert(WordShiftConst && BitShiftConst && "Expected constants");
6253
6254 const unsigned ShiftWords = WordShiftConst->getZExtValue();
6255 const unsigned ShiftBits = BitShiftConst->getZExtValue();
6256 const bool NeedsInterWordShift = ShiftBits != 0;
6257
6258 switch (Opcode) {
6259 case TargetOpcode::G_SHL: {
6260 // Data moves from lower indices to higher indices
6261 // If this part would come from a source beyond our range, it's zero
6262 if (PartIdx < ShiftWords)
6263 return Params.Zero;
6264
6265 unsigned SrcIdx = PartIdx - ShiftWords;
6266 if (!NeedsInterWordShift)
6267 return SrcParts[SrcIdx];
6268
6269 // Combine shifted main part with carry from previous part
6270 auto Hi = MIRBuilder.buildShl(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift);
6271 if (SrcIdx > 0) {
6272 auto Lo = MIRBuilder.buildLShr(Dst: TargetTy, Src0: SrcParts[SrcIdx - 1],
6273 Src1: Params.InvBitShift);
6274 return MIRBuilder.buildOr(Dst: TargetTy, Src0: Hi, Src1: Lo).getReg(Idx: 0);
6275 }
6276 return Hi.getReg(Idx: 0);
6277 }
6278
6279 case TargetOpcode::G_LSHR: {
6280 unsigned SrcIdx = PartIdx + ShiftWords;
6281 if (SrcIdx >= NumParts)
6282 return Params.Zero;
6283 if (!NeedsInterWordShift)
6284 return SrcParts[SrcIdx];
6285
6286 // Combine shifted main part with carry from next part
6287 auto Lo = MIRBuilder.buildLShr(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift);
6288 if (SrcIdx + 1 < NumParts) {
6289 auto Hi = MIRBuilder.buildShl(Dst: TargetTy, Src0: SrcParts[SrcIdx + 1],
6290 Src1: Params.InvBitShift);
6291 return MIRBuilder.buildOr(Dst: TargetTy, Src0: Lo, Src1: Hi).getReg(Idx: 0);
6292 }
6293 return Lo.getReg(Idx: 0);
6294 }
6295
6296 case TargetOpcode::G_ASHR: {
6297 // Like LSHR but preserves sign bit
6298 unsigned SrcIdx = PartIdx + ShiftWords;
6299 if (SrcIdx >= NumParts)
6300 return Params.SignBit;
6301 if (!NeedsInterWordShift)
6302 return SrcParts[SrcIdx];
6303
6304 // Only the original MSB part uses arithmetic shift to preserve sign. All
6305 // other parts use logical shift since they're just moving data bits.
6306 auto Lo =
6307 (SrcIdx == NumParts - 1)
6308 ? MIRBuilder.buildAShr(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift)
6309 : MIRBuilder.buildLShr(Dst: TargetTy, Src0: SrcParts[SrcIdx], Src1: Params.BitShift);
6310 Register HiSrc =
6311 (SrcIdx + 1 < NumParts) ? SrcParts[SrcIdx + 1] : Params.SignBit;
6312 auto Hi = MIRBuilder.buildShl(Dst: TargetTy, Src0: HiSrc, Src1: Params.InvBitShift);
6313 return MIRBuilder.buildOr(Dst: TargetTy, Src0: Lo, Src1: Hi).getReg(Idx: 0);
6314 }
6315
6316 default:
6317 llvm_unreachable("not a shift");
6318 }
6319}
6320
6321Register LegalizerHelper::buildVariableShiftPart(unsigned Opcode,
6322 Register MainOperand,
6323 Register ShiftAmt,
6324 LLT TargetTy,
6325 Register CarryOperand) {
6326 // This helper generates a single output part for variable shifts by combining
6327 // the main operand (shifted by BitShift) with carry bits from an adjacent
6328 // part.
6329
6330 // For G_ASHR, individual parts don't have their own sign bit, only the
6331 // complete value does. So we use LSHR for the main operand shift in ASHR
6332 // context.
6333 unsigned MainOpcode = (Opcode == TargetOpcode::G_ASHR)
6334 ? static_cast<unsigned>(TargetOpcode::G_LSHR)
6335 : Opcode;
6336
6337 // Perform the primary shift on the main operand
6338 Register MainShifted =
6339 MIRBuilder.buildInstr(Opc: MainOpcode, DstOps: {TargetTy}, SrcOps: {MainOperand, ShiftAmt})
6340 .getReg(Idx: 0);
6341
6342 // No carry operand available
6343 if (!CarryOperand.isValid())
6344 return MainShifted;
6345
6346 // If BitShift is 0 (word-aligned shift), no inter-word bit movement occurs,
6347 // so carry bits aren't needed.
6348 LLT ShiftAmtTy = MRI.getType(Reg: ShiftAmt);
6349 auto ZeroConst = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
6350 LLT BoolTy = LLT::scalar(SizeInBits: 1);
6351 auto IsZeroBitShift =
6352 MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: BoolTy, Op0: ShiftAmt, Op1: ZeroConst);
6353
6354 // Extract bits from the adjacent part that will "carry over" into this part.
6355 // The carry direction is opposite to the main shift direction, so we can
6356 // align the two shifted values before combining them with OR.
6357
6358 // Determine the carry shift opcode (opposite direction)
6359 unsigned CarryOpcode = (Opcode == TargetOpcode::G_SHL) ? TargetOpcode::G_LSHR
6360 : TargetOpcode::G_SHL;
6361
6362 // Calculate inverse shift amount: BitWidth - ShiftAmt
6363 auto TargetBitsConst =
6364 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetTy.getScalarSizeInBits());
6365 auto InvShiftAmt = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: TargetBitsConst, Src1: ShiftAmt);
6366
6367 // Shift the carry operand
6368 Register CarryBits =
6369 MIRBuilder
6370 .buildInstr(Opc: CarryOpcode, DstOps: {TargetTy}, SrcOps: {CarryOperand, InvShiftAmt})
6371 .getReg(Idx: 0);
6372
6373 // If BitShift is 0, don't include carry bits (InvShiftAmt would equal
6374 // TargetBits which would be poison for the individual carry shift operation).
6375 auto ZeroReg = MIRBuilder.buildConstant(Res: TargetTy, Val: 0);
6376 Register SafeCarryBits =
6377 MIRBuilder.buildSelect(Res: TargetTy, Tst: IsZeroBitShift, Op0: ZeroReg, Op1: CarryBits)
6378 .getReg(Idx: 0);
6379
6380 // Combine the main shifted part with the carry bits
6381 return MIRBuilder.buildOr(Dst: TargetTy, Src0: MainShifted, Src1: SafeCarryBits).getReg(Idx: 0);
6382}
6383
6384LegalizerHelper::LegalizeResult
6385LegalizerHelper::narrowScalarShiftByConstantMultiway(MachineInstr &MI,
6386 const APInt &Amt,
6387 LLT TargetTy,
6388 LLT ShiftAmtTy) {
6389 // Any wide shift can be decomposed into WordShift + BitShift components.
6390 // When shift amount is known constant, directly compute the decomposition
6391 // values and generate constant registers.
6392 Register DstReg = MI.getOperand(i: 0).getReg();
6393 Register SrcReg = MI.getOperand(i: 1).getReg();
6394 LLT DstTy = MRI.getType(Reg: DstReg);
6395
6396 const unsigned DstBits = DstTy.getScalarSizeInBits();
6397 const unsigned TargetBits = TargetTy.getScalarSizeInBits();
6398 const unsigned NumParts = DstBits / TargetBits;
6399
6400 assert(DstBits % TargetBits == 0 && "Target type must evenly divide source");
6401
6402 // When the shift amount is known at compile time, we just calculate which
6403 // source parts contribute to each output part.
6404
6405 SmallVector<Register, 8> SrcParts;
6406 extractParts(Reg: SrcReg, Ty: TargetTy, NumParts, VRegs&: SrcParts, MIRBuilder, MRI);
6407
6408 if (Amt.isZero()) {
6409 // No shift needed, just copy
6410 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SrcParts);
6411 MI.eraseFromParent();
6412 return Legalized;
6413 }
6414
6415 ShiftParams Params;
6416 const unsigned ShiftWords = Amt.getZExtValue() / TargetBits;
6417 const unsigned ShiftBits = Amt.getZExtValue() % TargetBits;
6418
6419 // Generate constants and values needed by all shift types
6420 Params.WordShift = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: ShiftWords).getReg(Idx: 0);
6421 Params.BitShift = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: ShiftBits).getReg(Idx: 0);
6422 Params.InvBitShift =
6423 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - ShiftBits).getReg(Idx: 0);
6424 Params.Zero = MIRBuilder.buildConstant(Res: TargetTy, Val: 0).getReg(Idx: 0);
6425
6426 // For ASHR, we need the sign-extended value to fill shifted-out positions
6427 if (MI.getOpcode() == TargetOpcode::G_ASHR)
6428 Params.SignBit =
6429 MIRBuilder
6430 .buildAShr(Dst: TargetTy, Src0: SrcParts[SrcParts.size() - 1],
6431 Src1: MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - 1))
6432 .getReg(Idx: 0);
6433
6434 SmallVector<Register, 8> DstParts(NumParts);
6435 for (unsigned I = 0; I < NumParts; ++I)
6436 DstParts[I] = buildConstantShiftPart(Opcode: MI.getOpcode(), PartIdx: I, NumParts, SrcParts,
6437 Params, TargetTy, ShiftAmtTy);
6438
6439 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstParts);
6440 MI.eraseFromParent();
6441 return Legalized;
6442}
6443
6444LegalizerHelper::LegalizeResult
6445LegalizerHelper::narrowScalarShiftMultiway(MachineInstr &MI, LLT TargetTy) {
6446 Register DstReg = MI.getOperand(i: 0).getReg();
6447 Register SrcReg = MI.getOperand(i: 1).getReg();
6448 Register AmtReg = MI.getOperand(i: 2).getReg();
6449 LLT DstTy = MRI.getType(Reg: DstReg);
6450 LLT ShiftAmtTy = MRI.getType(Reg: AmtReg);
6451
6452 const unsigned DstBits = DstTy.getScalarSizeInBits();
6453 const unsigned TargetBits = TargetTy.getScalarSizeInBits();
6454 const unsigned NumParts = DstBits / TargetBits;
6455
6456 assert(DstBits % TargetBits == 0 && "Target type must evenly divide source");
6457 assert(isPowerOf2_32(TargetBits) && "Target bit width must be power of 2");
6458
6459 // If the shift amount is known at compile time, we can use direct indexing
6460 // instead of generating select chains in the general case.
6461 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: AmtReg, MRI))
6462 return narrowScalarShiftByConstantMultiway(MI, Amt: VRegAndVal->Value, TargetTy,
6463 ShiftAmtTy);
6464
6465 // For runtime-variable shift amounts, we must generate a more complex
6466 // sequence that handles all possible shift values using select chains.
6467
6468 // Split the input into target-sized pieces
6469 SmallVector<Register, 8> SrcParts;
6470 extractParts(Reg: SrcReg, Ty: TargetTy, NumParts, VRegs&: SrcParts, MIRBuilder, MRI);
6471
6472 // Shifting by zero should be a no-op.
6473 auto ZeroAmtConst = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
6474 LLT BoolTy = LLT::scalar(SizeInBits: 1);
6475 auto IsZeroShift =
6476 MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: BoolTy, Op0: AmtReg, Op1: ZeroAmtConst);
6477
6478 // Any wide shift can be decomposed into two components:
6479 // 1. WordShift: number of complete target-sized words to shift
6480 // 2. BitShift: number of bits to shift within each word
6481 //
6482 // Example: 128-bit >> 50 with 32-bit target:
6483 // WordShift = 50 / 32 = 1 (shift right by 1 complete word)
6484 // BitShift = 50 % 32 = 18 (shift each word right by 18 bits)
6485 unsigned TargetBitsLog2 = Log2_32(Value: TargetBits);
6486 auto TargetBitsLog2Const =
6487 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBitsLog2);
6488 auto TargetBitsMask = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - 1);
6489
6490 Register WordShift =
6491 MIRBuilder.buildLShr(Dst: ShiftAmtTy, Src0: AmtReg, Src1: TargetBitsLog2Const).getReg(Idx: 0);
6492 Register BitShift =
6493 MIRBuilder.buildAnd(Dst: ShiftAmtTy, Src0: AmtReg, Src1: TargetBitsMask).getReg(Idx: 0);
6494
6495 // Fill values:
6496 // - SHL/LSHR: fill with zeros
6497 // - ASHR: fill with sign-extended MSB
6498 Register ZeroReg = MIRBuilder.buildConstant(Res: TargetTy, Val: 0).getReg(Idx: 0);
6499
6500 Register FillValue;
6501 if (MI.getOpcode() == TargetOpcode::G_ASHR) {
6502 auto TargetBitsMinusOneConst =
6503 MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: TargetBits - 1);
6504 FillValue = MIRBuilder
6505 .buildAShr(Dst: TargetTy, Src0: SrcParts[NumParts - 1],
6506 Src1: TargetBitsMinusOneConst)
6507 .getReg(Idx: 0);
6508 } else {
6509 FillValue = ZeroReg;
6510 }
6511
6512 SmallVector<Register, 8> DstParts(NumParts);
6513
6514 // For each output part, generate a select chain that chooses the correct
6515 // result based on the runtime WordShift value. This handles all possible
6516 // word shift amounts by pre-calculating what each would produce.
6517 for (unsigned I = 0; I < NumParts; ++I) {
6518 // Initialize with appropriate default value for this shift type
6519 Register InBoundsResult = FillValue;
6520
6521 // clang-format off
6522 // Build a branchless select chain by pre-computing results for all possible
6523 // WordShift values (0 to NumParts-1). Each iteration nests a new select:
6524 //
6525 // K=0: select(WordShift==0, result0, FillValue)
6526 // K=1: select(WordShift==1, result1, select(WordShift==0, result0, FillValue))
6527 // K=2: select(WordShift==2, result2, select(WordShift==1, result1, select(...)))
6528 // clang-format on
6529 for (unsigned K = 0; K < NumParts; ++K) {
6530 auto WordShiftKConst = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: K);
6531 auto IsWordShiftK = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: BoolTy,
6532 Op0: WordShift, Op1: WordShiftKConst);
6533
6534 // Calculate source indices for this word shift
6535 //
6536 // For 4-part 128-bit value with K=1 word shift:
6537 // SHL: [3][2][1][0] << K => [2][1][0][Z]
6538 // -> (MainIdx = I-K, CarryIdx = I-K-1)
6539 // LSHR: [3][2][1][0] >> K => [Z][3][2][1]
6540 // -> (MainIdx = I+K, CarryIdx = I+K+1)
6541 int MainSrcIdx;
6542 int CarrySrcIdx; // Index for the word that provides the carried-in bits.
6543
6544 switch (MI.getOpcode()) {
6545 case TargetOpcode::G_SHL:
6546 MainSrcIdx = (int)I - (int)K;
6547 CarrySrcIdx = MainSrcIdx - 1;
6548 break;
6549 case TargetOpcode::G_LSHR:
6550 case TargetOpcode::G_ASHR:
6551 MainSrcIdx = (int)I + (int)K;
6552 CarrySrcIdx = MainSrcIdx + 1;
6553 break;
6554 default:
6555 llvm_unreachable("Not a shift");
6556 }
6557
6558 // Check bounds and build the result for this word shift
6559 Register ResultForK;
6560 if (MainSrcIdx >= 0 && MainSrcIdx < (int)NumParts) {
6561 Register MainOp = SrcParts[MainSrcIdx];
6562 Register CarryOp;
6563
6564 // Determine carry operand with bounds checking
6565 if (CarrySrcIdx >= 0 && CarrySrcIdx < (int)NumParts)
6566 CarryOp = SrcParts[CarrySrcIdx];
6567 else if (MI.getOpcode() == TargetOpcode::G_ASHR &&
6568 CarrySrcIdx >= (int)NumParts)
6569 CarryOp = FillValue; // Use sign extension
6570
6571 ResultForK = buildVariableShiftPart(Opcode: MI.getOpcode(), MainOperand: MainOp, ShiftAmt: BitShift,
6572 TargetTy, CarryOperand: CarryOp);
6573 } else {
6574 // Out of bounds - use fill value for this k
6575 ResultForK = FillValue;
6576 }
6577
6578 // Select this result if WordShift equals k
6579 InBoundsResult =
6580 MIRBuilder
6581 .buildSelect(Res: TargetTy, Tst: IsWordShiftK, Op0: ResultForK, Op1: InBoundsResult)
6582 .getReg(Idx: 0);
6583 }
6584
6585 // Handle zero-shift special case: if shift is 0, use original input
6586 DstParts[I] =
6587 MIRBuilder
6588 .buildSelect(Res: TargetTy, Tst: IsZeroShift, Op0: SrcParts[I], Op1: InBoundsResult)
6589 .getReg(Idx: 0);
6590 }
6591
6592 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstParts);
6593 MI.eraseFromParent();
6594 return Legalized;
6595}
6596
6597LegalizerHelper::LegalizeResult
6598LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
6599 LLT MoreTy) {
6600 assert(TypeIdx == 0 && "Expecting only Idx 0");
6601
6602 Observer.changingInstr(MI);
6603 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6604 MachineBasicBlock &OpMBB = *MI.getOperand(i: I + 1).getMBB();
6605 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminator());
6606 moreElementsVectorSrc(MI, MoreTy, OpIdx: I);
6607 }
6608
6609 MachineBasicBlock &MBB = *MI.getParent();
6610 MIRBuilder.setInsertPt(MBB, II: --MBB.getFirstNonPHI());
6611 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6612 Observer.changedInstr(MI);
6613 return Legalized;
6614}
6615
6616MachineInstrBuilder LegalizerHelper::getNeutralElementForVecReduce(
6617 unsigned Opcode, MachineIRBuilder &MIRBuilder, LLT Ty) {
6618 assert(Ty.isScalar() && "Expected scalar type to make neutral element for");
6619
6620 switch (Opcode) {
6621 default:
6622 llvm_unreachable(
6623 "getNeutralElementForVecReduce called with invalid opcode!");
6624 case TargetOpcode::G_VECREDUCE_ADD:
6625 case TargetOpcode::G_VECREDUCE_OR:
6626 case TargetOpcode::G_VECREDUCE_XOR:
6627 case TargetOpcode::G_VECREDUCE_UMAX:
6628 return MIRBuilder.buildConstant(Res: Ty, Val: 0);
6629 case TargetOpcode::G_VECREDUCE_MUL:
6630 return MIRBuilder.buildConstant(Res: Ty, Val: 1);
6631 case TargetOpcode::G_VECREDUCE_AND:
6632 case TargetOpcode::G_VECREDUCE_UMIN:
6633 return MIRBuilder.buildConstant(
6634 Res: Ty, Val: APInt::getAllOnes(numBits: Ty.getScalarSizeInBits()));
6635 case TargetOpcode::G_VECREDUCE_SMAX:
6636 return MIRBuilder.buildConstant(
6637 Res: Ty, Val: APInt::getSignedMinValue(numBits: Ty.getSizeInBits()));
6638 case TargetOpcode::G_VECREDUCE_SMIN:
6639 return MIRBuilder.buildConstant(
6640 Res: Ty, Val: APInt::getSignedMaxValue(numBits: Ty.getSizeInBits()));
6641 case TargetOpcode::G_VECREDUCE_FADD:
6642 return MIRBuilder.buildFConstant(Res: Ty, Val: -0.0);
6643 case TargetOpcode::G_VECREDUCE_FMUL:
6644 return MIRBuilder.buildFConstant(Res: Ty, Val: 1.0);
6645 case TargetOpcode::G_VECREDUCE_FMINIMUM:
6646 case TargetOpcode::G_VECREDUCE_FMAXIMUM:
6647 assert(false && "getNeutralElementForVecReduce unimplemented for "
6648 "G_VECREDUCE_FMINIMUM and G_VECREDUCE_FMAXIMUM!");
6649 }
6650 llvm_unreachable("switch expected to return!");
6651}
6652
6653LegalizerHelper::LegalizeResult
6654LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
6655 LLT MoreTy) {
6656 unsigned Opc = MI.getOpcode();
6657 switch (Opc) {
6658 case TargetOpcode::G_IMPLICIT_DEF:
6659 case TargetOpcode::G_LOAD: {
6660 if (TypeIdx != 0)
6661 return UnableToLegalize;
6662 Observer.changingInstr(MI);
6663 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6664 Observer.changedInstr(MI);
6665 return Legalized;
6666 }
6667 case TargetOpcode::G_STORE:
6668 if (TypeIdx != 0)
6669 return UnableToLegalize;
6670 Observer.changingInstr(MI);
6671 moreElementsVectorSrc(MI, MoreTy, OpIdx: 0);
6672 Observer.changedInstr(MI);
6673 return Legalized;
6674 case TargetOpcode::G_AND:
6675 case TargetOpcode::G_OR:
6676 case TargetOpcode::G_XOR:
6677 case TargetOpcode::G_ADD:
6678 case TargetOpcode::G_SUB:
6679 case TargetOpcode::G_MUL:
6680 case TargetOpcode::G_FADD:
6681 case TargetOpcode::G_FSUB:
6682 case TargetOpcode::G_FMUL:
6683 case TargetOpcode::G_FDIV:
6684 case TargetOpcode::G_FCOPYSIGN:
6685 case TargetOpcode::G_UADDSAT:
6686 case TargetOpcode::G_USUBSAT:
6687 case TargetOpcode::G_SADDSAT:
6688 case TargetOpcode::G_SSUBSAT:
6689 case TargetOpcode::G_SMIN:
6690 case TargetOpcode::G_SMAX:
6691 case TargetOpcode::G_UMIN:
6692 case TargetOpcode::G_UMAX:
6693 case TargetOpcode::G_FMINNUM:
6694 case TargetOpcode::G_FMAXNUM:
6695 case TargetOpcode::G_FMINNUM_IEEE:
6696 case TargetOpcode::G_FMAXNUM_IEEE:
6697 case TargetOpcode::G_FMINIMUM:
6698 case TargetOpcode::G_FMAXIMUM:
6699 case TargetOpcode::G_FMINIMUMNUM:
6700 case TargetOpcode::G_FMAXIMUMNUM:
6701 case TargetOpcode::G_STRICT_FADD:
6702 case TargetOpcode::G_STRICT_FSUB:
6703 case TargetOpcode::G_STRICT_FMUL: {
6704 Observer.changingInstr(MI);
6705 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6706 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6707 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6708 Observer.changedInstr(MI);
6709 return Legalized;
6710 }
6711 case TargetOpcode::G_SHL:
6712 case TargetOpcode::G_ASHR:
6713 case TargetOpcode::G_LSHR: {
6714 Observer.changingInstr(MI);
6715 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6716 // The shift operand may have a different scalar type from the source and
6717 // destination operands.
6718 LLT ShiftMoreTy = MoreTy.changeElementType(
6719 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getElementType());
6720 moreElementsVectorSrc(MI, MoreTy: ShiftMoreTy, OpIdx: 2);
6721 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6722 Observer.changedInstr(MI);
6723 return Legalized;
6724 }
6725 case TargetOpcode::G_FMA:
6726 case TargetOpcode::G_STRICT_FMA:
6727 case TargetOpcode::G_FSHR:
6728 case TargetOpcode::G_FSHL: {
6729 Observer.changingInstr(MI);
6730 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6731 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6732 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6733 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6734 Observer.changedInstr(MI);
6735 return Legalized;
6736 }
6737 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
6738 case TargetOpcode::G_EXTRACT:
6739 if (TypeIdx != 1)
6740 return UnableToLegalize;
6741 Observer.changingInstr(MI);
6742 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6743 Observer.changedInstr(MI);
6744 return Legalized;
6745 case TargetOpcode::G_INSERT:
6746 case TargetOpcode::G_INSERT_VECTOR_ELT:
6747 case TargetOpcode::G_FREEZE:
6748 case TargetOpcode::G_FNEG:
6749 case TargetOpcode::G_FABS:
6750 case TargetOpcode::G_FSQRT:
6751 case TargetOpcode::G_FCEIL:
6752 case TargetOpcode::G_FFLOOR:
6753 case TargetOpcode::G_FNEARBYINT:
6754 case TargetOpcode::G_FRINT:
6755 case TargetOpcode::G_INTRINSIC_ROUND:
6756 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
6757 case TargetOpcode::G_INTRINSIC_TRUNC:
6758 case TargetOpcode::G_BITREVERSE:
6759 case TargetOpcode::G_BSWAP:
6760 case TargetOpcode::G_FCANONICALIZE:
6761 case TargetOpcode::G_SEXT_INREG:
6762 case TargetOpcode::G_ABS:
6763 case TargetOpcode::G_CTLZ:
6764 case TargetOpcode::G_CTPOP:
6765 if (TypeIdx != 0)
6766 return UnableToLegalize;
6767 Observer.changingInstr(MI);
6768 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
6769 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6770 Observer.changedInstr(MI);
6771 return Legalized;
6772 case TargetOpcode::G_SELECT: {
6773 auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
6774 if (TypeIdx == 1) {
6775 if (!CondTy.isScalar() ||
6776 DstTy.getElementCount() != MoreTy.getElementCount())
6777 return UnableToLegalize;
6778
6779 // This is turning a scalar select of vectors into a vector
6780 // select. Broadcast the select condition.
6781 auto ShufSplat = MIRBuilder.buildShuffleSplat(Res: MoreTy, Src: CondReg);
6782 Observer.changingInstr(MI);
6783 MI.getOperand(i: 1).setReg(ShufSplat.getReg(Idx: 0));
6784 Observer.changedInstr(MI);
6785 return Legalized;
6786 }
6787
6788 if (CondTy.isVector())
6789 return UnableToLegalize;
6790
6791 Observer.changingInstr(MI);
6792 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6793 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6794 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6795 Observer.changedInstr(MI);
6796 return Legalized;
6797 }
6798 case TargetOpcode::G_UNMERGE_VALUES:
6799 return UnableToLegalize;
6800 case TargetOpcode::G_PHI:
6801 return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
6802 case TargetOpcode::G_SHUFFLE_VECTOR:
6803 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
6804 case TargetOpcode::G_BUILD_VECTOR: {
6805 SmallVector<SrcOp, 8> Elts;
6806 for (auto Op : MI.uses()) {
6807 Elts.push_back(Elt: Op.getReg());
6808 }
6809
6810 for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
6811 Elts.push_back(Elt: MIRBuilder.buildUndef(Res: MoreTy.getScalarType()));
6812 }
6813
6814 MIRBuilder.buildDeleteTrailingVectorElements(
6815 Res: MI.getOperand(i: 0).getReg(), Op0: MIRBuilder.buildInstr(Opc, DstOps: {MoreTy}, SrcOps: Elts));
6816 MI.eraseFromParent();
6817 return Legalized;
6818 }
6819 case TargetOpcode::G_SEXT:
6820 case TargetOpcode::G_ZEXT:
6821 case TargetOpcode::G_ANYEXT:
6822 case TargetOpcode::G_TRUNC:
6823 case TargetOpcode::G_FPTRUNC:
6824 case TargetOpcode::G_FPEXT:
6825 case TargetOpcode::G_FPTOSI:
6826 case TargetOpcode::G_FPTOUI:
6827 case TargetOpcode::G_FPTOSI_SAT:
6828 case TargetOpcode::G_FPTOUI_SAT:
6829 case TargetOpcode::G_SITOFP:
6830 case TargetOpcode::G_UITOFP: {
6831 Observer.changingInstr(MI);
6832 LLT SrcExtTy;
6833 LLT DstExtTy;
6834 if (TypeIdx == 0) {
6835 DstExtTy = MoreTy;
6836 SrcExtTy = MoreTy.changeElementType(
6837 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getElementType());
6838 } else {
6839 DstExtTy = MoreTy.changeElementType(
6840 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getElementType());
6841 SrcExtTy = MoreTy;
6842 }
6843 moreElementsVectorSrc(MI, MoreTy: SrcExtTy, OpIdx: 1);
6844 moreElementsVectorDst(MI, WideTy: DstExtTy, OpIdx: 0);
6845 Observer.changedInstr(MI);
6846 return Legalized;
6847 }
6848 case TargetOpcode::G_ICMP:
6849 case TargetOpcode::G_FCMP: {
6850 if (TypeIdx != 1)
6851 return UnableToLegalize;
6852
6853 Observer.changingInstr(MI);
6854 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
6855 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
6856 LLT CondTy = MoreTy.changeVectorElementType(
6857 NewEltTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getElementType());
6858 moreElementsVectorDst(MI, WideTy: CondTy, OpIdx: 0);
6859 Observer.changedInstr(MI);
6860 return Legalized;
6861 }
6862 case TargetOpcode::G_BITCAST: {
6863 if (TypeIdx != 0)
6864 return UnableToLegalize;
6865
6866 LLT SrcTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
6867 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
6868
6869 unsigned coefficient = SrcTy.getNumElements() * MoreTy.getNumElements();
6870 if (coefficient % DstTy.getNumElements() != 0)
6871 return UnableToLegalize;
6872
6873 coefficient = coefficient / DstTy.getNumElements();
6874
6875 LLT NewTy = SrcTy.changeElementCount(
6876 EC: ElementCount::get(MinVal: coefficient, Scalable: MoreTy.isScalable()));
6877 Observer.changingInstr(MI);
6878 moreElementsVectorSrc(MI, MoreTy: NewTy, OpIdx: 1);
6879 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
6880 Observer.changedInstr(MI);
6881 return Legalized;
6882 }
6883 case TargetOpcode::G_VECREDUCE_FADD:
6884 case TargetOpcode::G_VECREDUCE_FMUL:
6885 case TargetOpcode::G_VECREDUCE_ADD:
6886 case TargetOpcode::G_VECREDUCE_MUL:
6887 case TargetOpcode::G_VECREDUCE_AND:
6888 case TargetOpcode::G_VECREDUCE_OR:
6889 case TargetOpcode::G_VECREDUCE_XOR:
6890 case TargetOpcode::G_VECREDUCE_SMAX:
6891 case TargetOpcode::G_VECREDUCE_SMIN:
6892 case TargetOpcode::G_VECREDUCE_UMAX:
6893 case TargetOpcode::G_VECREDUCE_UMIN: {
6894 LLT OrigTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
6895 MachineOperand &MO = MI.getOperand(i: 1);
6896 auto NewVec = MIRBuilder.buildPadVectorWithUndefElements(Res: MoreTy, Op0: MO);
6897 auto NeutralElement = getNeutralElementForVecReduce(
6898 Opcode: MI.getOpcode(), MIRBuilder, Ty: MoreTy.getElementType());
6899
6900 LLT IdxTy(TLI.getVectorIdxLLT(DL: MIRBuilder.getDataLayout()));
6901 for (size_t i = OrigTy.getNumElements(), e = MoreTy.getNumElements();
6902 i != e; i++) {
6903 auto Idx = MIRBuilder.buildConstant(Res: IdxTy, Val: i);
6904 NewVec = MIRBuilder.buildInsertVectorElement(Res: MoreTy, Val: NewVec,
6905 Elt: NeutralElement, Idx);
6906 }
6907
6908 Observer.changingInstr(MI);
6909 MO.setReg(NewVec.getReg(Idx: 0));
6910 Observer.changedInstr(MI);
6911 return Legalized;
6912 }
6913
6914 default:
6915 return UnableToLegalize;
6916 }
6917}
6918
6919LegalizerHelper::LegalizeResult
6920LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
6921 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6922 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
6923 unsigned MaskNumElts = Mask.size();
6924 unsigned SrcNumElts = SrcTy.getNumElements();
6925 LLT DestEltTy = DstTy.getElementType();
6926
6927 if (MaskNumElts == SrcNumElts)
6928 return Legalized;
6929
6930 if (MaskNumElts < SrcNumElts) {
6931 // Extend mask to match new destination vector size with
6932 // undef values.
6933 SmallVector<int, 16> NewMask(SrcNumElts, -1);
6934 llvm::copy(Range&: Mask, Out: NewMask.begin());
6935
6936 moreElementsVectorDst(MI, WideTy: SrcTy, OpIdx: 0);
6937 MIRBuilder.setInstrAndDebugLoc(MI);
6938 MIRBuilder.buildShuffleVector(Res: MI.getOperand(i: 0).getReg(),
6939 Src1: MI.getOperand(i: 1).getReg(),
6940 Src2: MI.getOperand(i: 2).getReg(), Mask: NewMask);
6941 MI.eraseFromParent();
6942
6943 return Legalized;
6944 }
6945
6946 unsigned PaddedMaskNumElts = alignTo(Value: MaskNumElts, Align: SrcNumElts);
6947 unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
6948 LLT PaddedTy =
6949 DstTy.changeVectorElementCount(EC: ElementCount::getFixed(MinVal: PaddedMaskNumElts));
6950
6951 // Create new source vectors by concatenating the initial
6952 // source vectors with undefined vectors of the same size.
6953 auto Undef = MIRBuilder.buildUndef(Res: SrcTy);
6954 SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(Idx: 0));
6955 SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(Idx: 0));
6956 MOps1[0] = MI.getOperand(i: 1).getReg();
6957 MOps2[0] = MI.getOperand(i: 2).getReg();
6958
6959 auto Src1 = MIRBuilder.buildConcatVectors(Res: PaddedTy, Ops: MOps1);
6960 auto Src2 = MIRBuilder.buildConcatVectors(Res: PaddedTy, Ops: MOps2);
6961
6962 // Readjust mask for new input vector length.
6963 SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
6964 for (unsigned I = 0; I != MaskNumElts; ++I) {
6965 int Idx = Mask[I];
6966 if (Idx >= static_cast<int>(SrcNumElts))
6967 Idx += PaddedMaskNumElts - SrcNumElts;
6968 MappedOps[I] = Idx;
6969 }
6970
6971 // If we got more elements than required, extract subvector.
6972 if (MaskNumElts != PaddedMaskNumElts) {
6973 auto Shuffle =
6974 MIRBuilder.buildShuffleVector(Res: PaddedTy, Src1, Src2, Mask: MappedOps);
6975
6976 SmallVector<Register, 16> Elts(MaskNumElts);
6977 for (unsigned I = 0; I < MaskNumElts; ++I) {
6978 Elts[I] =
6979 MIRBuilder.buildExtractVectorElementConstant(Res: DestEltTy, Val: Shuffle, Idx: I)
6980 .getReg(Idx: 0);
6981 }
6982 MIRBuilder.buildBuildVector(Res: DstReg, Ops: Elts);
6983 } else {
6984 MIRBuilder.buildShuffleVector(Res: DstReg, Src1, Src2, Mask: MappedOps);
6985 }
6986
6987 MI.eraseFromParent();
6988 return LegalizerHelper::LegalizeResult::Legalized;
6989}
6990
6991LegalizerHelper::LegalizeResult
6992LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
6993 unsigned int TypeIdx, LLT MoreTy) {
6994 auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
6995 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
6996 unsigned NumElts = DstTy.getNumElements();
6997 unsigned WidenNumElts = MoreTy.getNumElements();
6998
6999 if (DstTy.isVector() && Src1Ty.isVector() &&
7000 DstTy.getNumElements() != Src1Ty.getNumElements()) {
7001 return equalizeVectorShuffleLengths(MI);
7002 }
7003
7004 if (TypeIdx != 0)
7005 return UnableToLegalize;
7006
7007 // Expect a canonicalized shuffle.
7008 if (DstTy != Src1Ty || DstTy != Src2Ty)
7009 return UnableToLegalize;
7010
7011 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
7012 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
7013
7014 // Adjust mask based on new input vector length.
7015 SmallVector<int, 16> NewMask(WidenNumElts, -1);
7016 for (unsigned I = 0; I != NumElts; ++I) {
7017 int Idx = Mask[I];
7018 if (Idx < static_cast<int>(NumElts))
7019 NewMask[I] = Idx;
7020 else
7021 NewMask[I] = Idx - NumElts + WidenNumElts;
7022 }
7023 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
7024 MIRBuilder.setInstrAndDebugLoc(MI);
7025 MIRBuilder.buildShuffleVector(Res: MI.getOperand(i: 0).getReg(),
7026 Src1: MI.getOperand(i: 1).getReg(),
7027 Src2: MI.getOperand(i: 2).getReg(), Mask: NewMask);
7028 MI.eraseFromParent();
7029 return Legalized;
7030}
7031
7032void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
7033 ArrayRef<Register> Src1Regs,
7034 ArrayRef<Register> Src2Regs,
7035 LLT NarrowTy) {
7036 MachineIRBuilder &B = MIRBuilder;
7037 unsigned SrcParts = Src1Regs.size();
7038 unsigned DstParts = DstRegs.size();
7039
7040 unsigned DstIdx = 0; // Low bits of the result.
7041 Register FactorSum =
7042 B.buildMul(Dst: NarrowTy, Src0: Src1Regs[DstIdx], Src1: Src2Regs[DstIdx]).getReg(Idx: 0);
7043 DstRegs[DstIdx] = FactorSum;
7044
7045 Register CarrySumPrevDstIdx;
7046 SmallVector<Register, 4> Factors;
7047
7048 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
7049 // Collect low parts of muls for DstIdx.
7050 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
7051 i <= std::min(a: DstIdx, b: SrcParts - 1); ++i) {
7052 MachineInstrBuilder Mul =
7053 B.buildMul(Dst: NarrowTy, Src0: Src1Regs[DstIdx - i], Src1: Src2Regs[i]);
7054 Factors.push_back(Elt: Mul.getReg(Idx: 0));
7055 }
7056 // Collect high parts of muls from previous DstIdx.
7057 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
7058 i <= std::min(a: DstIdx - 1, b: SrcParts - 1); ++i) {
7059 MachineInstrBuilder Umulh =
7060 B.buildUMulH(Dst: NarrowTy, Src0: Src1Regs[DstIdx - 1 - i], Src1: Src2Regs[i]);
7061 Factors.push_back(Elt: Umulh.getReg(Idx: 0));
7062 }
7063 // Add CarrySum from additions calculated for previous DstIdx.
7064 if (DstIdx != 1) {
7065 Factors.push_back(Elt: CarrySumPrevDstIdx);
7066 }
7067
7068 Register CarrySum;
7069 // Add all factors and accumulate all carries into CarrySum.
7070 if (DstIdx != DstParts - 1) {
7071 MachineInstrBuilder Uaddo =
7072 B.buildUAddo(Res: NarrowTy, CarryOut: LLT::scalar(SizeInBits: 1), Op0: Factors[0], Op1: Factors[1]);
7073 FactorSum = Uaddo.getReg(Idx: 0);
7074 CarrySum = B.buildZExt(Res: NarrowTy, Op: Uaddo.getReg(Idx: 1)).getReg(Idx: 0);
7075 for (unsigned i = 2; i < Factors.size(); ++i) {
7076 MachineInstrBuilder Uaddo =
7077 B.buildUAddo(Res: NarrowTy, CarryOut: LLT::scalar(SizeInBits: 1), Op0: FactorSum, Op1: Factors[i]);
7078 FactorSum = Uaddo.getReg(Idx: 0);
7079 MachineInstrBuilder Carry = B.buildZExt(Res: NarrowTy, Op: Uaddo.getReg(Idx: 1));
7080 CarrySum = B.buildAdd(Dst: NarrowTy, Src0: CarrySum, Src1: Carry).getReg(Idx: 0);
7081 }
7082 } else {
7083 // Since value for the next index is not calculated, neither is CarrySum.
7084 FactorSum = B.buildAdd(Dst: NarrowTy, Src0: Factors[0], Src1: Factors[1]).getReg(Idx: 0);
7085 for (unsigned i = 2; i < Factors.size(); ++i)
7086 FactorSum = B.buildAdd(Dst: NarrowTy, Src0: FactorSum, Src1: Factors[i]).getReg(Idx: 0);
7087 }
7088
7089 CarrySumPrevDstIdx = CarrySum;
7090 DstRegs[DstIdx] = FactorSum;
7091 Factors.clear();
7092 }
7093}
7094
7095LegalizerHelper::LegalizeResult
7096LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
7097 LLT NarrowTy) {
7098 if (TypeIdx != 0)
7099 return UnableToLegalize;
7100
7101 Register DstReg = MI.getOperand(i: 0).getReg();
7102 LLT DstType = MRI.getType(Reg: DstReg);
7103 // FIXME: add support for vector types
7104 if (DstType.isVector())
7105 return UnableToLegalize;
7106
7107 unsigned Opcode = MI.getOpcode();
7108 unsigned OpO, OpE, OpF;
7109 switch (Opcode) {
7110 case TargetOpcode::G_SADDO:
7111 case TargetOpcode::G_SADDE:
7112 case TargetOpcode::G_UADDO:
7113 case TargetOpcode::G_UADDE:
7114 case TargetOpcode::G_ADD:
7115 OpO = TargetOpcode::G_UADDO;
7116 OpE = TargetOpcode::G_UADDE;
7117 OpF = TargetOpcode::G_UADDE;
7118 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
7119 OpF = TargetOpcode::G_SADDE;
7120 break;
7121 case TargetOpcode::G_SSUBO:
7122 case TargetOpcode::G_SSUBE:
7123 case TargetOpcode::G_USUBO:
7124 case TargetOpcode::G_USUBE:
7125 case TargetOpcode::G_SUB:
7126 OpO = TargetOpcode::G_USUBO;
7127 OpE = TargetOpcode::G_USUBE;
7128 OpF = TargetOpcode::G_USUBE;
7129 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
7130 OpF = TargetOpcode::G_SSUBE;
7131 break;
7132 default:
7133 llvm_unreachable("Unexpected add/sub opcode!");
7134 }
7135
7136 // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
7137 unsigned NumDefs = MI.getNumExplicitDefs();
7138 Register Src1 = MI.getOperand(i: NumDefs).getReg();
7139 Register Src2 = MI.getOperand(i: NumDefs + 1).getReg();
7140 Register CarryDst, CarryIn;
7141 if (NumDefs == 2)
7142 CarryDst = MI.getOperand(i: 1).getReg();
7143 if (MI.getNumOperands() == NumDefs + 3)
7144 CarryIn = MI.getOperand(i: NumDefs + 2).getReg();
7145
7146 LLT RegTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
7147 LLT LeftoverTy, DummyTy;
7148 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
7149 extractParts(Reg: Src1, RegTy, MainTy: NarrowTy, LeftoverTy, VRegs&: Src1Regs, LeftoverVRegs&: Src1Left,
7150 MIRBuilder, MRI);
7151 extractParts(Reg: Src2, RegTy, MainTy: NarrowTy, LeftoverTy&: DummyTy, VRegs&: Src2Regs, LeftoverVRegs&: Src2Left, MIRBuilder,
7152 MRI);
7153
7154 int NarrowParts = Src1Regs.size();
7155 Src1Regs.append(RHS: Src1Left);
7156 Src2Regs.append(RHS: Src2Left);
7157 DstRegs.reserve(N: Src1Regs.size());
7158
7159 for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
7160 Register DstReg =
7161 MRI.createGenericVirtualRegister(Ty: MRI.getType(Reg: Src1Regs[i]));
7162 Register CarryOut;
7163 // Forward the final carry-out to the destination register
7164 if (i == e - 1 && CarryDst)
7165 CarryOut = CarryDst;
7166 else
7167 CarryOut = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 1));
7168
7169 if (!CarryIn) {
7170 MIRBuilder.buildInstr(Opc: OpO, DstOps: {DstReg, CarryOut},
7171 SrcOps: {Src1Regs[i], Src2Regs[i]});
7172 } else if (i == e - 1) {
7173 MIRBuilder.buildInstr(Opc: OpF, DstOps: {DstReg, CarryOut},
7174 SrcOps: {Src1Regs[i], Src2Regs[i], CarryIn});
7175 } else {
7176 MIRBuilder.buildInstr(Opc: OpE, DstOps: {DstReg, CarryOut},
7177 SrcOps: {Src1Regs[i], Src2Regs[i], CarryIn});
7178 }
7179
7180 DstRegs.push_back(Elt: DstReg);
7181 CarryIn = CarryOut;
7182 }
7183 insertParts(DstReg: MI.getOperand(i: 0).getReg(), ResultTy: RegTy, PartTy: NarrowTy,
7184 PartRegs: ArrayRef(DstRegs).take_front(N: NarrowParts), LeftoverTy,
7185 LeftoverRegs: ArrayRef(DstRegs).drop_front(N: NarrowParts));
7186
7187 MI.eraseFromParent();
7188 return Legalized;
7189}
7190
7191LegalizerHelper::LegalizeResult
7192LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
7193 auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
7194
7195 LLT Ty = MRI.getType(Reg: DstReg);
7196 if (Ty.isVector())
7197 return UnableToLegalize;
7198
7199 unsigned Size = Ty.getSizeInBits();
7200 unsigned NarrowSize = NarrowTy.getSizeInBits();
7201 if (Size % NarrowSize != 0)
7202 return UnableToLegalize;
7203
7204 unsigned NumParts = Size / NarrowSize;
7205 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
7206 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
7207
7208 SmallVector<Register, 2> Src1Parts, Src2Parts;
7209 SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
7210 extractParts(Reg: Src1, Ty: NarrowTy, NumParts, VRegs&: Src1Parts, MIRBuilder, MRI);
7211 extractParts(Reg: Src2, Ty: NarrowTy, NumParts, VRegs&: Src2Parts, MIRBuilder, MRI);
7212 multiplyRegisters(DstRegs&: DstTmpRegs, Src1Regs: Src1Parts, Src2Regs: Src2Parts, NarrowTy);
7213
7214 // Take only high half of registers if this is high mul.
7215 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
7216 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
7217 MI.eraseFromParent();
7218 return Legalized;
7219}
7220
7221LegalizerHelper::LegalizeResult
7222LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
7223 LLT NarrowTy) {
7224 if (TypeIdx != 0)
7225 return UnableToLegalize;
7226
7227 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
7228
7229 Register Src = MI.getOperand(i: 1).getReg();
7230 LLT SrcTy = MRI.getType(Reg: Src);
7231
7232 // If all finite floats fit into the narrowed integer type, we can just swap
7233 // out the result type. This is practically only useful for conversions from
7234 // half to at least 16-bits, so just handle the one case.
7235 if (SrcTy.getScalarType() != LLT::scalar(SizeInBits: 16) ||
7236 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
7237 return UnableToLegalize;
7238
7239 Observer.changingInstr(MI);
7240 narrowScalarDst(MI, NarrowTy, OpIdx: 0,
7241 ExtOpcode: IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
7242 Observer.changedInstr(MI);
7243 return Legalized;
7244}
7245
7246LegalizerHelper::LegalizeResult
7247LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
7248 LLT NarrowTy) {
7249 if (TypeIdx != 1)
7250 return UnableToLegalize;
7251
7252 uint64_t NarrowSize = NarrowTy.getSizeInBits();
7253
7254 int64_t SizeOp1 = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7255 // FIXME: add support for when SizeOp1 isn't an exact multiple of
7256 // NarrowSize.
7257 if (SizeOp1 % NarrowSize != 0)
7258 return UnableToLegalize;
7259 int NumParts = SizeOp1 / NarrowSize;
7260
7261 SmallVector<Register, 2> SrcRegs, DstRegs;
7262 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowTy, NumParts, VRegs&: SrcRegs,
7263 MIRBuilder, MRI);
7264
7265 Register OpReg = MI.getOperand(i: 0).getReg();
7266 uint64_t OpStart = MI.getOperand(i: 2).getImm();
7267 uint64_t OpSize = MRI.getType(Reg: OpReg).getSizeInBits();
7268 for (int i = 0; i < NumParts; ++i) {
7269 unsigned SrcStart = i * NarrowSize;
7270
7271 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
7272 // No part of the extract uses this subregister, ignore it.
7273 continue;
7274 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(Reg: OpReg)) {
7275 // The entire subregister is extracted, forward the value.
7276 DstRegs.push_back(Elt: SrcRegs[i]);
7277 continue;
7278 }
7279
7280 // OpSegStart is where this destination segment would start in OpReg if it
7281 // extended infinitely in both directions.
7282 int64_t ExtractOffset;
7283 uint64_t SegSize;
7284 if (OpStart < SrcStart) {
7285 ExtractOffset = 0;
7286 SegSize = std::min(a: NarrowSize, b: OpStart + OpSize - SrcStart);
7287 } else {
7288 ExtractOffset = OpStart - SrcStart;
7289 SegSize = std::min(a: SrcStart + NarrowSize - OpStart, b: OpSize);
7290 }
7291
7292 Register SegReg = SrcRegs[i];
7293 if (ExtractOffset != 0 || SegSize != NarrowSize) {
7294 // A genuine extract is needed.
7295 SegReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: SegSize));
7296 MIRBuilder.buildExtract(Res: SegReg, Src: SrcRegs[i], Index: ExtractOffset);
7297 }
7298
7299 DstRegs.push_back(Elt: SegReg);
7300 }
7301
7302 Register DstReg = MI.getOperand(i: 0).getReg();
7303 if (MRI.getType(Reg: DstReg).isVector())
7304 MIRBuilder.buildBuildVector(Res: DstReg, Ops: DstRegs);
7305 else if (DstRegs.size() > 1)
7306 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
7307 else
7308 MIRBuilder.buildCopy(Res: DstReg, Op: DstRegs[0]);
7309 MI.eraseFromParent();
7310 return Legalized;
7311}
7312
7313LegalizerHelper::LegalizeResult
7314LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
7315 LLT NarrowTy) {
7316 // FIXME: Don't know how to handle secondary types yet.
7317 if (TypeIdx != 0)
7318 return UnableToLegalize;
7319
7320 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
7321 LLT RegTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
7322 LLT LeftoverTy;
7323 extractParts(Reg: MI.getOperand(i: 1).getReg(), RegTy, MainTy: NarrowTy, LeftoverTy, VRegs&: SrcRegs,
7324 LeftoverVRegs&: LeftoverRegs, MIRBuilder, MRI);
7325
7326 SrcRegs.append(RHS: LeftoverRegs);
7327
7328 uint64_t NarrowSize = NarrowTy.getSizeInBits();
7329 Register OpReg = MI.getOperand(i: 2).getReg();
7330 uint64_t OpStart = MI.getOperand(i: 3).getImm();
7331 uint64_t OpSize = MRI.getType(Reg: OpReg).getSizeInBits();
7332 for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
7333 unsigned DstStart = I * NarrowSize;
7334
7335 if (DstStart == OpStart && NarrowTy == MRI.getType(Reg: OpReg)) {
7336 // The entire subregister is defined by this insert, forward the new
7337 // value.
7338 DstRegs.push_back(Elt: OpReg);
7339 continue;
7340 }
7341
7342 Register SrcReg = SrcRegs[I];
7343 if (MRI.getType(Reg: SrcRegs[I]) == LeftoverTy) {
7344 // The leftover reg is smaller than NarrowTy, so we need to extend it.
7345 SrcReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
7346 MIRBuilder.buildAnyExt(Res: SrcReg, Op: SrcRegs[I]);
7347 }
7348
7349 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
7350 // No part of the insert affects this subregister, forward the original.
7351 DstRegs.push_back(Elt: SrcReg);
7352 continue;
7353 }
7354
7355 // OpSegStart is where this destination segment would start in OpReg if it
7356 // extended infinitely in both directions.
7357 int64_t ExtractOffset, InsertOffset;
7358 uint64_t SegSize;
7359 if (OpStart < DstStart) {
7360 InsertOffset = 0;
7361 ExtractOffset = DstStart - OpStart;
7362 SegSize = std::min(a: NarrowSize, b: OpStart + OpSize - DstStart);
7363 } else {
7364 InsertOffset = OpStart - DstStart;
7365 ExtractOffset = 0;
7366 SegSize =
7367 std::min(a: NarrowSize - InsertOffset, b: OpStart + OpSize - DstStart);
7368 }
7369
7370 Register SegReg = OpReg;
7371 if (ExtractOffset != 0 || SegSize != OpSize) {
7372 // A genuine extract is needed.
7373 SegReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: SegSize));
7374 MIRBuilder.buildExtract(Res: SegReg, Src: OpReg, Index: ExtractOffset);
7375 }
7376
7377 Register DstReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
7378 MIRBuilder.buildInsert(Res: DstReg, Src: SrcReg, Op: SegReg, Index: InsertOffset);
7379 DstRegs.push_back(Elt: DstReg);
7380 }
7381
7382 uint64_t WideSize = DstRegs.size() * NarrowSize;
7383 Register DstReg = MI.getOperand(i: 0).getReg();
7384 if (WideSize > RegTy.getSizeInBits()) {
7385 Register MergeReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: WideSize));
7386 MIRBuilder.buildMergeLikeInstr(Res: MergeReg, Ops: DstRegs);
7387 MIRBuilder.buildTrunc(Res: DstReg, Op: MergeReg);
7388 } else
7389 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
7390
7391 MI.eraseFromParent();
7392 return Legalized;
7393}
7394
7395LegalizerHelper::LegalizeResult
7396LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
7397 LLT NarrowTy) {
7398 Register DstReg = MI.getOperand(i: 0).getReg();
7399 LLT DstTy = MRI.getType(Reg: DstReg);
7400
7401 assert(MI.getNumOperands() == 3 && TypeIdx == 0);
7402
7403 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
7404 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
7405 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
7406 LLT LeftoverTy;
7407 if (!extractParts(Reg: MI.getOperand(i: 1).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy,
7408 VRegs&: Src0Regs, LeftoverVRegs&: Src0LeftoverRegs, MIRBuilder, MRI))
7409 return UnableToLegalize;
7410
7411 LLT Unused;
7412 if (!extractParts(Reg: MI.getOperand(i: 2).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy&: Unused,
7413 VRegs&: Src1Regs, LeftoverVRegs&: Src1LeftoverRegs, MIRBuilder, MRI))
7414 llvm_unreachable("inconsistent extractParts result");
7415
7416 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
7417 auto Inst = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy},
7418 SrcOps: {Src0Regs[I], Src1Regs[I]});
7419 DstRegs.push_back(Elt: Inst.getReg(Idx: 0));
7420 }
7421
7422 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
7423 auto Inst = MIRBuilder.buildInstr(
7424 Opc: MI.getOpcode(),
7425 DstOps: {LeftoverTy}, SrcOps: {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
7426 DstLeftoverRegs.push_back(Elt: Inst.getReg(Idx: 0));
7427 }
7428
7429 insertParts(DstReg, ResultTy: DstTy, PartTy: NarrowTy, PartRegs: DstRegs,
7430 LeftoverTy, LeftoverRegs: DstLeftoverRegs);
7431
7432 MI.eraseFromParent();
7433 return Legalized;
7434}
7435
7436LegalizerHelper::LegalizeResult
7437LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
7438 LLT NarrowTy) {
7439 if (TypeIdx != 0)
7440 return UnableToLegalize;
7441
7442 auto [DstReg, SrcReg] = MI.getFirst2Regs();
7443
7444 LLT DstTy = MRI.getType(Reg: DstReg);
7445 if (DstTy.isVector())
7446 return UnableToLegalize;
7447
7448 SmallVector<Register, 8> Parts;
7449 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
7450 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, VRegs&: Parts, PadStrategy: MI.getOpcode());
7451 buildWidenedRemergeToDst(DstReg, LCMTy, RemergeRegs: Parts);
7452
7453 MI.eraseFromParent();
7454 return Legalized;
7455}
7456
7457LegalizerHelper::LegalizeResult
7458LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
7459 LLT NarrowTy) {
7460 if (TypeIdx != 0)
7461 return UnableToLegalize;
7462
7463 Register CondReg = MI.getOperand(i: 1).getReg();
7464 LLT CondTy = MRI.getType(Reg: CondReg);
7465 if (CondTy.isVector()) // TODO: Handle vselect
7466 return UnableToLegalize;
7467
7468 Register DstReg = MI.getOperand(i: 0).getReg();
7469 LLT DstTy = MRI.getType(Reg: DstReg);
7470
7471 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
7472 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
7473 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
7474 LLT LeftoverTy;
7475 if (!extractParts(Reg: MI.getOperand(i: 2).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy,
7476 VRegs&: Src1Regs, LeftoverVRegs&: Src1LeftoverRegs, MIRBuilder, MRI))
7477 return UnableToLegalize;
7478
7479 LLT Unused;
7480 if (!extractParts(Reg: MI.getOperand(i: 3).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy&: Unused,
7481 VRegs&: Src2Regs, LeftoverVRegs&: Src2LeftoverRegs, MIRBuilder, MRI))
7482 llvm_unreachable("inconsistent extractParts result");
7483
7484 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
7485 auto Select = MIRBuilder.buildSelect(Res: NarrowTy,
7486 Tst: CondReg, Op0: Src1Regs[I], Op1: Src2Regs[I]);
7487 DstRegs.push_back(Elt: Select.getReg(Idx: 0));
7488 }
7489
7490 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
7491 auto Select = MIRBuilder.buildSelect(
7492 Res: LeftoverTy, Tst: CondReg, Op0: Src1LeftoverRegs[I], Op1: Src2LeftoverRegs[I]);
7493 DstLeftoverRegs.push_back(Elt: Select.getReg(Idx: 0));
7494 }
7495
7496 insertParts(DstReg, ResultTy: DstTy, PartTy: NarrowTy, PartRegs: DstRegs,
7497 LeftoverTy, LeftoverRegs: DstLeftoverRegs);
7498
7499 MI.eraseFromParent();
7500 return Legalized;
7501}
7502
7503LegalizerHelper::LegalizeResult
7504LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
7505 LLT NarrowTy) {
7506 if (TypeIdx != 1)
7507 return UnableToLegalize;
7508
7509 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7510 unsigned NarrowSize = NarrowTy.getSizeInBits();
7511
7512 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7513 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
7514
7515 MachineIRBuilder &B = MIRBuilder;
7516 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
7517 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
7518 auto C_0 = B.buildConstant(Res: NarrowTy, Val: 0);
7519 auto HiIsZero = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
7520 Op0: UnmergeSrc.getReg(Idx: 1), Op1: C_0);
7521 auto LoCTLZ = IsUndef ?
7522 B.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0)) :
7523 B.buildCTLZ(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7524 auto C_NarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
7525 auto HiIsZeroCTLZ = B.buildAdd(Dst: DstTy, Src0: LoCTLZ, Src1: C_NarrowSize);
7526 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7527 B.buildSelect(Res: DstReg, Tst: HiIsZero, Op0: HiIsZeroCTLZ, Op1: HiCTLZ);
7528
7529 MI.eraseFromParent();
7530 return Legalized;
7531 }
7532
7533 return UnableToLegalize;
7534}
7535
7536LegalizerHelper::LegalizeResult
7537LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
7538 LLT NarrowTy) {
7539 if (TypeIdx != 1)
7540 return UnableToLegalize;
7541
7542 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7543 unsigned NarrowSize = NarrowTy.getSizeInBits();
7544
7545 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7546 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
7547
7548 MachineIRBuilder &B = MIRBuilder;
7549 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
7550 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
7551 auto C_0 = B.buildConstant(Res: NarrowTy, Val: 0);
7552 auto LoIsZero = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
7553 Op0: UnmergeSrc.getReg(Idx: 0), Op1: C_0);
7554 auto HiCTTZ = IsUndef ?
7555 B.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1)) :
7556 B.buildCTTZ(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7557 auto C_NarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
7558 auto LoIsZeroCTTZ = B.buildAdd(Dst: DstTy, Src0: HiCTTZ, Src1: C_NarrowSize);
7559 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7560 B.buildSelect(Res: DstReg, Tst: LoIsZero, Op0: LoIsZeroCTTZ, Op1: LoCTTZ);
7561
7562 MI.eraseFromParent();
7563 return Legalized;
7564 }
7565
7566 return UnableToLegalize;
7567}
7568
7569LegalizerHelper::LegalizeResult
7570LegalizerHelper::narrowScalarCTLS(MachineInstr &MI, unsigned TypeIdx,
7571 LLT NarrowTy) {
7572 if (TypeIdx != 1)
7573 return UnableToLegalize;
7574
7575 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7576 unsigned NarrowSize = NarrowTy.getSizeInBits();
7577
7578 if (!SrcTy.isScalar() || SrcTy.getSizeInBits() != 2 * NarrowSize)
7579 return UnableToLegalize;
7580
7581 MachineIRBuilder &B = MIRBuilder;
7582
7583 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
7584 Register Lo = UnmergeSrc.getReg(Idx: 0);
7585 Register Hi = UnmergeSrc.getReg(Idx: 1);
7586
7587 auto ShAmt = B.buildConstant(Res: NarrowTy, Val: NarrowSize - 1);
7588 auto Sign = B.buildAShr(Dst: NarrowTy, Src0: Hi, Src1: ShAmt);
7589
7590 auto HiIsSign = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1), Op0: Hi, Op1: Sign);
7591
7592 // Invert Lo if Hi is negative. Then count the leading zeros. If there are no
7593 // leading zeros, then the MSB of Lo is different than the MSB of Hi.
7594 // Otherwise the leading zeros represent additional sign bits of the original
7595 // value.
7596 auto LoInv = B.buildXor(Dst: DstTy, Src0: Lo, Src1: Sign);
7597 auto LoCTLZ = B.buildCTLZ(Dst: DstTy, Src0: LoInv);
7598
7599 // Add NarrowSize-1 to LoCTLZ. This is the full CTLS if Hi is all sign bits.
7600 auto C_NarrowSizeM1 = B.buildConstant(Res: DstTy, Val: NarrowSize - 1);
7601 auto HiIsSignCTLS = B.buildAdd(Dst: DstTy, Src0: LoCTLZ, Src1: C_NarrowSizeM1);
7602
7603 auto HiCTLS = B.buildCTLS(Dst: DstTy, Src0: Hi);
7604
7605 B.buildSelect(Res: DstReg, Tst: HiIsSign, Op0: HiIsSignCTLS, Op1: HiCTLS);
7606
7607 MI.eraseFromParent();
7608 return Legalized;
7609}
7610
7611LegalizerHelper::LegalizeResult
7612LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
7613 LLT NarrowTy) {
7614 if (TypeIdx != 1)
7615 return UnableToLegalize;
7616
7617 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7618 unsigned NarrowSize = NarrowTy.getSizeInBits();
7619
7620 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7621 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1));
7622
7623 auto LoCTPOP = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
7624 auto HiCTPOP = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
7625 MIRBuilder.buildAdd(Dst: DstReg, Src0: HiCTPOP, Src1: LoCTPOP);
7626
7627 MI.eraseFromParent();
7628 return Legalized;
7629 }
7630
7631 return UnableToLegalize;
7632}
7633
7634LegalizerHelper::LegalizeResult
7635LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
7636 LLT NarrowTy) {
7637 if (TypeIdx != 1)
7638 return UnableToLegalize;
7639
7640 MachineIRBuilder &B = MIRBuilder;
7641 Register ExpReg = MI.getOperand(i: 2).getReg();
7642 LLT ExpTy = MRI.getType(Reg: ExpReg);
7643
7644 unsigned ClampSize = NarrowTy.getScalarSizeInBits();
7645
7646 // Clamp the exponent to the range of the target type.
7647 auto MinExp = B.buildConstant(Res: ExpTy, Val: minIntN(N: ClampSize));
7648 auto ClampMin = B.buildSMax(Dst: ExpTy, Src0: ExpReg, Src1: MinExp);
7649 auto MaxExp = B.buildConstant(Res: ExpTy, Val: maxIntN(N: ClampSize));
7650 auto Clamp = B.buildSMin(Dst: ExpTy, Src0: ClampMin, Src1: MaxExp);
7651
7652 auto Trunc = B.buildTrunc(Res: NarrowTy, Op: Clamp);
7653 Observer.changingInstr(MI);
7654 MI.getOperand(i: 2).setReg(Trunc.getReg(Idx: 0));
7655 Observer.changedInstr(MI);
7656 return Legalized;
7657}
7658
7659LegalizerHelper::LegalizeResult
7660LegalizerHelper::lowerBitCount(MachineInstr &MI) {
7661 unsigned Opc = MI.getOpcode();
7662 const auto &TII = MIRBuilder.getTII();
7663 auto isSupported = [this](const LegalityQuery &Q) {
7664 auto QAction = LI.getAction(Query: Q).Action;
7665 return QAction == Legal || QAction == Libcall || QAction == Custom;
7666 };
7667 switch (Opc) {
7668 default:
7669 return UnableToLegalize;
7670 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
7671 // This trivially expands to CTLZ.
7672 Observer.changingInstr(MI);
7673 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTLZ));
7674 Observer.changedInstr(MI);
7675 return Legalized;
7676 }
7677 case TargetOpcode::G_CTLZ: {
7678 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7679 unsigned Len = SrcTy.getScalarSizeInBits();
7680
7681 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7682 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
7683 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: SrcReg);
7684 auto ZeroSrc = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
7685 auto ICmp = MIRBuilder.buildICmp(
7686 Pred: CmpInst::ICMP_EQ, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: SrcReg, Op1: ZeroSrc);
7687 auto LenConst = MIRBuilder.buildConstant(Res: DstTy, Val: Len);
7688 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LenConst, Op1: CtlzZU);
7689 MI.eraseFromParent();
7690 return Legalized;
7691 }
7692 // for now, we do this:
7693 // NewLen = NextPowerOf2(Len);
7694 // x = x | (x >> 1);
7695 // x = x | (x >> 2);
7696 // ...
7697 // x = x | (x >>16);
7698 // x = x | (x >>32); // for 64-bit input
7699 // Upto NewLen/2
7700 // return Len - popcount(x);
7701 //
7702 // Ref: "Hacker's Delight" by Henry Warren
7703 Register Op = SrcReg;
7704 unsigned NewLen = PowerOf2Ceil(A: Len);
7705 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
7706 auto MIBShiftAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: 1ULL << i);
7707 auto MIBOp = MIRBuilder.buildOr(
7708 Dst: SrcTy, Src0: Op, Src1: MIRBuilder.buildLShr(Dst: SrcTy, Src0: Op, Src1: MIBShiftAmt));
7709 Op = MIBOp.getReg(Idx: 0);
7710 }
7711 auto MIBPop = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: Op);
7712 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MIRBuilder.buildConstant(Res: DstTy, Val: Len),
7713 Src1: MIBPop);
7714 MI.eraseFromParent();
7715 return Legalized;
7716 }
7717 case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
7718 // This trivially expands to CTTZ.
7719 Observer.changingInstr(MI);
7720 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTTZ));
7721 Observer.changedInstr(MI);
7722 return Legalized;
7723 }
7724 case TargetOpcode::G_CTTZ: {
7725 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7726
7727 unsigned Len = SrcTy.getScalarSizeInBits();
7728 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7729 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
7730 // zero.
7731 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: SrcReg);
7732 auto Zero = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
7733 auto ICmp = MIRBuilder.buildICmp(
7734 Pred: CmpInst::ICMP_EQ, Res: DstTy.changeElementSize(NewEltSize: 1), Op0: SrcReg, Op1: Zero);
7735 auto LenConst = MIRBuilder.buildConstant(Res: DstTy, Val: Len);
7736 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LenConst, Op1: CttzZU);
7737 MI.eraseFromParent();
7738 return Legalized;
7739 }
7740 // for now, we use: { return popcount(~x & (x - 1)); }
7741 // unless the target has ctlz but not ctpop, in which case we use:
7742 // { return 32 - nlz(~x & (x-1)); }
7743 // Ref: "Hacker's Delight" by Henry Warren
7744 auto MIBCstNeg1 = MIRBuilder.buildConstant(Res: SrcTy, Val: -1);
7745 auto MIBNot = MIRBuilder.buildXor(Dst: SrcTy, Src0: SrcReg, Src1: MIBCstNeg1);
7746 auto MIBTmp = MIRBuilder.buildAnd(
7747 Dst: SrcTy, Src0: MIBNot, Src1: MIRBuilder.buildAdd(Dst: SrcTy, Src0: SrcReg, Src1: MIBCstNeg1));
7748 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
7749 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
7750 auto MIBCstLen = MIRBuilder.buildConstant(Res: SrcTy, Val: Len);
7751 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MIBCstLen,
7752 Src1: MIRBuilder.buildCTLZ(Dst: SrcTy, Src0: MIBTmp));
7753 MI.eraseFromParent();
7754 return Legalized;
7755 }
7756 Observer.changingInstr(MI);
7757 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTPOP));
7758 MI.getOperand(i: 1).setReg(MIBTmp.getReg(Idx: 0));
7759 Observer.changedInstr(MI);
7760 return Legalized;
7761 }
7762 case TargetOpcode::G_CTPOP: {
7763 Register SrcReg = MI.getOperand(i: 1).getReg();
7764 LLT Ty = MRI.getType(Reg: SrcReg);
7765 unsigned Size = Ty.getScalarSizeInBits();
7766 MachineIRBuilder &B = MIRBuilder;
7767
7768 // Bail out on irregular type lengths.
7769 if (Size > 128 || Size % 8 != 0)
7770 return UnableToLegalize;
7771
7772 // Count set bits in blocks of 2 bits. Default approach would be
7773 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
7774 // We use following formula instead:
7775 // B2Count = val - { (val >> 1) & 0x55555555 }
7776 // since it gives same result in blocks of 2 with one instruction less.
7777 auto C_1 = B.buildConstant(Res: Ty, Val: 1);
7778 auto B2Set1LoTo1Hi = B.buildLShr(Dst: Ty, Src0: SrcReg, Src1: C_1);
7779 APInt B2Mask1HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x55));
7780 auto C_B2Mask1HiTo0 = B.buildConstant(Res: Ty, Val: B2Mask1HiTo0);
7781 auto B2Count1Hi = B.buildAnd(Dst: Ty, Src0: B2Set1LoTo1Hi, Src1: C_B2Mask1HiTo0);
7782 auto B2Count = B.buildSub(Dst: Ty, Src0: SrcReg, Src1: B2Count1Hi);
7783
7784 // In order to get count in blocks of 4 add values from adjacent block of 2.
7785 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
7786 auto C_2 = B.buildConstant(Res: Ty, Val: 2);
7787 auto B4Set2LoTo2Hi = B.buildLShr(Dst: Ty, Src0: B2Count, Src1: C_2);
7788 APInt B4Mask2HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x33));
7789 auto C_B4Mask2HiTo0 = B.buildConstant(Res: Ty, Val: B4Mask2HiTo0);
7790 auto B4HiB2Count = B.buildAnd(Dst: Ty, Src0: B4Set2LoTo2Hi, Src1: C_B4Mask2HiTo0);
7791 auto B4LoB2Count = B.buildAnd(Dst: Ty, Src0: B2Count, Src1: C_B4Mask2HiTo0);
7792 auto B4Count = B.buildAdd(Dst: Ty, Src0: B4HiB2Count, Src1: B4LoB2Count);
7793
7794 // For count in blocks of 8 bits we don't have to mask high 4 bits before
7795 // addition since count value sits in range {0,...,8} and 4 bits are enough
7796 // to hold such binary values. After addition high 4 bits still hold count
7797 // of set bits in high 4 bit block, set them to zero and get 8 bit result.
7798 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
7799 auto C_4 = B.buildConstant(Res: Ty, Val: 4);
7800 auto B8HiB4Count = B.buildLShr(Dst: Ty, Src0: B4Count, Src1: C_4);
7801 auto B8CountDirty4Hi = B.buildAdd(Dst: Ty, Src0: B8HiB4Count, Src1: B4Count);
7802 APInt B8Mask4HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x0F));
7803 auto C_B8Mask4HiTo0 = B.buildConstant(Res: Ty, Val: B8Mask4HiTo0);
7804 auto B8Count = B.buildAnd(Dst: Ty, Src0: B8CountDirty4Hi, Src1: C_B8Mask4HiTo0);
7805
7806 assert(Size <= 128 && "Scalar size is too large for CTPOP lower algorithm");
7807
7808 // Avoid the multiply when shift-add is cheaper.
7809 if (Size == 16 && !Ty.isVector()) {
7810 // v = (v + (v >> 8)) & 0xFF;
7811 auto C_8 = B.buildConstant(Res: Ty, Val: 8);
7812 auto HighSum = B.buildLShr(Dst: Ty, Src0: B8Count, Src1: C_8);
7813 auto Res = B.buildAdd(Dst: Ty, Src0: B8Count, Src1: HighSum);
7814 B.buildAnd(Dst: MI.getOperand(i: 0).getReg(), Src0: Res, Src1: B.buildConstant(Res: Ty, Val: 0xFF));
7815 MI.eraseFromParent();
7816 return Legalized;
7817 }
7818
7819 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
7820 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
7821 auto MulMask = B.buildConstant(Res: Ty, Val: APInt::getSplat(NewLen: Size, V: APInt(8, 0x01)));
7822
7823 // Shift count result from 8 high bits to low bits.
7824 auto C_SizeM8 = B.buildConstant(Res: Ty, Val: Size - 8);
7825
7826 auto IsMulSupported = [this](const LLT Ty) {
7827 auto Action = LI.getAction(Query: {TargetOpcode::G_MUL, {Ty}}).Action;
7828 return Action == Legal || Action == WidenScalar || Action == Custom;
7829 };
7830 if (IsMulSupported(Ty)) {
7831 auto ResTmp = B.buildMul(Dst: Ty, Src0: B8Count, Src1: MulMask);
7832 B.buildLShr(Dst: MI.getOperand(i: 0).getReg(), Src0: ResTmp, Src1: C_SizeM8);
7833 } else {
7834 auto ResTmp = B8Count;
7835 for (unsigned Shift = 8; Shift < Size; Shift *= 2) {
7836 auto ShiftC = B.buildConstant(Res: Ty, Val: Shift);
7837 auto Shl = B.buildShl(Dst: Ty, Src0: ResTmp, Src1: ShiftC);
7838 ResTmp = B.buildAdd(Dst: Ty, Src0: ResTmp, Src1: Shl);
7839 }
7840 B.buildLShr(Dst: MI.getOperand(i: 0).getReg(), Src0: ResTmp, Src1: C_SizeM8);
7841 }
7842 MI.eraseFromParent();
7843 return Legalized;
7844 }
7845 case TargetOpcode::G_CTLS: {
7846 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7847
7848 // ctls(x) -> ctlz(x ^ (x >> (N - 1))) - 1
7849 auto SignIdxC =
7850 MIRBuilder.buildConstant(Res: SrcTy, Val: SrcTy.getScalarSizeInBits() - 1);
7851 auto OneC = MIRBuilder.buildConstant(Res: DstTy, Val: 1);
7852
7853 auto Shr = MIRBuilder.buildAShr(Dst: SrcTy, Src0: SrcReg, Src1: SignIdxC);
7854
7855 auto Xor = MIRBuilder.buildXor(Dst: SrcTy, Src0: SrcReg, Src1: Shr);
7856 auto Ctlz = MIRBuilder.buildCTLZ(Dst: DstTy, Src0: Xor);
7857
7858 MIRBuilder.buildSub(Dst: DstReg, Src0: Ctlz, Src1: OneC);
7859 MI.eraseFromParent();
7860 return Legalized;
7861 }
7862 }
7863}
7864
7865// Check that (every element of) Reg is undef or not an exact multiple of BW.
7866static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
7867 Register Reg, unsigned BW) {
7868 return matchUnaryPredicate(
7869 MRI, Reg,
7870 Match: [=](const Constant *C) {
7871 // Null constant here means an undef.
7872 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(Val: C);
7873 return !CI || CI->getValue().urem(RHS: BW) != 0;
7874 },
7875 /*AllowUndefs*/ true);
7876}
7877
7878LegalizerHelper::LegalizeResult
7879LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
7880 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7881 LLT Ty = MRI.getType(Reg: Dst);
7882 LLT ShTy = MRI.getType(Reg: Z);
7883
7884 unsigned BW = Ty.getScalarSizeInBits();
7885
7886 if (!isPowerOf2_32(Value: BW))
7887 return UnableToLegalize;
7888
7889 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7890 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7891
7892 if (isNonZeroModBitWidthOrUndef(MRI, Reg: Z, BW)) {
7893 // fshl X, Y, Z -> fshr X, Y, -Z
7894 // fshr X, Y, Z -> fshl X, Y, -Z
7895 auto Zero = MIRBuilder.buildConstant(Res: ShTy, Val: 0);
7896 Z = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: Z).getReg(Idx: 0);
7897 } else {
7898 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
7899 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
7900 auto One = MIRBuilder.buildConstant(Res: ShTy, Val: 1);
7901 if (IsFSHL) {
7902 Y = MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Ty}, SrcOps: {X, Y, One}).getReg(Idx: 0);
7903 X = MIRBuilder.buildLShr(Dst: Ty, Src0: X, Src1: One).getReg(Idx: 0);
7904 } else {
7905 X = MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Ty}, SrcOps: {X, Y, One}).getReg(Idx: 0);
7906 Y = MIRBuilder.buildShl(Dst: Ty, Src0: Y, Src1: One).getReg(Idx: 0);
7907 }
7908
7909 Z = MIRBuilder.buildNot(Dst: ShTy, Src0: Z).getReg(Idx: 0);
7910 }
7911
7912 MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Dst}, SrcOps: {X, Y, Z});
7913 MI.eraseFromParent();
7914 return Legalized;
7915}
7916
7917LegalizerHelper::LegalizeResult
7918LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
7919 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7920 LLT Ty = MRI.getType(Reg: Dst);
7921 LLT ShTy = MRI.getType(Reg: Z);
7922
7923 const unsigned BW = Ty.getScalarSizeInBits();
7924 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7925
7926 Register ShX, ShY;
7927 Register ShAmt, InvShAmt;
7928
7929 // FIXME: Emit optimized urem by constant instead of letting it expand later.
7930 if (isNonZeroModBitWidthOrUndef(MRI, Reg: Z, BW)) {
7931 // fshl: X << C | Y >> (BW - C)
7932 // fshr: X << (BW - C) | Y >> C
7933 // where C = Z % BW is not zero
7934 auto BitWidthC = MIRBuilder.buildConstant(Res: ShTy, Val: BW);
7935 ShAmt = MIRBuilder.buildURem(Dst: ShTy, Src0: Z, Src1: BitWidthC).getReg(Idx: 0);
7936 InvShAmt = MIRBuilder.buildSub(Dst: ShTy, Src0: BitWidthC, Src1: ShAmt).getReg(Idx: 0);
7937 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: IsFSHL ? ShAmt : InvShAmt).getReg(Idx: 0);
7938 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: IsFSHL ? InvShAmt : ShAmt).getReg(Idx: 0);
7939 } else {
7940 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
7941 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
7942 auto Mask = MIRBuilder.buildConstant(Res: ShTy, Val: BW - 1);
7943 if (isPowerOf2_32(Value: BW)) {
7944 // Z % BW -> Z & (BW - 1)
7945 ShAmt = MIRBuilder.buildAnd(Dst: ShTy, Src0: Z, Src1: Mask).getReg(Idx: 0);
7946 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
7947 auto NotZ = MIRBuilder.buildNot(Dst: ShTy, Src0: Z);
7948 InvShAmt = MIRBuilder.buildAnd(Dst: ShTy, Src0: NotZ, Src1: Mask).getReg(Idx: 0);
7949 } else {
7950 auto BitWidthC = MIRBuilder.buildConstant(Res: ShTy, Val: BW);
7951 ShAmt = MIRBuilder.buildURem(Dst: ShTy, Src0: Z, Src1: BitWidthC).getReg(Idx: 0);
7952 InvShAmt = MIRBuilder.buildSub(Dst: ShTy, Src0: Mask, Src1: ShAmt).getReg(Idx: 0);
7953 }
7954
7955 auto One = MIRBuilder.buildConstant(Res: ShTy, Val: 1);
7956 if (IsFSHL) {
7957 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: ShAmt).getReg(Idx: 0);
7958 auto ShY1 = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: One);
7959 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: ShY1, Src1: InvShAmt).getReg(Idx: 0);
7960 } else {
7961 auto ShX1 = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: One);
7962 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: ShX1, Src1: InvShAmt).getReg(Idx: 0);
7963 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: ShAmt).getReg(Idx: 0);
7964 }
7965 }
7966
7967 MIRBuilder.buildOr(Dst, Src0: ShX, Src1: ShY, Flags: MachineInstr::Disjoint);
7968 MI.eraseFromParent();
7969 return Legalized;
7970}
7971
7972LegalizerHelper::LegalizeResult
7973LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
7974 // These operations approximately do the following (while avoiding undefined
7975 // shifts by BW):
7976 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
7977 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
7978 Register Dst = MI.getOperand(i: 0).getReg();
7979 LLT Ty = MRI.getType(Reg: Dst);
7980 LLT ShTy = MRI.getType(Reg: MI.getOperand(i: 3).getReg());
7981
7982 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7983 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7984
7985 // TODO: Use smarter heuristic that accounts for vector legalization.
7986 if (LI.getAction(Query: {RevOpcode, {Ty, ShTy}}).Action == Lower)
7987 return lowerFunnelShiftAsShifts(MI);
7988
7989 // This only works for powers of 2, fallback to shifts if it fails.
7990 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
7991 if (Result == UnableToLegalize)
7992 return lowerFunnelShiftAsShifts(MI);
7993 return Result;
7994}
7995
7996LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
7997 auto [Dst, Src] = MI.getFirst2Regs();
7998 LLT DstTy = MRI.getType(Reg: Dst);
7999 LLT SrcTy = MRI.getType(Reg: Src);
8000
8001 uint32_t DstTySize = DstTy.getSizeInBits();
8002 uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
8003 uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
8004
8005 if (!isPowerOf2_32(Value: DstTySize) || !isPowerOf2_32(Value: DstTyScalarSize) ||
8006 !isPowerOf2_32(Value: SrcTyScalarSize))
8007 return UnableToLegalize;
8008
8009 // The step between extend is too large, split it by creating an intermediate
8010 // extend instruction
8011 if (SrcTyScalarSize * 2 < DstTyScalarSize) {
8012 LLT MidTy = SrcTy.changeElementSize(NewEltSize: SrcTyScalarSize * 2);
8013 // If the destination type is illegal, split it into multiple statements
8014 // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
8015 auto NewExt = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {MidTy}, SrcOps: {Src});
8016 // Unmerge the vector
8017 LLT EltTy = MidTy.changeElementCount(
8018 EC: MidTy.getElementCount().divideCoefficientBy(RHS: 2));
8019 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: NewExt);
8020
8021 // ZExt the vectors
8022 LLT ZExtResTy = DstTy.changeElementCount(
8023 EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2));
8024 auto ZExtRes1 = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {ZExtResTy},
8025 SrcOps: {UnmergeSrc.getReg(Idx: 0)});
8026 auto ZExtRes2 = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {ZExtResTy},
8027 SrcOps: {UnmergeSrc.getReg(Idx: 1)});
8028
8029 // Merge the ending vectors
8030 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: {ZExtRes1, ZExtRes2});
8031
8032 MI.eraseFromParent();
8033 return Legalized;
8034 }
8035 return UnableToLegalize;
8036}
8037
8038LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
8039 // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
8040 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
8041 // Similar to how operand splitting is done in SelectiondDAG, we can handle
8042 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
8043 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
8044 // %lo16(<4 x s16>) = G_TRUNC %inlo
8045 // %hi16(<4 x s16>) = G_TRUNC %inhi
8046 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
8047 // %res(<8 x s8>) = G_TRUNC %in16
8048
8049 assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
8050
8051 Register DstReg = MI.getOperand(i: 0).getReg();
8052 Register SrcReg = MI.getOperand(i: 1).getReg();
8053 LLT DstTy = MRI.getType(Reg: DstReg);
8054 LLT SrcTy = MRI.getType(Reg: SrcReg);
8055
8056 if (DstTy.isVector() && isPowerOf2_32(Value: DstTy.getNumElements()) &&
8057 isPowerOf2_32(Value: DstTy.getScalarSizeInBits()) &&
8058 isPowerOf2_32(Value: SrcTy.getNumElements()) &&
8059 isPowerOf2_32(Value: SrcTy.getScalarSizeInBits())) {
8060 // Split input type.
8061 LLT SplitSrcTy = SrcTy.changeElementCount(
8062 EC: SrcTy.getElementCount().divideCoefficientBy(RHS: 2));
8063
8064 // First, split the source into two smaller vectors.
8065 SmallVector<Register, 2> SplitSrcs;
8066 extractParts(Reg: SrcReg, Ty: SplitSrcTy, NumParts: 2, VRegs&: SplitSrcs, MIRBuilder, MRI);
8067
8068 // Truncate the splits into intermediate narrower elements.
8069 LLT InterTy;
8070 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
8071 InterTy = SplitSrcTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits() * 2);
8072 else
8073 InterTy = SplitSrcTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits());
8074 for (Register &Src : SplitSrcs)
8075 Src = MIRBuilder.buildTrunc(Res: InterTy, Op: Src).getReg(Idx: 0);
8076
8077 // Combine the new truncates into one vector
8078 auto Merge = MIRBuilder.buildMergeLikeInstr(
8079 Res: DstTy.changeElementSize(NewEltSize: InterTy.getScalarSizeInBits()), Ops: SplitSrcs);
8080
8081 // Truncate the new vector to the final result type
8082 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
8083 MIRBuilder.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Merge.getReg(Idx: 0));
8084 else
8085 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0).getReg(), Op: Merge.getReg(Idx: 0));
8086
8087 MI.eraseFromParent();
8088
8089 return Legalized;
8090 }
8091 return UnableToLegalize;
8092}
8093
8094LegalizerHelper::LegalizeResult
8095LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
8096 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
8097 auto Zero = MIRBuilder.buildConstant(Res: AmtTy, Val: 0);
8098 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
8099 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
8100 auto Neg = MIRBuilder.buildSub(Dst: AmtTy, Src0: Zero, Src1: Amt);
8101 MIRBuilder.buildInstr(Opc: RevRot, DstOps: {Dst}, SrcOps: {Src, Neg});
8102 MI.eraseFromParent();
8103 return Legalized;
8104}
8105
8106LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
8107 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
8108
8109 unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
8110 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
8111
8112 MIRBuilder.setInstrAndDebugLoc(MI);
8113
8114 // If a rotate in the other direction is supported, use it.
8115 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
8116 if (LI.isLegalOrCustom(Query: {RevRot, {DstTy, SrcTy}}) &&
8117 isPowerOf2_32(Value: EltSizeInBits))
8118 return lowerRotateWithReverseRotate(MI);
8119
8120 // If a funnel shift is supported, use it.
8121 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
8122 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
8123 bool IsFShLegal = false;
8124 if ((IsFShLegal = LI.isLegalOrCustom(Query: {FShOpc, {DstTy, AmtTy}})) ||
8125 LI.isLegalOrCustom(Query: {RevFsh, {DstTy, AmtTy}})) {
8126 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
8127 Register R3) {
8128 MIRBuilder.buildInstr(Opc, DstOps: {R1}, SrcOps: {R2, R2, R3});
8129 MI.eraseFromParent();
8130 return Legalized;
8131 };
8132 // If a funnel shift in the other direction is supported, use it.
8133 if (IsFShLegal) {
8134 return buildFunnelShift(FShOpc, Dst, Src, Amt);
8135 } else if (isPowerOf2_32(Value: EltSizeInBits)) {
8136 Amt = MIRBuilder.buildNeg(Dst: DstTy, Src0: Amt).getReg(Idx: 0);
8137 return buildFunnelShift(RevFsh, Dst, Src, Amt);
8138 }
8139 }
8140
8141 auto Zero = MIRBuilder.buildConstant(Res: AmtTy, Val: 0);
8142 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
8143 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
8144 auto BitWidthMinusOneC = MIRBuilder.buildConstant(Res: AmtTy, Val: EltSizeInBits - 1);
8145 Register ShVal;
8146 Register RevShiftVal;
8147 if (isPowerOf2_32(Value: EltSizeInBits)) {
8148 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
8149 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
8150 auto NegAmt = MIRBuilder.buildSub(Dst: AmtTy, Src0: Zero, Src1: Amt);
8151 auto ShAmt = MIRBuilder.buildAnd(Dst: AmtTy, Src0: Amt, Src1: BitWidthMinusOneC);
8152 ShVal = MIRBuilder.buildInstr(Opc: ShOpc, DstOps: {DstTy}, SrcOps: {Src, ShAmt}).getReg(Idx: 0);
8153 auto RevAmt = MIRBuilder.buildAnd(Dst: AmtTy, Src0: NegAmt, Src1: BitWidthMinusOneC);
8154 RevShiftVal =
8155 MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Src, RevAmt}).getReg(Idx: 0);
8156 } else {
8157 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
8158 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
8159 auto BitWidthC = MIRBuilder.buildConstant(Res: AmtTy, Val: EltSizeInBits);
8160 auto ShAmt = MIRBuilder.buildURem(Dst: AmtTy, Src0: Amt, Src1: BitWidthC);
8161 ShVal = MIRBuilder.buildInstr(Opc: ShOpc, DstOps: {DstTy}, SrcOps: {Src, ShAmt}).getReg(Idx: 0);
8162 auto RevAmt = MIRBuilder.buildSub(Dst: AmtTy, Src0: BitWidthMinusOneC, Src1: ShAmt);
8163 auto One = MIRBuilder.buildConstant(Res: AmtTy, Val: 1);
8164 auto Inner = MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Src, One});
8165 RevShiftVal =
8166 MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Inner, RevAmt}).getReg(Idx: 0);
8167 }
8168 MIRBuilder.buildOr(Dst, Src0: ShVal, Src1: RevShiftVal, Flags: MachineInstr::Disjoint);
8169 MI.eraseFromParent();
8170 return Legalized;
8171}
8172
8173// Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
8174// representation.
8175LegalizerHelper::LegalizeResult
8176LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
8177 auto [Dst, Src] = MI.getFirst2Regs();
8178 const LLT S64 = LLT::scalar(SizeInBits: 64);
8179 const LLT S32 = LLT::scalar(SizeInBits: 32);
8180 const LLT S1 = LLT::scalar(SizeInBits: 1);
8181
8182 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
8183
8184 // unsigned cul2f(ulong u) {
8185 // uint lz = clz(u);
8186 // uint e = (u != 0) ? 127U + 63U - lz : 0;
8187 // u = (u << lz) & 0x7fffffffffffffffUL;
8188 // ulong t = u & 0xffffffffffUL;
8189 // uint v = (e << 23) | (uint)(u >> 40);
8190 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
8191 // return as_float(v + r);
8192 // }
8193
8194 auto Zero32 = MIRBuilder.buildConstant(Res: S32, Val: 0);
8195 auto Zero64 = MIRBuilder.buildConstant(Res: S64, Val: 0);
8196
8197 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(Dst: S32, Src0: Src);
8198
8199 auto K = MIRBuilder.buildConstant(Res: S32, Val: 127U + 63U);
8200 auto Sub = MIRBuilder.buildSub(Dst: S32, Src0: K, Src1: LZ);
8201
8202 auto NotZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: Src, Op1: Zero64);
8203 auto E = MIRBuilder.buildSelect(Res: S32, Tst: NotZero, Op0: Sub, Op1: Zero32);
8204
8205 auto Mask0 = MIRBuilder.buildConstant(Res: S64, Val: (-1ULL) >> 1);
8206 auto ShlLZ = MIRBuilder.buildShl(Dst: S64, Src0: Src, Src1: LZ);
8207
8208 auto U = MIRBuilder.buildAnd(Dst: S64, Src0: ShlLZ, Src1: Mask0);
8209
8210 auto Mask1 = MIRBuilder.buildConstant(Res: S64, Val: 0xffffffffffULL);
8211 auto T = MIRBuilder.buildAnd(Dst: S64, Src0: U, Src1: Mask1);
8212
8213 auto UShl = MIRBuilder.buildLShr(Dst: S64, Src0: U, Src1: MIRBuilder.buildConstant(Res: S64, Val: 40));
8214 auto ShlE = MIRBuilder.buildShl(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: 23));
8215 auto V = MIRBuilder.buildOr(Dst: S32, Src0: ShlE, Src1: MIRBuilder.buildTrunc(Res: S32, Op: UShl));
8216
8217 auto C = MIRBuilder.buildConstant(Res: S64, Val: 0x8000000000ULL);
8218 auto RCmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_UGT, Res: S1, Op0: T, Op1: C);
8219 auto TCmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: T, Op1: C);
8220 auto One = MIRBuilder.buildConstant(Res: S32, Val: 1);
8221
8222 auto VTrunc1 = MIRBuilder.buildAnd(Dst: S32, Src0: V, Src1: One);
8223 auto Select0 = MIRBuilder.buildSelect(Res: S32, Tst: TCmp, Op0: VTrunc1, Op1: Zero32);
8224 auto R = MIRBuilder.buildSelect(Res: S32, Tst: RCmp, Op0: One, Op1: Select0);
8225 MIRBuilder.buildAdd(Dst, Src0: V, Src1: R);
8226
8227 MI.eraseFromParent();
8228 return Legalized;
8229}
8230
8231// Expand s32 = G_UITOFP s64 to an IEEE float representation using bit
8232// operations and G_SITOFP
8233LegalizerHelper::LegalizeResult
8234LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
8235 auto [Dst, Src] = MI.getFirst2Regs();
8236 const LLT S64 = LLT::scalar(SizeInBits: 64);
8237 const LLT S32 = LLT::scalar(SizeInBits: 32);
8238 const LLT S1 = LLT::scalar(SizeInBits: 1);
8239
8240 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
8241
8242 // For i64 < INT_MAX we simply reuse SITOFP.
8243 // Otherwise, divide i64 by 2, round result by ORing with the lowest bit
8244 // saved before division, convert to float by SITOFP, multiply the result
8245 // by 2.
8246 auto One = MIRBuilder.buildConstant(Res: S64, Val: 1);
8247 auto Zero = MIRBuilder.buildConstant(Res: S64, Val: 0);
8248 // Result if Src < INT_MAX
8249 auto SmallResult = MIRBuilder.buildSITOFP(Dst: S32, Src0: Src);
8250 // Result if Src >= INT_MAX
8251 auto Halved = MIRBuilder.buildLShr(Dst: S64, Src0: Src, Src1: One);
8252 auto LowerBit = MIRBuilder.buildAnd(Dst: S64, Src0: Src, Src1: One);
8253 auto RoundedHalved = MIRBuilder.buildOr(Dst: S64, Src0: Halved, Src1: LowerBit);
8254 auto HalvedFP = MIRBuilder.buildSITOFP(Dst: S32, Src0: RoundedHalved);
8255 auto LargeResult = MIRBuilder.buildFAdd(Dst: S32, Src0: HalvedFP, Src1: HalvedFP);
8256 // Check if the original value is larger than INT_MAX by comparing with
8257 // zero to pick one of the two conversions.
8258 auto IsLarge =
8259 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_SLT, Res: S1, Op0: Src, Op1: Zero);
8260 MIRBuilder.buildSelect(Res: Dst, Tst: IsLarge, Op0: LargeResult, Op1: SmallResult);
8261
8262 MI.eraseFromParent();
8263 return Legalized;
8264}
8265
8266// Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an
8267// IEEE double representation.
8268LegalizerHelper::LegalizeResult
8269LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
8270 auto [Dst, Src] = MI.getFirst2Regs();
8271 const LLT S64 = LLT::scalar(SizeInBits: 64);
8272 const LLT S32 = LLT::scalar(SizeInBits: 32);
8273
8274 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
8275
8276 // We create double value from 32 bit parts with 32 exponent difference.
8277 // Note that + and - are float operations that adjust the implicit leading
8278 // one, the bases 2^52 and 2^84 are for illustrative purposes.
8279 //
8280 // X = 2^52 * 1.0...LowBits
8281 // Y = 2^84 * 1.0...HighBits
8282 // Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
8283 // = - 2^52 * 1.0...HighBits
8284 // Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
8285 auto TwoP52 = MIRBuilder.buildConstant(Res: S64, UINT64_C(0x4330000000000000));
8286 auto TwoP84 = MIRBuilder.buildConstant(Res: S64, UINT64_C(0x4530000000000000));
8287 auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
8288 auto TwoP52P84FP = MIRBuilder.buildFConstant(Res: S64, Val: TwoP52P84);
8289 auto HalfWidth = MIRBuilder.buildConstant(Res: S64, Val: 32);
8290
8291 auto LowBits = MIRBuilder.buildTrunc(Res: S32, Op: Src);
8292 LowBits = MIRBuilder.buildZExt(Res: S64, Op: LowBits);
8293 auto LowBitsFP = MIRBuilder.buildOr(Dst: S64, Src0: TwoP52, Src1: LowBits);
8294 auto HighBits = MIRBuilder.buildLShr(Dst: S64, Src0: Src, Src1: HalfWidth);
8295 auto HighBitsFP = MIRBuilder.buildOr(Dst: S64, Src0: TwoP84, Src1: HighBits);
8296 auto Scratch = MIRBuilder.buildFSub(Dst: S64, Src0: HighBitsFP, Src1: TwoP52P84FP);
8297 MIRBuilder.buildFAdd(Dst, Src0: Scratch, Src1: LowBitsFP);
8298
8299 MI.eraseFromParent();
8300 return Legalized;
8301}
8302
8303/// i64->fp16 itofp can be lowered to i64->f64,f64->f32,f32->f16. We cannot
8304/// convert fpround f64->f16 without double-rounding, so we manually perform the
8305/// lowering here where we know it is valid.
8306static LegalizerHelper::LegalizeResult
8307loweri64tof16ITOFP(MachineInstr &MI, Register Dst, LLT DstTy, Register Src,
8308 LLT SrcTy, MachineIRBuilder &MIRBuilder) {
8309 auto M1 = MI.getOpcode() == TargetOpcode::G_UITOFP
8310 ? MIRBuilder.buildUITOFP(Dst: SrcTy, Src0: Src)
8311 : MIRBuilder.buildSITOFP(Dst: SrcTy, Src0: Src);
8312 LLT S32Ty = SrcTy.changeElementSize(NewEltSize: 32);
8313 auto M2 = MIRBuilder.buildFPTrunc(Res: S32Ty, Op: M1);
8314 MIRBuilder.buildFPTrunc(Res: Dst, Op: M2);
8315 MI.eraseFromParent();
8316 return LegalizerHelper::Legalized;
8317}
8318
8319LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
8320 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8321
8322 if (SrcTy == LLT::scalar(SizeInBits: 1)) {
8323 auto True = MIRBuilder.buildFConstant(Res: DstTy, Val: 1.0);
8324 auto False = MIRBuilder.buildFConstant(Res: DstTy, Val: 0.0);
8325 MIRBuilder.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
8326 MI.eraseFromParent();
8327 return Legalized;
8328 }
8329
8330 if (DstTy.getScalarSizeInBits() == 16 && SrcTy.getScalarSizeInBits() == 64)
8331 return loweri64tof16ITOFP(MI, Dst, DstTy, Src, SrcTy, MIRBuilder);
8332
8333 if (SrcTy != LLT::scalar(SizeInBits: 64))
8334 return UnableToLegalize;
8335
8336 if (DstTy == LLT::scalar(SizeInBits: 32))
8337 // TODO: SelectionDAG has several alternative expansions to port which may
8338 // be more reasonable depending on the available instructions. We also need
8339 // a more advanced mechanism to choose an optimal version depending on
8340 // target features such as sitofp or CTLZ availability.
8341 return lowerU64ToF32WithSITOFP(MI);
8342
8343 if (DstTy == LLT::scalar(SizeInBits: 64))
8344 return lowerU64ToF64BitFloatOps(MI);
8345
8346 return UnableToLegalize;
8347}
8348
8349LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
8350 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8351
8352 const LLT S64 = LLT::scalar(SizeInBits: 64);
8353 const LLT S32 = LLT::scalar(SizeInBits: 32);
8354 const LLT S1 = LLT::scalar(SizeInBits: 1);
8355
8356 if (SrcTy == S1) {
8357 auto True = MIRBuilder.buildFConstant(Res: DstTy, Val: -1.0);
8358 auto False = MIRBuilder.buildFConstant(Res: DstTy, Val: 0.0);
8359 MIRBuilder.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
8360 MI.eraseFromParent();
8361 return Legalized;
8362 }
8363
8364 if (DstTy.getScalarSizeInBits() == 16 && SrcTy.getScalarSizeInBits() == 64)
8365 return loweri64tof16ITOFP(MI, Dst, DstTy, Src, SrcTy, MIRBuilder);
8366
8367 if (SrcTy != S64)
8368 return UnableToLegalize;
8369
8370 if (DstTy == S32) {
8371 // signed cl2f(long l) {
8372 // long s = l >> 63;
8373 // float r = cul2f((l + s) ^ s);
8374 // return s ? -r : r;
8375 // }
8376 Register L = Src;
8377 auto SignBit = MIRBuilder.buildConstant(Res: S64, Val: 63);
8378 auto S = MIRBuilder.buildAShr(Dst: S64, Src0: L, Src1: SignBit);
8379
8380 auto LPlusS = MIRBuilder.buildAdd(Dst: S64, Src0: L, Src1: S);
8381 auto Xor = MIRBuilder.buildXor(Dst: S64, Src0: LPlusS, Src1: S);
8382 auto R = MIRBuilder.buildUITOFP(Dst: S32, Src0: Xor);
8383
8384 auto RNeg = MIRBuilder.buildFNeg(Dst: S32, Src0: R);
8385 auto SignNotZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: S,
8386 Op1: MIRBuilder.buildConstant(Res: S64, Val: 0));
8387 MIRBuilder.buildSelect(Res: Dst, Tst: SignNotZero, Op0: RNeg, Op1: R);
8388 MI.eraseFromParent();
8389 return Legalized;
8390 }
8391
8392 return UnableToLegalize;
8393}
8394
8395LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
8396 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8397 const LLT S64 = LLT::scalar(SizeInBits: 64);
8398 const LLT S32 = LLT::scalar(SizeInBits: 32);
8399
8400 if (SrcTy != S64 && SrcTy != S32)
8401 return UnableToLegalize;
8402 if (DstTy != S32 && DstTy != S64)
8403 return UnableToLegalize;
8404
8405 // FPTOSI gives same result as FPTOUI for positive signed integers.
8406 // FPTOUI needs to deal with fp values that convert to unsigned integers
8407 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
8408
8409 APInt TwoPExpInt = APInt::getSignMask(BitWidth: DstTy.getSizeInBits());
8410 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
8411 : APFloat::IEEEdouble(),
8412 APInt::getZero(numBits: SrcTy.getSizeInBits()));
8413 TwoPExpFP.convertFromAPInt(Input: TwoPExpInt, IsSigned: false, RM: APFloat::rmNearestTiesToEven);
8414
8415 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Src);
8416
8417 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(Res: SrcTy, Val: TwoPExpFP);
8418 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
8419 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
8420 MachineInstrBuilder FSub = MIRBuilder.buildFSub(Dst: SrcTy, Src0: Src, Src1: Threshold);
8421 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: FSub);
8422 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(Res: DstTy, Val: TwoPExpInt);
8423 MachineInstrBuilder Res = MIRBuilder.buildXor(Dst: DstTy, Src0: ResLowBits, Src1: ResHighBit);
8424
8425 const LLT S1 = LLT::scalar(SizeInBits: 1);
8426
8427 MachineInstrBuilder FCMP =
8428 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ULT, Res: S1, Op0: Src, Op1: Threshold);
8429 MIRBuilder.buildSelect(Res: Dst, Tst: FCMP, Op0: FPTOSI, Op1: Res);
8430
8431 MI.eraseFromParent();
8432 return Legalized;
8433}
8434
8435LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
8436 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8437 const LLT S64 = LLT::scalar(SizeInBits: 64);
8438 const LLT S32 = LLT::scalar(SizeInBits: 32);
8439
8440 // FIXME: Only f32 to i64 conversions are supported.
8441 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
8442 return UnableToLegalize;
8443
8444 // Expand f32 -> i64 conversion
8445 // This algorithm comes from compiler-rt's implementation of fixsfdi:
8446 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
8447
8448 unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
8449
8450 auto ExponentMask = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x7F800000);
8451 auto ExponentLoBit = MIRBuilder.buildConstant(Res: SrcTy, Val: 23);
8452
8453 auto AndExpMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: ExponentMask);
8454 auto ExponentBits = MIRBuilder.buildLShr(Dst: SrcTy, Src0: AndExpMask, Src1: ExponentLoBit);
8455
8456 auto SignMask = MIRBuilder.buildConstant(Res: SrcTy,
8457 Val: APInt::getSignMask(BitWidth: SrcEltBits));
8458 auto AndSignMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: SignMask);
8459 auto SignLowBit = MIRBuilder.buildConstant(Res: SrcTy, Val: SrcEltBits - 1);
8460 auto Sign = MIRBuilder.buildAShr(Dst: SrcTy, Src0: AndSignMask, Src1: SignLowBit);
8461 Sign = MIRBuilder.buildSExt(Res: DstTy, Op: Sign);
8462
8463 auto MantissaMask = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x007FFFFF);
8464 auto AndMantissaMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: MantissaMask);
8465 auto K = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x00800000);
8466
8467 auto R = MIRBuilder.buildOr(Dst: SrcTy, Src0: AndMantissaMask, Src1: K);
8468 R = MIRBuilder.buildZExt(Res: DstTy, Op: R);
8469
8470 auto Bias = MIRBuilder.buildConstant(Res: SrcTy, Val: 127);
8471 auto Exponent = MIRBuilder.buildSub(Dst: SrcTy, Src0: ExponentBits, Src1: Bias);
8472 auto SubExponent = MIRBuilder.buildSub(Dst: SrcTy, Src0: Exponent, Src1: ExponentLoBit);
8473 auto ExponentSub = MIRBuilder.buildSub(Dst: SrcTy, Src0: ExponentLoBit, Src1: Exponent);
8474
8475 auto Shl = MIRBuilder.buildShl(Dst: DstTy, Src0: R, Src1: SubExponent);
8476 auto Srl = MIRBuilder.buildLShr(Dst: DstTy, Src0: R, Src1: ExponentSub);
8477
8478 const LLT S1 = LLT::scalar(SizeInBits: 1);
8479 auto CmpGt = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT,
8480 Res: S1, Op0: Exponent, Op1: ExponentLoBit);
8481
8482 R = MIRBuilder.buildSelect(Res: DstTy, Tst: CmpGt, Op0: Shl, Op1: Srl);
8483
8484 auto XorSign = MIRBuilder.buildXor(Dst: DstTy, Src0: R, Src1: Sign);
8485 auto Ret = MIRBuilder.buildSub(Dst: DstTy, Src0: XorSign, Src1: Sign);
8486
8487 auto ZeroSrcTy = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
8488
8489 auto ExponentLt0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT,
8490 Res: S1, Op0: Exponent, Op1: ZeroSrcTy);
8491
8492 auto ZeroDstTy = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
8493 MIRBuilder.buildSelect(Res: Dst, Tst: ExponentLt0, Op0: ZeroDstTy, Op1: Ret);
8494
8495 MI.eraseFromParent();
8496 return Legalized;
8497}
8498
8499LegalizerHelper::LegalizeResult
8500LegalizerHelper::lowerFPTOINT_SAT(MachineInstr &MI) {
8501 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
8502
8503 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI_SAT;
8504 unsigned SatWidth = DstTy.getScalarSizeInBits();
8505
8506 // Determine minimum and maximum integer values and their corresponding
8507 // floating-point values.
8508 APInt MinInt, MaxInt;
8509 if (IsSigned) {
8510 MinInt = APInt::getSignedMinValue(numBits: SatWidth);
8511 MaxInt = APInt::getSignedMaxValue(numBits: SatWidth);
8512 } else {
8513 MinInt = APInt::getMinValue(numBits: SatWidth);
8514 MaxInt = APInt::getMaxValue(numBits: SatWidth);
8515 }
8516
8517 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: SrcTy.getScalarType());
8518 APFloat MinFloat(Semantics);
8519 APFloat MaxFloat(Semantics);
8520
8521 APFloat::opStatus MinStatus =
8522 MinFloat.convertFromAPInt(Input: MinInt, IsSigned, RM: APFloat::rmTowardZero);
8523 APFloat::opStatus MaxStatus =
8524 MaxFloat.convertFromAPInt(Input: MaxInt, IsSigned, RM: APFloat::rmTowardZero);
8525 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
8526 !(MaxStatus & APFloat::opStatus::opInexact);
8527
8528 // If the integer bounds are exactly representable as floats, emit a
8529 // min+max+fptoi sequence. Otherwise we have to use a sequence of comparisons
8530 // and selects.
8531 if (AreExactFloatBounds) {
8532 // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
8533 auto MaxC = MIRBuilder.buildFConstant(Res: SrcTy, Val: MinFloat);
8534 auto MaxP = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGT,
8535 Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Src, Op1: MaxC);
8536 auto Max = MIRBuilder.buildSelect(Res: SrcTy, Tst: MaxP, Op0: Src, Op1: MaxC);
8537 // Clamp by MaxFloat from above. NaN cannot occur.
8538 auto MinC = MIRBuilder.buildFConstant(Res: SrcTy, Val: MaxFloat);
8539 auto MinP =
8540 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Max,
8541 Op1: MinC, Flags: MachineInstr::FmNoNans);
8542 auto Min =
8543 MIRBuilder.buildSelect(Res: SrcTy, Tst: MinP, Op0: Max, Op1: MinC, Flags: MachineInstr::FmNoNans);
8544 // Convert clamped value to integer. In the unsigned case we're done,
8545 // because we mapped NaN to MinFloat, which will cast to zero.
8546 if (!IsSigned) {
8547 MIRBuilder.buildFPTOUI(Dst, Src0: Min);
8548 MI.eraseFromParent();
8549 return Legalized;
8550 }
8551
8552 // Otherwise, select 0 if Src is NaN.
8553 auto FpToInt = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Min);
8554 auto IsZero = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_UNO,
8555 Res: DstTy.changeElementSize(NewEltSize: 1), Op0: Src, Op1: Src);
8556 MIRBuilder.buildSelect(Res: Dst, Tst: IsZero, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: 0),
8557 Op1: FpToInt);
8558 MI.eraseFromParent();
8559 return Legalized;
8560 }
8561
8562 // Result of direct conversion. The assumption here is that the operation is
8563 // non-trapping and it's fine to apply it to an out-of-range value if we
8564 // select it away later.
8565 auto FpToInt = IsSigned ? MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Src)
8566 : MIRBuilder.buildFPTOUI(Dst: DstTy, Src0: Src);
8567
8568 // If Src ULT MinFloat, select MinInt. In particular, this also selects
8569 // MinInt if Src is NaN.
8570 auto ULT =
8571 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ULT, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Src,
8572 Op1: MIRBuilder.buildFConstant(Res: SrcTy, Val: MinFloat));
8573 auto Max = MIRBuilder.buildSelect(
8574 Res: DstTy, Tst: ULT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MinInt), Op1: FpToInt);
8575 // If Src OGT MaxFloat, select MaxInt.
8576 auto OGT =
8577 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: Src,
8578 Op1: MIRBuilder.buildFConstant(Res: SrcTy, Val: MaxFloat));
8579
8580 // In the unsigned case we are done, because we mapped NaN to MinInt, which
8581 // is already zero.
8582 if (!IsSigned) {
8583 MIRBuilder.buildSelect(Res: Dst, Tst: OGT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MaxInt),
8584 Op1: Max);
8585 MI.eraseFromParent();
8586 return Legalized;
8587 }
8588
8589 // Otherwise, select 0 if Src is NaN.
8590 auto Min = MIRBuilder.buildSelect(
8591 Res: DstTy, Tst: OGT, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: MaxInt), Op1: Max);
8592 auto IsZero = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_UNO,
8593 Res: DstTy.changeElementSize(NewEltSize: 1), Op0: Src, Op1: Src);
8594 MIRBuilder.buildSelect(Res: Dst, Tst: IsZero, Op0: MIRBuilder.buildConstant(Res: DstTy, Val: 0), Op1: Min);
8595 MI.eraseFromParent();
8596 return Legalized;
8597}
8598
8599// f64 -> f16 conversion using round-to-nearest-even rounding mode.
8600LegalizerHelper::LegalizeResult
8601LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
8602 const LLT S1 = LLT::scalar(SizeInBits: 1);
8603 const LLT S32 = LLT::scalar(SizeInBits: 32);
8604
8605 auto [Dst, Src] = MI.getFirst2Regs();
8606 assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
8607 MRI.getType(Src).getScalarType() == LLT::scalar(64));
8608
8609 if (MRI.getType(Reg: Src).isVector()) // TODO: Handle vectors directly.
8610 return UnableToLegalize;
8611
8612 if (MI.getFlag(Flag: MachineInstr::FmAfn)) {
8613 unsigned Flags = MI.getFlags();
8614 auto Src32 = MIRBuilder.buildFPTrunc(Res: S32, Op: Src, Flags);
8615 MIRBuilder.buildFPTrunc(Res: Dst, Op: Src32, Flags);
8616 MI.eraseFromParent();
8617 return Legalized;
8618 }
8619
8620 const unsigned ExpMask = 0x7ff;
8621 const unsigned ExpBiasf64 = 1023;
8622 const unsigned ExpBiasf16 = 15;
8623
8624 auto Unmerge = MIRBuilder.buildUnmerge(Res: S32, Op: Src);
8625 Register U = Unmerge.getReg(Idx: 0);
8626 Register UH = Unmerge.getReg(Idx: 1);
8627
8628 auto E = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 20));
8629 E = MIRBuilder.buildAnd(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: ExpMask));
8630
8631 // Subtract the fp64 exponent bias (1023) to get the real exponent and
8632 // add the f16 bias (15) to get the biased exponent for the f16 format.
8633 E = MIRBuilder.buildAdd(
8634 Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: -ExpBiasf64 + ExpBiasf16));
8635
8636 auto M = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 8));
8637 M = MIRBuilder.buildAnd(Dst: S32, Src0: M, Src1: MIRBuilder.buildConstant(Res: S32, Val: 0xffe));
8638
8639 auto MaskedSig = MIRBuilder.buildAnd(Dst: S32, Src0: UH,
8640 Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x1ff));
8641 MaskedSig = MIRBuilder.buildOr(Dst: S32, Src0: MaskedSig, Src1: U);
8642
8643 auto Zero = MIRBuilder.buildConstant(Res: S32, Val: 0);
8644 auto SigCmpNE0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: MaskedSig, Op1: Zero);
8645 auto Lo40Set = MIRBuilder.buildZExt(Res: S32, Op: SigCmpNE0);
8646 M = MIRBuilder.buildOr(Dst: S32, Src0: M, Src1: Lo40Set);
8647
8648 // (M != 0 ? 0x0200 : 0) | 0x7c00;
8649 auto Bits0x200 = MIRBuilder.buildConstant(Res: S32, Val: 0x0200);
8650 auto CmpM_NE0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: M, Op1: Zero);
8651 auto SelectCC = MIRBuilder.buildSelect(Res: S32, Tst: CmpM_NE0, Op0: Bits0x200, Op1: Zero);
8652
8653 auto Bits0x7c00 = MIRBuilder.buildConstant(Res: S32, Val: 0x7c00);
8654 auto I = MIRBuilder.buildOr(Dst: S32, Src0: SelectCC, Src1: Bits0x7c00);
8655
8656 // N = M | (E << 12);
8657 auto EShl12 = MIRBuilder.buildShl(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: 12));
8658 auto N = MIRBuilder.buildOr(Dst: S32, Src0: M, Src1: EShl12);
8659
8660 // B = clamp(1-E, 0, 13);
8661 auto One = MIRBuilder.buildConstant(Res: S32, Val: 1);
8662 auto OneSubExp = MIRBuilder.buildSub(Dst: S32, Src0: One, Src1: E);
8663 auto B = MIRBuilder.buildSMax(Dst: S32, Src0: OneSubExp, Src1: Zero);
8664 B = MIRBuilder.buildSMin(Dst: S32, Src0: B, Src1: MIRBuilder.buildConstant(Res: S32, Val: 13));
8665
8666 auto SigSetHigh = MIRBuilder.buildOr(Dst: S32, Src0: M,
8667 Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x1000));
8668
8669 auto D = MIRBuilder.buildLShr(Dst: S32, Src0: SigSetHigh, Src1: B);
8670 auto D0 = MIRBuilder.buildShl(Dst: S32, Src0: D, Src1: B);
8671
8672 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1,
8673 Op0: D0, Op1: SigSetHigh);
8674 auto D1 = MIRBuilder.buildZExt(Res: S32, Op: D0_NE_SigSetHigh);
8675 D = MIRBuilder.buildOr(Dst: S32, Src0: D, Src1: D1);
8676
8677 auto CmpELtOne = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: E, Op1: One);
8678 auto V = MIRBuilder.buildSelect(Res: S32, Tst: CmpELtOne, Op0: D, Op1: N);
8679
8680 auto VLow3 = MIRBuilder.buildAnd(Dst: S32, Src0: V, Src1: MIRBuilder.buildConstant(Res: S32, Val: 7));
8681 V = MIRBuilder.buildLShr(Dst: S32, Src0: V, Src1: MIRBuilder.buildConstant(Res: S32, Val: 2));
8682
8683 auto VLow3Eq3 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: VLow3,
8684 Op1: MIRBuilder.buildConstant(Res: S32, Val: 3));
8685 auto V0 = MIRBuilder.buildZExt(Res: S32, Op: VLow3Eq3);
8686
8687 auto VLow3Gt5 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: VLow3,
8688 Op1: MIRBuilder.buildConstant(Res: S32, Val: 5));
8689 auto V1 = MIRBuilder.buildZExt(Res: S32, Op: VLow3Gt5);
8690
8691 V1 = MIRBuilder.buildOr(Dst: S32, Src0: V0, Src1: V1);
8692 V = MIRBuilder.buildAdd(Dst: S32, Src0: V, Src1: V1);
8693
8694 auto CmpEGt30 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1,
8695 Op0: E, Op1: MIRBuilder.buildConstant(Res: S32, Val: 30));
8696 V = MIRBuilder.buildSelect(Res: S32, Tst: CmpEGt30,
8697 Op0: MIRBuilder.buildConstant(Res: S32, Val: 0x7c00), Op1: V);
8698
8699 auto CmpEGt1039 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1,
8700 Op0: E, Op1: MIRBuilder.buildConstant(Res: S32, Val: 1039));
8701 V = MIRBuilder.buildSelect(Res: S32, Tst: CmpEGt1039, Op0: I, Op1: V);
8702
8703 // Extract the sign bit.
8704 auto Sign = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 16));
8705 Sign = MIRBuilder.buildAnd(Dst: S32, Src0: Sign, Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x8000));
8706
8707 // Insert the sign bit
8708 V = MIRBuilder.buildOr(Dst: S32, Src0: Sign, Src1: V);
8709
8710 MIRBuilder.buildTrunc(Res: Dst, Op: V);
8711 MI.eraseFromParent();
8712 return Legalized;
8713}
8714
8715LegalizerHelper::LegalizeResult
8716LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
8717 auto [DstTy, SrcTy] = MI.getFirst2LLTs();
8718 const LLT S64 = LLT::scalar(SizeInBits: 64);
8719 const LLT S16 = LLT::scalar(SizeInBits: 16);
8720
8721 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
8722 return lowerFPTRUNC_F64_TO_F16(MI);
8723
8724 return UnableToLegalize;
8725}
8726
8727LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
8728 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8729 LLT Ty = MRI.getType(Reg: Dst);
8730
8731 auto CvtSrc1 = MIRBuilder.buildSITOFP(Dst: Ty, Src0: Src1);
8732 MIRBuilder.buildFPow(Dst, Src0, Src1: CvtSrc1, Flags: MI.getFlags());
8733 MI.eraseFromParent();
8734 return Legalized;
8735}
8736
8737LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMODF(MachineInstr &MI) {
8738 auto [DstFrac, DstInt, Src] = MI.getFirst3Regs();
8739 LLT Ty = MRI.getType(Reg: Src);
8740 auto Flags = MI.getFlags();
8741
8742 auto IntPart = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: Src, Flags);
8743 auto FracPart = MIRBuilder.buildFSub(Dst: Ty, Src0: Src, Src1: IntPart, Flags);
8744
8745 Register FracToUse;
8746 if (MI.getFlag(Flag: MachineInstr::FmNoInfs)) {
8747 FracToUse = FracPart.getReg(Idx: 0);
8748 } else {
8749 auto Abs = MIRBuilder.buildFAbs(Dst: Ty, Src0: Src, Flags);
8750 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: Ty.getScalarType());
8751 auto Inf = MIRBuilder.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: Semantics));
8752 auto IsInf = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OEQ,
8753 Res: Ty.changeElementSize(NewEltSize: 1), Op0: Abs, Op1: Inf);
8754 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
8755 auto Select = MIRBuilder.buildSelect(Res: Ty, Tst: IsInf, Op0: Zero, Op1: FracPart);
8756 FracToUse = Select.getReg(Idx: 0);
8757 }
8758
8759 MIRBuilder.buildFCopysign(Dst: DstFrac, Src0: FracToUse, Src1: Src, Flags);
8760 MIRBuilder.buildCopy(Res: DstInt, Op: IntPart.getReg(Idx: 0));
8761
8762 MI.eraseFromParent();
8763 return Legalized;
8764}
8765
8766static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
8767 switch (Opc) {
8768 case TargetOpcode::G_SMIN:
8769 return CmpInst::ICMP_SLT;
8770 case TargetOpcode::G_SMAX:
8771 return CmpInst::ICMP_SGT;
8772 case TargetOpcode::G_UMIN:
8773 return CmpInst::ICMP_ULT;
8774 case TargetOpcode::G_UMAX:
8775 return CmpInst::ICMP_UGT;
8776 default:
8777 llvm_unreachable("not in integer min/max");
8778 }
8779}
8780
8781LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
8782 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8783
8784 const CmpInst::Predicate Pred = minMaxToCompare(Opc: MI.getOpcode());
8785 LLT CmpType = MRI.getType(Reg: Dst).changeElementType(NewEltTy: LLT::scalar(SizeInBits: 1));
8786
8787 auto Cmp = MIRBuilder.buildICmp(Pred, Res: CmpType, Op0: Src0, Op1: Src1);
8788 MIRBuilder.buildSelect(Res: Dst, Tst: Cmp, Op0: Src0, Op1: Src1);
8789
8790 MI.eraseFromParent();
8791 return Legalized;
8792}
8793
8794LegalizerHelper::LegalizeResult
8795LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
8796 GSUCmp *Cmp = cast<GSUCmp>(Val: &MI);
8797
8798 Register Dst = Cmp->getReg(Idx: 0);
8799 LLT DstTy = MRI.getType(Reg: Dst);
8800 LLT SrcTy = MRI.getType(Reg: Cmp->getReg(Idx: 1));
8801 LLT CmpTy = DstTy.changeElementSize(NewEltSize: 1);
8802
8803 CmpInst::Predicate LTPredicate = Cmp->isSigned()
8804 ? CmpInst::Predicate::ICMP_SLT
8805 : CmpInst::Predicate::ICMP_ULT;
8806 CmpInst::Predicate GTPredicate = Cmp->isSigned()
8807 ? CmpInst::Predicate::ICMP_SGT
8808 : CmpInst::Predicate::ICMP_UGT;
8809
8810 auto Zero = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
8811 auto IsGT = MIRBuilder.buildICmp(Pred: GTPredicate, Res: CmpTy, Op0: Cmp->getLHSReg(),
8812 Op1: Cmp->getRHSReg());
8813 auto IsLT = MIRBuilder.buildICmp(Pred: LTPredicate, Res: CmpTy, Op0: Cmp->getLHSReg(),
8814 Op1: Cmp->getRHSReg());
8815
8816 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
8817 auto BC = TLI.getBooleanContents(isVec: DstTy.isVector(), /*isFP=*/isFloat: false);
8818 if (TLI.preferSelectsOverBooleanArithmetic(
8819 VT: getApproximateEVTForLLT(Ty: SrcTy, Ctx)) ||
8820 BC == TargetLowering::UndefinedBooleanContent) {
8821 auto One = MIRBuilder.buildConstant(Res: DstTy, Val: 1);
8822 auto SelectZeroOrOne = MIRBuilder.buildSelect(Res: DstTy, Tst: IsGT, Op0: One, Op1: Zero);
8823
8824 auto MinusOne = MIRBuilder.buildConstant(Res: DstTy, Val: -1);
8825 MIRBuilder.buildSelect(Res: Dst, Tst: IsLT, Op0: MinusOne, Op1: SelectZeroOrOne);
8826 } else {
8827 if (BC == TargetLowering::ZeroOrNegativeOneBooleanContent)
8828 std::swap(a&: IsGT, b&: IsLT);
8829 // Extend boolean results to DstTy, which is at least i2, before subtracting
8830 // them.
8831 unsigned BoolExtOp =
8832 MIRBuilder.getBoolExtOp(IsVec: DstTy.isVector(), /*isFP=*/IsFP: false);
8833 IsGT = MIRBuilder.buildInstr(Opc: BoolExtOp, DstOps: {DstTy}, SrcOps: {IsGT});
8834 IsLT = MIRBuilder.buildInstr(Opc: BoolExtOp, DstOps: {DstTy}, SrcOps: {IsLT});
8835 MIRBuilder.buildSub(Dst, Src0: IsGT, Src1: IsLT);
8836 }
8837
8838 MI.eraseFromParent();
8839 return Legalized;
8840}
8841
8842LegalizerHelper::LegalizeResult
8843LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
8844 auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
8845 const int Src0Size = Src0Ty.getScalarSizeInBits();
8846 const int Src1Size = Src1Ty.getScalarSizeInBits();
8847
8848 auto SignBitMask = MIRBuilder.buildConstant(
8849 Res: Src0Ty, Val: APInt::getSignMask(BitWidth: Src0Size));
8850
8851 auto NotSignBitMask = MIRBuilder.buildConstant(
8852 Res: Src0Ty, Val: APInt::getLowBitsSet(numBits: Src0Size, loBitsSet: Src0Size - 1));
8853
8854 Register And0 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0, Src1: NotSignBitMask).getReg(Idx: 0);
8855 Register And1;
8856 if (Src0Ty == Src1Ty) {
8857 And1 = MIRBuilder.buildAnd(Dst: Src1Ty, Src0: Src1, Src1: SignBitMask).getReg(Idx: 0);
8858 } else if (Src0Size > Src1Size) {
8859 auto ShiftAmt = MIRBuilder.buildConstant(Res: Src0Ty, Val: Src0Size - Src1Size);
8860 auto Zext = MIRBuilder.buildZExt(Res: Src0Ty, Op: Src1);
8861 auto Shift = MIRBuilder.buildShl(Dst: Src0Ty, Src0: Zext, Src1: ShiftAmt);
8862 And1 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0: Shift, Src1: SignBitMask).getReg(Idx: 0);
8863 } else {
8864 auto ShiftAmt = MIRBuilder.buildConstant(Res: Src1Ty, Val: Src1Size - Src0Size);
8865 auto Shift = MIRBuilder.buildLShr(Dst: Src1Ty, Src0: Src1, Src1: ShiftAmt);
8866 auto Trunc = MIRBuilder.buildTrunc(Res: Src0Ty, Op: Shift);
8867 And1 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0: Trunc, Src1: SignBitMask).getReg(Idx: 0);
8868 }
8869
8870 // Be careful about setting nsz/nnan/ninf on every instruction, since the
8871 // constants are a nan and -0.0, but the final result should preserve
8872 // everything.
8873 unsigned Flags = MI.getFlags();
8874
8875 // We masked the sign bit and the not-sign bit, so these are disjoint.
8876 Flags |= MachineInstr::Disjoint;
8877
8878 MIRBuilder.buildOr(Dst, Src0: And0, Src1: And1, Flags);
8879
8880 MI.eraseFromParent();
8881 return Legalized;
8882}
8883
8884LegalizerHelper::LegalizeResult
8885LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
8886 // FIXME: fminnum/fmaxnum and fminimumnum/fmaximumnum should not have
8887 // identical handling. fminimumnum/fmaximumnum also need a path that do not
8888 // depend on fminnum/fmaxnum.
8889
8890 unsigned NewOp;
8891 switch (MI.getOpcode()) {
8892 case TargetOpcode::G_FMINNUM:
8893 NewOp = TargetOpcode::G_FMINNUM_IEEE;
8894 break;
8895 case TargetOpcode::G_FMINIMUMNUM:
8896 NewOp = TargetOpcode::G_FMINNUM;
8897 break;
8898 case TargetOpcode::G_FMAXNUM:
8899 NewOp = TargetOpcode::G_FMAXNUM_IEEE;
8900 break;
8901 case TargetOpcode::G_FMAXIMUMNUM:
8902 NewOp = TargetOpcode::G_FMAXNUM;
8903 break;
8904 default:
8905 llvm_unreachable("unexpected min/max opcode");
8906 }
8907
8908 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8909 LLT Ty = MRI.getType(Reg: Dst);
8910
8911 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
8912 // Insert canonicalizes if it's possible we need to quiet to get correct
8913 // sNaN behavior.
8914
8915 // Note this must be done here, and not as an optimization combine in the
8916 // absence of a dedicate quiet-snan instruction as we're using an
8917 // omni-purpose G_FCANONICALIZE.
8918 if (!isKnownNeverSNaN(Val: Src0, MRI))
8919 Src0 = MIRBuilder.buildFCanonicalize(Dst: Ty, Src0, Flags: MI.getFlags()).getReg(Idx: 0);
8920
8921 if (!isKnownNeverSNaN(Val: Src1, MRI))
8922 Src1 = MIRBuilder.buildFCanonicalize(Dst: Ty, Src0: Src1, Flags: MI.getFlags()).getReg(Idx: 0);
8923 }
8924
8925 // If there are no nans, it's safe to simply replace this with the non-IEEE
8926 // version.
8927 MIRBuilder.buildInstr(Opc: NewOp, DstOps: {Dst}, SrcOps: {Src0, Src1}, Flags: MI.getFlags());
8928 MI.eraseFromParent();
8929 return Legalized;
8930}
8931
8932LegalizerHelper::LegalizeResult
8933LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) {
8934 unsigned Opc = MI.getOpcode();
8935 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8936 LLT Ty = MRI.getType(Reg: Dst);
8937 LLT CmpTy = Ty.changeElementSize(NewEltSize: 1);
8938
8939 bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM);
8940 unsigned OpcIeee =
8941 IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE;
8942 unsigned OpcNonIeee =
8943 IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM;
8944 bool MinMaxMustRespectOrderedZero = false;
8945 Register Res;
8946
8947 // IEEE variants don't need canonicalization
8948 if (LI.isLegalOrCustom(Query: {OpcIeee, Ty})) {
8949 Res = MIRBuilder.buildInstr(Opc: OpcIeee, DstOps: {Ty}, SrcOps: {Src0, Src1}).getReg(Idx: 0);
8950 MinMaxMustRespectOrderedZero = true;
8951 } else if (LI.isLegalOrCustom(Query: {OpcNonIeee, Ty})) {
8952 Res = MIRBuilder.buildInstr(Opc: OpcNonIeee, DstOps: {Ty}, SrcOps: {Src0, Src1}).getReg(Idx: 0);
8953 } else {
8954 auto Compare = MIRBuilder.buildFCmp(
8955 Pred: IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, Res: CmpTy, Op0: Src0, Op1: Src1);
8956 Res = MIRBuilder.buildSelect(Res: Ty, Tst: Compare, Op0: Src0, Op1: Src1).getReg(Idx: 0);
8957 }
8958
8959 // Propagate any NaN of both operands
8960 if (!MI.getFlag(Flag: MachineInstr::FmNoNans) &&
8961 (!isKnownNeverNaN(Val: Src0, MRI) || isKnownNeverNaN(Val: Src1, MRI))) {
8962 auto IsOrdered = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: CmpTy, Op0: Src0, Op1: Src1);
8963
8964 LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType();
8965 APFloat NaNValue = APFloat::getNaN(Sem: getFltSemanticForLLT(Ty: ElementTy));
8966 Register NaN = MIRBuilder.buildFConstant(Res: ElementTy, Val: NaNValue).getReg(Idx: 0);
8967 if (Ty.isVector())
8968 NaN = MIRBuilder.buildSplatBuildVector(Res: Ty, Src: NaN).getReg(Idx: 0);
8969
8970 Res = MIRBuilder.buildSelect(Res: Ty, Tst: IsOrdered, Op0: Res, Op1: NaN).getReg(Idx: 0);
8971 }
8972
8973 // fminimum/fmaximum requires -0.0 less than +0.0
8974 if (!MinMaxMustRespectOrderedZero && !MI.getFlag(Flag: MachineInstr::FmNsz)) {
8975 GISelValueTracking VT(MIRBuilder.getMF());
8976 KnownFPClass Src0Info = VT.computeKnownFPClass(R: Src0, InterestedClasses: fcZero);
8977 KnownFPClass Src1Info = VT.computeKnownFPClass(R: Src1, InterestedClasses: fcZero);
8978
8979 if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) {
8980 const unsigned Flags = MI.getFlags();
8981 Register Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0).getReg(Idx: 0);
8982 auto IsZero = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OEQ, Res: CmpTy, Op0: Res, Op1: Zero);
8983
8984 unsigned TestClass = IsMax ? fcPosZero : fcNegZero;
8985
8986 auto LHSTestZero = MIRBuilder.buildIsFPClass(Res: CmpTy, Src: Src0, Mask: TestClass);
8987 auto LHSSelect =
8988 MIRBuilder.buildSelect(Res: Ty, Tst: LHSTestZero, Op0: Src0, Op1: Res, Flags);
8989
8990 auto RHSTestZero = MIRBuilder.buildIsFPClass(Res: CmpTy, Src: Src1, Mask: TestClass);
8991 auto RHSSelect =
8992 MIRBuilder.buildSelect(Res: Ty, Tst: RHSTestZero, Op0: Src1, Op1: LHSSelect, Flags);
8993
8994 Res = MIRBuilder.buildSelect(Res: Ty, Tst: IsZero, Op0: RHSSelect, Op1: Res, Flags).getReg(Idx: 0);
8995 }
8996 }
8997
8998 MIRBuilder.buildCopy(Res: Dst, Op: Res);
8999 MI.eraseFromParent();
9000 return Legalized;
9001}
9002
9003LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
9004 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
9005 Register DstReg = MI.getOperand(i: 0).getReg();
9006 LLT Ty = MRI.getType(Reg: DstReg);
9007 unsigned Flags = MI.getFlags();
9008
9009 auto Mul = MIRBuilder.buildFMul(Dst: Ty, Src0: MI.getOperand(i: 1), Src1: MI.getOperand(i: 2),
9010 Flags);
9011 MIRBuilder.buildFAdd(Dst: DstReg, Src0: Mul, Src1: MI.getOperand(i: 3), Flags);
9012 MI.eraseFromParent();
9013 return Legalized;
9014}
9015
9016LegalizerHelper::LegalizeResult
9017LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
9018 auto [DstReg, X] = MI.getFirst2Regs();
9019 const unsigned Flags = MI.getFlags();
9020 const LLT Ty = MRI.getType(Reg: DstReg);
9021 const LLT CondTy = Ty.changeElementSize(NewEltSize: 1);
9022
9023 // round(x) =>
9024 // t = trunc(x);
9025 // d = fabs(x - t);
9026 // o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
9027 // return t + o;
9028
9029 auto T = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: X, Flags);
9030
9031 auto Diff = MIRBuilder.buildFSub(Dst: Ty, Src0: X, Src1: T, Flags);
9032 auto AbsDiff = MIRBuilder.buildFAbs(Dst: Ty, Src0: Diff, Flags);
9033
9034 auto Half = MIRBuilder.buildFConstant(Res: Ty, Val: 0.5);
9035 auto Cmp =
9036 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGE, Res: CondTy, Op0: AbsDiff, Op1: Half, Flags);
9037
9038 // Could emit G_UITOFP instead
9039 auto One = MIRBuilder.buildFConstant(Res: Ty, Val: 1.0);
9040 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
9041 auto BoolFP = MIRBuilder.buildSelect(Res: Ty, Tst: Cmp, Op0: One, Op1: Zero);
9042 auto SignedOffset = MIRBuilder.buildFCopysign(Dst: Ty, Src0: BoolFP, Src1: X);
9043
9044 MIRBuilder.buildFAdd(Dst: DstReg, Src0: T, Src1: SignedOffset, Flags);
9045
9046 MI.eraseFromParent();
9047 return Legalized;
9048}
9049
9050LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
9051 auto [DstReg, SrcReg] = MI.getFirst2Regs();
9052 unsigned Flags = MI.getFlags();
9053 LLT Ty = MRI.getType(Reg: DstReg);
9054 const LLT CondTy = Ty.changeElementSize(NewEltSize: 1);
9055
9056 // result = trunc(src);
9057 // if (src < 0.0 && src != result)
9058 // result += -1.0.
9059
9060 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: SrcReg, Flags);
9061 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
9062
9063 auto Lt0 = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: CondTy,
9064 Op0: SrcReg, Op1: Zero, Flags);
9065 auto NeTrunc = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: CondTy,
9066 Op0: SrcReg, Op1: Trunc, Flags);
9067 auto And = MIRBuilder.buildAnd(Dst: CondTy, Src0: Lt0, Src1: NeTrunc);
9068 auto AddVal = MIRBuilder.buildSITOFP(Dst: Ty, Src0: And);
9069
9070 MIRBuilder.buildFAdd(Dst: DstReg, Src0: Trunc, Src1: AddVal, Flags);
9071 MI.eraseFromParent();
9072 return Legalized;
9073}
9074
9075LegalizerHelper::LegalizeResult
9076LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
9077 const unsigned NumOps = MI.getNumOperands();
9078 auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
9079 unsigned PartSize = Src0Ty.getSizeInBits();
9080
9081 LLT WideTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
9082 Register ResultReg = MIRBuilder.buildZExt(Res: WideTy, Op: Src0Reg).getReg(Idx: 0);
9083
9084 for (unsigned I = 2; I != NumOps; ++I) {
9085 const unsigned Offset = (I - 1) * PartSize;
9086
9087 Register SrcReg = MI.getOperand(i: I).getReg();
9088 auto ZextInput = MIRBuilder.buildZExt(Res: WideTy, Op: SrcReg);
9089
9090 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
9091 MRI.createGenericVirtualRegister(Ty: WideTy);
9092
9093 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: Offset);
9094 auto Shl = MIRBuilder.buildShl(Dst: WideTy, Src0: ZextInput, Src1: ShiftAmt);
9095 MIRBuilder.buildOr(Dst: NextResult, Src0: ResultReg, Src1: Shl);
9096 ResultReg = NextResult;
9097 }
9098
9099 if (DstTy.isPointer()) {
9100 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
9101 AddrSpace: DstTy.getAddressSpace())) {
9102 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
9103 return UnableToLegalize;
9104 }
9105
9106 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
9107 }
9108
9109 MI.eraseFromParent();
9110 return Legalized;
9111}
9112
9113LegalizerHelper::LegalizeResult
9114LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
9115 const unsigned NumDst = MI.getNumOperands() - 1;
9116 Register SrcReg = MI.getOperand(i: NumDst).getReg();
9117 Register Dst0Reg = MI.getOperand(i: 0).getReg();
9118 LLT DstTy = MRI.getType(Reg: Dst0Reg);
9119 if (DstTy.isPointer())
9120 return UnableToLegalize; // TODO
9121
9122 SrcReg = coerceToScalar(Val: SrcReg);
9123 if (!SrcReg)
9124 return UnableToLegalize;
9125
9126 // Expand scalarizing unmerge as bitcast to integer and shift.
9127 LLT IntTy = MRI.getType(Reg: SrcReg);
9128
9129 MIRBuilder.buildTrunc(Res: Dst0Reg, Op: SrcReg);
9130
9131 const unsigned DstSize = DstTy.getSizeInBits();
9132 unsigned Offset = DstSize;
9133 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
9134 auto ShiftAmt = MIRBuilder.buildConstant(Res: IntTy, Val: Offset);
9135 auto Shift = MIRBuilder.buildLShr(Dst: IntTy, Src0: SrcReg, Src1: ShiftAmt);
9136 MIRBuilder.buildTrunc(Res: MI.getOperand(i: I), Op: Shift);
9137 }
9138
9139 MI.eraseFromParent();
9140 return Legalized;
9141}
9142
9143/// Lower a vector extract or insert by writing the vector to a stack temporary
9144/// and reloading the element or vector.
9145///
9146/// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
9147/// =>
9148/// %stack_temp = G_FRAME_INDEX
9149/// G_STORE %vec, %stack_temp
9150/// %idx = clamp(%idx, %vec.getNumElements())
9151/// %element_ptr = G_PTR_ADD %stack_temp, %idx
9152/// %dst = G_LOAD %element_ptr
9153LegalizerHelper::LegalizeResult
9154LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
9155 Register DstReg = MI.getOperand(i: 0).getReg();
9156 Register SrcVec = MI.getOperand(i: 1).getReg();
9157 Register InsertVal;
9158 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
9159 InsertVal = MI.getOperand(i: 2).getReg();
9160
9161 Register Idx = MI.getOperand(i: MI.getNumOperands() - 1).getReg();
9162
9163 LLT VecTy = MRI.getType(Reg: SrcVec);
9164 LLT EltTy = VecTy.getElementType();
9165 unsigned NumElts = VecTy.getNumElements();
9166
9167 int64_t IdxVal;
9168 if (mi_match(R: Idx, MRI, P: m_ICst(Cst&: IdxVal)) && IdxVal <= NumElts) {
9169 SmallVector<Register, 8> SrcRegs;
9170 extractParts(Reg: SrcVec, Ty: EltTy, NumParts: NumElts, VRegs&: SrcRegs, MIRBuilder, MRI);
9171
9172 if (InsertVal) {
9173 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
9174 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SrcRegs);
9175 } else {
9176 MIRBuilder.buildCopy(Res: DstReg, Op: SrcRegs[IdxVal]);
9177 }
9178
9179 MI.eraseFromParent();
9180 return Legalized;
9181 }
9182
9183 if (!EltTy.isByteSized()) { // Not implemented.
9184 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
9185 return UnableToLegalize;
9186 }
9187
9188 unsigned EltBytes = EltTy.getSizeInBytes();
9189 Align VecAlign = getStackTemporaryAlignment(Ty: VecTy);
9190 Align EltAlign;
9191
9192 MachinePointerInfo PtrInfo;
9193 auto StackTemp = createStackTemporary(
9194 Bytes: TypeSize::getFixed(ExactSize: VecTy.getSizeInBytes()), Alignment: VecAlign, PtrInfo);
9195 MIRBuilder.buildStore(Val: SrcVec, Addr: StackTemp, PtrInfo, Alignment: VecAlign);
9196
9197 // Get the pointer to the element, and be sure not to hit undefined behavior
9198 // if the index is out of bounds.
9199 Register EltPtr = getVectorElementPointer(VecPtr: StackTemp.getReg(Idx: 0), VecTy, Index: Idx);
9200
9201 if (mi_match(R: Idx, MRI, P: m_ICst(Cst&: IdxVal))) {
9202 int64_t Offset = IdxVal * EltBytes;
9203 PtrInfo = PtrInfo.getWithOffset(O: Offset);
9204 EltAlign = commonAlignment(A: VecAlign, Offset);
9205 } else {
9206 // We lose information with a variable offset.
9207 EltAlign = getStackTemporaryAlignment(Ty: EltTy);
9208 PtrInfo = MachinePointerInfo(MRI.getType(Reg: EltPtr).getAddressSpace());
9209 }
9210
9211 if (InsertVal) {
9212 // Write the inserted element
9213 MIRBuilder.buildStore(Val: InsertVal, Addr: EltPtr, PtrInfo, Alignment: EltAlign);
9214
9215 // Reload the whole vector.
9216 MIRBuilder.buildLoad(Res: DstReg, Addr: StackTemp, PtrInfo, Alignment: VecAlign);
9217 } else {
9218 MIRBuilder.buildLoad(Res: DstReg, Addr: EltPtr, PtrInfo, Alignment: EltAlign);
9219 }
9220
9221 MI.eraseFromParent();
9222 return Legalized;
9223}
9224
9225LegalizerHelper::LegalizeResult
9226LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
9227 auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
9228 MI.getFirst3RegLLTs();
9229 LLT IdxTy = LLT::scalar(SizeInBits: 32);
9230
9231 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
9232 Register Undef;
9233 SmallVector<Register, 32> BuildVec;
9234 LLT EltTy = DstTy.getScalarType();
9235
9236 DenseMap<unsigned, Register> CachedExtract;
9237
9238 for (int Idx : Mask) {
9239 if (Idx < 0) {
9240 if (!Undef.isValid())
9241 Undef = MIRBuilder.buildUndef(Res: EltTy).getReg(Idx: 0);
9242 BuildVec.push_back(Elt: Undef);
9243 continue;
9244 }
9245
9246 assert(!Src0Ty.isScalar() && "Unexpected scalar G_SHUFFLE_VECTOR");
9247
9248 int NumElts = Src0Ty.getNumElements();
9249 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
9250 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
9251 auto [It, Inserted] = CachedExtract.try_emplace(Key: Idx);
9252 if (Inserted) {
9253 auto IdxK = MIRBuilder.buildConstant(Res: IdxTy, Val: ExtractIdx);
9254 It->second =
9255 MIRBuilder.buildExtractVectorElement(Res: EltTy, Val: SrcVec, Idx: IdxK).getReg(Idx: 0);
9256 }
9257 BuildVec.push_back(Elt: It->second);
9258 }
9259
9260 assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR");
9261 MIRBuilder.buildBuildVector(Res: DstReg, Ops: BuildVec);
9262 MI.eraseFromParent();
9263 return Legalized;
9264}
9265
9266LegalizerHelper::LegalizeResult
9267LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
9268 auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] =
9269 MI.getFirst4RegLLTs();
9270
9271 if (VecTy.isScalableVector())
9272 report_fatal_error(reason: "Cannot expand masked_compress for scalable vectors.");
9273
9274 Align VecAlign = getStackTemporaryAlignment(Ty: VecTy);
9275 MachinePointerInfo PtrInfo;
9276 Register StackPtr =
9277 createStackTemporary(Bytes: TypeSize::getFixed(ExactSize: VecTy.getSizeInBytes()), Alignment: VecAlign,
9278 PtrInfo)
9279 .getReg(Idx: 0);
9280 MachinePointerInfo ValPtrInfo =
9281 MachinePointerInfo::getUnknownStack(MF&: *MI.getMF());
9282
9283 LLT IdxTy = LLT::scalar(SizeInBits: 32);
9284 LLT ValTy = VecTy.getElementType();
9285 Align ValAlign = getStackTemporaryAlignment(Ty: ValTy);
9286
9287 auto OutPos = MIRBuilder.buildConstant(Res: IdxTy, Val: 0);
9288
9289 bool HasPassthru =
9290 MRI.getVRegDef(Reg: Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF;
9291
9292 if (HasPassthru)
9293 MIRBuilder.buildStore(Val: Passthru, Addr: StackPtr, PtrInfo, Alignment: VecAlign);
9294
9295 Register LastWriteVal;
9296 std::optional<APInt> PassthruSplatVal =
9297 isConstantOrConstantSplatVector(MI&: *MRI.getVRegDef(Reg: Passthru), MRI);
9298
9299 if (PassthruSplatVal.has_value()) {
9300 LastWriteVal =
9301 MIRBuilder.buildConstant(Res: ValTy, Val: PassthruSplatVal.value()).getReg(Idx: 0);
9302 } else if (HasPassthru) {
9303 auto Popcount = MIRBuilder.buildZExt(Res: MaskTy.changeElementSize(NewEltSize: 32), Op: Mask);
9304 Popcount = MIRBuilder.buildInstr(Opc: TargetOpcode::G_VECREDUCE_ADD,
9305 DstOps: {LLT::scalar(SizeInBits: 32)}, SrcOps: {Popcount});
9306
9307 Register LastElmtPtr =
9308 getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: Popcount.getReg(Idx: 0));
9309 LastWriteVal =
9310 MIRBuilder.buildLoad(Res: ValTy, Addr: LastElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign)
9311 .getReg(Idx: 0);
9312 }
9313
9314 unsigned NumElmts = VecTy.getNumElements();
9315 for (unsigned I = 0; I < NumElmts; ++I) {
9316 auto Idx = MIRBuilder.buildConstant(Res: IdxTy, Val: I);
9317 auto Val = MIRBuilder.buildExtractVectorElement(Res: ValTy, Val: Vec, Idx);
9318 Register ElmtPtr =
9319 getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: OutPos.getReg(Idx: 0));
9320 MIRBuilder.buildStore(Val, Addr: ElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign);
9321
9322 LLT MaskITy = MaskTy.getElementType();
9323 auto MaskI = MIRBuilder.buildExtractVectorElement(Res: MaskITy, Val: Mask, Idx);
9324 if (MaskITy.getSizeInBits() > 1)
9325 MaskI = MIRBuilder.buildTrunc(Res: LLT::scalar(SizeInBits: 1), Op: MaskI);
9326
9327 MaskI = MIRBuilder.buildZExt(Res: IdxTy, Op: MaskI);
9328 OutPos = MIRBuilder.buildAdd(Dst: IdxTy, Src0: OutPos, Src1: MaskI);
9329
9330 if (HasPassthru && I == NumElmts - 1) {
9331 auto EndOfVector =
9332 MIRBuilder.buildConstant(Res: IdxTy, Val: VecTy.getNumElements() - 1);
9333 auto AllLanesSelected = MIRBuilder.buildICmp(
9334 Pred: CmpInst::ICMP_UGT, Res: LLT::scalar(SizeInBits: 1), Op0: OutPos, Op1: EndOfVector);
9335 OutPos = MIRBuilder.buildInstr(Opc: TargetOpcode::G_UMIN, DstOps: {IdxTy},
9336 SrcOps: {OutPos, EndOfVector});
9337 ElmtPtr = getVectorElementPointer(VecPtr: StackPtr, VecTy, Index: OutPos.getReg(Idx: 0));
9338
9339 LastWriteVal =
9340 MIRBuilder.buildSelect(Res: ValTy, Tst: AllLanesSelected, Op0: Val, Op1: LastWriteVal)
9341 .getReg(Idx: 0);
9342 MIRBuilder.buildStore(Val: LastWriteVal, Addr: ElmtPtr, PtrInfo: ValPtrInfo, Alignment: ValAlign);
9343 }
9344 }
9345
9346 // TODO: Use StackPtr's FrameIndex alignment.
9347 MIRBuilder.buildLoad(Res: Dst, Addr: StackPtr, PtrInfo, Alignment: VecAlign);
9348
9349 MI.eraseFromParent();
9350 return Legalized;
9351}
9352
9353Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
9354 Register AllocSize,
9355 Align Alignment,
9356 LLT PtrTy) {
9357 LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
9358
9359 auto SPTmp = MIRBuilder.buildCopy(Res: PtrTy, Op: SPReg);
9360 SPTmp = MIRBuilder.buildCast(Dst: IntPtrTy, Src: SPTmp);
9361
9362 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
9363 // have to generate an extra instruction to negate the alloc and then use
9364 // G_PTR_ADD to add the negative offset.
9365 auto Alloc = MIRBuilder.buildSub(Dst: IntPtrTy, Src0: SPTmp, Src1: AllocSize);
9366 if (Alignment > Align(1)) {
9367 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
9368 AlignMask.negate();
9369 auto AlignCst = MIRBuilder.buildConstant(Res: IntPtrTy, Val: AlignMask);
9370 Alloc = MIRBuilder.buildAnd(Dst: IntPtrTy, Src0: Alloc, Src1: AlignCst);
9371 }
9372
9373 return MIRBuilder.buildCast(Dst: PtrTy, Src: Alloc).getReg(Idx: 0);
9374}
9375
9376LegalizerHelper::LegalizeResult
9377LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
9378 const auto &MF = *MI.getMF();
9379 const auto &TFI = *MF.getSubtarget().getFrameLowering();
9380 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
9381 return UnableToLegalize;
9382
9383 Register Dst = MI.getOperand(i: 0).getReg();
9384 Register AllocSize = MI.getOperand(i: 1).getReg();
9385 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
9386
9387 LLT PtrTy = MRI.getType(Reg: Dst);
9388 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
9389 Register SPTmp =
9390 getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
9391
9392 MIRBuilder.buildCopy(Res: SPReg, Op: SPTmp);
9393 MIRBuilder.buildCopy(Res: Dst, Op: SPTmp);
9394
9395 MI.eraseFromParent();
9396 return Legalized;
9397}
9398
9399LegalizerHelper::LegalizeResult
9400LegalizerHelper::lowerStackSave(MachineInstr &MI) {
9401 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
9402 if (!StackPtr)
9403 return UnableToLegalize;
9404
9405 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0), Op: StackPtr);
9406 MI.eraseFromParent();
9407 return Legalized;
9408}
9409
9410LegalizerHelper::LegalizeResult
9411LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
9412 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
9413 if (!StackPtr)
9414 return UnableToLegalize;
9415
9416 MIRBuilder.buildCopy(Res: StackPtr, Op: MI.getOperand(i: 0));
9417 MI.eraseFromParent();
9418 return Legalized;
9419}
9420
9421LegalizerHelper::LegalizeResult
9422LegalizerHelper::lowerExtract(MachineInstr &MI) {
9423 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
9424 unsigned Offset = MI.getOperand(i: 2).getImm();
9425
9426 // Extract sub-vector or one element
9427 if (SrcTy.isVector()) {
9428 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
9429 unsigned DstSize = DstTy.getSizeInBits();
9430
9431 if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
9432 (Offset + DstSize <= SrcTy.getSizeInBits())) {
9433 // Unmerge and allow access to each Src element for the artifact combiner.
9434 auto Unmerge = MIRBuilder.buildUnmerge(Res: SrcTy.getElementType(), Op: SrcReg);
9435
9436 // Take element(s) we need to extract and copy it (merge them).
9437 SmallVector<Register, 8> SubVectorElts;
9438 for (unsigned Idx = Offset / SrcEltSize;
9439 Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
9440 SubVectorElts.push_back(Elt: Unmerge.getReg(Idx));
9441 }
9442 if (SubVectorElts.size() == 1)
9443 MIRBuilder.buildCopy(Res: DstReg, Op: SubVectorElts[0]);
9444 else
9445 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SubVectorElts);
9446
9447 MI.eraseFromParent();
9448 return Legalized;
9449 }
9450 }
9451
9452 if (DstTy.isScalar() &&
9453 (SrcTy.isScalar() ||
9454 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
9455 LLT SrcIntTy = SrcTy;
9456 if (!SrcTy.isScalar()) {
9457 SrcIntTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
9458 SrcReg = MIRBuilder.buildBitcast(Dst: SrcIntTy, Src: SrcReg).getReg(Idx: 0);
9459 }
9460
9461 if (Offset == 0)
9462 MIRBuilder.buildTrunc(Res: DstReg, Op: SrcReg);
9463 else {
9464 auto ShiftAmt = MIRBuilder.buildConstant(Res: SrcIntTy, Val: Offset);
9465 auto Shr = MIRBuilder.buildLShr(Dst: SrcIntTy, Src0: SrcReg, Src1: ShiftAmt);
9466 MIRBuilder.buildTrunc(Res: DstReg, Op: Shr);
9467 }
9468
9469 MI.eraseFromParent();
9470 return Legalized;
9471 }
9472
9473 return UnableToLegalize;
9474}
9475
9476LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
9477 auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
9478 uint64_t Offset = MI.getOperand(i: 3).getImm();
9479
9480 LLT DstTy = MRI.getType(Reg: Src);
9481 LLT InsertTy = MRI.getType(Reg: InsertSrc);
9482
9483 // Insert sub-vector or one element
9484 if (DstTy.isVector() && !InsertTy.isPointer()) {
9485 LLT EltTy = DstTy.getElementType();
9486 unsigned EltSize = EltTy.getSizeInBits();
9487 unsigned InsertSize = InsertTy.getSizeInBits();
9488
9489 if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
9490 (Offset + InsertSize <= DstTy.getSizeInBits())) {
9491 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: Src);
9492 SmallVector<Register, 8> DstElts;
9493 unsigned Idx = 0;
9494 // Elements from Src before insert start Offset
9495 for (; Idx < Offset / EltSize; ++Idx) {
9496 DstElts.push_back(Elt: UnmergeSrc.getReg(Idx));
9497 }
9498
9499 // Replace elements in Src with elements from InsertSrc
9500 if (InsertTy.getSizeInBits() > EltSize) {
9501 auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: InsertSrc);
9502 for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
9503 ++Idx, ++i) {
9504 DstElts.push_back(Elt: UnmergeInsertSrc.getReg(Idx: i));
9505 }
9506 } else {
9507 DstElts.push_back(Elt: InsertSrc);
9508 ++Idx;
9509 }
9510
9511 // Remaining elements from Src after insert
9512 for (; Idx < DstTy.getNumElements(); ++Idx) {
9513 DstElts.push_back(Elt: UnmergeSrc.getReg(Idx));
9514 }
9515
9516 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: DstElts);
9517 MI.eraseFromParent();
9518 return Legalized;
9519 }
9520 }
9521
9522 if (InsertTy.isVector() ||
9523 (DstTy.isVector() && DstTy.getElementType() != InsertTy))
9524 return UnableToLegalize;
9525
9526 const DataLayout &DL = MIRBuilder.getDataLayout();
9527 if ((DstTy.isPointer() &&
9528 DL.isNonIntegralAddressSpace(AddrSpace: DstTy.getAddressSpace())) ||
9529 (InsertTy.isPointer() &&
9530 DL.isNonIntegralAddressSpace(AddrSpace: InsertTy.getAddressSpace()))) {
9531 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
9532 return UnableToLegalize;
9533 }
9534
9535 LLT IntDstTy = DstTy;
9536
9537 if (!DstTy.isScalar()) {
9538 IntDstTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
9539 Src = MIRBuilder.buildCast(Dst: IntDstTy, Src).getReg(Idx: 0);
9540 }
9541
9542 if (!InsertTy.isScalar()) {
9543 const LLT IntInsertTy = LLT::scalar(SizeInBits: InsertTy.getSizeInBits());
9544 InsertSrc = MIRBuilder.buildPtrToInt(Dst: IntInsertTy, Src: InsertSrc).getReg(Idx: 0);
9545 }
9546
9547 Register ExtInsSrc = MIRBuilder.buildZExt(Res: IntDstTy, Op: InsertSrc).getReg(Idx: 0);
9548 if (Offset != 0) {
9549 auto ShiftAmt = MIRBuilder.buildConstant(Res: IntDstTy, Val: Offset);
9550 ExtInsSrc = MIRBuilder.buildShl(Dst: IntDstTy, Src0: ExtInsSrc, Src1: ShiftAmt).getReg(Idx: 0);
9551 }
9552
9553 APInt MaskVal = APInt::getBitsSetWithWrap(
9554 numBits: DstTy.getSizeInBits(), loBit: Offset + InsertTy.getSizeInBits(), hiBit: Offset);
9555
9556 auto Mask = MIRBuilder.buildConstant(Res: IntDstTy, Val: MaskVal);
9557 auto MaskedSrc = MIRBuilder.buildAnd(Dst: IntDstTy, Src0: Src, Src1: Mask);
9558 auto Or = MIRBuilder.buildOr(Dst: IntDstTy, Src0: MaskedSrc, Src1: ExtInsSrc);
9559
9560 MIRBuilder.buildCast(Dst, Src: Or);
9561 MI.eraseFromParent();
9562 return Legalized;
9563}
9564
9565LegalizerHelper::LegalizeResult
9566LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
9567 auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
9568 MI.getFirst4RegLLTs();
9569 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
9570
9571 LLT Ty = Dst0Ty;
9572 LLT BoolTy = Dst1Ty;
9573
9574 Register NewDst0 = MRI.cloneVirtualRegister(VReg: Dst0);
9575
9576 if (IsAdd)
9577 MIRBuilder.buildAdd(Dst: NewDst0, Src0: LHS, Src1: RHS);
9578 else
9579 MIRBuilder.buildSub(Dst: NewDst0, Src0: LHS, Src1: RHS);
9580
9581 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
9582
9583 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9584
9585 // For an addition, the result should be less than one of the operands (LHS)
9586 // if and only if the other operand (RHS) is negative, otherwise there will
9587 // be overflow.
9588 // For a subtraction, the result should be less than one of the operands
9589 // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
9590 // otherwise there will be overflow.
9591 auto ResultLowerThanLHS =
9592 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: BoolTy, Op0: NewDst0, Op1: LHS);
9593 auto ConditionRHS = MIRBuilder.buildICmp(
9594 Pred: IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, Res: BoolTy, Op0: RHS, Op1: Zero);
9595
9596 MIRBuilder.buildXor(Dst: Dst1, Src0: ConditionRHS, Src1: ResultLowerThanLHS);
9597
9598 MIRBuilder.buildCopy(Res: Dst0, Op: NewDst0);
9599 MI.eraseFromParent();
9600
9601 return Legalized;
9602}
9603
9604LegalizerHelper::LegalizeResult LegalizerHelper::lowerSADDE(MachineInstr &MI) {
9605 auto [Res, OvOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
9606 const LLT Ty = MRI.getType(Reg: Res);
9607
9608 // sum = LHS + RHS + zext(CarryIn)
9609 auto Tmp = MIRBuilder.buildAdd(Dst: Ty, Src0: LHS, Src1: RHS);
9610 auto CarryZ = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
9611 auto Sum = MIRBuilder.buildAdd(Dst: Ty, Src0: Tmp, Src1: CarryZ);
9612 MIRBuilder.buildCopy(Res, Op: Sum);
9613
9614 // OvOut = icmp slt ((sum ^ lhs) & (sum ^ rhs)), 0
9615 auto AX = MIRBuilder.buildXor(Dst: Ty, Src0: Sum, Src1: LHS);
9616 auto BX = MIRBuilder.buildXor(Dst: Ty, Src0: Sum, Src1: RHS);
9617 auto T = MIRBuilder.buildAnd(Dst: Ty, Src0: AX, Src1: BX);
9618
9619 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9620 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: OvOut, Op0: T, Op1: Zero);
9621
9622 MI.eraseFromParent();
9623 return Legalized;
9624}
9625
9626LegalizerHelper::LegalizeResult LegalizerHelper::lowerSSUBE(MachineInstr &MI) {
9627 auto [Res, OvOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
9628 const LLT Ty = MRI.getType(Reg: Res);
9629
9630 // Diff = LHS - (RHS + zext(CarryIn))
9631 auto CarryZ = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
9632 auto RHSPlusCI = MIRBuilder.buildAdd(Dst: Ty, Src0: RHS, Src1: CarryZ);
9633 auto Diff = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHSPlusCI);
9634 MIRBuilder.buildCopy(Res, Op: Diff);
9635
9636 // ov = msb((LHS ^ RHS) & (LHS ^ Diff))
9637 auto X1 = MIRBuilder.buildXor(Dst: Ty, Src0: LHS, Src1: RHS);
9638 auto X2 = MIRBuilder.buildXor(Dst: Ty, Src0: LHS, Src1: Diff);
9639 auto T = MIRBuilder.buildAnd(Dst: Ty, Src0: X1, Src1: X2);
9640 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9641 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: OvOut, Op0: T, Op1: Zero);
9642
9643 MI.eraseFromParent();
9644 return Legalized;
9645}
9646
9647LegalizerHelper::LegalizeResult
9648LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
9649 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9650 LLT Ty = MRI.getType(Reg: Res);
9651 bool IsSigned;
9652 bool IsAdd;
9653 unsigned BaseOp;
9654 switch (MI.getOpcode()) {
9655 default:
9656 llvm_unreachable("unexpected addsat/subsat opcode");
9657 case TargetOpcode::G_UADDSAT:
9658 IsSigned = false;
9659 IsAdd = true;
9660 BaseOp = TargetOpcode::G_ADD;
9661 break;
9662 case TargetOpcode::G_SADDSAT:
9663 IsSigned = true;
9664 IsAdd = true;
9665 BaseOp = TargetOpcode::G_ADD;
9666 break;
9667 case TargetOpcode::G_USUBSAT:
9668 IsSigned = false;
9669 IsAdd = false;
9670 BaseOp = TargetOpcode::G_SUB;
9671 break;
9672 case TargetOpcode::G_SSUBSAT:
9673 IsSigned = true;
9674 IsAdd = false;
9675 BaseOp = TargetOpcode::G_SUB;
9676 break;
9677 }
9678
9679 if (IsSigned) {
9680 // sadd.sat(a, b) ->
9681 // hi = 0x7fffffff - smax(a, 0)
9682 // lo = 0x80000000 - smin(a, 0)
9683 // a + smin(smax(lo, b), hi)
9684 // ssub.sat(a, b) ->
9685 // lo = smax(a, -1) - 0x7fffffff
9686 // hi = smin(a, -1) - 0x80000000
9687 // a - smin(smax(lo, b), hi)
9688 // TODO: AMDGPU can use a "median of 3" instruction here:
9689 // a +/- med3(lo, b, hi)
9690 uint64_t NumBits = Ty.getScalarSizeInBits();
9691 auto MaxVal =
9692 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMaxValue(numBits: NumBits));
9693 auto MinVal =
9694 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: NumBits));
9695 MachineInstrBuilder Hi, Lo;
9696 if (IsAdd) {
9697 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
9698 Hi = MIRBuilder.buildSub(Dst: Ty, Src0: MaxVal, Src1: MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: Zero));
9699 Lo = MIRBuilder.buildSub(Dst: Ty, Src0: MinVal, Src1: MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: Zero));
9700 } else {
9701 auto NegOne = MIRBuilder.buildConstant(Res: Ty, Val: -1);
9702 Lo = MIRBuilder.buildSub(Dst: Ty, Src0: MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: NegOne),
9703 Src1: MaxVal);
9704 Hi = MIRBuilder.buildSub(Dst: Ty, Src0: MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: NegOne),
9705 Src1: MinVal);
9706 }
9707 auto RHSClamped =
9708 MIRBuilder.buildSMin(Dst: Ty, Src0: MIRBuilder.buildSMax(Dst: Ty, Src0: Lo, Src1: RHS), Src1: Hi);
9709 MIRBuilder.buildInstr(Opc: BaseOp, DstOps: {Res}, SrcOps: {LHS, RHSClamped});
9710 } else {
9711 // uadd.sat(a, b) -> a + umin(~a, b)
9712 // usub.sat(a, b) -> a - umin(a, b)
9713 Register Not = IsAdd ? MIRBuilder.buildNot(Dst: Ty, Src0: LHS).getReg(Idx: 0) : LHS;
9714 auto Min = MIRBuilder.buildUMin(Dst: Ty, Src0: Not, Src1: RHS);
9715 MIRBuilder.buildInstr(Opc: BaseOp, DstOps: {Res}, SrcOps: {LHS, Min});
9716 }
9717
9718 MI.eraseFromParent();
9719 return Legalized;
9720}
9721
9722LegalizerHelper::LegalizeResult
9723LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
9724 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9725 LLT Ty = MRI.getType(Reg: Res);
9726 LLT BoolTy = Ty.changeElementSize(NewEltSize: 1);
9727 bool IsSigned;
9728 bool IsAdd;
9729 unsigned OverflowOp;
9730 switch (MI.getOpcode()) {
9731 default:
9732 llvm_unreachable("unexpected addsat/subsat opcode");
9733 case TargetOpcode::G_UADDSAT:
9734 IsSigned = false;
9735 IsAdd = true;
9736 OverflowOp = TargetOpcode::G_UADDO;
9737 break;
9738 case TargetOpcode::G_SADDSAT:
9739 IsSigned = true;
9740 IsAdd = true;
9741 OverflowOp = TargetOpcode::G_SADDO;
9742 break;
9743 case TargetOpcode::G_USUBSAT:
9744 IsSigned = false;
9745 IsAdd = false;
9746 OverflowOp = TargetOpcode::G_USUBO;
9747 break;
9748 case TargetOpcode::G_SSUBSAT:
9749 IsSigned = true;
9750 IsAdd = false;
9751 OverflowOp = TargetOpcode::G_SSUBO;
9752 break;
9753 }
9754
9755 auto OverflowRes =
9756 MIRBuilder.buildInstr(Opc: OverflowOp, DstOps: {Ty, BoolTy}, SrcOps: {LHS, RHS});
9757 Register Tmp = OverflowRes.getReg(Idx: 0);
9758 Register Ov = OverflowRes.getReg(Idx: 1);
9759 MachineInstrBuilder Clamp;
9760 if (IsSigned) {
9761 // sadd.sat(a, b) ->
9762 // {tmp, ov} = saddo(a, b)
9763 // ov ? (tmp >>s 31) + 0x80000000 : r
9764 // ssub.sat(a, b) ->
9765 // {tmp, ov} = ssubo(a, b)
9766 // ov ? (tmp >>s 31) + 0x80000000 : r
9767 uint64_t NumBits = Ty.getScalarSizeInBits();
9768 auto ShiftAmount = MIRBuilder.buildConstant(Res: Ty, Val: NumBits - 1);
9769 auto Sign = MIRBuilder.buildAShr(Dst: Ty, Src0: Tmp, Src1: ShiftAmount);
9770 auto MinVal =
9771 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: NumBits));
9772 Clamp = MIRBuilder.buildAdd(Dst: Ty, Src0: Sign, Src1: MinVal);
9773 } else {
9774 // uadd.sat(a, b) ->
9775 // {tmp, ov} = uaddo(a, b)
9776 // ov ? 0xffffffff : tmp
9777 // usub.sat(a, b) ->
9778 // {tmp, ov} = usubo(a, b)
9779 // ov ? 0 : tmp
9780 Clamp = MIRBuilder.buildConstant(Res: Ty, Val: IsAdd ? -1 : 0);
9781 }
9782 MIRBuilder.buildSelect(Res, Tst: Ov, Op0: Clamp, Op1: Tmp);
9783
9784 MI.eraseFromParent();
9785 return Legalized;
9786}
9787
9788LegalizerHelper::LegalizeResult
9789LegalizerHelper::lowerShlSat(MachineInstr &MI) {
9790 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
9791 MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
9792 "Expected shlsat opcode!");
9793 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
9794 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9795 LLT Ty = MRI.getType(Reg: Res);
9796 LLT BoolTy = Ty.changeElementSize(NewEltSize: 1);
9797
9798 unsigned BW = Ty.getScalarSizeInBits();
9799 auto Result = MIRBuilder.buildShl(Dst: Ty, Src0: LHS, Src1: RHS);
9800 auto Orig = IsSigned ? MIRBuilder.buildAShr(Dst: Ty, Src0: Result, Src1: RHS)
9801 : MIRBuilder.buildLShr(Dst: Ty, Src0: Result, Src1: RHS);
9802
9803 MachineInstrBuilder SatVal;
9804 if (IsSigned) {
9805 auto SatMin = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: BW));
9806 auto SatMax = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMaxValue(numBits: BW));
9807 auto Cmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: BoolTy, Op0: LHS,
9808 Op1: MIRBuilder.buildConstant(Res: Ty, Val: 0));
9809 SatVal = MIRBuilder.buildSelect(Res: Ty, Tst: Cmp, Op0: SatMin, Op1: SatMax);
9810 } else {
9811 SatVal = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getMaxValue(numBits: BW));
9812 }
9813 auto Ov = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: BoolTy, Op0: LHS, Op1: Orig);
9814 MIRBuilder.buildSelect(Res, Tst: Ov, Op0: SatVal, Op1: Result);
9815
9816 MI.eraseFromParent();
9817 return Legalized;
9818}
9819
9820LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
9821 auto [Dst, Src] = MI.getFirst2Regs();
9822 const LLT Ty = MRI.getType(Reg: Src);
9823 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
9824 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
9825
9826 // Swap most and least significant byte, set remaining bytes in Res to zero.
9827 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: BaseShiftAmt);
9828 auto LSByteShiftedLeft = MIRBuilder.buildShl(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9829 auto MSByteShiftedRight = MIRBuilder.buildLShr(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9830 auto Res = MIRBuilder.buildOr(Dst: Ty, Src0: MSByteShiftedRight, Src1: LSByteShiftedLeft);
9831
9832 // Set i-th high/low byte in Res to i-th low/high byte from Src.
9833 for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
9834 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
9835 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
9836 auto Mask = MIRBuilder.buildConstant(Res: Ty, Val: APMask);
9837 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: BaseShiftAmt - 16 * i);
9838 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
9839 auto LoByte = MIRBuilder.buildAnd(Dst: Ty, Src0: Src, Src1: Mask);
9840 auto LoShiftedLeft = MIRBuilder.buildShl(Dst: Ty, Src0: LoByte, Src1: ShiftAmt);
9841 Res = MIRBuilder.buildOr(Dst: Ty, Src0: Res, Src1: LoShiftedLeft);
9842 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
9843 auto SrcShiftedRight = MIRBuilder.buildLShr(Dst: Ty, Src0: Src, Src1: ShiftAmt);
9844 auto HiShiftedRight = MIRBuilder.buildAnd(Dst: Ty, Src0: SrcShiftedRight, Src1: Mask);
9845 Res = MIRBuilder.buildOr(Dst: Ty, Src0: Res, Src1: HiShiftedRight);
9846 }
9847 Res.getInstr()->getOperand(i: 0).setReg(Dst);
9848
9849 MI.eraseFromParent();
9850 return Legalized;
9851}
9852
9853//{ (Src & Mask) >> N } | { (Src << N) & Mask }
9854static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
9855 MachineInstrBuilder Src, const APInt &Mask) {
9856 const LLT Ty = Dst.getLLTTy(MRI: *B.getMRI());
9857 MachineInstrBuilder C_N = B.buildConstant(Res: Ty, Val: N);
9858 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Res: Ty, Val: Mask);
9859 auto LHS = B.buildLShr(Dst: Ty, Src0: B.buildAnd(Dst: Ty, Src0: Src, Src1: MaskLoNTo0), Src1: C_N);
9860 auto RHS = B.buildAnd(Dst: Ty, Src0: B.buildShl(Dst: Ty, Src0: Src, Src1: C_N), Src1: MaskLoNTo0);
9861 return B.buildOr(Dst, Src0: LHS, Src1: RHS);
9862}
9863
9864LegalizerHelper::LegalizeResult
9865LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
9866 auto [Dst, Src] = MI.getFirst2Regs();
9867 const LLT SrcTy = MRI.getType(Reg: Src);
9868 unsigned Size = SrcTy.getScalarSizeInBits();
9869 unsigned VSize = SrcTy.getSizeInBits();
9870
9871 if (Size >= 8) {
9872 if (SrcTy.isVector() && (VSize % 8 == 0) &&
9873 (LI.isLegal(Query: {TargetOpcode::G_BITREVERSE,
9874 {LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8),
9875 LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8)}}))) {
9876 // If bitreverse is legal for i8 vector of the same size, then cast
9877 // to i8 vector type.
9878 // e.g. v4s32 -> v16s8
9879 LLT VTy = LLT::fixed_vector(NumElements: VSize / 8, ScalarSizeInBits: 8);
9880 auto BSWAP = MIRBuilder.buildBSwap(Dst: SrcTy, Src0: Src);
9881 auto Cast = MIRBuilder.buildBitcast(Dst: VTy, Src: BSWAP);
9882 auto RBIT = MIRBuilder.buildBitReverse(Dst: VTy, Src: Cast);
9883 MIRBuilder.buildBitcast(Dst, Src: RBIT);
9884 } else {
9885 MachineInstrBuilder BSWAP =
9886 MIRBuilder.buildInstr(Opc: TargetOpcode::G_BSWAP, DstOps: {SrcTy}, SrcOps: {Src});
9887
9888 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
9889 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
9890 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
9891 MachineInstrBuilder Swap4 = SwapN(N: 4, Dst: SrcTy, B&: MIRBuilder, Src: BSWAP,
9892 Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xF0)));
9893
9894 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
9895 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
9896 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
9897 MachineInstrBuilder Swap2 = SwapN(N: 2, Dst: SrcTy, B&: MIRBuilder, Src: Swap4,
9898 Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xCC)));
9899
9900 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
9901 // 6|7
9902 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
9903 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
9904 SwapN(N: 1, Dst, B&: MIRBuilder, Src: Swap2, Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xAA)));
9905 }
9906 } else {
9907 // Expand bitreverse for types smaller than 8 bits.
9908 MachineInstrBuilder Tmp;
9909 for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
9910 MachineInstrBuilder Tmp2;
9911 if (I < J) {
9912 auto ShAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: J - I);
9913 Tmp2 = MIRBuilder.buildShl(Dst: SrcTy, Src0: Src, Src1: ShAmt);
9914 } else {
9915 auto ShAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: I - J);
9916 Tmp2 = MIRBuilder.buildLShr(Dst: SrcTy, Src0: Src, Src1: ShAmt);
9917 }
9918
9919 auto Mask = MIRBuilder.buildConstant(Res: SrcTy, Val: 1ULL << J);
9920 Tmp2 = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Tmp2, Src1: Mask);
9921 if (I == 0)
9922 Tmp = Tmp2;
9923 else
9924 Tmp = MIRBuilder.buildOr(Dst: SrcTy, Src0: Tmp, Src1: Tmp2);
9925 }
9926 MIRBuilder.buildCopy(Res: Dst, Op: Tmp);
9927 }
9928
9929 MI.eraseFromParent();
9930 return Legalized;
9931}
9932
9933LegalizerHelper::LegalizeResult
9934LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
9935 MachineFunction &MF = MIRBuilder.getMF();
9936
9937 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
9938 int NameOpIdx = IsRead ? 1 : 0;
9939 int ValRegIndex = IsRead ? 0 : 1;
9940
9941 Register ValReg = MI.getOperand(i: ValRegIndex).getReg();
9942 const LLT Ty = MRI.getType(Reg: ValReg);
9943 const MDString *RegStr = cast<MDString>(
9944 Val: cast<MDNode>(Val: MI.getOperand(i: NameOpIdx).getMetadata())->getOperand(I: 0));
9945
9946 Register PhysReg = TLI.getRegisterByName(RegName: RegStr->getString().data(), Ty, MF);
9947 if (!PhysReg) {
9948 const Function &Fn = MF.getFunction();
9949 Fn.getContext().diagnose(DI: DiagnosticInfoGenericWithLoc(
9950 "invalid register \"" + Twine(RegStr->getString().data()) + "\" for " +
9951 (IsRead ? "llvm.read_register" : "llvm.write_register"),
9952 Fn, MI.getDebugLoc()));
9953 if (IsRead)
9954 MIRBuilder.buildUndef(Res: ValReg);
9955
9956 MI.eraseFromParent();
9957 return Legalized;
9958 }
9959
9960 if (IsRead)
9961 MIRBuilder.buildCopy(Res: ValReg, Op: PhysReg);
9962 else
9963 MIRBuilder.buildCopy(Res: PhysReg, Op: ValReg);
9964
9965 MI.eraseFromParent();
9966 return Legalized;
9967}
9968
9969LegalizerHelper::LegalizeResult
9970LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
9971 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
9972 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
9973 Register Result = MI.getOperand(i: 0).getReg();
9974 LLT OrigTy = MRI.getType(Reg: Result);
9975 auto SizeInBits = OrigTy.getScalarSizeInBits();
9976 LLT WideTy = OrigTy.changeElementSize(NewEltSize: SizeInBits * 2);
9977
9978 auto LHS = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 1)});
9979 auto RHS = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 2)});
9980 auto Mul = MIRBuilder.buildMul(Dst: WideTy, Src0: LHS, Src1: RHS);
9981 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
9982
9983 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: SizeInBits);
9984 auto Shifted = MIRBuilder.buildInstr(Opc: ShiftOp, DstOps: {WideTy}, SrcOps: {Mul, ShiftAmt});
9985 MIRBuilder.buildTrunc(Res: Result, Op: Shifted);
9986
9987 MI.eraseFromParent();
9988 return Legalized;
9989}
9990
9991LegalizerHelper::LegalizeResult
9992LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
9993 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
9994 FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(i: 2).getImm());
9995
9996 if (Mask == fcNone) {
9997 MIRBuilder.buildConstant(Res: DstReg, Val: 0);
9998 MI.eraseFromParent();
9999 return Legalized;
10000 }
10001 if (Mask == fcAllFlags) {
10002 MIRBuilder.buildConstant(Res: DstReg, Val: 1);
10003 MI.eraseFromParent();
10004 return Legalized;
10005 }
10006
10007 // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
10008 // version
10009
10010 unsigned BitSize = SrcTy.getScalarSizeInBits();
10011 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: SrcTy.getScalarType());
10012
10013 LLT IntTy = SrcTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: BitSize));
10014 auto AsInt = MIRBuilder.buildCopy(Res: IntTy, Op: SrcReg);
10015
10016 // Various masks.
10017 APInt SignBit = APInt::getSignMask(BitWidth: BitSize);
10018 APInt ValueMask = APInt::getSignedMaxValue(numBits: BitSize); // All bits but sign.
10019 APInt Inf = APFloat::getInf(Sem: Semantics).bitcastToAPInt(); // Exp and int bit.
10020 APInt ExpMask = Inf;
10021 APInt AllOneMantissa = APFloat::getLargest(Sem: Semantics).bitcastToAPInt() & ~Inf;
10022 APInt QNaNBitMask =
10023 APInt::getOneBitSet(numBits: BitSize, BitNo: AllOneMantissa.getActiveBits() - 1);
10024 APInt InversionMask = APInt::getAllOnes(numBits: DstTy.getScalarSizeInBits());
10025
10026 auto SignBitC = MIRBuilder.buildConstant(Res: IntTy, Val: SignBit);
10027 auto ValueMaskC = MIRBuilder.buildConstant(Res: IntTy, Val: ValueMask);
10028 auto InfC = MIRBuilder.buildConstant(Res: IntTy, Val: Inf);
10029 auto ExpMaskC = MIRBuilder.buildConstant(Res: IntTy, Val: ExpMask);
10030 auto ZeroC = MIRBuilder.buildConstant(Res: IntTy, Val: 0);
10031
10032 auto Abs = MIRBuilder.buildAnd(Dst: IntTy, Src0: AsInt, Src1: ValueMaskC);
10033 auto Sign =
10034 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_NE, Res: DstTy, Op0: AsInt, Op1: Abs);
10035
10036 auto Res = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
10037 // Clang doesn't support capture of structured bindings:
10038 LLT DstTyCopy = DstTy;
10039 const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
10040 Res = MIRBuilder.buildOr(Dst: DstTyCopy, Src0: Res, Src1: ToAppend);
10041 };
10042
10043 // Tests that involve more than one class should be processed first.
10044 if ((Mask & fcFinite) == fcFinite) {
10045 // finite(V) ==> abs(V) u< exp_mask
10046 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: Abs,
10047 Op1: ExpMaskC));
10048 Mask &= ~fcFinite;
10049 } else if ((Mask & fcFinite) == fcPosFinite) {
10050 // finite(V) && V > 0 ==> V u< exp_mask
10051 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: AsInt,
10052 Op1: ExpMaskC));
10053 Mask &= ~fcPosFinite;
10054 } else if ((Mask & fcFinite) == fcNegFinite) {
10055 // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
10056 auto Cmp = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: Abs,
10057 Op1: ExpMaskC);
10058 auto And = MIRBuilder.buildAnd(Dst: DstTy, Src0: Cmp, Src1: Sign);
10059 appendToRes(And);
10060 Mask &= ~fcNegFinite;
10061 }
10062
10063 if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
10064 // fcZero | fcSubnormal => test all exponent bits are 0
10065 // TODO: Handle sign bit specific cases
10066 // TODO: Handle inverted case
10067 if (PartialCheck == (fcZero | fcSubnormal)) {
10068 auto ExpBits = MIRBuilder.buildAnd(Dst: IntTy, Src0: AsInt, Src1: ExpMaskC);
10069 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10070 Op0: ExpBits, Op1: ZeroC));
10071 Mask &= ~PartialCheck;
10072 }
10073 }
10074
10075 // Check for individual classes.
10076 if (FPClassTest PartialCheck = Mask & fcZero) {
10077 if (PartialCheck == fcPosZero)
10078 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10079 Op0: AsInt, Op1: ZeroC));
10080 else if (PartialCheck == fcZero)
10081 appendToRes(
10082 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy, Op0: Abs, Op1: ZeroC));
10083 else // fcNegZero
10084 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10085 Op0: AsInt, Op1: SignBitC));
10086 }
10087
10088 if (FPClassTest PartialCheck = Mask & fcSubnormal) {
10089 // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
10090 // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
10091 auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
10092 auto OneC = MIRBuilder.buildConstant(Res: IntTy, Val: 1);
10093 auto VMinusOne = MIRBuilder.buildSub(Dst: IntTy, Src0: V, Src1: OneC);
10094 auto SubnormalRes =
10095 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: VMinusOne,
10096 Op1: MIRBuilder.buildConstant(Res: IntTy, Val: AllOneMantissa));
10097 if (PartialCheck == fcNegSubnormal)
10098 SubnormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: SubnormalRes, Src1: Sign);
10099 appendToRes(SubnormalRes);
10100 }
10101
10102 if (FPClassTest PartialCheck = Mask & fcInf) {
10103 if (PartialCheck == fcPosInf)
10104 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10105 Op0: AsInt, Op1: InfC));
10106 else if (PartialCheck == fcInf)
10107 appendToRes(
10108 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy, Op0: Abs, Op1: InfC));
10109 else { // fcNegInf
10110 APInt NegInf = APFloat::getInf(Sem: Semantics, Negative: true).bitcastToAPInt();
10111 auto NegInfC = MIRBuilder.buildConstant(Res: IntTy, Val: NegInf);
10112 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
10113 Op0: AsInt, Op1: NegInfC));
10114 }
10115 }
10116
10117 if (FPClassTest PartialCheck = Mask & fcNan) {
10118 auto InfWithQnanBitC = MIRBuilder.buildConstant(Res: IntTy, Val: Inf | QNaNBitMask);
10119 if (PartialCheck == fcNan) {
10120 // isnan(V) ==> abs(V) u> int(inf)
10121 appendToRes(
10122 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGT, Res: DstTy, Op0: Abs, Op1: InfC));
10123 } else if (PartialCheck == fcQNan) {
10124 // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
10125 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGE, Res: DstTy, Op0: Abs,
10126 Op1: InfWithQnanBitC));
10127 } else { // fcSNan
10128 // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
10129 // abs(V) u< (unsigned(Inf) | quiet_bit)
10130 auto IsNan =
10131 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGT, Res: DstTy, Op0: Abs, Op1: InfC);
10132 auto IsNotQnan = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy,
10133 Op0: Abs, Op1: InfWithQnanBitC);
10134 appendToRes(MIRBuilder.buildAnd(Dst: DstTy, Src0: IsNan, Src1: IsNotQnan));
10135 }
10136 }
10137
10138 if (FPClassTest PartialCheck = Mask & fcNormal) {
10139 // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
10140 // (max_exp-1))
10141 APInt ExpLSB = ExpMask & ~(ExpMask.shl(shiftAmt: 1));
10142 auto ExpMinusOne = MIRBuilder.buildSub(
10143 Dst: IntTy, Src0: Abs, Src1: MIRBuilder.buildConstant(Res: IntTy, Val: ExpLSB));
10144 APInt MaxExpMinusOne = ExpMask - ExpLSB;
10145 auto NormalRes =
10146 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: ExpMinusOne,
10147 Op1: MIRBuilder.buildConstant(Res: IntTy, Val: MaxExpMinusOne));
10148 if (PartialCheck == fcNegNormal)
10149 NormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: NormalRes, Src1: Sign);
10150 else if (PartialCheck == fcPosNormal) {
10151 auto PosSign = MIRBuilder.buildXor(
10152 Dst: DstTy, Src0: Sign, Src1: MIRBuilder.buildConstant(Res: DstTy, Val: InversionMask));
10153 NormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: NormalRes, Src1: PosSign);
10154 }
10155 appendToRes(NormalRes);
10156 }
10157
10158 MIRBuilder.buildCopy(Res: DstReg, Op: Res);
10159 MI.eraseFromParent();
10160 return Legalized;
10161}
10162
10163LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
10164 // Implement G_SELECT in terms of XOR, AND, OR.
10165 auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
10166 MI.getFirst4RegLLTs();
10167
10168 bool IsEltPtr = DstTy.isPointerOrPointerVector();
10169 if (IsEltPtr) {
10170 LLT ScalarPtrTy = LLT::scalar(SizeInBits: DstTy.getScalarSizeInBits());
10171 LLT NewTy = DstTy.changeElementType(NewEltTy: ScalarPtrTy);
10172 Op1Reg = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Op1Reg).getReg(Idx: 0);
10173 Op2Reg = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Op2Reg).getReg(Idx: 0);
10174 DstTy = NewTy;
10175 }
10176
10177 if (MaskTy.isScalar()) {
10178 // Turn the scalar condition into a vector condition mask if needed.
10179
10180 Register MaskElt = MaskReg;
10181
10182 // The condition was potentially zero extended before, but we want a sign
10183 // extended boolean.
10184 if (MaskTy != LLT::scalar(SizeInBits: 1))
10185 MaskElt = MIRBuilder.buildSExtInReg(Res: MaskTy, Op: MaskElt, ImmOp: 1).getReg(Idx: 0);
10186
10187 // Continue the sign extension (or truncate) to match the data type.
10188 MaskElt =
10189 MIRBuilder.buildSExtOrTrunc(Res: DstTy.getScalarType(), Op: MaskElt).getReg(Idx: 0);
10190
10191 if (DstTy.isVector()) {
10192 // Generate a vector splat idiom.
10193 auto ShufSplat = MIRBuilder.buildShuffleSplat(Res: DstTy, Src: MaskElt);
10194 MaskReg = ShufSplat.getReg(Idx: 0);
10195 } else {
10196 MaskReg = MaskElt;
10197 }
10198 MaskTy = DstTy;
10199 } else if (!DstTy.isVector()) {
10200 // Cannot handle the case that mask is a vector and dst is a scalar.
10201 return UnableToLegalize;
10202 }
10203
10204 if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
10205 return UnableToLegalize;
10206 }
10207
10208 auto NotMask = MIRBuilder.buildNot(Dst: MaskTy, Src0: MaskReg);
10209 auto NewOp1 = MIRBuilder.buildAnd(Dst: MaskTy, Src0: Op1Reg, Src1: MaskReg);
10210 auto NewOp2 = MIRBuilder.buildAnd(Dst: MaskTy, Src0: Op2Reg, Src1: NotMask);
10211 if (IsEltPtr) {
10212 auto Or = MIRBuilder.buildOr(Dst: DstTy, Src0: NewOp1, Src1: NewOp2);
10213 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: Or);
10214 } else {
10215 MIRBuilder.buildOr(Dst: DstReg, Src0: NewOp1, Src1: NewOp2);
10216 }
10217 MI.eraseFromParent();
10218 return Legalized;
10219}
10220
10221LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
10222 // Split DIVREM into individual instructions.
10223 unsigned Opcode = MI.getOpcode();
10224
10225 MIRBuilder.buildInstr(
10226 Opc: Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
10227 : TargetOpcode::G_UDIV,
10228 DstOps: {MI.getOperand(i: 0).getReg()}, SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
10229 MIRBuilder.buildInstr(
10230 Opc: Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
10231 : TargetOpcode::G_UREM,
10232 DstOps: {MI.getOperand(i: 1).getReg()}, SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
10233 MI.eraseFromParent();
10234 return Legalized;
10235}
10236
10237LegalizerHelper::LegalizeResult
10238LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
10239 // Expand %res = G_ABS %a into:
10240 // %v1 = G_ASHR %a, scalar_size-1
10241 // %v2 = G_ADD %a, %v1
10242 // %res = G_XOR %v2, %v1
10243 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
10244 Register OpReg = MI.getOperand(i: 1).getReg();
10245 auto ShiftAmt =
10246 MIRBuilder.buildConstant(Res: DstTy, Val: DstTy.getScalarSizeInBits() - 1);
10247 auto Shift = MIRBuilder.buildAShr(Dst: DstTy, Src0: OpReg, Src1: ShiftAmt);
10248 auto Add = MIRBuilder.buildAdd(Dst: DstTy, Src0: OpReg, Src1: Shift);
10249 MIRBuilder.buildXor(Dst: MI.getOperand(i: 0).getReg(), Src0: Add, Src1: Shift);
10250 MI.eraseFromParent();
10251 return Legalized;
10252}
10253
10254LegalizerHelper::LegalizeResult
10255LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
10256 // Expand %res = G_ABS %a into:
10257 // %v1 = G_CONSTANT 0
10258 // %v2 = G_SUB %v1, %a
10259 // %res = G_SMAX %a, %v2
10260 Register SrcReg = MI.getOperand(i: 1).getReg();
10261 LLT Ty = MRI.getType(Reg: SrcReg);
10262 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
10263 auto Sub = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: SrcReg);
10264 MIRBuilder.buildSMax(Dst: MI.getOperand(i: 0), Src0: SrcReg, Src1: Sub);
10265 MI.eraseFromParent();
10266 return Legalized;
10267}
10268
10269LegalizerHelper::LegalizeResult
10270LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
10271 Register SrcReg = MI.getOperand(i: 1).getReg();
10272 Register DestReg = MI.getOperand(i: 0).getReg();
10273 LLT Ty = MRI.getType(Reg: SrcReg), IType = LLT::scalar(SizeInBits: 1);
10274 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0).getReg(Idx: 0);
10275 auto Sub = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: SrcReg).getReg(Idx: 0);
10276 auto ICmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: IType, Op0: SrcReg, Op1: Zero);
10277 MIRBuilder.buildSelect(Res: DestReg, Tst: ICmp, Op0: SrcReg, Op1: Sub);
10278 MI.eraseFromParent();
10279 return Legalized;
10280}
10281
10282LegalizerHelper::LegalizeResult
10283LegalizerHelper::lowerAbsDiffToSelect(MachineInstr &MI) {
10284 assert((MI.getOpcode() == TargetOpcode::G_ABDS ||
10285 MI.getOpcode() == TargetOpcode::G_ABDU) &&
10286 "Expected G_ABDS or G_ABDU instruction");
10287
10288 auto [DstReg, LHS, RHS] = MI.getFirst3Regs();
10289 LLT Ty = MRI.getType(Reg: LHS);
10290
10291 // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
10292 // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
10293 Register LHSSub = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10294 Register RHSSub = MIRBuilder.buildSub(Dst: Ty, Src0: RHS, Src1: LHS).getReg(Idx: 0);
10295 CmpInst::Predicate Pred = (MI.getOpcode() == TargetOpcode::G_ABDS)
10296 ? CmpInst::ICMP_SGT
10297 : CmpInst::ICMP_UGT;
10298 auto ICmp = MIRBuilder.buildICmp(Pred, Res: LLT::scalar(SizeInBits: 1), Op0: LHS, Op1: RHS);
10299 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LHSSub, Op1: RHSSub);
10300
10301 MI.eraseFromParent();
10302 return Legalized;
10303}
10304
10305LegalizerHelper::LegalizeResult
10306LegalizerHelper::lowerAbsDiffToMinMax(MachineInstr &MI) {
10307 assert((MI.getOpcode() == TargetOpcode::G_ABDS ||
10308 MI.getOpcode() == TargetOpcode::G_ABDU) &&
10309 "Expected G_ABDS or G_ABDU instruction");
10310
10311 auto [DstReg, LHS, RHS] = MI.getFirst3Regs();
10312 LLT Ty = MRI.getType(Reg: LHS);
10313
10314 // abds(lhs, rhs) -→ sub(smax(lhs, rhs), smin(lhs, rhs))
10315 // abdu(lhs, rhs) -→ sub(umax(lhs, rhs), umin(lhs, rhs))
10316 Register MaxReg, MinReg;
10317 if (MI.getOpcode() == TargetOpcode::G_ABDS) {
10318 MaxReg = MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10319 MinReg = MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10320 } else {
10321 MaxReg = MIRBuilder.buildUMax(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10322 MinReg = MIRBuilder.buildUMin(Dst: Ty, Src0: LHS, Src1: RHS).getReg(Idx: 0);
10323 }
10324 MIRBuilder.buildSub(Dst: DstReg, Src0: MaxReg, Src1: MinReg);
10325
10326 MI.eraseFromParent();
10327 return Legalized;
10328}
10329
10330LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
10331 Register SrcReg = MI.getOperand(i: 1).getReg();
10332 Register DstReg = MI.getOperand(i: 0).getReg();
10333
10334 LLT Ty = MRI.getType(Reg: DstReg);
10335
10336 // Reset sign bit
10337 MIRBuilder.buildAnd(
10338 Dst: DstReg, Src0: SrcReg,
10339 Src1: MIRBuilder.buildConstant(
10340 Res: Ty, Val: APInt::getSignedMaxValue(numBits: Ty.getScalarSizeInBits())));
10341
10342 MI.eraseFromParent();
10343 return Legalized;
10344}
10345
10346LegalizerHelper::LegalizeResult
10347LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
10348 Register SrcReg = MI.getOperand(i: 1).getReg();
10349 LLT SrcTy = MRI.getType(Reg: SrcReg);
10350 LLT DstTy = MRI.getType(Reg: SrcReg);
10351
10352 // The source could be a scalar if the IR type was <1 x sN>.
10353 if (SrcTy.isScalar()) {
10354 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
10355 return UnableToLegalize; // FIXME: handle extension.
10356 // This can be just a plain copy.
10357 Observer.changingInstr(MI);
10358 MI.setDesc(MIRBuilder.getTII().get(Opcode: TargetOpcode::COPY));
10359 Observer.changedInstr(MI);
10360 return Legalized;
10361 }
10362 return UnableToLegalize;
10363}
10364
10365LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
10366 MachineFunction &MF = *MI.getMF();
10367 const DataLayout &DL = MIRBuilder.getDataLayout();
10368 LLVMContext &Ctx = MF.getFunction().getContext();
10369 Register ListPtr = MI.getOperand(i: 1).getReg();
10370 LLT PtrTy = MRI.getType(Reg: ListPtr);
10371
10372 // LstPtr is a pointer to the head of the list. Get the address
10373 // of the head of the list.
10374 Align PtrAlignment = DL.getABITypeAlign(Ty: getTypeForLLT(Ty: PtrTy, C&: Ctx));
10375 MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
10376 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad, MemTy: PtrTy, base_alignment: PtrAlignment);
10377 auto VAList = MIRBuilder.buildLoad(Res: PtrTy, Addr: ListPtr, MMO&: *PtrLoadMMO).getReg(Idx: 0);
10378
10379 const Align A(MI.getOperand(i: 2).getImm());
10380 LLT PtrTyAsScalarTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
10381 if (A > TLI.getMinStackArgumentAlignment()) {
10382 Register AlignAmt =
10383 MIRBuilder.buildConstant(Res: PtrTyAsScalarTy, Val: A.value() - 1).getReg(Idx: 0);
10384 auto AddDst = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VAList, Op1: AlignAmt);
10385 auto AndDst = MIRBuilder.buildMaskLowPtrBits(Res: PtrTy, Op0: AddDst, NumBits: Log2(A));
10386 VAList = AndDst.getReg(Idx: 0);
10387 }
10388
10389 // Increment the pointer, VAList, to the next vaarg
10390 // The list should be bumped by the size of element in the current head of
10391 // list.
10392 Register Dst = MI.getOperand(i: 0).getReg();
10393 LLT LLTTy = MRI.getType(Reg: Dst);
10394 Type *Ty = getTypeForLLT(Ty: LLTTy, C&: Ctx);
10395 auto IncAmt =
10396 MIRBuilder.buildConstant(Res: PtrTyAsScalarTy, Val: DL.getTypeAllocSize(Ty));
10397 auto Succ = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VAList, Op1: IncAmt);
10398
10399 // Store the increment VAList to the legalized pointer
10400 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10401 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOStore, MemTy: PtrTy, base_alignment: PtrAlignment);
10402 MIRBuilder.buildStore(Val: Succ, Addr: ListPtr, MMO&: *StoreMMO);
10403 // Load the actual argument out of the pointer VAList
10404 Align EltAlignment = DL.getABITypeAlign(Ty);
10405 MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
10406 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad, MemTy: LLTTy, base_alignment: EltAlignment);
10407 MIRBuilder.buildLoad(Res: Dst, Addr: VAList, MMO&: *EltLoadMMO);
10408
10409 MI.eraseFromParent();
10410 return Legalized;
10411}
10412
10413static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
10414 // On Darwin, -Os means optimize for size without hurting performance, so
10415 // only really optimize for size when -Oz (MinSize) is used.
10416 if (MF.getTarget().getTargetTriple().isOSDarwin())
10417 return MF.getFunction().hasMinSize();
10418 return MF.getFunction().hasOptSize();
10419}
10420
10421// Returns a list of types to use for memory op lowering in MemOps. A partial
10422// port of findOptimalMemOpLowering in TargetLowering.
10423static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
10424 unsigned Limit, const MemOp &Op,
10425 unsigned DstAS, unsigned SrcAS,
10426 const AttributeList &FuncAttributes,
10427 const TargetLowering &TLI) {
10428 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
10429 return false;
10430
10431 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
10432
10433 if (Ty == LLT()) {
10434 // Use the largest scalar type whose alignment constraints are satisfied.
10435 // We only need to check DstAlign here as SrcAlign is always greater or
10436 // equal to DstAlign (or zero).
10437 Ty = LLT::scalar(SizeInBits: 64);
10438 if (Op.isFixedDstAlign())
10439 while (Op.getDstAlign() < Ty.getSizeInBytes() &&
10440 !TLI.allowsMisalignedMemoryAccesses(Ty, AddrSpace: DstAS, Alignment: Op.getDstAlign()))
10441 Ty = LLT::scalar(SizeInBits: Ty.getSizeInBytes());
10442 assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
10443 // FIXME: check for the largest legal type we can load/store to.
10444 }
10445
10446 unsigned NumMemOps = 0;
10447 uint64_t Size = Op.size();
10448 while (Size) {
10449 unsigned TySize = Ty.getSizeInBytes();
10450 while (TySize > Size) {
10451 // For now, only use non-vector load / store's for the left-over pieces.
10452 LLT NewTy = Ty;
10453 // FIXME: check for mem op safety and legality of the types. Not all of
10454 // SDAGisms map cleanly to GISel concepts.
10455 if (NewTy.isVector())
10456 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32);
10457 NewTy = LLT::scalar(SizeInBits: llvm::bit_floor(Value: NewTy.getSizeInBits() - 1));
10458 unsigned NewTySize = NewTy.getSizeInBytes();
10459 assert(NewTySize > 0 && "Could not find appropriate type");
10460
10461 // If the new LLT cannot cover all of the remaining bits, then consider
10462 // issuing a (or a pair of) unaligned and overlapping load / store.
10463 unsigned Fast;
10464 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
10465 MVT VT = getMVTForLLT(Ty);
10466 if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
10467 TLI.allowsMisalignedMemoryAccesses(
10468 VT, AddrSpace: DstAS, Alignment: Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
10469 Flags: MachineMemOperand::MONone, &Fast) &&
10470 Fast)
10471 TySize = Size;
10472 else {
10473 Ty = NewTy;
10474 TySize = NewTySize;
10475 }
10476 }
10477
10478 if (++NumMemOps > Limit)
10479 return false;
10480
10481 MemOps.push_back(x: Ty);
10482 Size -= TySize;
10483 }
10484
10485 return true;
10486}
10487
10488// Get a vectorized representation of the memset value operand, GISel edition.
10489static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
10490 MachineRegisterInfo &MRI = *MIB.getMRI();
10491 unsigned NumBits = Ty.getScalarSizeInBits();
10492 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Val, MRI);
10493 if (!Ty.isVector() && ValVRegAndVal) {
10494 APInt Scalar = ValVRegAndVal->Value.trunc(width: 8);
10495 APInt SplatVal = APInt::getSplat(NewLen: NumBits, V: Scalar);
10496 return MIB.buildConstant(Res: Ty, Val: SplatVal).getReg(Idx: 0);
10497 }
10498
10499 // Extend the byte value to the larger type, and then multiply by a magic
10500 // value 0x010101... in order to replicate it across every byte.
10501 // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
10502 if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
10503 return MIB.buildConstant(Res: Ty, Val: 0).getReg(Idx: 0);
10504 }
10505
10506 LLT ExtType = Ty.getScalarType();
10507 auto ZExt = MIB.buildZExtOrTrunc(Res: ExtType, Op: Val);
10508 if (NumBits > 8) {
10509 APInt Magic = APInt::getSplat(NewLen: NumBits, V: APInt(8, 0x01));
10510 auto MagicMI = MIB.buildConstant(Res: ExtType, Val: Magic);
10511 Val = MIB.buildMul(Dst: ExtType, Src0: ZExt, Src1: MagicMI).getReg(Idx: 0);
10512 }
10513
10514 // For vector types create a G_BUILD_VECTOR.
10515 if (Ty.isVector())
10516 Val = MIB.buildSplatBuildVector(Res: Ty, Src: Val).getReg(Idx: 0);
10517
10518 return Val;
10519}
10520
10521LegalizerHelper::LegalizeResult
10522LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
10523 uint64_t KnownLen, Align Alignment,
10524 bool IsVolatile) {
10525 auto &MF = *MI.getParent()->getParent();
10526 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10527 auto &DL = MF.getDataLayout();
10528 LLVMContext &C = MF.getFunction().getContext();
10529
10530 assert(KnownLen != 0 && "Have a zero length memset length!");
10531
10532 bool DstAlignCanChange = false;
10533 MachineFrameInfo &MFI = MF.getFrameInfo();
10534 bool OptSize = shouldLowerMemFuncForSize(MF);
10535
10536 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
10537 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
10538 DstAlignCanChange = true;
10539
10540 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
10541 std::vector<LLT> MemOps;
10542
10543 const auto &DstMMO = **MI.memoperands_begin();
10544 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10545
10546 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Val, MRI);
10547 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
10548
10549 if (!findGISelOptimalMemOpLowering(MemOps, Limit,
10550 Op: MemOp::Set(Size: KnownLen, DstAlignCanChange,
10551 DstAlign: Alignment,
10552 /*IsZeroMemset=*/IsZeroVal,
10553 /*IsVolatile=*/IsVolatile),
10554 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: ~0u,
10555 FuncAttributes: MF.getFunction().getAttributes(), TLI))
10556 return UnableToLegalize;
10557
10558 if (DstAlignCanChange) {
10559 // Get an estimate of the type from the LLT.
10560 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
10561 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
10562 if (NewAlign > Alignment) {
10563 Alignment = NewAlign;
10564 unsigned FI = FIDef->getOperand(i: 1).getIndex();
10565 // Give the stack frame object a larger alignment if needed.
10566 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
10567 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
10568 }
10569 }
10570
10571 MachineIRBuilder MIB(MI);
10572 // Find the largest store and generate the bit pattern for it.
10573 LLT LargestTy = MemOps[0];
10574 for (unsigned i = 1; i < MemOps.size(); i++)
10575 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
10576 LargestTy = MemOps[i];
10577
10578 // The memset stored value is always defined as an s8, so in order to make it
10579 // work with larger store types we need to repeat the bit pattern across the
10580 // wider type.
10581 Register MemSetValue = getMemsetValue(Val, Ty: LargestTy, MIB);
10582
10583 if (!MemSetValue)
10584 return UnableToLegalize;
10585
10586 // Generate the stores. For each store type in the list, we generate the
10587 // matching store of that type to the destination address.
10588 LLT PtrTy = MRI.getType(Reg: Dst);
10589 unsigned DstOff = 0;
10590 unsigned Size = KnownLen;
10591 for (unsigned I = 0; I < MemOps.size(); I++) {
10592 LLT Ty = MemOps[I];
10593 unsigned TySize = Ty.getSizeInBytes();
10594 if (TySize > Size) {
10595 // Issuing an unaligned load / store pair that overlaps with the previous
10596 // pair. Adjust the offset accordingly.
10597 assert(I == MemOps.size() - 1 && I != 0);
10598 DstOff -= TySize - Size;
10599 }
10600
10601 // If this store is smaller than the largest store see whether we can get
10602 // the smaller value for free with a truncate.
10603 Register Value = MemSetValue;
10604 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
10605 MVT VT = getMVTForLLT(Ty);
10606 MVT LargestVT = getMVTForLLT(Ty: LargestTy);
10607 if (!LargestTy.isVector() && !Ty.isVector() &&
10608 TLI.isTruncateFree(FromVT: LargestVT, ToVT: VT))
10609 Value = MIB.buildTrunc(Res: Ty, Op: MemSetValue).getReg(Idx: 0);
10610 else
10611 Value = getMemsetValue(Val, Ty, MIB);
10612 if (!Value)
10613 return UnableToLegalize;
10614 }
10615
10616 auto *StoreMMO = MF.getMachineMemOperand(MMO: &DstMMO, Offset: DstOff, Ty);
10617
10618 Register Ptr = Dst;
10619 if (DstOff != 0) {
10620 auto Offset =
10621 MIB.buildConstant(Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()), Val: DstOff);
10622 Ptr = MIB.buildObjectPtrOffset(Res: PtrTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
10623 }
10624
10625 MIB.buildStore(Val: Value, Addr: Ptr, MMO&: *StoreMMO);
10626 DstOff += Ty.getSizeInBytes();
10627 Size -= TySize;
10628 }
10629
10630 MI.eraseFromParent();
10631 return Legalized;
10632}
10633
10634LegalizerHelper::LegalizeResult
10635LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
10636 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
10637
10638 auto [Dst, Src, Len] = MI.getFirst3Regs();
10639
10640 const auto *MMOIt = MI.memoperands_begin();
10641 const MachineMemOperand *MemOp = *MMOIt;
10642 bool IsVolatile = MemOp->isVolatile();
10643
10644 // See if this is a constant length copy
10645 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Len, MRI);
10646 // FIXME: support dynamically sized G_MEMCPY_INLINE
10647 assert(LenVRegAndVal &&
10648 "inline memcpy with dynamic size is not yet supported");
10649 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
10650 if (KnownLen == 0) {
10651 MI.eraseFromParent();
10652 return Legalized;
10653 }
10654
10655 const auto &DstMMO = **MI.memoperands_begin();
10656 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
10657 Align DstAlign = DstMMO.getBaseAlign();
10658 Align SrcAlign = SrcMMO.getBaseAlign();
10659
10660 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
10661 IsVolatile);
10662}
10663
10664LegalizerHelper::LegalizeResult
10665LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
10666 uint64_t KnownLen, Align DstAlign,
10667 Align SrcAlign, bool IsVolatile) {
10668 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
10669 return lowerMemcpy(MI, Dst, Src, KnownLen,
10670 Limit: std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
10671 IsVolatile);
10672}
10673
10674LegalizerHelper::LegalizeResult
10675LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
10676 uint64_t KnownLen, uint64_t Limit, Align DstAlign,
10677 Align SrcAlign, bool IsVolatile) {
10678 auto &MF = *MI.getParent()->getParent();
10679 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10680 auto &DL = MF.getDataLayout();
10681 LLVMContext &C = MF.getFunction().getContext();
10682
10683 assert(KnownLen != 0 && "Have a zero length memcpy length!");
10684
10685 bool DstAlignCanChange = false;
10686 MachineFrameInfo &MFI = MF.getFrameInfo();
10687 Align Alignment = std::min(a: DstAlign, b: SrcAlign);
10688
10689 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
10690 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
10691 DstAlignCanChange = true;
10692
10693 // FIXME: infer better src pointer alignment like SelectionDAG does here.
10694 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
10695 // if the memcpy is in a tail call position.
10696
10697 std::vector<LLT> MemOps;
10698
10699 const auto &DstMMO = **MI.memoperands_begin();
10700 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
10701 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10702 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
10703
10704 if (!findGISelOptimalMemOpLowering(
10705 MemOps, Limit,
10706 Op: MemOp::Copy(Size: KnownLen, DstAlignCanChange, DstAlign: Alignment, SrcAlign,
10707 IsVolatile),
10708 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: SrcPtrInfo.getAddrSpace(),
10709 FuncAttributes: MF.getFunction().getAttributes(), TLI))
10710 return UnableToLegalize;
10711
10712 if (DstAlignCanChange) {
10713 // Get an estimate of the type from the LLT.
10714 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
10715 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
10716
10717 // Don't promote to an alignment that would require dynamic stack
10718 // realignment.
10719 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
10720 if (!TRI->hasStackRealignment(MF))
10721 if (MaybeAlign StackAlign = DL.getStackAlignment())
10722 NewAlign = std::min(a: NewAlign, b: *StackAlign);
10723
10724 if (NewAlign > Alignment) {
10725 Alignment = NewAlign;
10726 unsigned FI = FIDef->getOperand(i: 1).getIndex();
10727 // Give the stack frame object a larger alignment if needed.
10728 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
10729 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
10730 }
10731 }
10732
10733 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
10734
10735 MachineIRBuilder MIB(MI);
10736 // Now we need to emit a pair of load and stores for each of the types we've
10737 // collected. I.e. for each type, generate a load from the source pointer of
10738 // that type width, and then generate a corresponding store to the dest buffer
10739 // of that value loaded. This can result in a sequence of loads and stores
10740 // mixed types, depending on what the target specifies as good types to use.
10741 unsigned CurrOffset = 0;
10742 unsigned Size = KnownLen;
10743 for (auto CopyTy : MemOps) {
10744 // Issuing an unaligned load / store pair that overlaps with the previous
10745 // pair. Adjust the offset accordingly.
10746 if (CopyTy.getSizeInBytes() > Size)
10747 CurrOffset -= CopyTy.getSizeInBytes() - Size;
10748
10749 // Construct MMOs for the accesses.
10750 auto *LoadMMO =
10751 MF.getMachineMemOperand(MMO: &SrcMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10752 auto *StoreMMO =
10753 MF.getMachineMemOperand(MMO: &DstMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10754
10755 // Create the load.
10756 Register LoadPtr = Src;
10757 Register Offset;
10758 if (CurrOffset != 0) {
10759 LLT SrcTy = MRI.getType(Reg: Src);
10760 Offset = MIB.buildConstant(Res: LLT::scalar(SizeInBits: SrcTy.getSizeInBits()), Val: CurrOffset)
10761 .getReg(Idx: 0);
10762 LoadPtr = MIB.buildObjectPtrOffset(Res: SrcTy, Op0: Src, Op1: Offset).getReg(Idx: 0);
10763 }
10764 auto LdVal = MIB.buildLoad(Res: CopyTy, Addr: LoadPtr, MMO&: *LoadMMO);
10765
10766 // Create the store.
10767 Register StorePtr = Dst;
10768 if (CurrOffset != 0) {
10769 LLT DstTy = MRI.getType(Reg: Dst);
10770 StorePtr = MIB.buildObjectPtrOffset(Res: DstTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
10771 }
10772 MIB.buildStore(Val: LdVal, Addr: StorePtr, MMO&: *StoreMMO);
10773 CurrOffset += CopyTy.getSizeInBytes();
10774 Size -= CopyTy.getSizeInBytes();
10775 }
10776
10777 MI.eraseFromParent();
10778 return Legalized;
10779}
10780
10781LegalizerHelper::LegalizeResult
10782LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
10783 uint64_t KnownLen, Align DstAlign, Align SrcAlign,
10784 bool IsVolatile) {
10785 auto &MF = *MI.getParent()->getParent();
10786 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10787 auto &DL = MF.getDataLayout();
10788 LLVMContext &C = MF.getFunction().getContext();
10789
10790 assert(KnownLen != 0 && "Have a zero length memmove length!");
10791
10792 bool DstAlignCanChange = false;
10793 MachineFrameInfo &MFI = MF.getFrameInfo();
10794 bool OptSize = shouldLowerMemFuncForSize(MF);
10795 Align Alignment = std::min(a: DstAlign, b: SrcAlign);
10796
10797 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
10798 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
10799 DstAlignCanChange = true;
10800
10801 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
10802 std::vector<LLT> MemOps;
10803
10804 const auto &DstMMO = **MI.memoperands_begin();
10805 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
10806 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10807 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
10808
10809 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
10810 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
10811 // same thing here.
10812 if (!findGISelOptimalMemOpLowering(
10813 MemOps, Limit,
10814 Op: MemOp::Copy(Size: KnownLen, DstAlignCanChange, DstAlign: Alignment, SrcAlign,
10815 /*IsVolatile*/ true),
10816 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: SrcPtrInfo.getAddrSpace(),
10817 FuncAttributes: MF.getFunction().getAttributes(), TLI))
10818 return UnableToLegalize;
10819
10820 if (DstAlignCanChange) {
10821 // Get an estimate of the type from the LLT.
10822 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
10823 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
10824
10825 // Don't promote to an alignment that would require dynamic stack
10826 // realignment.
10827 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
10828 if (!TRI->hasStackRealignment(MF))
10829 if (MaybeAlign StackAlign = DL.getStackAlignment())
10830 NewAlign = std::min(a: NewAlign, b: *StackAlign);
10831
10832 if (NewAlign > Alignment) {
10833 Alignment = NewAlign;
10834 unsigned FI = FIDef->getOperand(i: 1).getIndex();
10835 // Give the stack frame object a larger alignment if needed.
10836 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
10837 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
10838 }
10839 }
10840
10841 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
10842
10843 MachineIRBuilder MIB(MI);
10844 // Memmove requires that we perform the loads first before issuing the stores.
10845 // Apart from that, this loop is pretty much doing the same thing as the
10846 // memcpy codegen function.
10847 unsigned CurrOffset = 0;
10848 SmallVector<Register, 16> LoadVals;
10849 for (auto CopyTy : MemOps) {
10850 // Construct MMO for the load.
10851 auto *LoadMMO =
10852 MF.getMachineMemOperand(MMO: &SrcMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10853
10854 // Create the load.
10855 Register LoadPtr = Src;
10856 if (CurrOffset != 0) {
10857 LLT SrcTy = MRI.getType(Reg: Src);
10858 auto Offset =
10859 MIB.buildConstant(Res: LLT::scalar(SizeInBits: SrcTy.getSizeInBits()), Val: CurrOffset);
10860 LoadPtr = MIB.buildObjectPtrOffset(Res: SrcTy, Op0: Src, Op1: Offset).getReg(Idx: 0);
10861 }
10862 LoadVals.push_back(Elt: MIB.buildLoad(Res: CopyTy, Addr: LoadPtr, MMO&: *LoadMMO).getReg(Idx: 0));
10863 CurrOffset += CopyTy.getSizeInBytes();
10864 }
10865
10866 CurrOffset = 0;
10867 for (unsigned I = 0; I < MemOps.size(); ++I) {
10868 LLT CopyTy = MemOps[I];
10869 // Now store the values loaded.
10870 auto *StoreMMO =
10871 MF.getMachineMemOperand(MMO: &DstMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
10872
10873 Register StorePtr = Dst;
10874 if (CurrOffset != 0) {
10875 LLT DstTy = MRI.getType(Reg: Dst);
10876 auto Offset =
10877 MIB.buildConstant(Res: LLT::scalar(SizeInBits: DstTy.getSizeInBits()), Val: CurrOffset);
10878 StorePtr = MIB.buildObjectPtrOffset(Res: DstTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
10879 }
10880 MIB.buildStore(Val: LoadVals[I], Addr: StorePtr, MMO&: *StoreMMO);
10881 CurrOffset += CopyTy.getSizeInBytes();
10882 }
10883 MI.eraseFromParent();
10884 return Legalized;
10885}
10886
10887LegalizerHelper::LegalizeResult
10888LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
10889 const unsigned Opc = MI.getOpcode();
10890 // This combine is fairly complex so it's not written with a separate
10891 // matcher function.
10892 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
10893 Opc == TargetOpcode::G_MEMSET) &&
10894 "Expected memcpy like instruction");
10895
10896 auto MMOIt = MI.memoperands_begin();
10897 const MachineMemOperand *MemOp = *MMOIt;
10898
10899 Align DstAlign = MemOp->getBaseAlign();
10900 Align SrcAlign;
10901 auto [Dst, Src, Len] = MI.getFirst3Regs();
10902
10903 if (Opc != TargetOpcode::G_MEMSET) {
10904 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
10905 MemOp = *(++MMOIt);
10906 SrcAlign = MemOp->getBaseAlign();
10907 }
10908
10909 // See if this is a constant length copy
10910 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Len, MRI);
10911 if (!LenVRegAndVal)
10912 return UnableToLegalize;
10913 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
10914
10915 if (KnownLen == 0) {
10916 MI.eraseFromParent();
10917 return Legalized;
10918 }
10919
10920 if (MaxLen && KnownLen > MaxLen)
10921 return UnableToLegalize;
10922
10923 bool IsVolatile = MemOp->isVolatile();
10924 if (Opc == TargetOpcode::G_MEMCPY) {
10925 auto &MF = *MI.getParent()->getParent();
10926 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10927 bool OptSize = shouldLowerMemFuncForSize(MF);
10928 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
10929 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
10930 IsVolatile);
10931 }
10932 if (Opc == TargetOpcode::G_MEMMOVE)
10933 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
10934 if (Opc == TargetOpcode::G_MEMSET)
10935 return lowerMemset(MI, Dst, Val: Src, KnownLen, Alignment: DstAlign, IsVolatile);
10936 return UnableToLegalize;
10937}
10938