1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
16#include "X86MachineFunctionInfo.h"
17#include "X86RegisterInfo.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/Statistic.h"
21#include "llvm/CodeGen/MachineModuleInfo.h"
22#include "llvm/CodeGen/SelectionDAGISel.h"
23#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/ConstantRange.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/Instructions.h"
27#include "llvm/IR/Intrinsics.h"
28#include "llvm/IR/IntrinsicsX86.h"
29#include "llvm/IR/Module.h"
30#include "llvm/IR/Type.h"
31#include "llvm/Support/Debug.h"
32#include "llvm/Support/ErrorHandling.h"
33#include "llvm/Support/KnownBits.h"
34#include "llvm/Support/MathExtras.h"
35#include <cstdint>
36
37using namespace llvm;
38
39#define DEBUG_TYPE "x86-isel"
40#define PASS_NAME "X86 DAG->DAG Instruction Selection"
41
42STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
43
44static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(Val: true),
45 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46 cl::Hidden);
47
48static cl::opt<bool> EnablePromoteAnyextLoad(
49 "x86-promote-anyext-load", cl::init(Val: true),
50 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
51
52extern cl::opt<bool> IndirectBranchTracking;
53
54//===----------------------------------------------------------------------===//
55// Pattern Matcher Implementation
56//===----------------------------------------------------------------------===//
57
58namespace {
59 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
60 /// numbers for the leaves of the matched tree.
61 struct X86ISelAddressMode {
62 enum {
63 RegBase,
64 FrameIndexBase
65 } BaseType = RegBase;
66
67 // This is really a union, discriminated by BaseType!
68 SDValue Base_Reg;
69 int Base_FrameIndex = 0;
70
71 unsigned Scale = 1;
72 SDValue IndexReg;
73 int32_t Disp = 0;
74 SDValue Segment;
75 const GlobalValue *GV = nullptr;
76 const Constant *CP = nullptr;
77 const BlockAddress *BlockAddr = nullptr;
78 const char *ES = nullptr;
79 MCSymbol *MCSym = nullptr;
80 int JT = -1;
81 Align Alignment; // CP alignment.
82 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
83 bool NegateIndex = false;
84
85 X86ISelAddressMode() = default;
86
87 bool hasSymbolicDisplacement() const {
88 return GV != nullptr || CP != nullptr || ES != nullptr ||
89 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
90 }
91
92 bool hasBaseOrIndexReg() const {
93 return BaseType == FrameIndexBase ||
94 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
95 }
96
97 /// Return true if this addressing mode is already RIP-relative.
98 bool isRIPRelative() const {
99 if (BaseType != RegBase) return false;
100 if (RegisterSDNode *RegNode =
101 dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode()))
102 return RegNode->getReg() == X86::RIP;
103 return false;
104 }
105
106 void setBaseReg(SDValue Reg) {
107 BaseType = RegBase;
108 Base_Reg = Reg;
109 }
110
111#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
112 void dump(SelectionDAG *DAG = nullptr) {
113 dbgs() << "X86ISelAddressMode " << this << '\n';
114 dbgs() << "Base_Reg ";
115 if (Base_Reg.getNode())
116 Base_Reg.getNode()->dump(DAG);
117 else
118 dbgs() << "nul\n";
119 if (BaseType == FrameIndexBase)
120 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
121 dbgs() << " Scale " << Scale << '\n'
122 << "IndexReg ";
123 if (NegateIndex)
124 dbgs() << "negate ";
125 if (IndexReg.getNode())
126 IndexReg.getNode()->dump(DAG);
127 else
128 dbgs() << "nul\n";
129 dbgs() << " Disp " << Disp << '\n'
130 << "GV ";
131 if (GV)
132 GV->dump();
133 else
134 dbgs() << "nul";
135 dbgs() << " CP ";
136 if (CP)
137 CP->dump();
138 else
139 dbgs() << "nul";
140 dbgs() << '\n'
141 << "ES ";
142 if (ES)
143 dbgs() << ES;
144 else
145 dbgs() << "nul";
146 dbgs() << " MCSym ";
147 if (MCSym)
148 dbgs() << MCSym;
149 else
150 dbgs() << "nul";
151 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
152 }
153#endif
154 };
155}
156
157namespace {
158 //===--------------------------------------------------------------------===//
159 /// ISel - X86-specific code to select X86 machine instructions for
160 /// SelectionDAG operations.
161 ///
162 class X86DAGToDAGISel final : public SelectionDAGISel {
163 /// Keep a pointer to the X86Subtarget around so that we can
164 /// make the right decision when generating code for different targets.
165 const X86Subtarget *Subtarget;
166
167 /// If true, selector should try to optimize for minimum code size.
168 bool OptForMinSize;
169
170 /// Disable direct TLS access through segment registers.
171 bool IndirectTlsSegRefs;
172
173 public:
174 X86DAGToDAGISel() = delete;
175
176 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
178 OptForMinSize(false), IndirectTlsSegRefs(false) {}
179
180 bool runOnMachineFunction(MachineFunction &MF) override {
181 // Reset the subtarget each time through.
182 Subtarget = &MF.getSubtarget<X86Subtarget>();
183 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184 Kind: "indirect-tls-seg-refs");
185
186 // OptFor[Min]Size are used in pattern predicates that isel is matching.
187 OptForMinSize = MF.getFunction().hasMinSize();
188 assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
189 "OptForMinSize implies OptForSize");
190 return SelectionDAGISel::runOnMachineFunction(mf&: MF);
191 }
192
193 void emitFunctionEntryCode() override;
194
195 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
196
197 void PreprocessISelDAG() override;
198 void PostprocessISelDAG() override;
199
200// Include the pieces autogenerated from the target description.
201#include "X86GenDAGISel.inc"
202
203 private:
204 void Select(SDNode *N) override;
205
206 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
207 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
208 bool AllowSegmentRegForX32 = false);
209 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
210 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
211 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
212 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
213 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
214 unsigned Depth);
215 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
216 unsigned Depth);
217 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218 unsigned Depth);
219 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221 SDValue &Scale, SDValue &Index, SDValue &Disp,
222 SDValue &Segment);
223 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
224 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
225 SDValue &Index, SDValue &Disp, SDValue &Segment);
226 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227 bool selectLEAAddr(SDValue N, SDValue &Base,
228 SDValue &Scale, SDValue &Index, SDValue &Disp,
229 SDValue &Segment);
230 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231 SDValue &Scale, SDValue &Index, SDValue &Disp,
232 SDValue &Segment);
233 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234 SDValue &Scale, SDValue &Index, SDValue &Disp,
235 SDValue &Segment);
236 bool selectRelocImm(SDValue N, SDValue &Op);
237
238 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
239 SDValue &Base, SDValue &Scale,
240 SDValue &Index, SDValue &Disp,
241 SDValue &Segment);
242
243 // Convenience method where P is also root.
244 bool tryFoldLoad(SDNode *P, SDValue N,
245 SDValue &Base, SDValue &Scale,
246 SDValue &Index, SDValue &Disp,
247 SDValue &Segment) {
248 return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment);
249 }
250
251 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
252 SDValue &Base, SDValue &Scale,
253 SDValue &Index, SDValue &Disp,
254 SDValue &Segment);
255
256 bool isProfitableToFormMaskedOp(SDNode *N) const;
257
258 /// Implement addressing mode selection for inline asm expressions.
259 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
260 InlineAsm::ConstraintCode ConstraintID,
261 std::vector<SDValue> &OutOps) override;
262
263 void emitSpecialCodeForMain();
264
265 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
266 MVT VT, SDValue &Base, SDValue &Scale,
267 SDValue &Index, SDValue &Disp,
268 SDValue &Segment) {
269 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
270 Base = CurDAG->getTargetFrameIndex(
271 FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout()));
272 else if (AM.Base_Reg.getNode())
273 Base = AM.Base_Reg;
274 else
275 Base = CurDAG->getRegister(Reg: 0, VT);
276
277 Scale = getI8Imm(Imm: AM.Scale, DL);
278
279#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
280 // Negate the index if needed.
281 if (AM.NegateIndex) {
282 unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
283 : GET_ND_IF_ENABLED(X86::NEG32r);
284 SDValue Neg = SDValue(CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32,
285 Ops: AM.IndexReg), 0);
286 AM.IndexReg = Neg;
287 }
288
289 if (AM.IndexReg.getNode())
290 Index = AM.IndexReg;
291 else
292 Index = CurDAG->getRegister(Reg: 0, VT);
293
294 // These are 32-bit even in 64-bit mode since RIP-relative offset
295 // is 32-bit.
296 if (AM.GV)
297 Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc(),
298 VT: MVT::i32, offset: AM.Disp,
299 TargetFlags: AM.SymbolFlags);
300 else if (AM.CP)
301 Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment,
302 Offset: AM.Disp, TargetFlags: AM.SymbolFlags);
303 else if (AM.ES) {
304 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
305 Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
306 } else if (AM.MCSym) {
307 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
308 assert(AM.SymbolFlags == 0 && "oo");
309 Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32);
310 } else if (AM.JT != -1) {
311 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
312 Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
313 } else if (AM.BlockAddr)
314 Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp,
315 TargetFlags: AM.SymbolFlags);
316 else
317 Disp = CurDAG->getTargetConstant(Val: AM.Disp, DL, VT: MVT::i32);
318
319 if (AM.Segment.getNode())
320 Segment = AM.Segment;
321 else
322 Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
323 }
324
325 // Utility function to determine whether we should avoid selecting
326 // immediate forms of instructions for better code size or not.
327 // At a high level, we'd like to avoid such instructions when
328 // we have similar constants used within the same basic block
329 // that can be kept in a register.
330 //
331 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
332 uint32_t UseCount = 0;
333
334 // Do not want to hoist if we're not optimizing for size.
335 // TODO: We'd like to remove this restriction.
336 // See the comment in X86InstrInfo.td for more info.
337 if (!CurDAG->shouldOptForSize())
338 return false;
339
340 // Walk all the users of the immediate.
341 for (const SDNode *User : N->uses()) {
342 if (UseCount >= 2)
343 break;
344
345 // This user is already selected. Count it as a legitimate use and
346 // move on.
347 if (User->isMachineOpcode()) {
348 UseCount++;
349 continue;
350 }
351
352 // We want to count stores of immediates as real uses.
353 if (User->getOpcode() == ISD::STORE &&
354 User->getOperand(Num: 1).getNode() == N) {
355 UseCount++;
356 continue;
357 }
358
359 // We don't currently match users that have > 2 operands (except
360 // for stores, which are handled above)
361 // Those instruction won't match in ISEL, for now, and would
362 // be counted incorrectly.
363 // This may change in the future as we add additional instruction
364 // types.
365 if (User->getNumOperands() != 2)
366 continue;
367
368 // If this is a sign-extended 8-bit integer immediate used in an ALU
369 // instruction, there is probably an opcode encoding to save space.
370 auto *C = dyn_cast<ConstantSDNode>(Val: N);
371 if (C && isInt<8>(x: C->getSExtValue()))
372 continue;
373
374 // Immediates that are used for offsets as part of stack
375 // manipulation should be left alone. These are typically
376 // used to indicate SP offsets for argument passing and
377 // will get pulled into stores/pushes (implicitly).
378 if (User->getOpcode() == X86ISD::ADD ||
379 User->getOpcode() == ISD::ADD ||
380 User->getOpcode() == X86ISD::SUB ||
381 User->getOpcode() == ISD::SUB) {
382
383 // Find the other operand of the add/sub.
384 SDValue OtherOp = User->getOperand(Num: 0);
385 if (OtherOp.getNode() == N)
386 OtherOp = User->getOperand(Num: 1);
387
388 // Don't count if the other operand is SP.
389 RegisterSDNode *RegNode;
390 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
391 (RegNode = dyn_cast_or_null<RegisterSDNode>(
392 Val: OtherOp->getOperand(Num: 1).getNode())))
393 if ((RegNode->getReg() == X86::ESP) ||
394 (RegNode->getReg() == X86::RSP))
395 continue;
396 }
397
398 // ... otherwise, count this and move on.
399 UseCount++;
400 }
401
402 // If we have more than 1 use, then recommend for hoisting.
403 return (UseCount > 1);
404 }
405
406 /// Return a target constant with the specified value of type i8.
407 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
408 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
409 }
410
411 /// Return a target constant with the specified value, of type i32.
412 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
413 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32);
414 }
415
416 /// Return a target constant with the specified value, of type i64.
417 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
418 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64);
419 }
420
421 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
422 const SDLoc &DL) {
423 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
424 uint64_t Index = N->getConstantOperandVal(Num: 1);
425 MVT VecVT = N->getOperand(Num: 0).getSimpleValueType();
426 return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
427 }
428
429 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
430 const SDLoc &DL) {
431 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
432 uint64_t Index = N->getConstantOperandVal(Num: 2);
433 MVT VecVT = N->getSimpleValueType(ResNo: 0);
434 return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
435 }
436
437 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
438 const SDLoc &DL) {
439 assert(VecWidth == 128 && "Unexpected vector width");
440 uint64_t Index = N->getConstantOperandVal(Num: 2);
441 MVT VecVT = N->getSimpleValueType(ResNo: 0);
442 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
443 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
444 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
445 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
446 return getI8Imm(Imm: InsertIdx ? 0x02 : 0x30, DL);
447 }
448
449 SDValue getSBBZero(SDNode *N) {
450 SDLoc dl(N);
451 MVT VT = N->getSimpleValueType(ResNo: 0);
452
453 // Create zero.
454 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
455 SDValue Zero = SDValue(
456 CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: std::nullopt), 0);
457 if (VT == MVT::i64) {
458 Zero = SDValue(
459 CurDAG->getMachineNode(
460 Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64,
461 Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: Zero,
462 Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
463 0);
464 }
465
466 // Copy flags to the EFLAGS register and glue it to next node.
467 unsigned Opcode = N->getOpcode();
468 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
469 "Unexpected opcode for SBB materialization");
470 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
471 SDValue EFLAGS =
472 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
473 N: N->getOperand(Num: FlagOpIndex), Glue: SDValue());
474
475 // Create a 64-bit instruction if the result is 64-bits otherwise use the
476 // 32-bit version.
477 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
478 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
479 VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32);
480 return SDValue(
481 CurDAG->getMachineNode(Opcode: Opc, dl, VTs,
482 Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: 1)}),
483 0);
484 }
485
486 // Helper to detect unneeded and instructions on shift amounts. Called
487 // from PatFrags in tablegen.
488 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
489 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
490 const APInt &Val = N->getConstantOperandAPInt(Num: 1);
491
492 if (Val.countr_one() >= Width)
493 return true;
494
495 APInt Mask = Val | CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero;
496 return Mask.countr_one() >= Width;
497 }
498
499 /// Return an SDNode that returns the value of the global base register.
500 /// Output instructions required to initialize the global base register,
501 /// if necessary.
502 SDNode *getGlobalBaseReg();
503
504 /// Return a reference to the TargetMachine, casted to the target-specific
505 /// type.
506 const X86TargetMachine &getTargetMachine() const {
507 return static_cast<const X86TargetMachine &>(TM);
508 }
509
510 /// Return a reference to the TargetInstrInfo, casted to the target-specific
511 /// type.
512 const X86InstrInfo *getInstrInfo() const {
513 return Subtarget->getInstrInfo();
514 }
515
516 /// Return a condition code of the given SDNode
517 X86::CondCode getCondFromNode(SDNode *N) const;
518
519 /// Address-mode matching performs shift-of-and to and-of-shift
520 /// reassociation in order to expose more scaled addressing
521 /// opportunities.
522 bool ComplexPatternFuncMutatesDAG() const override {
523 return true;
524 }
525
526 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
527
528 // Indicates we should prefer to use a non-temporal load for this load.
529 bool useNonTemporalLoad(LoadSDNode *N) const {
530 if (!N->isNonTemporal())
531 return false;
532
533 unsigned StoreSize = N->getMemoryVT().getStoreSize();
534
535 if (N->getAlign().value() < StoreSize)
536 return false;
537
538 switch (StoreSize) {
539 default: llvm_unreachable("Unsupported store size");
540 case 4:
541 case 8:
542 return false;
543 case 16:
544 return Subtarget->hasSSE41();
545 case 32:
546 return Subtarget->hasAVX2();
547 case 64:
548 return Subtarget->hasAVX512();
549 }
550 }
551
552 bool foldLoadStoreIntoMemOperand(SDNode *Node);
553 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
554 bool matchBitExtract(SDNode *Node);
555 bool shrinkAndImmediate(SDNode *N);
556 bool isMaskZeroExtended(SDNode *N) const;
557 bool tryShiftAmountMod(SDNode *N);
558 bool tryShrinkShlLogicImm(SDNode *N);
559 bool tryVPTERNLOG(SDNode *N);
560 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
561 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
562 uint8_t Imm);
563 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
564 bool tryMatchBitSelect(SDNode *N);
565
566 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
567 const SDLoc &dl, MVT VT, SDNode *Node);
568 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
569 const SDLoc &dl, MVT VT, SDNode *Node,
570 SDValue &InGlue);
571
572 bool tryOptimizeRem8Extend(SDNode *N);
573
574 bool onlyUsesZeroFlag(SDValue Flags) const;
575 bool hasNoSignFlagUses(SDValue Flags) const;
576 bool hasNoCarryFlagUses(SDValue Flags) const;
577 };
578
579 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
580 public:
581 static char ID;
582 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
583 CodeGenOptLevel OptLevel)
584 : SelectionDAGISelLegacy(
585 ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {}
586 };
587}
588
589char X86DAGToDAGISelLegacy::ID = 0;
590
591INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
592
593// Returns true if this masked compare can be implemented legally with this
594// type.
595static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
596 unsigned Opcode = N->getOpcode();
597 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
598 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
599 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
600 // We can get 256-bit 8 element types here without VLX being enabled. When
601 // this happens we will use 512-bit operations and the mask will not be
602 // zero extended.
603 EVT OpVT = N->getOperand(Num: 0).getValueType();
604 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
605 // second operand.
606 if (Opcode == X86ISD::STRICT_CMPM)
607 OpVT = N->getOperand(Num: 1).getValueType();
608 if (OpVT.is256BitVector() || OpVT.is128BitVector())
609 return Subtarget->hasVLX();
610
611 return true;
612 }
613 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
614 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
615 Opcode == X86ISD::FSETCCM_SAE)
616 return true;
617
618 return false;
619}
620
621// Returns true if we can assume the writer of the mask has zero extended it
622// for us.
623bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
624 // If this is an AND, check if we have a compare on either side. As long as
625 // one side guarantees the mask is zero extended, the AND will preserve those
626 // zeros.
627 if (N->getOpcode() == ISD::AND)
628 return isLegalMaskCompare(N: N->getOperand(Num: 0).getNode(), Subtarget) ||
629 isLegalMaskCompare(N: N->getOperand(Num: 1).getNode(), Subtarget);
630
631 return isLegalMaskCompare(N, Subtarget);
632}
633
634bool
635X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
636 if (OptLevel == CodeGenOptLevel::None)
637 return false;
638
639 if (!N.hasOneUse())
640 return false;
641
642 if (N.getOpcode() != ISD::LOAD)
643 return true;
644
645 // Don't fold non-temporal loads if we have an instruction for them.
646 if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N)))
647 return false;
648
649 // If N is a load, do additional profitability checks.
650 if (U == Root) {
651 switch (U->getOpcode()) {
652 default: break;
653 case X86ISD::ADD:
654 case X86ISD::ADC:
655 case X86ISD::SUB:
656 case X86ISD::SBB:
657 case X86ISD::AND:
658 case X86ISD::XOR:
659 case X86ISD::OR:
660 case ISD::ADD:
661 case ISD::UADDO_CARRY:
662 case ISD::AND:
663 case ISD::OR:
664 case ISD::XOR: {
665 SDValue Op1 = U->getOperand(Num: 1);
666
667 // If the other operand is a 8-bit immediate we should fold the immediate
668 // instead. This reduces code size.
669 // e.g.
670 // movl 4(%esp), %eax
671 // addl $4, %eax
672 // vs.
673 // movl $4, %eax
674 // addl 4(%esp), %eax
675 // The former is 2 bytes shorter. In case where the increment is 1, then
676 // the saving can be 4 bytes (by using incl %eax).
677 if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) {
678 if (Imm->getAPIntValue().isSignedIntN(N: 8))
679 return false;
680
681 // If this is a 64-bit AND with an immediate that fits in 32-bits,
682 // prefer using the smaller and over folding the load. This is needed to
683 // make sure immediates created by shrinkAndImmediate are always folded.
684 // Ideally we would narrow the load during DAG combine and get the
685 // best of both worlds.
686 if (U->getOpcode() == ISD::AND &&
687 Imm->getAPIntValue().getBitWidth() == 64 &&
688 Imm->getAPIntValue().isIntN(N: 32))
689 return false;
690
691 // If this really a zext_inreg that can be represented with a movzx
692 // instruction, prefer that.
693 // TODO: We could shrink the load and fold if it is non-volatile.
694 if (U->getOpcode() == ISD::AND &&
695 (Imm->getAPIntValue() == UINT8_MAX ||
696 Imm->getAPIntValue() == UINT16_MAX ||
697 Imm->getAPIntValue() == UINT32_MAX))
698 return false;
699
700 // ADD/SUB with can negate the immediate and use the opposite operation
701 // to fit 128 into a sign extended 8 bit immediate.
702 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
703 (-Imm->getAPIntValue()).isSignedIntN(N: 8))
704 return false;
705
706 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
707 (-Imm->getAPIntValue()).isSignedIntN(N: 8) &&
708 hasNoCarryFlagUses(Flags: SDValue(U, 1)))
709 return false;
710 }
711
712 // If the other operand is a TLS address, we should fold it instead.
713 // This produces
714 // movl %gs:0, %eax
715 // leal i@NTPOFF(%eax), %eax
716 // instead of
717 // movl $i@NTPOFF, %eax
718 // addl %gs:0, %eax
719 // if the block also has an access to a second TLS address this will save
720 // a load.
721 // FIXME: This is probably also true for non-TLS addresses.
722 if (Op1.getOpcode() == X86ISD::Wrapper) {
723 SDValue Val = Op1.getOperand(i: 0);
724 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
725 return false;
726 }
727
728 // Don't fold load if this matches the BTS/BTR/BTC patterns.
729 // BTS: (or X, (shl 1, n))
730 // BTR: (and X, (rotl -2, n))
731 // BTC: (xor X, (shl 1, n))
732 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
733 if (U->getOperand(Num: 0).getOpcode() == ISD::SHL &&
734 isOneConstant(V: U->getOperand(Num: 0).getOperand(i: 0)))
735 return false;
736
737 if (U->getOperand(Num: 1).getOpcode() == ISD::SHL &&
738 isOneConstant(V: U->getOperand(Num: 1).getOperand(i: 0)))
739 return false;
740 }
741 if (U->getOpcode() == ISD::AND) {
742 SDValue U0 = U->getOperand(Num: 0);
743 SDValue U1 = U->getOperand(Num: 1);
744 if (U0.getOpcode() == ISD::ROTL) {
745 auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: 0));
746 if (C && C->getSExtValue() == -2)
747 return false;
748 }
749
750 if (U1.getOpcode() == ISD::ROTL) {
751 auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: 0));
752 if (C && C->getSExtValue() == -2)
753 return false;
754 }
755 }
756
757 break;
758 }
759 case ISD::SHL:
760 case ISD::SRA:
761 case ISD::SRL:
762 // Don't fold a load into a shift by immediate. The BMI2 instructions
763 // support folding a load, but not an immediate. The legacy instructions
764 // support folding an immediate, but can't fold a load. Folding an
765 // immediate is preferable to folding a load.
766 if (isa<ConstantSDNode>(Val: U->getOperand(Num: 1)))
767 return false;
768
769 break;
770 }
771 }
772
773 // Prevent folding a load if this can implemented with an insert_subreg or
774 // a move that implicitly zeroes.
775 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
776 isNullConstant(V: Root->getOperand(Num: 2)) &&
777 (Root->getOperand(Num: 0).isUndef() ||
778 ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: 0).getNode())))
779 return false;
780
781 return true;
782}
783
784// Indicates it is profitable to form an AVX512 masked operation. Returning
785// false will favor a masked register-register masked move or vblendm and the
786// operation will be selected separately.
787bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
788 assert(
789 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
790 "Unexpected opcode!");
791
792 // If the operation has additional users, the operation will be duplicated.
793 // Check the use count to prevent that.
794 // FIXME: Are there cheap opcodes we might want to duplicate?
795 return N->getOperand(Num: 1).hasOneUse();
796}
797
798/// Replace the original chain operand of the call with
799/// load's chain operand and move load below the call's chain operand.
800static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
801 SDValue Call, SDValue OrigChain) {
802 SmallVector<SDValue, 8> Ops;
803 SDValue Chain = OrigChain.getOperand(i: 0);
804 if (Chain.getNode() == Load.getNode())
805 Ops.push_back(Elt: Load.getOperand(i: 0));
806 else {
807 assert(Chain.getOpcode() == ISD::TokenFactor &&
808 "Unexpected chain operand");
809 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
810 if (Chain.getOperand(i).getNode() == Load.getNode())
811 Ops.push_back(Elt: Load.getOperand(i: 0));
812 else
813 Ops.push_back(Elt: Chain.getOperand(i));
814 SDValue NewChain =
815 CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Load), VT: MVT::Other, Ops);
816 Ops.clear();
817 Ops.push_back(Elt: NewChain);
818 }
819 Ops.append(in_start: OrigChain->op_begin() + 1, in_end: OrigChain->op_end());
820 CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops);
821 CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: 0),
822 Op2: Load.getOperand(i: 1), Op3: Load.getOperand(i: 2));
823
824 Ops.clear();
825 Ops.push_back(Elt: SDValue(Load.getNode(), 1));
826 Ops.append(in_start: Call->op_begin() + 1, in_end: Call->op_end());
827 CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops);
828}
829
830/// Return true if call address is a load and it can be
831/// moved below CALLSEQ_START and the chains leading up to the call.
832/// Return the CALLSEQ_START by reference as a second output.
833/// In the case of a tail call, there isn't a callseq node between the call
834/// chain and the load.
835static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
836 // The transformation is somewhat dangerous if the call's chain was glued to
837 // the call. After MoveBelowOrigChain the load is moved between the call and
838 // the chain, this can create a cycle if the load is not folded. So it is
839 // *really* important that we are sure the load will be folded.
840 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
841 return false;
842 auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode());
843 if (!LD ||
844 !LD->isSimple() ||
845 LD->getAddressingMode() != ISD::UNINDEXED ||
846 LD->getExtensionType() != ISD::NON_EXTLOAD)
847 return false;
848
849 // Now let's find the callseq_start.
850 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
851 if (!Chain.hasOneUse())
852 return false;
853 Chain = Chain.getOperand(i: 0);
854 }
855
856 if (!Chain.getNumOperands())
857 return false;
858 // Since we are not checking for AA here, conservatively abort if the chain
859 // writes to memory. It's not safe to move the callee (a load) across a store.
860 if (isa<MemSDNode>(Val: Chain.getNode()) &&
861 cast<MemSDNode>(Val: Chain.getNode())->writeMem())
862 return false;
863 if (Chain.getOperand(i: 0).getNode() == Callee.getNode())
864 return true;
865 if (Chain.getOperand(i: 0).getOpcode() == ISD::TokenFactor &&
866 Callee.getValue(R: 1).isOperandOf(N: Chain.getOperand(i: 0).getNode()) &&
867 Callee.getValue(R: 1).hasOneUse())
868 return true;
869 return false;
870}
871
872static bool isEndbrImm64(uint64_t Imm) {
873// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
874// i.g: 0xF3660F1EFA, 0xF3670F1EFA
875 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
876 return false;
877
878 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
879 0x65, 0x66, 0x67, 0xf0, 0xf2};
880 int i = 24; // 24bit 0x0F1EFA has matched
881 while (i < 64) {
882 uint8_t Byte = (Imm >> i) & 0xFF;
883 if (Byte == 0xF3)
884 return true;
885 if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte))
886 return false;
887 i += 8;
888 }
889
890 return false;
891}
892
893static bool needBWI(MVT VT) {
894 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
895}
896
897void X86DAGToDAGISel::PreprocessISelDAG() {
898 bool MadeChange = false;
899 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
900 E = CurDAG->allnodes_end(); I != E; ) {
901 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
902
903 // This is for CET enhancement.
904 //
905 // ENDBR32 and ENDBR64 have specific opcodes:
906 // ENDBR32: F3 0F 1E FB
907 // ENDBR64: F3 0F 1E FA
908 // And we want that attackers won’t find unintended ENDBR32/64
909 // opcode matches in the binary
910 // Here’s an example:
911 // If the compiler had to generate asm for the following code:
912 // a = 0xF30F1EFA
913 // it could, for example, generate:
914 // mov 0xF30F1EFA, dword ptr[a]
915 // In such a case, the binary would include a gadget that starts
916 // with a fake ENDBR64 opcode. Therefore, we split such generation
917 // into multiple operations, let it not shows in the binary
918 if (N->getOpcode() == ISD::Constant) {
919 MVT VT = N->getSimpleValueType(ResNo: 0);
920 int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
921 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
922 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
923 // Check that the cf-protection-branch is enabled.
924 Metadata *CFProtectionBranch =
925 MF->getFunction().getParent()->getModuleFlag(
926 Key: "cf-protection-branch");
927 if (CFProtectionBranch || IndirectBranchTracking) {
928 SDLoc dl(N);
929 SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true);
930 Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT);
931 --I;
932 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Complement);
933 ++I;
934 MadeChange = true;
935 continue;
936 }
937 }
938 }
939
940 // If this is a target specific AND node with no flag usages, turn it back
941 // into ISD::AND to enable test instruction matching.
942 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: 1)) {
943 SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
944 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
945 --I;
946 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
947 ++I;
948 MadeChange = true;
949 continue;
950 }
951
952 // Convert vector increment or decrement to sub/add with an all-ones
953 // constant:
954 // add X, <1, 1...> --> sub X, <-1, -1...>
955 // sub X, <1, 1...> --> add X, <-1, -1...>
956 // The all-ones vector constant can be materialized using a pcmpeq
957 // instruction that is commonly recognized as an idiom (has no register
958 // dependency), so that's better/smaller than loading a splat 1 constant.
959 //
960 // But don't do this if it would inhibit a potentially profitable load
961 // folding opportunity for the other operand. That only occurs with the
962 // intersection of:
963 // (1) The other operand (op0) is load foldable.
964 // (2) The op is an add (otherwise, we are *creating* an add and can still
965 // load fold the other op).
966 // (3) The target has AVX (otherwise, we have a destructive add and can't
967 // load fold the other op without killing the constant op).
968 // (4) The constant 1 vector has multiple uses (so it is profitable to load
969 // into a register anyway).
970 auto mayPreventLoadFold = [&]() {
971 return X86::mayFoldLoad(Op: N->getOperand(Num: 0), Subtarget: *Subtarget) &&
972 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
973 !N->getOperand(Num: 1).hasOneUse();
974 };
975 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
976 N->getSimpleValueType(ResNo: 0).isVector() && !mayPreventLoadFold()) {
977 APInt SplatVal;
978 if (X86::isConstantSplat(Op: N->getOperand(Num: 1), SplatVal) &&
979 SplatVal.isOne()) {
980 SDLoc DL(N);
981
982 MVT VT = N->getSimpleValueType(ResNo: 0);
983 unsigned NumElts = VT.getSizeInBits() / 32;
984 SDValue AllOnes =
985 CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts));
986 AllOnes = CurDAG->getBitcast(VT, V: AllOnes);
987
988 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
989 SDValue Res =
990 CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: 0), N2: AllOnes);
991 --I;
992 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
993 ++I;
994 MadeChange = true;
995 continue;
996 }
997 }
998
999 switch (N->getOpcode()) {
1000 case X86ISD::VBROADCAST: {
1001 MVT VT = N->getSimpleValueType(ResNo: 0);
1002 // Emulate v32i16/v64i8 broadcast without BWI.
1003 if (!Subtarget->hasBWI() && needBWI(VT)) {
1004 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1005 SDLoc dl(N);
1006 SDValue NarrowBCast =
1007 CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: 0));
1008 SDValue Res =
1009 CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1010 N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1011 unsigned Index = NarrowVT.getVectorMinNumElements();
1012 Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1013 N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1014
1015 --I;
1016 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1017 ++I;
1018 MadeChange = true;
1019 continue;
1020 }
1021
1022 break;
1023 }
1024 case X86ISD::VBROADCAST_LOAD: {
1025 MVT VT = N->getSimpleValueType(ResNo: 0);
1026 // Emulate v32i16/v64i8 broadcast without BWI.
1027 if (!Subtarget->hasBWI() && needBWI(VT)) {
1028 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1029 auto *MemNode = cast<MemSDNode>(Val: N);
1030 SDLoc dl(N);
1031 SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other);
1032 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1033 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1034 Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(),
1035 MMO: MemNode->getMemOperand());
1036 SDValue Res =
1037 CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1038 N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1039 unsigned Index = NarrowVT.getVectorMinNumElements();
1040 Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1041 N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1042
1043 --I;
1044 SDValue To[] = {Res, NarrowBCast.getValue(R: 1)};
1045 CurDAG->ReplaceAllUsesWith(From: N, To);
1046 ++I;
1047 MadeChange = true;
1048 continue;
1049 }
1050
1051 break;
1052 }
1053 case ISD::LOAD: {
1054 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1055 // load, then just extract the lower subvector and avoid the second load.
1056 auto *Ld = cast<LoadSDNode>(Val: N);
1057 MVT VT = N->getSimpleValueType(ResNo: 0);
1058 if (!ISD::isNormalLoad(N: Ld) || !Ld->isSimple() ||
1059 !(VT.is128BitVector() || VT.is256BitVector()))
1060 break;
1061
1062 MVT MaxVT = VT;
1063 SDNode *MaxLd = nullptr;
1064 SDValue Ptr = Ld->getBasePtr();
1065 SDValue Chain = Ld->getChain();
1066 for (SDNode *User : Ptr->uses()) {
1067 auto *UserLd = dyn_cast<LoadSDNode>(Val: User);
1068 MVT UserVT = User->getSimpleValueType(ResNo: 0);
1069 if (User != N && UserLd && ISD::isNormalLoad(N: User) &&
1070 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1071 !User->hasAnyUseOfValue(Value: 1) &&
1072 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1073 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1074 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1075 MaxLd = User;
1076 MaxVT = UserVT;
1077 }
1078 }
1079 if (MaxLd) {
1080 SDLoc dl(N);
1081 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1082 MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts);
1083 SDValue Extract = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT,
1084 N1: SDValue(MaxLd, 0),
1085 N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1086 SDValue Res = CurDAG->getBitcast(VT, V: Extract);
1087
1088 --I;
1089 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1090 CurDAG->ReplaceAllUsesWith(From: N, To);
1091 ++I;
1092 MadeChange = true;
1093 continue;
1094 }
1095 break;
1096 }
1097 case ISD::VSELECT: {
1098 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1099 EVT EleVT = N->getOperand(Num: 0).getValueType().getVectorElementType();
1100 if (EleVT == MVT::i1)
1101 break;
1102
1103 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1104 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1105 "We can't replace VSELECT with BLENDV in vXi16!");
1106 SDValue R;
1107 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: 0)) ==
1108 EleVT.getSizeInBits()) {
1109 R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1110 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), N3: N->getOperand(Num: 2),
1111 N4: CurDAG->getTargetConstant(Val: 0xCA, DL: SDLoc(N), VT: MVT::i8));
1112 } else {
1113 R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1114 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1),
1115 N3: N->getOperand(Num: 2));
1116 }
1117 --I;
1118 CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode());
1119 ++I;
1120 MadeChange = true;
1121 continue;
1122 }
1123 case ISD::FP_ROUND:
1124 case ISD::STRICT_FP_ROUND:
1125 case ISD::FP_TO_SINT:
1126 case ISD::FP_TO_UINT:
1127 case ISD::STRICT_FP_TO_SINT:
1128 case ISD::STRICT_FP_TO_UINT: {
1129 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1130 // don't need 2 sets of patterns.
1131 if (!N->getSimpleValueType(ResNo: 0).isVector())
1132 break;
1133
1134 unsigned NewOpc;
1135 switch (N->getOpcode()) {
1136 default: llvm_unreachable("Unexpected opcode!");
1137 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1138 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1139 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1140 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1141 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1142 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1143 }
1144 SDValue Res;
1145 if (N->isStrictFPOpcode())
1146 Res =
1147 CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), ResultTys: {N->getValueType(ResNo: 0), MVT::Other},
1148 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)});
1149 else
1150 Res =
1151 CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1152 Operand: N->getOperand(Num: 0));
1153 --I;
1154 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1155 ++I;
1156 MadeChange = true;
1157 continue;
1158 }
1159 case ISD::SHL:
1160 case ISD::SRA:
1161 case ISD::SRL: {
1162 // Replace vector shifts with their X86 specific equivalent so we don't
1163 // need 2 sets of patterns.
1164 if (!N->getValueType(ResNo: 0).isVector())
1165 break;
1166
1167 unsigned NewOpc;
1168 switch (N->getOpcode()) {
1169 default: llvm_unreachable("Unexpected opcode!");
1170 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1171 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1172 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1173 }
1174 SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1175 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
1176 --I;
1177 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1178 ++I;
1179 MadeChange = true;
1180 continue;
1181 }
1182 case ISD::ANY_EXTEND:
1183 case ISD::ANY_EXTEND_VECTOR_INREG: {
1184 // Replace vector any extend with the zero extend equivalents so we don't
1185 // need 2 sets of patterns. Ignore vXi1 extensions.
1186 if (!N->getValueType(ResNo: 0).isVector())
1187 break;
1188
1189 unsigned NewOpc;
1190 if (N->getOperand(Num: 0).getScalarValueSizeInBits() == 1) {
1191 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1192 "Unexpected opcode for mask vector!");
1193 NewOpc = ISD::SIGN_EXTEND;
1194 } else {
1195 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1196 ? ISD::ZERO_EXTEND
1197 : ISD::ZERO_EXTEND_VECTOR_INREG;
1198 }
1199
1200 SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1201 Operand: N->getOperand(Num: 0));
1202 --I;
1203 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1204 ++I;
1205 MadeChange = true;
1206 continue;
1207 }
1208 case ISD::FCEIL:
1209 case ISD::STRICT_FCEIL:
1210 case ISD::FFLOOR:
1211 case ISD::STRICT_FFLOOR:
1212 case ISD::FTRUNC:
1213 case ISD::STRICT_FTRUNC:
1214 case ISD::FROUNDEVEN:
1215 case ISD::STRICT_FROUNDEVEN:
1216 case ISD::FNEARBYINT:
1217 case ISD::STRICT_FNEARBYINT:
1218 case ISD::FRINT:
1219 case ISD::STRICT_FRINT: {
1220 // Replace fp rounding with their X86 specific equivalent so we don't
1221 // need 2 sets of patterns.
1222 unsigned Imm;
1223 switch (N->getOpcode()) {
1224 default: llvm_unreachable("Unexpected opcode!");
1225 case ISD::STRICT_FCEIL:
1226 case ISD::FCEIL: Imm = 0xA; break;
1227 case ISD::STRICT_FFLOOR:
1228 case ISD::FFLOOR: Imm = 0x9; break;
1229 case ISD::STRICT_FTRUNC:
1230 case ISD::FTRUNC: Imm = 0xB; break;
1231 case ISD::STRICT_FROUNDEVEN:
1232 case ISD::FROUNDEVEN: Imm = 0x8; break;
1233 case ISD::STRICT_FNEARBYINT:
1234 case ISD::FNEARBYINT: Imm = 0xC; break;
1235 case ISD::STRICT_FRINT:
1236 case ISD::FRINT: Imm = 0x4; break;
1237 }
1238 SDLoc dl(N);
1239 bool IsStrict = N->isStrictFPOpcode();
1240 SDValue Res;
1241 if (IsStrict)
1242 Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl,
1243 ResultTys: {N->getValueType(ResNo: 0), MVT::Other},
1244 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1),
1245 CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)});
1246 else
1247 Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: 0),
1248 N1: N->getOperand(Num: 0),
1249 N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32));
1250 --I;
1251 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1252 ++I;
1253 MadeChange = true;
1254 continue;
1255 }
1256 case X86ISD::FANDN:
1257 case X86ISD::FAND:
1258 case X86ISD::FOR:
1259 case X86ISD::FXOR: {
1260 // Widen scalar fp logic ops to vector to reduce isel patterns.
1261 // FIXME: Can we do this during lowering/combine.
1262 MVT VT = N->getSimpleValueType(ResNo: 0);
1263 if (VT.isVector() || VT == MVT::f128)
1264 break;
1265
1266 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1267 : VT == MVT::f32 ? MVT::v4f32
1268 : MVT::v8f16;
1269
1270 SDLoc dl(N);
1271 SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1272 Operand: N->getOperand(Num: 0));
1273 SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1274 Operand: N->getOperand(Num: 1));
1275
1276 SDValue Res;
1277 if (Subtarget->hasSSE2()) {
1278 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1279 Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0);
1280 Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1);
1281 unsigned Opc;
1282 switch (N->getOpcode()) {
1283 default: llvm_unreachable("Unexpected opcode!");
1284 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1285 case X86ISD::FAND: Opc = ISD::AND; break;
1286 case X86ISD::FOR: Opc = ISD::OR; break;
1287 case X86ISD::FXOR: Opc = ISD::XOR; break;
1288 }
1289 Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1);
1290 Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res);
1291 } else {
1292 Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1);
1293 }
1294 Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res,
1295 N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1296 --I;
1297 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1298 ++I;
1299 MadeChange = true;
1300 continue;
1301 }
1302 }
1303
1304 if (OptLevel != CodeGenOptLevel::None &&
1305 // Only do this when the target can fold the load into the call or
1306 // jmp.
1307 !Subtarget->useIndirectThunkCalls() &&
1308 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1309 (N->getOpcode() == X86ISD::TC_RETURN &&
1310 (Subtarget->is64Bit() ||
1311 !getTargetMachine().isPositionIndependent())))) {
1312 /// Also try moving call address load from outside callseq_start to just
1313 /// before the call to allow it to be folded.
1314 ///
1315 /// [Load chain]
1316 /// ^
1317 /// |
1318 /// [Load]
1319 /// ^ ^
1320 /// | |
1321 /// / \--
1322 /// / |
1323 ///[CALLSEQ_START] |
1324 /// ^ |
1325 /// | |
1326 /// [LOAD/C2Reg] |
1327 /// | |
1328 /// \ /
1329 /// \ /
1330 /// [CALL]
1331 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1332 SDValue Chain = N->getOperand(Num: 0);
1333 SDValue Load = N->getOperand(Num: 1);
1334 if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq))
1335 continue;
1336 moveBelowOrigChain(CurDAG, Load, Call: SDValue(N, 0), OrigChain: Chain);
1337 ++NumLoadMoved;
1338 MadeChange = true;
1339 continue;
1340 }
1341
1342 // Lower fpround and fpextend nodes that target the FP stack to be store and
1343 // load to the stack. This is a gross hack. We would like to simply mark
1344 // these as being illegal, but when we do that, legalize produces these when
1345 // it expands calls, then expands these in the same legalize pass. We would
1346 // like dag combine to be able to hack on these between the call expansion
1347 // and the node legalization. As such this pass basically does "really
1348 // late" legalization of these inline with the X86 isel pass.
1349 // FIXME: This should only happen when not compiled with -O0.
1350 switch (N->getOpcode()) {
1351 default: continue;
1352 case ISD::FP_ROUND:
1353 case ISD::FP_EXTEND:
1354 {
1355 MVT SrcVT = N->getOperand(Num: 0).getSimpleValueType();
1356 MVT DstVT = N->getSimpleValueType(ResNo: 0);
1357
1358 // If any of the sources are vectors, no fp stack involved.
1359 if (SrcVT.isVector() || DstVT.isVector())
1360 continue;
1361
1362 // If the source and destination are SSE registers, then this is a legal
1363 // conversion that should not be lowered.
1364 const X86TargetLowering *X86Lowering =
1365 static_cast<const X86TargetLowering *>(TLI);
1366 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1367 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1368 if (SrcIsSSE && DstIsSSE)
1369 continue;
1370
1371 if (!SrcIsSSE && !DstIsSSE) {
1372 // If this is an FPStack extension, it is a noop.
1373 if (N->getOpcode() == ISD::FP_EXTEND)
1374 continue;
1375 // If this is a value-preserving FPStack truncation, it is a noop.
1376 if (N->getConstantOperandVal(Num: 1))
1377 continue;
1378 }
1379
1380 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1381 // FPStack has extload and truncstore. SSE can fold direct loads into other
1382 // operations. Based on this, decide what we want to do.
1383 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1384 SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1385 int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1386 MachinePointerInfo MPI =
1387 MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1388 SDLoc dl(N);
1389
1390 // FIXME: optimize the case where the src/dest is a load or store?
1391
1392 SDValue Store = CurDAG->getTruncStore(
1393 Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: 0), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT);
1394 SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store,
1395 Ptr: MemTmp, PtrInfo: MPI, MemVT);
1396
1397 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1398 // extload we created. This will cause general havok on the dag because
1399 // anything below the conversion could be folded into other existing nodes.
1400 // To avoid invalidating 'I', back it up to the convert node.
1401 --I;
1402 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Result);
1403 break;
1404 }
1405
1406 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1407 //dealing with the chain differently, as there is already a preexisting chain.
1408 case ISD::STRICT_FP_ROUND:
1409 case ISD::STRICT_FP_EXTEND:
1410 {
1411 MVT SrcVT = N->getOperand(Num: 1).getSimpleValueType();
1412 MVT DstVT = N->getSimpleValueType(ResNo: 0);
1413
1414 // If any of the sources are vectors, no fp stack involved.
1415 if (SrcVT.isVector() || DstVT.isVector())
1416 continue;
1417
1418 // If the source and destination are SSE registers, then this is a legal
1419 // conversion that should not be lowered.
1420 const X86TargetLowering *X86Lowering =
1421 static_cast<const X86TargetLowering *>(TLI);
1422 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1423 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1424 if (SrcIsSSE && DstIsSSE)
1425 continue;
1426
1427 if (!SrcIsSSE && !DstIsSSE) {
1428 // If this is an FPStack extension, it is a noop.
1429 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1430 continue;
1431 // If this is a value-preserving FPStack truncation, it is a noop.
1432 if (N->getConstantOperandVal(Num: 2))
1433 continue;
1434 }
1435
1436 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1437 // FPStack has extload and truncstore. SSE can fold direct loads into other
1438 // operations. Based on this, decide what we want to do.
1439 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1440 SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1441 int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1442 MachinePointerInfo MPI =
1443 MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1444 SDLoc dl(N);
1445
1446 // FIXME: optimize the case where the src/dest is a load or store?
1447
1448 //Since the operation is StrictFP, use the preexisting chain.
1449 SDValue Store, Result;
1450 if (!SrcIsSSE) {
1451 SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
1452 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), MemTmp};
1453 Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT,
1454 PtrInfo: MPI, /*Align*/ Alignment: std::nullopt,
1455 Flags: MachineMemOperand::MOStore);
1456 if (N->getFlags().hasNoFPExcept()) {
1457 SDNodeFlags Flags = Store->getFlags();
1458 Flags.setNoFPExcept(true);
1459 Store->setFlags(Flags);
1460 }
1461 } else {
1462 assert(SrcVT == MemVT && "Unexpected VT!");
1463 Store = CurDAG->getStore(Chain: N->getOperand(Num: 0), dl, Val: N->getOperand(Num: 1), Ptr: MemTmp,
1464 PtrInfo: MPI);
1465 }
1466
1467 if (!DstIsSSE) {
1468 SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other);
1469 SDValue Ops[] = {Store, MemTmp};
1470 Result = CurDAG->getMemIntrinsicNode(
1471 Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI,
1472 /*Align*/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad);
1473 if (N->getFlags().hasNoFPExcept()) {
1474 SDNodeFlags Flags = Result->getFlags();
1475 Flags.setNoFPExcept(true);
1476 Result->setFlags(Flags);
1477 }
1478 } else {
1479 assert(DstVT == MemVT && "Unexpected VT!");
1480 Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI);
1481 }
1482
1483 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1484 // extload we created. This will cause general havok on the dag because
1485 // anything below the conversion could be folded into other existing nodes.
1486 // To avoid invalidating 'I', back it up to the convert node.
1487 --I;
1488 CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode());
1489 break;
1490 }
1491 }
1492
1493
1494 // Now that we did that, the node is dead. Increment the iterator to the
1495 // next node to process, then delete N.
1496 ++I;
1497 MadeChange = true;
1498 }
1499
1500 // Remove any dead nodes that may have been left behind.
1501 if (MadeChange)
1502 CurDAG->RemoveDeadNodes();
1503}
1504
1505// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1506bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1507 unsigned Opc = N->getMachineOpcode();
1508 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1509 Opc != X86::MOVSX64rr8)
1510 return false;
1511
1512 SDValue N0 = N->getOperand(Num: 0);
1513
1514 // We need to be extracting the lower bit of an extend.
1515 if (!N0.isMachineOpcode() ||
1516 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1517 N0.getConstantOperandVal(i: 1) != X86::sub_8bit)
1518 return false;
1519
1520 // We're looking for either a movsx or movzx to match the original opcode.
1521 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1522 : X86::MOVSX32rr8_NOREX;
1523 SDValue N00 = N0.getOperand(i: 0);
1524 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1525 return false;
1526
1527 if (Opc == X86::MOVSX64rr8) {
1528 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1529 // to 64.
1530 MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc(N),
1531 VT: MVT::i64, Op1: N00);
1532 ReplaceUses(F: N, T: Extend);
1533 } else {
1534 // Ok we can drop this extend and just use the original extend.
1535 ReplaceUses(F: N, T: N00.getNode());
1536 }
1537
1538 return true;
1539}
1540
1541void X86DAGToDAGISel::PostprocessISelDAG() {
1542 // Skip peepholes at -O0.
1543 if (TM.getOptLevel() == CodeGenOptLevel::None)
1544 return;
1545
1546 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1547
1548 bool MadeChange = false;
1549 while (Position != CurDAG->allnodes_begin()) {
1550 SDNode *N = &*--Position;
1551 // Skip dead nodes and any non-machine opcodes.
1552 if (N->use_empty() || !N->isMachineOpcode())
1553 continue;
1554
1555 if (tryOptimizeRem8Extend(N)) {
1556 MadeChange = true;
1557 continue;
1558 }
1559
1560 unsigned Opc = N->getMachineOpcode();
1561 switch (Opc) {
1562 default:
1563 continue;
1564 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1565 case X86::TEST8rr:
1566 case X86::TEST16rr:
1567 case X86::TEST32rr:
1568 case X86::TEST64rr:
1569 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1570 case X86::CTEST8rr:
1571 case X86::CTEST16rr:
1572 case X86::CTEST32rr:
1573 case X86::CTEST64rr: {
1574 auto &Op0 = N->getOperand(Num: 0);
1575 if (Op0 != N->getOperand(Num: 1) || !Op0->hasNUsesOfValue(NUses: 2, Value: Op0.getResNo()) ||
1576 !Op0.isMachineOpcode())
1577 continue;
1578 SDValue And = N->getOperand(Num: 0);
1579#define CASE_ND(OP) \
1580 case X86::OP: \
1581 case X86::OP##_ND:
1582 switch (And.getMachineOpcode()) {
1583 default:
1584 continue;
1585 CASE_ND(AND8rr)
1586 CASE_ND(AND16rr)
1587 CASE_ND(AND32rr)
1588 CASE_ND(AND64rr) {
1589 if (And->hasAnyUseOfValue(Value: 1))
1590 continue;
1591 SmallVector<SDValue> Ops(N->op_values());
1592 Ops[0] = And.getOperand(i: 0);
1593 Ops[1] = And.getOperand(i: 1);
1594 MachineSDNode *Test =
1595 CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: MVT::i32, Ops);
1596 ReplaceUses(F: N, T: Test);
1597 MadeChange = true;
1598 continue;
1599 }
1600 CASE_ND(AND8rm)
1601 CASE_ND(AND16rm)
1602 CASE_ND(AND32rm)
1603 CASE_ND(AND64rm) {
1604 if (And->hasAnyUseOfValue(Value: 1))
1605 continue;
1606 unsigned NewOpc;
1607 bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc);
1608#define FROM_TO(A, B) \
1609 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1610 break;
1611 switch (And.getMachineOpcode()) {
1612 FROM_TO(AND8rm, TEST8mr);
1613 FROM_TO(AND16rm, TEST16mr);
1614 FROM_TO(AND32rm, TEST32mr);
1615 FROM_TO(AND64rm, TEST64mr);
1616 }
1617#undef FROM_TO
1618#undef CASE_ND
1619 // Need to swap the memory and register operand.
1620 SmallVector<SDValue> Ops = {And.getOperand(i: 1), And.getOperand(i: 2),
1621 And.getOperand(i: 3), And.getOperand(i: 4),
1622 And.getOperand(i: 5), And.getOperand(i: 0)};
1623 // CC, Cflags.
1624 if (IsCTESTCC) {
1625 Ops.push_back(Elt: N->getOperand(Num: 2));
1626 Ops.push_back(Elt: N->getOperand(Num: 3));
1627 }
1628 // Chain of memory load
1629 Ops.push_back(Elt: And.getOperand(i: 6));
1630 // Glue
1631 if (IsCTESTCC)
1632 Ops.push_back(Elt: N->getOperand(Num: 4));
1633
1634 MachineSDNode *Test = CurDAG->getMachineNode(
1635 Opcode: NewOpc, dl: SDLoc(N), VT1: MVT::i32, VT2: MVT::Other, Ops);
1636 CurDAG->setNodeMemRefs(
1637 N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands());
1638 ReplaceUses(F: And.getValue(R: 2), T: SDValue(Test, 1));
1639 ReplaceUses(F: SDValue(N, 0), T: SDValue(Test, 0));
1640 MadeChange = true;
1641 continue;
1642 }
1643 }
1644 }
1645 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1646 // used. We're doing this late so we can prefer to fold the AND into masked
1647 // comparisons. Doing that can be better for the live range of the mask
1648 // register.
1649 case X86::KORTESTBrr:
1650 case X86::KORTESTWrr:
1651 case X86::KORTESTDrr:
1652 case X86::KORTESTQrr: {
1653 SDValue Op0 = N->getOperand(Num: 0);
1654 if (Op0 != N->getOperand(Num: 1) || !N->isOnlyUserOf(N: Op0.getNode()) ||
1655 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(Flags: SDValue(N, 0)))
1656 continue;
1657#define CASE(A) \
1658 case X86::A: \
1659 break;
1660 switch (Op0.getMachineOpcode()) {
1661 default:
1662 continue;
1663 CASE(KANDBrr)
1664 CASE(KANDWrr)
1665 CASE(KANDDrr)
1666 CASE(KANDQrr)
1667 }
1668 unsigned NewOpc;
1669#define FROM_TO(A, B) \
1670 case X86::A: \
1671 NewOpc = X86::B; \
1672 break;
1673 switch (Opc) {
1674 FROM_TO(KORTESTBrr, KTESTBrr)
1675 FROM_TO(KORTESTWrr, KTESTWrr)
1676 FROM_TO(KORTESTDrr, KTESTDrr)
1677 FROM_TO(KORTESTQrr, KTESTQrr)
1678 }
1679 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1680 // KAND instructions and KTEST use the same ISA feature.
1681 if (NewOpc == X86::KTESTWrr && !Subtarget->hasDQI())
1682 continue;
1683#undef FROM_TO
1684 MachineSDNode *KTest = CurDAG->getMachineNode(
1685 Opcode: NewOpc, dl: SDLoc(N), VT: MVT::i32, Op1: Op0.getOperand(i: 0), Op2: Op0.getOperand(i: 1));
1686 ReplaceUses(F: N, T: KTest);
1687 MadeChange = true;
1688 continue;
1689 }
1690 // Attempt to remove vectors moves that were inserted to zero upper bits.
1691 case TargetOpcode::SUBREG_TO_REG: {
1692 unsigned SubRegIdx = N->getConstantOperandVal(Num: 2);
1693 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1694 continue;
1695
1696 SDValue Move = N->getOperand(Num: 1);
1697 if (!Move.isMachineOpcode())
1698 continue;
1699
1700 // Make sure its one of the move opcodes we recognize.
1701 switch (Move.getMachineOpcode()) {
1702 default:
1703 continue;
1704 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1705 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1706 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1707 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1708 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1709 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1710 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1711 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1712 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1713 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1714 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1715 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1716 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1717 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1718 }
1719#undef CASE
1720
1721 SDValue In = Move.getOperand(i: 0);
1722 if (!In.isMachineOpcode() ||
1723 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1724 continue;
1725
1726 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1727 // the SHA instructions which use a legacy encoding.
1728 uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags;
1729 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1730 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1731 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1732 continue;
1733
1734 // Producing instruction is another vector instruction. We can drop the
1735 // move.
1736 CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: In, Op3: N->getOperand(Num: 2));
1737 MadeChange = true;
1738 }
1739 }
1740 }
1741
1742 if (MadeChange)
1743 CurDAG->RemoveDeadNodes();
1744}
1745
1746
1747/// Emit any code that needs to be executed only in the main function.
1748void X86DAGToDAGISel::emitSpecialCodeForMain() {
1749 if (Subtarget->isTargetCygMing()) {
1750 TargetLowering::ArgListTy Args;
1751 auto &DL = CurDAG->getDataLayout();
1752
1753 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1754 CLI.setChain(CurDAG->getRoot())
1755 .setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()),
1756 Target: CurDAG->getExternalSymbol(Sym: "__main", VT: TLI->getPointerTy(DL)),
1757 ArgsList: std::move(Args));
1758 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1759 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1760 CurDAG->setRoot(Result.second);
1761 }
1762}
1763
1764void X86DAGToDAGISel::emitFunctionEntryCode() {
1765 // If this is main, emit special code for main.
1766 const Function &F = MF->getFunction();
1767 if (F.hasExternalLinkage() && F.getName() == "main")
1768 emitSpecialCodeForMain();
1769}
1770
1771static bool isDispSafeForFrameIndex(int64_t Val) {
1772 // On 64-bit platforms, we can run into an issue where a frame index
1773 // includes a displacement that, when added to the explicit displacement,
1774 // will overflow the displacement field. Assuming that the frame index
1775 // displacement fits into a 31-bit integer (which is only slightly more
1776 // aggressive than the current fundamental assumption that it fits into
1777 // a 32-bit integer), a 31-bit disp should always be safe.
1778 return isInt<31>(x: Val);
1779}
1780
1781bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1782 X86ISelAddressMode &AM) {
1783 // We may have already matched a displacement and the caller just added the
1784 // symbolic displacement. So we still need to do the checks even if Offset
1785 // is zero.
1786
1787 int64_t Val = AM.Disp + Offset;
1788
1789 // Cannot combine ExternalSymbol displacements with integer offsets.
1790 if (Val != 0 && (AM.ES || AM.MCSym))
1791 return true;
1792
1793 CodeModel::Model M = TM.getCodeModel();
1794 if (Subtarget->is64Bit()) {
1795 if (Val != 0 &&
1796 !X86::isOffsetSuitableForCodeModel(Offset: Val, M,
1797 hasSymbolicDisplacement: AM.hasSymbolicDisplacement()))
1798 return true;
1799 // In addition to the checks required for a register base, check that
1800 // we do not try to use an unsafe Disp with a frame index.
1801 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1802 !isDispSafeForFrameIndex(Val))
1803 return true;
1804 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1805 // 64 bits. Instructions with 32-bit register addresses perform this zero
1806 // extension for us and we can safely ignore the high bits of Offset.
1807 // Instructions with only a 32-bit immediate address do not, though: they
1808 // sign extend instead. This means only address the low 2GB of address space
1809 // is directly addressable, we need indirect addressing for the high 2GB of
1810 // address space.
1811 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1812 // implicit zero extension of instructions would cover up any problem.
1813 // However, we have asserts elsewhere that get triggered if we do, so keep
1814 // the checks for now.
1815 // TODO: We would actually be able to accept these, as well as the same
1816 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1817 // to get an address size override to be emitted. However, this
1818 // pseudo-register is not part of any register class and therefore causes
1819 // MIR verification to fail.
1820 if (Subtarget->isTarget64BitILP32() && !isUInt<31>(x: Val) &&
1821 !AM.hasBaseOrIndexReg())
1822 return true;
1823 }
1824 AM.Disp = Val;
1825 return false;
1826}
1827
1828bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1829 bool AllowSegmentRegForX32) {
1830 SDValue Address = N->getOperand(Num: 1);
1831
1832 // load gs:0 -> GS segment register.
1833 // load fs:0 -> FS segment register.
1834 //
1835 // This optimization is generally valid because the GNU TLS model defines that
1836 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1837 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1838 // zero-extended to 64 bits and then added it to the base address, which gives
1839 // unwanted results when the register holds a negative value.
1840 // For more information see http://people.redhat.com/drepper/tls.pdf
1841 if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr &&
1842 !IndirectTlsSegRefs &&
1843 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1844 Subtarget->isTargetFuchsia())) {
1845 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1846 return true;
1847 switch (N->getPointerInfo().getAddrSpace()) {
1848 case X86AS::GS:
1849 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
1850 return false;
1851 case X86AS::FS:
1852 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
1853 return false;
1854 // Address space X86AS::SS is not handled here, because it is not used to
1855 // address TLS areas.
1856 }
1857 }
1858
1859 return true;
1860}
1861
1862/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1863/// mode. These wrap things that will resolve down into a symbol reference.
1864/// If no match is possible, this returns true, otherwise it returns false.
1865bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1866 // If the addressing mode already has a symbol as the displacement, we can
1867 // never match another symbol.
1868 if (AM.hasSymbolicDisplacement())
1869 return true;
1870
1871 bool IsRIPRelTLS = false;
1872 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1873 if (IsRIPRel) {
1874 SDValue Val = N.getOperand(i: 0);
1875 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1876 IsRIPRelTLS = true;
1877 }
1878
1879 // We can't use an addressing mode in the 64-bit large code model.
1880 // Global TLS addressing is an exception. In the medium code model,
1881 // we use can use a mode when RIP wrappers are present.
1882 // That signifies access to globals that are known to be "near",
1883 // such as the GOT itself.
1884 CodeModel::Model M = TM.getCodeModel();
1885 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1886 return true;
1887
1888 // Base and index reg must be 0 in order to use %rip as base.
1889 if (IsRIPRel && AM.hasBaseOrIndexReg())
1890 return true;
1891
1892 // Make a local copy in case we can't do this fold.
1893 X86ISelAddressMode Backup = AM;
1894
1895 int64_t Offset = 0;
1896 SDValue N0 = N.getOperand(i: 0);
1897 if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) {
1898 AM.GV = G->getGlobal();
1899 AM.SymbolFlags = G->getTargetFlags();
1900 Offset = G->getOffset();
1901 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) {
1902 AM.CP = CP->getConstVal();
1903 AM.Alignment = CP->getAlign();
1904 AM.SymbolFlags = CP->getTargetFlags();
1905 Offset = CP->getOffset();
1906 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) {
1907 AM.ES = S->getSymbol();
1908 AM.SymbolFlags = S->getTargetFlags();
1909 } else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) {
1910 AM.MCSym = S->getMCSymbol();
1911 } else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) {
1912 AM.JT = J->getIndex();
1913 AM.SymbolFlags = J->getTargetFlags();
1914 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) {
1915 AM.BlockAddr = BA->getBlockAddress();
1916 AM.SymbolFlags = BA->getTargetFlags();
1917 Offset = BA->getOffset();
1918 } else
1919 llvm_unreachable("Unhandled symbol reference node.");
1920
1921 // Can't use an addressing mode with large globals.
1922 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1923 TM.isLargeGlobalValue(GV: AM.GV)) {
1924 AM = Backup;
1925 return true;
1926 }
1927
1928 if (foldOffsetIntoAddress(Offset, AM)) {
1929 AM = Backup;
1930 return true;
1931 }
1932
1933 if (IsRIPRel)
1934 AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64));
1935
1936 // Commit the changes now that we know this fold is safe.
1937 return false;
1938}
1939
1940/// Add the specified node to the specified addressing mode, returning true if
1941/// it cannot be done. This just pattern matches for the addressing mode.
1942bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1943 if (matchAddressRecursively(N, AM, Depth: 0))
1944 return true;
1945
1946 // Post-processing: Make a second attempt to fold a load, if we now know
1947 // that there will not be any other register. This is only performed for
1948 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1949 // any foldable load the first time.
1950 if (Subtarget->isTarget64BitILP32() &&
1951 AM.BaseType == X86ISelAddressMode::RegBase &&
1952 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1953 SDValue Save_Base_Reg = AM.Base_Reg;
1954 if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) {
1955 AM.Base_Reg = SDValue();
1956 if (matchLoadInAddress(N: LoadN, AM, /*AllowSegmentRegForX32=*/true))
1957 AM.Base_Reg = Save_Base_Reg;
1958 }
1959 }
1960
1961 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1962 // a smaller encoding and avoids a scaled-index.
1963 if (AM.Scale == 2 &&
1964 AM.BaseType == X86ISelAddressMode::RegBase &&
1965 AM.Base_Reg.getNode() == nullptr) {
1966 AM.Base_Reg = AM.IndexReg;
1967 AM.Scale = 1;
1968 }
1969
1970 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1971 // because it has a smaller encoding.
1972 if (TM.getCodeModel() != CodeModel::Large &&
1973 (!AM.GV || !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() &&
1974 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
1975 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1976 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1977 AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64);
1978 }
1979
1980 return false;
1981}
1982
1983bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1984 unsigned Depth) {
1985 // Add an artificial use to this node so that we can keep track of
1986 // it if it gets CSE'd with a different node.
1987 HandleSDNode Handle(N);
1988
1989 X86ISelAddressMode Backup = AM;
1990 if (!matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1) &&
1991 !matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, Depth: Depth+1))
1992 return false;
1993 AM = Backup;
1994
1995 // Try again after commutating the operands.
1996 if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
1997 Depth: Depth + 1) &&
1998 !matchAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, Depth: Depth + 1))
1999 return false;
2000 AM = Backup;
2001
2002 // If we couldn't fold both operands into the address at the same time,
2003 // see if we can just put each operand into a register and fold at least
2004 // the add.
2005 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2006 !AM.Base_Reg.getNode() &&
2007 !AM.IndexReg.getNode()) {
2008 N = Handle.getValue();
2009 AM.Base_Reg = N.getOperand(i: 0);
2010 AM.IndexReg = N.getOperand(i: 1);
2011 AM.Scale = 1;
2012 return false;
2013 }
2014 N = Handle.getValue();
2015 return true;
2016}
2017
2018// Insert a node into the DAG at least before the Pos node's position. This
2019// will reposition the node as needed, and will assign it a node ID that is <=
2020// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2021// IDs! The selection DAG must no longer depend on their uniqueness when this
2022// is used.
2023static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2024 if (N->getNodeId() == -1 ||
2025 (SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) >
2026 SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) {
2027 DAG.RepositionNode(Position: Pos->getIterator(), N: N.getNode());
2028 // Mark Node as invalid for pruning as after this it may be a successor to a
2029 // selected node but otherwise be in the same position of Pos.
2030 // Conservatively mark it with the same -abs(Id) to assure node id
2031 // invariant is preserved.
2032 N->setNodeId(Pos->getNodeId());
2033 SelectionDAGISel::InvalidateNodeId(N: N.getNode());
2034 }
2035}
2036
2037// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2038// safe. This allows us to convert the shift and and into an h-register
2039// extract and a scaled index. Returns false if the simplification is
2040// performed.
2041static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2042 uint64_t Mask,
2043 SDValue Shift, SDValue X,
2044 X86ISelAddressMode &AM) {
2045 if (Shift.getOpcode() != ISD::SRL ||
2046 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) ||
2047 !Shift.hasOneUse())
2048 return true;
2049
2050 int ScaleLog = 8 - Shift.getConstantOperandVal(i: 1);
2051 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2052 Mask != (0xffu << ScaleLog))
2053 return true;
2054
2055 MVT XVT = X.getSimpleValueType();
2056 MVT VT = N.getSimpleValueType();
2057 SDLoc DL(N);
2058 SDValue Eight = DAG.getConstant(Val: 8, DL, VT: MVT::i8);
2059 SDValue NewMask = DAG.getConstant(Val: 0xff, DL, VT: XVT);
2060 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight);
2061 SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask);
2062 SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT);
2063 SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8);
2064 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount);
2065
2066 // Insert the new nodes into the topological ordering. We must do this in
2067 // a valid topological ordering as nothing is going to go back and re-sort
2068 // these nodes. We continually insert before 'N' in sequence as this is
2069 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2070 // hierarchy left to express.
2071 insertDAGNode(DAG, Pos: N, N: Eight);
2072 insertDAGNode(DAG, Pos: N, N: NewMask);
2073 insertDAGNode(DAG, Pos: N, N: Srl);
2074 insertDAGNode(DAG, Pos: N, N: And);
2075 insertDAGNode(DAG, Pos: N, N: Ext);
2076 insertDAGNode(DAG, Pos: N, N: ShlCount);
2077 insertDAGNode(DAG, Pos: N, N: Shl);
2078 DAG.ReplaceAllUsesWith(From: N, To: Shl);
2079 DAG.RemoveDeadNode(N: N.getNode());
2080 AM.IndexReg = Ext;
2081 AM.Scale = (1 << ScaleLog);
2082 return false;
2083}
2084
2085// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2086// allows us to fold the shift into this addressing mode. Returns false if the
2087// transform succeeded.
2088static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2089 X86ISelAddressMode &AM) {
2090 SDValue Shift = N.getOperand(i: 0);
2091
2092 // Use a signed mask so that shifting right will insert sign bits. These
2093 // bits will be removed when we shift the result left so it doesn't matter
2094 // what we use. This might allow a smaller immediate encoding.
2095 int64_t Mask = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getSExtValue();
2096
2097 // If we have an any_extend feeding the AND, look through it to see if there
2098 // is a shift behind it. But only if the AND doesn't use the extended bits.
2099 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2100 bool FoundAnyExtend = false;
2101 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2102 Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 &&
2103 isUInt<32>(x: Mask)) {
2104 FoundAnyExtend = true;
2105 Shift = Shift.getOperand(i: 0);
2106 }
2107
2108 if (Shift.getOpcode() != ISD::SHL ||
2109 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
2110 return true;
2111
2112 SDValue X = Shift.getOperand(i: 0);
2113
2114 // Not likely to be profitable if either the AND or SHIFT node has more
2115 // than one use (unless all uses are for address computation). Besides,
2116 // isel mechanism requires their node ids to be reused.
2117 if (!N.hasOneUse() || !Shift.hasOneUse())
2118 return true;
2119
2120 // Verify that the shift amount is something we can fold.
2121 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2122 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2123 return true;
2124
2125 MVT VT = N.getSimpleValueType();
2126 SDLoc DL(N);
2127 if (FoundAnyExtend) {
2128 SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X);
2129 insertDAGNode(DAG, Pos: N, N: NewX);
2130 X = NewX;
2131 }
2132
2133 SDValue NewMask = DAG.getConstant(Val: Mask >> ShiftAmt, DL, VT);
2134 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask);
2135 SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: 1));
2136
2137 // Insert the new nodes into the topological ordering. We must do this in
2138 // a valid topological ordering as nothing is going to go back and re-sort
2139 // these nodes. We continually insert before 'N' in sequence as this is
2140 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2141 // hierarchy left to express.
2142 insertDAGNode(DAG, Pos: N, N: NewMask);
2143 insertDAGNode(DAG, Pos: N, N: NewAnd);
2144 insertDAGNode(DAG, Pos: N, N: NewShift);
2145 DAG.ReplaceAllUsesWith(From: N, To: NewShift);
2146 DAG.RemoveDeadNode(N: N.getNode());
2147
2148 AM.Scale = 1 << ShiftAmt;
2149 AM.IndexReg = NewAnd;
2150 return false;
2151}
2152
2153// Implement some heroics to detect shifts of masked values where the mask can
2154// be replaced by extending the shift and undoing that in the addressing mode
2155// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2156// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2157// the addressing mode. This results in code such as:
2158//
2159// int f(short *y, int *lookup_table) {
2160// ...
2161// return *y + lookup_table[*y >> 11];
2162// }
2163//
2164// Turning into:
2165// movzwl (%rdi), %eax
2166// movl %eax, %ecx
2167// shrl $11, %ecx
2168// addl (%rsi,%rcx,4), %eax
2169//
2170// Instead of:
2171// movzwl (%rdi), %eax
2172// movl %eax, %ecx
2173// shrl $9, %ecx
2174// andl $124, %rcx
2175// addl (%rsi,%rcx), %eax
2176//
2177// Note that this function assumes the mask is provided as a mask *after* the
2178// value is shifted. The input chain may or may not match that, but computing
2179// such a mask is trivial.
2180static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2181 uint64_t Mask,
2182 SDValue Shift, SDValue X,
2183 X86ISelAddressMode &AM) {
2184 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2185 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
2186 return true;
2187
2188 // We need to ensure that mask is a continuous run of bits.
2189 unsigned MaskIdx, MaskLen;
2190 if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2191 return true;
2192 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2193
2194 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2195
2196 // The amount of shift we're trying to fit into the addressing mode is taken
2197 // from the shifted mask index (number of trailing zeros of the mask).
2198 unsigned AMShiftAmt = MaskIdx;
2199
2200 // There is nothing we can do here unless the mask is removing some bits.
2201 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2202 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2203
2204 // Scale the leading zero count down based on the actual size of the value.
2205 // Also scale it down based on the size of the shift.
2206 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2207 if (MaskLZ < ScaleDown)
2208 return true;
2209 MaskLZ -= ScaleDown;
2210
2211 // The final check is to ensure that any masked out high bits of X are
2212 // already known to be zero. Otherwise, the mask has a semantic impact
2213 // other than masking out a couple of low bits. Unfortunately, because of
2214 // the mask, zero extensions will be removed from operands in some cases.
2215 // This code works extra hard to look through extensions because we can
2216 // replace them with zero extensions cheaply if necessary.
2217 bool ReplacingAnyExtend = false;
2218 if (X.getOpcode() == ISD::ANY_EXTEND) {
2219 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2220 X.getOperand(i: 0).getSimpleValueType().getSizeInBits();
2221 // Assume that we'll replace the any-extend with a zero-extend, and
2222 // narrow the search to the extended value.
2223 X = X.getOperand(i: 0);
2224 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2225 ReplacingAnyExtend = true;
2226 }
2227 APInt MaskedHighBits =
2228 APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ);
2229 if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits))
2230 return true;
2231
2232 // We've identified a pattern that can be transformed into a single shift
2233 // and an addressing mode. Make it so.
2234 MVT VT = N.getSimpleValueType();
2235 if (ReplacingAnyExtend) {
2236 assert(X.getValueType() != VT);
2237 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2238 SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(X), VT, Operand: X);
2239 insertDAGNode(DAG, Pos: N, N: NewX);
2240 X = NewX;
2241 }
2242
2243 MVT XVT = X.getSimpleValueType();
2244 SDLoc DL(N);
2245 SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2246 SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2247 SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT);
2248 SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2249 SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2250
2251 // Insert the new nodes into the topological ordering. We must do this in
2252 // a valid topological ordering as nothing is going to go back and re-sort
2253 // these nodes. We continually insert before 'N' in sequence as this is
2254 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2255 // hierarchy left to express.
2256 insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2257 insertDAGNode(DAG, Pos: N, N: NewSRL);
2258 insertDAGNode(DAG, Pos: N, N: NewExt);
2259 insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2260 insertDAGNode(DAG, Pos: N, N: NewSHL);
2261 DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2262 DAG.RemoveDeadNode(N: N.getNode());
2263
2264 AM.Scale = 1 << AMShiftAmt;
2265 AM.IndexReg = NewExt;
2266 return false;
2267}
2268
2269// Transform "(X >> SHIFT) & (MASK << C1)" to
2270// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2271// matched to a BEXTR later. Returns false if the simplification is performed.
2272static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2273 uint64_t Mask,
2274 SDValue Shift, SDValue X,
2275 X86ISelAddressMode &AM,
2276 const X86Subtarget &Subtarget) {
2277 if (Shift.getOpcode() != ISD::SRL ||
2278 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) ||
2279 !Shift.hasOneUse() || !N.hasOneUse())
2280 return true;
2281
2282 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2283 if (!Subtarget.hasTBM() &&
2284 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2285 return true;
2286
2287 // We need to ensure that mask is a continuous run of bits.
2288 unsigned MaskIdx, MaskLen;
2289 if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2290 return true;
2291
2292 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2293
2294 // The amount of shift we're trying to fit into the addressing mode is taken
2295 // from the shifted mask index (number of trailing zeros of the mask).
2296 unsigned AMShiftAmt = MaskIdx;
2297
2298 // There is nothing we can do here unless the mask is removing some bits.
2299 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2300 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2301
2302 MVT XVT = X.getSimpleValueType();
2303 MVT VT = N.getSimpleValueType();
2304 SDLoc DL(N);
2305 SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2306 SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2307 SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT);
2308 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask);
2309 SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT);
2310 SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2311 SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2312
2313 // Insert the new nodes into the topological ordering. We must do this in
2314 // a valid topological ordering as nothing is going to go back and re-sort
2315 // these nodes. We continually insert before 'N' in sequence as this is
2316 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2317 // hierarchy left to express.
2318 insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2319 insertDAGNode(DAG, Pos: N, N: NewSRL);
2320 insertDAGNode(DAG, Pos: N, N: NewMask);
2321 insertDAGNode(DAG, Pos: N, N: NewAnd);
2322 insertDAGNode(DAG, Pos: N, N: NewExt);
2323 insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2324 insertDAGNode(DAG, Pos: N, N: NewSHL);
2325 DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2326 DAG.RemoveDeadNode(N: N.getNode());
2327
2328 AM.Scale = 1 << AMShiftAmt;
2329 AM.IndexReg = NewExt;
2330 return false;
2331}
2332
2333// Attempt to peek further into a scaled index register, collecting additional
2334// extensions / offsets / etc. Returns /p N if we can't peek any further.
2335SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2336 X86ISelAddressMode &AM,
2337 unsigned Depth) {
2338 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2339 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2340 "Illegal index scale");
2341
2342 // Limit recursion.
2343 if (Depth >= SelectionDAG::MaxRecursionDepth)
2344 return N;
2345
2346 EVT VT = N.getValueType();
2347 unsigned Opc = N.getOpcode();
2348
2349 // index: add(x,c) -> index: x, disp + c
2350 if (CurDAG->isBaseWithConstantOffset(Op: N)) {
2351 auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: 1));
2352 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2353 if (!foldOffsetIntoAddress(Offset, AM))
2354 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2355 }
2356
2357 // index: add(x,x) -> index: x, scale * 2
2358 if (Opc == ISD::ADD && N.getOperand(i: 0) == N.getOperand(i: 1)) {
2359 if (AM.Scale <= 4) {
2360 AM.Scale *= 2;
2361 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2362 }
2363 }
2364
2365 // index: shl(x,i) -> index: x, scale * (1 << i)
2366 if (Opc == X86ISD::VSHLI) {
2367 uint64_t ShiftAmt = N.getConstantOperandVal(i: 1);
2368 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2369 if ((AM.Scale * ScaleAmt) <= 8) {
2370 AM.Scale *= ScaleAmt;
2371 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2372 }
2373 }
2374
2375 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2376 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2377 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2378 SDValue Src = N.getOperand(i: 0);
2379 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2380 Src.hasOneUse()) {
2381 if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2382 SDValue AddSrc = Src.getOperand(i: 0);
2383 auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: 1));
2384 uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2385 if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2386 SDLoc DL(N);
2387 SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2388 SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2389 SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal);
2390 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2391 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2392 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2393 CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2394 CurDAG->RemoveDeadNode(N: N.getNode());
2395 return ExtSrc;
2396 }
2397 }
2398 }
2399 }
2400
2401 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2402 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2403 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2404 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2405 SDValue Src = N.getOperand(i: 0);
2406 unsigned SrcOpc = Src.getOpcode();
2407 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2408 CurDAG->isADDLike(Op: Src, /*NoWrap=*/true)) &&
2409 Src.hasOneUse()) {
2410 if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2411 SDValue AddSrc = Src.getOperand(i: 0);
2412 uint64_t Offset = Src.getConstantOperandVal(i: 1);
2413 if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2414 SDLoc DL(N);
2415 SDValue Res;
2416 // If we're also scaling, see if we can use that as well.
2417 if (AddSrc.getOpcode() == ISD::SHL &&
2418 isa<ConstantSDNode>(Val: AddSrc.getOperand(i: 1))) {
2419 SDValue ShVal = AddSrc.getOperand(i: 0);
2420 uint64_t ShAmt = AddSrc.getConstantOperandVal(i: 1);
2421 APInt HiBits =
2422 APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt);
2423 uint64_t ScaleAmt = 1ULL << ShAmt;
2424 if ((AM.Scale * ScaleAmt) <= 8 &&
2425 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2426 CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) {
2427 AM.Scale *= ScaleAmt;
2428 SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal);
2429 SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal,
2430 N2: AddSrc.getOperand(i: 1));
2431 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal);
2432 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift);
2433 AddSrc = ExtShift;
2434 Res = ExtShVal;
2435 }
2436 }
2437 SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2438 SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2439 SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal);
2440 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2441 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2442 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2443 CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2444 CurDAG->RemoveDeadNode(N: N.getNode());
2445 return Res ? Res : ExtSrc;
2446 }
2447 }
2448 }
2449 }
2450
2451 // TODO: Handle extensions, shifted masks etc.
2452 return N;
2453}
2454
2455bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2456 unsigned Depth) {
2457 SDLoc dl(N);
2458 LLVM_DEBUG({
2459 dbgs() << "MatchAddress: ";
2460 AM.dump(CurDAG);
2461 });
2462 // Limit recursion.
2463 if (Depth >= SelectionDAG::MaxRecursionDepth)
2464 return matchAddressBase(N, AM);
2465
2466 // If this is already a %rip relative address, we can only merge immediates
2467 // into it. Instead of handling this in every case, we handle it here.
2468 // RIP relative addressing: %rip + 32-bit displacement!
2469 if (AM.isRIPRelative()) {
2470 // FIXME: JumpTable and ExternalSymbol address currently don't like
2471 // displacements. It isn't very important, but this should be fixed for
2472 // consistency.
2473 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2474 return true;
2475
2476 if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N))
2477 if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM))
2478 return false;
2479 return true;
2480 }
2481
2482 switch (N.getOpcode()) {
2483 default: break;
2484 case ISD::LOCAL_RECOVER: {
2485 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2486 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: 0))) {
2487 // Use the symbol and don't prefix it.
2488 AM.MCSym = ESNode->getMCSymbol();
2489 return false;
2490 }
2491 break;
2492 }
2493 case ISD::Constant: {
2494 uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2495 if (!foldOffsetIntoAddress(Offset: Val, AM))
2496 return false;
2497 break;
2498 }
2499
2500 case X86ISD::Wrapper:
2501 case X86ISD::WrapperRIP:
2502 if (!matchWrapper(N, AM))
2503 return false;
2504 break;
2505
2506 case ISD::LOAD:
2507 if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM))
2508 return false;
2509 break;
2510
2511 case ISD::FrameIndex:
2512 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2513 AM.Base_Reg.getNode() == nullptr &&
2514 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(Val: AM.Disp))) {
2515 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2516 AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex();
2517 return false;
2518 }
2519 break;
2520
2521 case ISD::SHL:
2522 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2523 break;
2524
2525 if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) {
2526 unsigned Val = CN->getZExtValue();
2527 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2528 // that the base operand remains free for further matching. If
2529 // the base doesn't end up getting used, a post-processing step
2530 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2531 if (Val == 1 || Val == 2 || Val == 3) {
2532 SDValue ShVal = N.getOperand(i: 0);
2533 AM.Scale = 1 << Val;
2534 AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + 1);
2535 return false;
2536 }
2537 }
2538 break;
2539
2540 case ISD::SRL: {
2541 // Scale must not be used already.
2542 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2543
2544 // We only handle up to 64-bit values here as those are what matter for
2545 // addressing mode optimizations.
2546 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2547 "Unexpected value size!");
2548
2549 SDValue And = N.getOperand(i: 0);
2550 if (And.getOpcode() != ISD::AND) break;
2551 SDValue X = And.getOperand(i: 0);
2552
2553 // The mask used for the transform is expected to be post-shift, but we
2554 // found the shift first so just apply the shift to the mask before passing
2555 // it down.
2556 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)) ||
2557 !isa<ConstantSDNode>(Val: And.getOperand(i: 1)))
2558 break;
2559 uint64_t Mask = And.getConstantOperandVal(i: 1) >> N.getConstantOperandVal(i: 1);
2560
2561 // Try to fold the mask and shift into the scale, and return false if we
2562 // succeed.
2563 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM))
2564 return false;
2565 break;
2566 }
2567
2568 case ISD::SMUL_LOHI:
2569 case ISD::UMUL_LOHI:
2570 // A mul_lohi where we need the low part can be folded as a plain multiply.
2571 if (N.getResNo() != 0) break;
2572 [[fallthrough]];
2573 case ISD::MUL:
2574 case X86ISD::MUL_IMM:
2575 // X*[3,5,9] -> X+X*[2,4,8]
2576 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2577 AM.Base_Reg.getNode() == nullptr &&
2578 AM.IndexReg.getNode() == nullptr) {
2579 if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1)))
2580 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2581 CN->getZExtValue() == 9) {
2582 AM.Scale = unsigned(CN->getZExtValue())-1;
2583
2584 SDValue MulVal = N.getOperand(i: 0);
2585 SDValue Reg;
2586
2587 // Okay, we know that we have a scale by now. However, if the scaled
2588 // value is an add of something and a constant, we can fold the
2589 // constant into the disp field here.
2590 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2591 isa<ConstantSDNode>(Val: MulVal.getOperand(i: 1))) {
2592 Reg = MulVal.getOperand(i: 0);
2593 auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: 1));
2594 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2595 if (foldOffsetIntoAddress(Offset: Disp, AM))
2596 Reg = N.getOperand(i: 0);
2597 } else {
2598 Reg = N.getOperand(i: 0);
2599 }
2600
2601 AM.IndexReg = AM.Base_Reg = Reg;
2602 return false;
2603 }
2604 }
2605 break;
2606
2607 case ISD::SUB: {
2608 // Given A-B, if A can be completely folded into the address and
2609 // the index field with the index field unused, use -B as the index.
2610 // This is a win if a has multiple parts that can be folded into
2611 // the address. Also, this saves a mov if the base register has
2612 // other uses, since it avoids a two-address sub instruction, however
2613 // it costs an additional mov if the index register has other uses.
2614
2615 // Add an artificial use to this node so that we can keep track of
2616 // it if it gets CSE'd with a different node.
2617 HandleSDNode Handle(N);
2618
2619 // Test if the LHS of the sub can be folded.
2620 X86ISelAddressMode Backup = AM;
2621 if (matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1)) {
2622 N = Handle.getValue();
2623 AM = Backup;
2624 break;
2625 }
2626 N = Handle.getValue();
2627 // Test if the index field is free for use.
2628 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2629 AM = Backup;
2630 break;
2631 }
2632
2633 int Cost = 0;
2634 SDValue RHS = N.getOperand(i: 1);
2635 // If the RHS involves a register with multiple uses, this
2636 // transformation incurs an extra mov, due to the neg instruction
2637 // clobbering its operand.
2638 if (!RHS.getNode()->hasOneUse() ||
2639 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2640 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2641 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2642 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2643 RHS.getOperand(i: 0).getValueType() == MVT::i32))
2644 ++Cost;
2645 // If the base is a register with multiple uses, this
2646 // transformation may save a mov.
2647 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2648 !AM.Base_Reg.getNode()->hasOneUse()) ||
2649 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2650 --Cost;
2651 // If the folded LHS was interesting, this transformation saves
2652 // address arithmetic.
2653 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2654 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2655 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2656 --Cost;
2657 // If it doesn't look like it may be an overall win, don't do it.
2658 if (Cost >= 0) {
2659 AM = Backup;
2660 break;
2661 }
2662
2663 // Ok, the transformation is legal and appears profitable. Go for it.
2664 // Negation will be emitted later to avoid creating dangling nodes if this
2665 // was an unprofitable LEA.
2666 AM.IndexReg = RHS;
2667 AM.NegateIndex = true;
2668 AM.Scale = 1;
2669 return false;
2670 }
2671
2672 case ISD::OR:
2673 case ISD::XOR:
2674 // See if we can treat the OR/XOR node as an ADD node.
2675 if (!CurDAG->isADDLike(Op: N))
2676 break;
2677 [[fallthrough]];
2678 case ISD::ADD:
2679 if (!matchAdd(N, AM, Depth))
2680 return false;
2681 break;
2682
2683 case ISD::AND: {
2684 // Perform some heroic transforms on an and of a constant-count shift
2685 // with a constant to enable use of the scaled offset field.
2686
2687 // Scale must not be used already.
2688 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2689
2690 // We only handle up to 64-bit values here as those are what matter for
2691 // addressing mode optimizations.
2692 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2693 "Unexpected value size!");
2694
2695 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)))
2696 break;
2697
2698 if (N.getOperand(i: 0).getOpcode() == ISD::SRL) {
2699 SDValue Shift = N.getOperand(i: 0);
2700 SDValue X = Shift.getOperand(i: 0);
2701
2702 uint64_t Mask = N.getConstantOperandVal(i: 1);
2703
2704 // Try to fold the mask and shift into an extract and scale.
2705 if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2706 return false;
2707
2708 // Try to fold the mask and shift directly into the scale.
2709 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2710 return false;
2711
2712 // Try to fold the mask and shift into BEXTR and scale.
2713 if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask, Shift, X, AM, Subtarget: *Subtarget))
2714 return false;
2715 }
2716
2717 // Try to swap the mask and shift to place shifts which can be done as
2718 // a scale on the outside of the mask.
2719 if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM))
2720 return false;
2721
2722 break;
2723 }
2724 case ISD::ZERO_EXTEND: {
2725 // Try to widen a zexted shift left to the same size as its use, so we can
2726 // match the shift as a scale factor.
2727 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2728 break;
2729
2730 SDValue Src = N.getOperand(i: 0);
2731
2732 // See if we can match a zext(addlike(x,c)).
2733 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2734 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2735 if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + 1))
2736 if (Index != N) {
2737 AM.IndexReg = Index;
2738 return false;
2739 }
2740
2741 // Peek through mask: zext(and(shl(x,c1),c2))
2742 APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits());
2743 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2744 if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1))) {
2745 Mask = MaskC->getAPIntValue();
2746 Src = Src.getOperand(i: 0);
2747 }
2748
2749 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2750 // Give up if the shift is not a valid scale factor [1,2,3].
2751 SDValue ShlSrc = Src.getOperand(i: 0);
2752 SDValue ShlAmt = Src.getOperand(i: 1);
2753 auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt);
2754 if (!ShAmtC)
2755 break;
2756 unsigned ShAmtV = ShAmtC->getZExtValue();
2757 if (ShAmtV > 3)
2758 break;
2759
2760 // The narrow shift must only shift out zero bits (it must be 'nuw').
2761 // That makes it safe to widen to the destination type.
2762 APInt HighZeros =
2763 APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV);
2764 if (!Src->getFlags().hasNoUnsignedWrap() &&
2765 !CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask))
2766 break;
2767
2768 // zext (shl nuw i8 %x, C1) to i32
2769 // --> shl (zext i8 %x to i32), (zext C1)
2770 // zext (and (shl nuw i8 %x, C1), C2) to i32
2771 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2772 MVT SrcVT = ShlSrc.getSimpleValueType();
2773 MVT VT = N.getSimpleValueType();
2774 SDLoc DL(N);
2775
2776 SDValue Res = ShlSrc;
2777 if (!Mask.isAllOnes()) {
2778 Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT);
2779 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2780 Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res);
2781 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2782 }
2783 SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res);
2784 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext);
2785 SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt);
2786 insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl);
2787 CurDAG->ReplaceAllUsesWith(From: N, To: NewShl);
2788 CurDAG->RemoveDeadNode(N: N.getNode());
2789
2790 // Convert the shift to scale factor.
2791 AM.Scale = 1 << ShAmtV;
2792 // If matchIndexRecursively is not called here,
2793 // Zext may be replaced by other nodes but later used to call a builder
2794 // method
2795 AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + 1);
2796 return false;
2797 }
2798
2799 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2800 // Try to fold the mask and shift into an extract and scale.
2801 if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2802 X: Src.getOperand(i: 0), AM))
2803 return false;
2804
2805 // Try to fold the mask and shift directly into the scale.
2806 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2807 X: Src.getOperand(i: 0), AM))
2808 return false;
2809
2810 // Try to fold the mask and shift into BEXTR and scale.
2811 if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2812 X: Src.getOperand(i: 0), AM, Subtarget: *Subtarget))
2813 return false;
2814 }
2815
2816 break;
2817 }
2818 }
2819
2820 return matchAddressBase(N, AM);
2821}
2822
2823/// Helper for MatchAddress. Add the specified node to the
2824/// specified addressing mode without any further recursion.
2825bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2826 // Is the base register already occupied?
2827 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2828 // If so, check to see if the scale index register is set.
2829 if (!AM.IndexReg.getNode()) {
2830 AM.IndexReg = N;
2831 AM.Scale = 1;
2832 return false;
2833 }
2834
2835 // Otherwise, we cannot select it.
2836 return true;
2837 }
2838
2839 // Default, generate it as a register.
2840 AM.BaseType = X86ISelAddressMode::RegBase;
2841 AM.Base_Reg = N;
2842 return false;
2843}
2844
2845bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2846 X86ISelAddressMode &AM,
2847 unsigned Depth) {
2848 SDLoc dl(N);
2849 LLVM_DEBUG({
2850 dbgs() << "MatchVectorAddress: ";
2851 AM.dump(CurDAG);
2852 });
2853 // Limit recursion.
2854 if (Depth >= SelectionDAG::MaxRecursionDepth)
2855 return matchAddressBase(N, AM);
2856
2857 // TODO: Support other operations.
2858 switch (N.getOpcode()) {
2859 case ISD::Constant: {
2860 uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2861 if (!foldOffsetIntoAddress(Offset: Val, AM))
2862 return false;
2863 break;
2864 }
2865 case X86ISD::Wrapper:
2866 if (!matchWrapper(N, AM))
2867 return false;
2868 break;
2869 case ISD::ADD: {
2870 // Add an artificial use to this node so that we can keep track of
2871 // it if it gets CSE'd with a different node.
2872 HandleSDNode Handle(N);
2873
2874 X86ISelAddressMode Backup = AM;
2875 if (!matchVectorAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1) &&
2876 !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2877 Depth: Depth + 1))
2878 return false;
2879 AM = Backup;
2880
2881 // Try again after commuting the operands.
2882 if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2883 Depth: Depth + 1) &&
2884 !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM,
2885 Depth: Depth + 1))
2886 return false;
2887 AM = Backup;
2888
2889 N = Handle.getValue();
2890 break;
2891 }
2892 }
2893
2894 return matchAddressBase(N, AM);
2895}
2896
2897/// Helper for selectVectorAddr. Handles things that can be folded into a
2898/// gather/scatter address. The index register and scale should have already
2899/// been handled.
2900bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2901 return matchVectorAddressRecursively(N, AM, Depth: 0);
2902}
2903
2904bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2905 SDValue IndexOp, SDValue ScaleOp,
2906 SDValue &Base, SDValue &Scale,
2907 SDValue &Index, SDValue &Disp,
2908 SDValue &Segment) {
2909 X86ISelAddressMode AM;
2910 AM.Scale = ScaleOp->getAsZExtVal();
2911
2912 // Attempt to match index patterns, as long as we're not relying on implicit
2913 // sign-extension, which is performed BEFORE scale.
2914 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2915 AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: 0);
2916 else
2917 AM.IndexReg = IndexOp;
2918
2919 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2920 if (AddrSpace == X86AS::GS)
2921 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
2922 if (AddrSpace == X86AS::FS)
2923 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
2924 if (AddrSpace == X86AS::SS)
2925 AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
2926
2927 SDLoc DL(BasePtr);
2928 MVT VT = BasePtr.getSimpleValueType();
2929
2930 // Try to match into the base and displacement fields.
2931 if (matchVectorAddress(N: BasePtr, AM))
2932 return false;
2933
2934 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2935 return true;
2936}
2937
2938/// Returns true if it is able to pattern match an addressing mode.
2939/// It returns the operands which make up the maximal addressing mode it can
2940/// match by reference.
2941///
2942/// Parent is the parent node of the addr operand that is being matched. It
2943/// is always a load, store, atomic node, or null. It is only null when
2944/// checking memory operands for inline asm nodes.
2945bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2946 SDValue &Scale, SDValue &Index,
2947 SDValue &Disp, SDValue &Segment) {
2948 X86ISelAddressMode AM;
2949
2950 if (Parent &&
2951 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2952 // that are not a MemSDNode, and thus don't have proper addrspace info.
2953 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2954 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2955 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2956 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2957 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2958 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2959 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2960 unsigned AddrSpace =
2961 cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace();
2962 if (AddrSpace == X86AS::GS)
2963 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
2964 if (AddrSpace == X86AS::FS)
2965 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
2966 if (AddrSpace == X86AS::SS)
2967 AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
2968 }
2969
2970 // Save the DL and VT before calling matchAddress, it can invalidate N.
2971 SDLoc DL(N);
2972 MVT VT = N.getSimpleValueType();
2973
2974 if (matchAddress(N, AM))
2975 return false;
2976
2977 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2978 return true;
2979}
2980
2981bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2982 // Cannot use 32 bit constants to reference objects in kernel/large code
2983 // model.
2984 if (TM.getCodeModel() == CodeModel::Kernel ||
2985 TM.getCodeModel() == CodeModel::Large)
2986 return false;
2987
2988 // In static codegen with small code model, we can get the address of a label
2989 // into a register with 'movl'
2990 if (N->getOpcode() != X86ISD::Wrapper)
2991 return false;
2992
2993 N = N.getOperand(i: 0);
2994
2995 // At least GNU as does not accept 'movl' for TPOFF relocations.
2996 // FIXME: We could use 'movl' when we know we are targeting MC.
2997 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2998 return false;
2999
3000 Imm = N;
3001 // Small/medium code model can reference non-TargetGlobalAddress objects with
3002 // 32 bit constants.
3003 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3004 return TM.getCodeModel() == CodeModel::Small ||
3005 TM.getCodeModel() == CodeModel::Medium;
3006 }
3007
3008 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal();
3009 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3010 return CR->getUnsignedMax().ult(RHS: 1ull << 32);
3011
3012 return !TM.isLargeGlobalValue(GV);
3013}
3014
3015bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3016 SDValue &Scale, SDValue &Index,
3017 SDValue &Disp, SDValue &Segment) {
3018 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3019 SDLoc DL(N);
3020
3021 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3022 return false;
3023
3024 auto *RN = dyn_cast<RegisterSDNode>(Val&: Base);
3025 if (RN && RN->getReg() == 0)
3026 Base = CurDAG->getRegister(Reg: 0, VT: MVT::i64);
3027 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Val: Base)) {
3028 // Base could already be %rip, particularly in the x32 ABI.
3029 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3030 VT: MVT::i64), 0);
3031 Base = CurDAG->getTargetInsertSubreg(SRIdx: X86::sub_32bit, DL, VT: MVT::i64, Operand: ImplDef,
3032 Subreg: Base);
3033 }
3034
3035 RN = dyn_cast<RegisterSDNode>(Val&: Index);
3036 if (RN && RN->getReg() == 0)
3037 Index = CurDAG->getRegister(Reg: 0, VT: MVT::i64);
3038 else {
3039 assert(Index.getValueType() == MVT::i32 &&
3040 "Expect to be extending 32-bit registers for use in LEA");
3041 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3042 VT: MVT::i64), 0);
3043 Index = CurDAG->getTargetInsertSubreg(SRIdx: X86::sub_32bit, DL, VT: MVT::i64, Operand: ImplDef,
3044 Subreg: Index);
3045 }
3046
3047 return true;
3048}
3049
3050/// Calls SelectAddr and determines if the maximal addressing
3051/// mode it matches can be cost effectively emitted as an LEA instruction.
3052bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3053 SDValue &Base, SDValue &Scale,
3054 SDValue &Index, SDValue &Disp,
3055 SDValue &Segment) {
3056 X86ISelAddressMode AM;
3057
3058 // Save the DL and VT before calling matchAddress, it can invalidate N.
3059 SDLoc DL(N);
3060 MVT VT = N.getSimpleValueType();
3061
3062 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3063 // segments.
3064 SDValue Copy = AM.Segment;
3065 SDValue T = CurDAG->getRegister(Reg: 0, VT: MVT::i32);
3066 AM.Segment = T;
3067 if (matchAddress(N, AM))
3068 return false;
3069 assert (T == AM.Segment);
3070 AM.Segment = Copy;
3071
3072 unsigned Complexity = 0;
3073 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3074 Complexity = 1;
3075 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3076 Complexity = 4;
3077
3078 if (AM.IndexReg.getNode())
3079 Complexity++;
3080
3081 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3082 // a simple shift.
3083 if (AM.Scale > 1)
3084 Complexity++;
3085
3086 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3087 // to a LEA. This is determined with some experimentation but is by no means
3088 // optimal (especially for code size consideration). LEA is nice because of
3089 // its three-address nature. Tweak the cost function again when we can run
3090 // convertToThreeAddress() at register allocation time.
3091 if (AM.hasSymbolicDisplacement()) {
3092 // For X86-64, always use LEA to materialize RIP-relative addresses.
3093 if (Subtarget->is64Bit())
3094 Complexity = 4;
3095 else
3096 Complexity += 2;
3097 }
3098
3099 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3100 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3101 // duplicating flag-producing instructions later in the pipeline.
3102 if (N.getOpcode() == ISD::ADD) {
3103 auto isMathWithFlags = [](SDValue V) {
3104 switch (V.getOpcode()) {
3105 case X86ISD::ADD:
3106 case X86ISD::SUB:
3107 case X86ISD::ADC:
3108 case X86ISD::SBB:
3109 case X86ISD::SMUL:
3110 case X86ISD::UMUL:
3111 /* TODO: These opcodes can be added safely, but we may want to justify
3112 their inclusion for different reasons (better for reg-alloc).
3113 case X86ISD::OR:
3114 case X86ISD::XOR:
3115 case X86ISD::AND:
3116 */
3117 // Value 1 is the flag output of the node - verify it's not dead.
3118 return !SDValue(V.getNode(), 1).use_empty();
3119 default:
3120 return false;
3121 }
3122 };
3123 // TODO: We might want to factor in whether there's a load folding
3124 // opportunity for the math op that disappears with LEA.
3125 if (isMathWithFlags(N.getOperand(i: 0)) || isMathWithFlags(N.getOperand(i: 1)))
3126 Complexity++;
3127 }
3128
3129 if (AM.Disp)
3130 Complexity++;
3131
3132 // If it isn't worth using an LEA, reject it.
3133 if (Complexity <= 2)
3134 return false;
3135
3136 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3137 return true;
3138}
3139
3140/// This is only run on TargetGlobalTLSAddress nodes.
3141bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3142 SDValue &Scale, SDValue &Index,
3143 SDValue &Disp, SDValue &Segment) {
3144 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3145 N.getOpcode() == ISD::TargetExternalSymbol);
3146
3147 X86ISelAddressMode AM;
3148 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) {
3149 AM.GV = GA->getGlobal();
3150 AM.Disp += GA->getOffset();
3151 AM.SymbolFlags = GA->getTargetFlags();
3152 } else {
3153 auto *SA = cast<ExternalSymbolSDNode>(Val&: N);
3154 AM.ES = SA->getSymbol();
3155 AM.SymbolFlags = SA->getTargetFlags();
3156 }
3157
3158 if (Subtarget->is32Bit()) {
3159 AM.Scale = 1;
3160 AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32);
3161 }
3162
3163 MVT VT = N.getSimpleValueType();
3164 getAddressOperands(AM, DL: SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3165 return true;
3166}
3167
3168bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3169 // Keep track of the original value type and whether this value was
3170 // truncated. If we see a truncation from pointer type to VT that truncates
3171 // bits that are known to be zero, we can use a narrow reference.
3172 EVT VT = N.getValueType();
3173 bool WasTruncated = false;
3174 if (N.getOpcode() == ISD::TRUNCATE) {
3175 WasTruncated = true;
3176 N = N.getOperand(i: 0);
3177 }
3178
3179 if (N.getOpcode() != X86ISD::Wrapper)
3180 return false;
3181
3182 // We can only use non-GlobalValues as immediates if they were not truncated,
3183 // as we do not have any range information. If we have a GlobalValue and the
3184 // address was not truncated, we can select it as an operand directly.
3185 unsigned Opc = N.getOperand(i: 0)->getOpcode();
3186 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3187 Op = N.getOperand(i: 0);
3188 // We can only select the operand directly if we didn't have to look past a
3189 // truncate.
3190 return !WasTruncated;
3191 }
3192
3193 // Check that the global's range fits into VT.
3194 auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: 0));
3195 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3196 if (!CR || CR->getUnsignedMax().uge(RHS: 1ull << VT.getSizeInBits()))
3197 return false;
3198
3199 // Okay, we can use a narrow reference.
3200 Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N), VT,
3201 offset: GA->getOffset(), TargetFlags: GA->getTargetFlags());
3202 return true;
3203}
3204
3205bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3206 SDValue &Base, SDValue &Scale,
3207 SDValue &Index, SDValue &Disp,
3208 SDValue &Segment) {
3209 assert(Root && P && "Unknown root/parent nodes");
3210 if (!ISD::isNON_EXTLoad(N: N.getNode()) ||
3211 !IsProfitableToFold(N, U: P, Root) ||
3212 !IsLegalToFold(N, U: P, Root, OptLevel))
3213 return false;
3214
3215 return selectAddr(Parent: N.getNode(),
3216 N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment);
3217}
3218
3219bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3220 SDValue &Base, SDValue &Scale,
3221 SDValue &Index, SDValue &Disp,
3222 SDValue &Segment) {
3223 assert(Root && P && "Unknown root/parent nodes");
3224 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3225 !IsProfitableToFold(N, U: P, Root) ||
3226 !IsLegalToFold(N, U: P, Root, OptLevel))
3227 return false;
3228
3229 return selectAddr(Parent: N.getNode(),
3230 N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment);
3231}
3232
3233/// Return an SDNode that returns the value of the global base register.
3234/// Output instructions required to initialize the global base register,
3235/// if necessary.
3236SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3237 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3238 auto &DL = MF->getDataLayout();
3239 return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode();
3240}
3241
3242bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3243 if (N->getOpcode() == ISD::TRUNCATE)
3244 N = N->getOperand(Num: 0).getNode();
3245 if (N->getOpcode() != X86ISD::Wrapper)
3246 return false;
3247
3248 auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: 0));
3249 if (!GA)
3250 return false;
3251
3252 auto *GV = GA->getGlobal();
3253 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3254 if (CR)
3255 return CR->getSignedMin().sge(RHS: -1ull << Width) &&
3256 CR->getSignedMax().slt(RHS: 1ull << Width);
3257 // In the kernel code model, globals are in the negative 2GB of the address
3258 // space, so globals can be a sign extended 32-bit immediate.
3259 // In other code models, small globals are in the low 2GB of the address
3260 // space, so sign extending them is equivalent to zero extending them.
3261 return Width == 32 && !TM.isLargeGlobalValue(GV);
3262}
3263
3264X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3265 assert(N->isMachineOpcode() && "Unexpected node");
3266 unsigned Opc = N->getMachineOpcode();
3267 const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc);
3268 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3269 if (CondNo < 0)
3270 return X86::COND_INVALID;
3271
3272 return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo));
3273}
3274
3275/// Test whether the given X86ISD::CMP node has any users that use a flag
3276/// other than ZF.
3277bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3278 // Examine each user of the node.
3279 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3280 UI != UE; ++UI) {
3281 // Only check things that use the flags.
3282 if (UI.getUse().getResNo() != Flags.getResNo())
3283 continue;
3284 // Only examine CopyToReg uses that copy to EFLAGS.
3285 if (UI->getOpcode() != ISD::CopyToReg ||
3286 cast<RegisterSDNode>(Val: UI->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3287 return false;
3288 // Examine each user of the CopyToReg use.
3289 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3290 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3291 // Only examine the Flag result.
3292 if (FlagUI.getUse().getResNo() != 1) continue;
3293 // Anything unusual: assume conservatively.
3294 if (!FlagUI->isMachineOpcode()) return false;
3295 // Examine the condition code of the user.
3296 X86::CondCode CC = getCondFromNode(N: *FlagUI);
3297
3298 switch (CC) {
3299 // Comparisons which only use the zero flag.
3300 case X86::COND_E: case X86::COND_NE:
3301 continue;
3302 // Anything else: assume conservatively.
3303 default:
3304 return false;
3305 }
3306 }
3307 }
3308 return true;
3309}
3310
3311/// Test whether the given X86ISD::CMP node has any uses which require the SF
3312/// flag to be accurate.
3313bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3314 // Examine each user of the node.
3315 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3316 UI != UE; ++UI) {
3317 // Only check things that use the flags.
3318 if (UI.getUse().getResNo() != Flags.getResNo())
3319 continue;
3320 // Only examine CopyToReg uses that copy to EFLAGS.
3321 if (UI->getOpcode() != ISD::CopyToReg ||
3322 cast<RegisterSDNode>(Val: UI->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3323 return false;
3324 // Examine each user of the CopyToReg use.
3325 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3326 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3327 // Only examine the Flag result.
3328 if (FlagUI.getUse().getResNo() != 1) continue;
3329 // Anything unusual: assume conservatively.
3330 if (!FlagUI->isMachineOpcode()) return false;
3331 // Examine the condition code of the user.
3332 X86::CondCode CC = getCondFromNode(N: *FlagUI);
3333
3334 switch (CC) {
3335 // Comparisons which don't examine the SF flag.
3336 case X86::COND_A: case X86::COND_AE:
3337 case X86::COND_B: case X86::COND_BE:
3338 case X86::COND_E: case X86::COND_NE:
3339 case X86::COND_O: case X86::COND_NO:
3340 case X86::COND_P: case X86::COND_NP:
3341 continue;
3342 // Anything else: assume conservatively.
3343 default:
3344 return false;
3345 }
3346 }
3347 }
3348 return true;
3349}
3350
3351static bool mayUseCarryFlag(X86::CondCode CC) {
3352 switch (CC) {
3353 // Comparisons which don't examine the CF flag.
3354 case X86::COND_O: case X86::COND_NO:
3355 case X86::COND_E: case X86::COND_NE:
3356 case X86::COND_S: case X86::COND_NS:
3357 case X86::COND_P: case X86::COND_NP:
3358 case X86::COND_L: case X86::COND_GE:
3359 case X86::COND_G: case X86::COND_LE:
3360 return false;
3361 // Anything else: assume conservatively.
3362 default:
3363 return true;
3364 }
3365}
3366
3367/// Test whether the given node which sets flags has any uses which require the
3368/// CF flag to be accurate.
3369 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3370 // Examine each user of the node.
3371 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3372 UI != UE; ++UI) {
3373 // Only check things that use the flags.
3374 if (UI.getUse().getResNo() != Flags.getResNo())
3375 continue;
3376
3377 unsigned UIOpc = UI->getOpcode();
3378
3379 if (UIOpc == ISD::CopyToReg) {
3380 // Only examine CopyToReg uses that copy to EFLAGS.
3381 if (cast<RegisterSDNode>(Val: UI->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3382 return false;
3383 // Examine each user of the CopyToReg use.
3384 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
3385 FlagUI != FlagUE; ++FlagUI) {
3386 // Only examine the Flag result.
3387 if (FlagUI.getUse().getResNo() != 1)
3388 continue;
3389 // Anything unusual: assume conservatively.
3390 if (!FlagUI->isMachineOpcode())
3391 return false;
3392 // Examine the condition code of the user.
3393 X86::CondCode CC = getCondFromNode(N: *FlagUI);
3394
3395 if (mayUseCarryFlag(CC))
3396 return false;
3397 }
3398
3399 // This CopyToReg is ok. Move on to the next user.
3400 continue;
3401 }
3402
3403 // This might be an unselected node. So look for the pre-isel opcodes that
3404 // use flags.
3405 unsigned CCOpNo;
3406 switch (UIOpc) {
3407 default:
3408 // Something unusual. Be conservative.
3409 return false;
3410 case X86ISD::SETCC: CCOpNo = 0; break;
3411 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3412 case X86ISD::CMOV: CCOpNo = 2; break;
3413 case X86ISD::BRCOND: CCOpNo = 2; break;
3414 }
3415
3416 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(Num: CCOpNo);
3417 if (mayUseCarryFlag(CC))
3418 return false;
3419 }
3420 return true;
3421}
3422
3423/// Check whether or not the chain ending in StoreNode is suitable for doing
3424/// the {load; op; store} to modify transformation.
3425static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3426 SDValue StoredVal, SelectionDAG *CurDAG,
3427 unsigned LoadOpNo,
3428 LoadSDNode *&LoadNode,
3429 SDValue &InputChain) {
3430 // Is the stored value result 0 of the operation?
3431 if (StoredVal.getResNo() != 0) return false;
3432
3433 // Are there other uses of the operation other than the store?
3434 if (!StoredVal.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) return false;
3435
3436 // Is the store non-extending and non-indexed?
3437 if (!ISD::isNormalStore(N: StoreNode) || StoreNode->isNonTemporal())
3438 return false;
3439
3440 SDValue Load = StoredVal->getOperand(Num: LoadOpNo);
3441 // Is the stored value a non-extending and non-indexed load?
3442 if (!ISD::isNormalLoad(N: Load.getNode())) return false;
3443
3444 // Return LoadNode by reference.
3445 LoadNode = cast<LoadSDNode>(Val&: Load);
3446
3447 // Is store the only read of the loaded value?
3448 if (!Load.hasOneUse())
3449 return false;
3450
3451 // Is the address of the store the same as the load?
3452 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3453 LoadNode->getOffset() != StoreNode->getOffset())
3454 return false;
3455
3456 bool FoundLoad = false;
3457 SmallVector<SDValue, 4> ChainOps;
3458 SmallVector<const SDNode *, 4> LoopWorklist;
3459 SmallPtrSet<const SDNode *, 16> Visited;
3460 const unsigned int Max = 1024;
3461
3462 // Visualization of Load-Op-Store fusion:
3463 // -------------------------
3464 // Legend:
3465 // *-lines = Chain operand dependencies.
3466 // |-lines = Normal operand dependencies.
3467 // Dependencies flow down and right. n-suffix references multiple nodes.
3468 //
3469 // C Xn C
3470 // * * *
3471 // * * *
3472 // Xn A-LD Yn TF Yn
3473 // * * \ | * |
3474 // * * \ | * |
3475 // * * \ | => A--LD_OP_ST
3476 // * * \| \
3477 // TF OP \
3478 // * | \ Zn
3479 // * | \
3480 // A-ST Zn
3481 //
3482
3483 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3484 // #2: Yn -> LD
3485 // #3: ST -> Zn
3486
3487 // Ensure the transform is safe by checking for the dual
3488 // dependencies to make sure we do not induce a loop.
3489
3490 // As LD is a predecessor to both OP and ST we can do this by checking:
3491 // a). if LD is a predecessor to a member of Xn or Yn.
3492 // b). if a Zn is a predecessor to ST.
3493
3494 // However, (b) can only occur through being a chain predecessor to
3495 // ST, which is the same as Zn being a member or predecessor of Xn,
3496 // which is a subset of LD being a predecessor of Xn. So it's
3497 // subsumed by check (a).
3498
3499 SDValue Chain = StoreNode->getChain();
3500
3501 // Gather X elements in ChainOps.
3502 if (Chain == Load.getValue(R: 1)) {
3503 FoundLoad = true;
3504 ChainOps.push_back(Elt: Load.getOperand(i: 0));
3505 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3506 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3507 SDValue Op = Chain.getOperand(i);
3508 if (Op == Load.getValue(R: 1)) {
3509 FoundLoad = true;
3510 // Drop Load, but keep its chain. No cycle check necessary.
3511 ChainOps.push_back(Elt: Load.getOperand(i: 0));
3512 continue;
3513 }
3514 LoopWorklist.push_back(Elt: Op.getNode());
3515 ChainOps.push_back(Elt: Op);
3516 }
3517 }
3518
3519 if (!FoundLoad)
3520 return false;
3521
3522 // Worklist is currently Xn. Add Yn to worklist.
3523 for (SDValue Op : StoredVal->ops())
3524 if (Op.getNode() != LoadNode)
3525 LoopWorklist.push_back(Elt: Op.getNode());
3526
3527 // Check (a) if Load is a predecessor to Xn + Yn
3528 if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max,
3529 TopologicalPrune: true))
3530 return false;
3531
3532 InputChain =
3533 CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ChainOps);
3534 return true;
3535}
3536
3537// Change a chain of {load; op; store} of the same value into a simple op
3538// through memory of that value, if the uses of the modified value and its
3539// address are suitable.
3540//
3541// The tablegen pattern memory operand pattern is currently not able to match
3542// the case where the EFLAGS on the original operation are used.
3543//
3544// To move this to tablegen, we'll need to improve tablegen to allow flags to
3545// be transferred from a node in the pattern to the result node, probably with
3546// a new keyword. For example, we have this
3547// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3548// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3549// (implicit EFLAGS)]>;
3550// but maybe need something like this
3551// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3552// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3553// (transferrable EFLAGS)]>;
3554//
3555// Until then, we manually fold these and instruction select the operation
3556// here.
3557bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3558 auto *StoreNode = cast<StoreSDNode>(Val: Node);
3559 SDValue StoredVal = StoreNode->getOperand(Num: 1);
3560 unsigned Opc = StoredVal->getOpcode();
3561
3562 // Before we try to select anything, make sure this is memory operand size
3563 // and opcode we can handle. Note that this must match the code below that
3564 // actually lowers the opcodes.
3565 EVT MemVT = StoreNode->getMemoryVT();
3566 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3567 MemVT != MVT::i8)
3568 return false;
3569
3570 bool IsCommutable = false;
3571 bool IsNegate = false;
3572 switch (Opc) {
3573 default:
3574 return false;
3575 case X86ISD::SUB:
3576 IsNegate = isNullConstant(V: StoredVal.getOperand(i: 0));
3577 break;
3578 case X86ISD::SBB:
3579 break;
3580 case X86ISD::ADD:
3581 case X86ISD::ADC:
3582 case X86ISD::AND:
3583 case X86ISD::OR:
3584 case X86ISD::XOR:
3585 IsCommutable = true;
3586 break;
3587 }
3588
3589 unsigned LoadOpNo = IsNegate ? 1 : 0;
3590 LoadSDNode *LoadNode = nullptr;
3591 SDValue InputChain;
3592 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3593 LoadNode, InputChain)) {
3594 if (!IsCommutable)
3595 return false;
3596
3597 // This operation is commutable, try the other operand.
3598 LoadOpNo = 1;
3599 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3600 LoadNode, InputChain))
3601 return false;
3602 }
3603
3604 SDValue Base, Scale, Index, Disp, Segment;
3605 if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3606 Segment))
3607 return false;
3608
3609 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3610 unsigned Opc8) {
3611 switch (MemVT.getSimpleVT().SimpleTy) {
3612 case MVT::i64:
3613 return Opc64;
3614 case MVT::i32:
3615 return Opc32;
3616 case MVT::i16:
3617 return Opc16;
3618 case MVT::i8:
3619 return Opc8;
3620 default:
3621 llvm_unreachable("Invalid size!");
3622 }
3623 };
3624
3625 MachineSDNode *Result;
3626 switch (Opc) {
3627 case X86ISD::SUB:
3628 // Handle negate.
3629 if (IsNegate) {
3630 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3631 X86::NEG8m);
3632 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3633 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32,
3634 VT2: MVT::Other, Ops);
3635 break;
3636 }
3637 [[fallthrough]];
3638 case X86ISD::ADD:
3639 // Try to match inc/dec.
3640 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3641 bool IsOne = isOneConstant(V: StoredVal.getOperand(i: 1));
3642 bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: 1));
3643 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3644 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) {
3645 unsigned NewOpc =
3646 ((Opc == X86ISD::ADD) == IsOne)
3647 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3648 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3649 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3650 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32,
3651 VT2: MVT::Other, Ops);
3652 break;
3653 }
3654 }
3655 [[fallthrough]];
3656 case X86ISD::ADC:
3657 case X86ISD::SBB:
3658 case X86ISD::AND:
3659 case X86ISD::OR:
3660 case X86ISD::XOR: {
3661 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3662 switch (Opc) {
3663 case X86ISD::ADD:
3664 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3665 X86::ADD8mr);
3666 case X86ISD::ADC:
3667 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3668 X86::ADC8mr);
3669 case X86ISD::SUB:
3670 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3671 X86::SUB8mr);
3672 case X86ISD::SBB:
3673 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3674 X86::SBB8mr);
3675 case X86ISD::AND:
3676 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3677 X86::AND8mr);
3678 case X86ISD::OR:
3679 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3680 case X86ISD::XOR:
3681 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3682 X86::XOR8mr);
3683 default:
3684 llvm_unreachable("Invalid opcode!");
3685 }
3686 };
3687 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3688 switch (Opc) {
3689 case X86ISD::ADD:
3690 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3691 X86::ADD8mi);
3692 case X86ISD::ADC:
3693 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3694 X86::ADC8mi);
3695 case X86ISD::SUB:
3696 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3697 X86::SUB8mi);
3698 case X86ISD::SBB:
3699 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3700 X86::SBB8mi);
3701 case X86ISD::AND:
3702 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3703 X86::AND8mi);
3704 case X86ISD::OR:
3705 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3706 X86::OR8mi);
3707 case X86ISD::XOR:
3708 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3709 X86::XOR8mi);
3710 default:
3711 llvm_unreachable("Invalid opcode!");
3712 }
3713 };
3714
3715 unsigned NewOpc = SelectRegOpcode(Opc);
3716 SDValue Operand = StoredVal->getOperand(Num: 1-LoadOpNo);
3717
3718 // See if the operand is a constant that we can fold into an immediate
3719 // operand.
3720 if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) {
3721 int64_t OperandV = OperandC->getSExtValue();
3722
3723 // Check if we can shrink the operand enough to fit in an immediate (or
3724 // fit into a smaller immediate) by negating it and switching the
3725 // operation.
3726 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3727 ((MemVT != MVT::i8 && !isInt<8>(x: OperandV) && isInt<8>(x: -OperandV)) ||
3728 (MemVT == MVT::i64 && !isInt<32>(x: OperandV) &&
3729 isInt<32>(x: -OperandV))) &&
3730 hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) {
3731 OperandV = -OperandV;
3732 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3733 }
3734
3735 if (MemVT != MVT::i64 || isInt<32>(x: OperandV)) {
3736 Operand = CurDAG->getTargetConstant(Val: OperandV, DL: SDLoc(Node), VT: MemVT);
3737 NewOpc = SelectImmOpcode(Opc);
3738 }
3739 }
3740
3741 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3742 SDValue CopyTo =
3743 CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc(Node), Reg: X86::EFLAGS,
3744 N: StoredVal.getOperand(i: 2), Glue: SDValue());
3745
3746 const SDValue Ops[] = {Base, Scale, Index, Disp,
3747 Segment, Operand, CopyTo, CopyTo.getValue(R: 1)};
3748 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other,
3749 Ops);
3750 } else {
3751 const SDValue Ops[] = {Base, Scale, Index, Disp,
3752 Segment, Operand, InputChain};
3753 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other,
3754 Ops);
3755 }
3756 break;
3757 }
3758 default:
3759 llvm_unreachable("Invalid opcode!");
3760 }
3761
3762 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3763 LoadNode->getMemOperand()};
3764 CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps);
3765
3766 // Update Load Chain uses as well.
3767 ReplaceUses(F: SDValue(LoadNode, 1), T: SDValue(Result, 1));
3768 ReplaceUses(F: SDValue(StoreNode, 0), T: SDValue(Result, 1));
3769 ReplaceUses(F: SDValue(StoredVal.getNode(), 1), T: SDValue(Result, 0));
3770 CurDAG->RemoveDeadNode(N: Node);
3771 return true;
3772}
3773
3774// See if this is an X & Mask that we can match to BEXTR/BZHI.
3775// Where Mask is one of the following patterns:
3776// a) x & (1 << nbits) - 1
3777// b) x & ~(-1 << nbits)
3778// c) x & (-1 >> (32 - y))
3779// d) x << (32 - y) >> (32 - y)
3780// e) (1 << nbits) - 1
3781bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3782 assert(
3783 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3784 Node->getOpcode() == ISD::SRL) &&
3785 "Should be either an and-mask, or right-shift after clearing high bits.");
3786
3787 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3788 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3789 return false;
3790
3791 MVT NVT = Node->getSimpleValueType(ResNo: 0);
3792
3793 // Only supported for 32 and 64 bits.
3794 if (NVT != MVT::i32 && NVT != MVT::i64)
3795 return false;
3796
3797 SDValue NBits;
3798 bool NegateNBits;
3799
3800 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3801 // Else, if we only have BMI1's BEXTR, we require one-use.
3802 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3803 auto checkUses = [AllowExtraUsesByDefault](
3804 SDValue Op, unsigned NUses,
3805 std::optional<bool> AllowExtraUses) {
3806 return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) ||
3807 Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo());
3808 };
3809 auto checkOneUse = [checkUses](SDValue Op,
3810 std::optional<bool> AllowExtraUses =
3811 std::nullopt) {
3812 return checkUses(Op, 1, AllowExtraUses);
3813 };
3814 auto checkTwoUse = [checkUses](SDValue Op,
3815 std::optional<bool> AllowExtraUses =
3816 std::nullopt) {
3817 return checkUses(Op, 2, AllowExtraUses);
3818 };
3819
3820 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3821 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3822 assert(V.getSimpleValueType() == MVT::i32 &&
3823 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3824 "Expected i64 -> i32 truncation");
3825 V = V.getOperand(i: 0);
3826 }
3827 return V;
3828 };
3829
3830 // a) x & ((1 << nbits) + (-1))
3831 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3832 &NegateNBits](SDValue Mask) -> bool {
3833 // Match `add`. Must only have one use!
3834 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3835 return false;
3836 // We should be adding all-ones constant (i.e. subtracting one.)
3837 if (!isAllOnesConstant(V: Mask->getOperand(Num: 1)))
3838 return false;
3839 // Match `1 << nbits`. Might be truncated. Must only have one use!
3840 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0));
3841 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3842 return false;
3843 if (!isOneConstant(V: M0->getOperand(Num: 0)))
3844 return false;
3845 NBits = M0->getOperand(Num: 1);
3846 NegateNBits = false;
3847 return true;
3848 };
3849
3850 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3851 V = peekThroughOneUseTruncation(V);
3852 return CurDAG->MaskedValueIsAllOnes(
3853 Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(),
3854 loBitsSet: NVT.getSizeInBits()));
3855 };
3856
3857 // b) x & ~(-1 << nbits)
3858 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3859 &NBits, &NegateNBits](SDValue Mask) -> bool {
3860 // Match `~()`. Must only have one use!
3861 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3862 return false;
3863 // The -1 only has to be all-ones for the final Node's NVT.
3864 if (!isAllOnes(Mask->getOperand(Num: 1)))
3865 return false;
3866 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3867 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0));
3868 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3869 return false;
3870 // The -1 only has to be all-ones for the final Node's NVT.
3871 if (!isAllOnes(M0->getOperand(Num: 0)))
3872 return false;
3873 NBits = M0->getOperand(Num: 1);
3874 NegateNBits = false;
3875 return true;
3876 };
3877
3878 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3879 // or leave the shift amount as-is, but then we'll have to negate it.
3880 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3881 unsigned Bitwidth) {
3882 NBits = ShiftAmt;
3883 NegateNBits = true;
3884 // Skip over a truncate of the shift amount, if any.
3885 if (NBits.getOpcode() == ISD::TRUNCATE)
3886 NBits = NBits.getOperand(i: 0);
3887 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3888 // If it doesn't match, that's fine, we'll just negate it ourselves.
3889 if (NBits.getOpcode() != ISD::SUB)
3890 return;
3891 auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: 0));
3892 if (!V0 || V0->getZExtValue() != Bitwidth)
3893 return;
3894 NBits = NBits.getOperand(i: 1);
3895 NegateNBits = false;
3896 };
3897
3898 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3899 // or
3900 // c) x & (-1 >> (32 - y))
3901 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3902 canonicalizeShiftAmt](SDValue Mask) -> bool {
3903 // The mask itself may be truncated.
3904 Mask = peekThroughOneUseTruncation(Mask);
3905 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3906 // Match `l>>`. Must only have one use!
3907 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3908 return false;
3909 // We should be shifting truly all-ones constant.
3910 if (!isAllOnesConstant(V: Mask.getOperand(i: 0)))
3911 return false;
3912 SDValue M1 = Mask.getOperand(i: 1);
3913 // The shift amount should not be used externally.
3914 if (!checkOneUse(M1))
3915 return false;
3916 canonicalizeShiftAmt(M1, Bitwidth);
3917 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3918 // is no extra use of the mask. Clearly, there was one since we are here.
3919 // But at the same time, if we need to negate the shift amount,
3920 // then we don't want the mask to stick around, else it's unprofitable.
3921 return !NegateNBits;
3922 };
3923
3924 SDValue X;
3925
3926 // d) x << z >> z but then we'll have to subtract z from bitwidth
3927 // or
3928 // d) x << (32 - y) >> (32 - y)
3929 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3930 AllowExtraUsesByDefault, &NegateNBits,
3931 &X](SDNode *Node) -> bool {
3932 if (Node->getOpcode() != ISD::SRL)
3933 return false;
3934 SDValue N0 = Node->getOperand(Num: 0);
3935 if (N0->getOpcode() != ISD::SHL)
3936 return false;
3937 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3938 SDValue N1 = Node->getOperand(Num: 1);
3939 SDValue N01 = N0->getOperand(Num: 1);
3940 // Both of the shifts must be by the exact same value.
3941 if (N1 != N01)
3942 return false;
3943 canonicalizeShiftAmt(N1, Bitwidth);
3944 // There should not be any external uses of the inner shift / shift amount.
3945 // Note that while we are generally okay with external uses given BMI2,
3946 // iff we need to negate the shift amount, we are not okay with extra uses.
3947 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3948 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3949 return false;
3950 X = N0->getOperand(Num: 0);
3951 return true;
3952 };
3953
3954 auto matchLowBitMask = [matchPatternA, matchPatternB,
3955 matchPatternC](SDValue Mask) -> bool {
3956 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3957 };
3958
3959 if (Node->getOpcode() == ISD::AND) {
3960 X = Node->getOperand(Num: 0);
3961 SDValue Mask = Node->getOperand(Num: 1);
3962
3963 if (matchLowBitMask(Mask)) {
3964 // Great.
3965 } else {
3966 std::swap(a&: X, b&: Mask);
3967 if (!matchLowBitMask(Mask))
3968 return false;
3969 }
3970 } else if (matchLowBitMask(SDValue(Node, 0))) {
3971 X = CurDAG->getAllOnesConstant(DL: SDLoc(Node), VT: NVT);
3972 } else if (!matchPatternD(Node))
3973 return false;
3974
3975 // If we need to negate the shift amount, require BMI2 BZHI support.
3976 // It's just too unprofitable for BMI1 BEXTR.
3977 if (NegateNBits && !Subtarget->hasBMI2())
3978 return false;
3979
3980 SDLoc DL(Node);
3981
3982 // Truncate the shift amount.
3983 NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits);
3984 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
3985
3986 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3987 // All the other bits are undefined, we do not care about them.
3988 SDValue ImplDef = SDValue(
3989 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), 0);
3990 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: ImplDef);
3991
3992 SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32);
3993 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: SRIdxVal);
3994 NBits = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL,
3995 VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal),
3996 0);
3997 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
3998
3999 // We might have matched the amount of high bits to be cleared,
4000 // but we want the amount of low bits to be kept, so negate it then.
4001 if (NegateNBits) {
4002 SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32);
4003 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: BitWidthC);
4004
4005 NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits);
4006 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4007 }
4008
4009 if (Subtarget->hasBMI2()) {
4010 // Great, just emit the BZHI..
4011 if (NVT != MVT::i32) {
4012 // But have to place the bit count into the wide-enough register first.
4013 NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits);
4014 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4015 }
4016
4017 SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits);
4018 ReplaceNode(F: Node, T: Extract.getNode());
4019 SelectCode(N: Extract.getNode());
4020 return true;
4021 }
4022
4023 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4024 // *logically* shifted (potentially with one-use trunc inbetween),
4025 // and the truncation was the only use of the shift,
4026 // and if so look past one-use truncation.
4027 {
4028 SDValue RealX = peekThroughOneUseTruncation(X);
4029 // FIXME: only if the shift is one-use?
4030 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4031 X = RealX;
4032 }
4033
4034 MVT XVT = X.getSimpleValueType();
4035
4036 // Else, emitting BEXTR requires one more step.
4037 // The 'control' of BEXTR has the pattern of:
4038 // [15...8 bit][ 7...0 bit] location
4039 // [ bit count][ shift] name
4040 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4041
4042 // Shift NBits left by 8 bits, thus producing 'control'.
4043 // This makes the low 8 bits to be zero.
4044 SDValue C8 = CurDAG->getConstant(Val: 8, DL, VT: MVT::i8);
4045 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: C8);
4046 SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8);
4047 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4048
4049 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4050 // FIXME: only if the shift is one-use?
4051 if (X.getOpcode() == ISD::SRL) {
4052 SDValue ShiftAmt = X.getOperand(i: 1);
4053 X = X.getOperand(i: 0);
4054
4055 assert(ShiftAmt.getValueType() == MVT::i8 &&
4056 "Expected shift amount to be i8");
4057
4058 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4059 // We could zext to i16 in some form, but we intentionally don't do that.
4060 SDValue OrigShiftAmt = ShiftAmt;
4061 ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt);
4062 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt);
4063
4064 // And now 'or' these low 8 bits of shift amount into the 'control'.
4065 Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt);
4066 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4067 }
4068
4069 // But have to place the 'control' into the wide-enough register first.
4070 if (XVT != MVT::i32) {
4071 Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control);
4072 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4073 }
4074
4075 // And finally, form the BEXTR itself.
4076 SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control);
4077
4078 // The 'X' was originally truncated. Do that now.
4079 if (XVT != NVT) {
4080 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Extract);
4081 Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract);
4082 }
4083
4084 ReplaceNode(F: Node, T: Extract.getNode());
4085 SelectCode(N: Extract.getNode());
4086
4087 return true;
4088}
4089
4090// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4091MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4092 MVT NVT = Node->getSimpleValueType(ResNo: 0);
4093 SDLoc dl(Node);
4094
4095 SDValue N0 = Node->getOperand(Num: 0);
4096 SDValue N1 = Node->getOperand(Num: 1);
4097
4098 // If we have TBM we can use an immediate for the control. If we have BMI
4099 // we should only do this if the BEXTR instruction is implemented well.
4100 // Otherwise moving the control into a register makes this more costly.
4101 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4102 // hoisting the move immediate would make it worthwhile with a less optimal
4103 // BEXTR?
4104 bool PreferBEXTR =
4105 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4106 if (!PreferBEXTR && !Subtarget->hasBMI2())
4107 return nullptr;
4108
4109 // Must have a shift right.
4110 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4111 return nullptr;
4112
4113 // Shift can't have additional users.
4114 if (!N0->hasOneUse())
4115 return nullptr;
4116
4117 // Only supported for 32 and 64 bits.
4118 if (NVT != MVT::i32 && NVT != MVT::i64)
4119 return nullptr;
4120
4121 // Shift amount and RHS of and must be constant.
4122 auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1);
4123 auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1));
4124 if (!MaskCst || !ShiftCst)
4125 return nullptr;
4126
4127 // And RHS must be a mask.
4128 uint64_t Mask = MaskCst->getZExtValue();
4129 if (!isMask_64(Value: Mask))
4130 return nullptr;
4131
4132 uint64_t Shift = ShiftCst->getZExtValue();
4133 uint64_t MaskSize = llvm::popcount(Value: Mask);
4134
4135 // Don't interfere with something that can be handled by extracting AH.
4136 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4137 if (Shift == 8 && MaskSize == 8)
4138 return nullptr;
4139
4140 // Make sure we are only using bits that were in the original value, not
4141 // shifted in.
4142 if (Shift + MaskSize > NVT.getSizeInBits())
4143 return nullptr;
4144
4145 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4146 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4147 // does not fit into 32 bits. Load folding is not a sufficient reason.
4148 if (!PreferBEXTR && MaskSize <= 32)
4149 return nullptr;
4150
4151 SDValue Control;
4152 unsigned ROpc, MOpc;
4153
4154#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4155 if (!PreferBEXTR) {
4156 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4157 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4158 // Let's perform the mask first, and apply shift later. Note that we need to
4159 // widen the mask to account for the fact that we'll apply shift afterwards!
4160 Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT);
4161 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4162 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4163 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4164 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4165 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4166 Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0);
4167 } else {
4168 // The 'control' of BEXTR has the pattern of:
4169 // [15...8 bit][ 7...0 bit] location
4170 // [ bit count][ shift] name
4171 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4172 Control = CurDAG->getTargetConstant(Val: Shift | (MaskSize << 8), DL: dl, VT: NVT);
4173 if (Subtarget->hasTBM()) {
4174 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4175 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4176 } else {
4177 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4178 // BMI requires the immediate to placed in a register.
4179 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4180 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4181 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4182 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4183 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4184 Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0);
4185 }
4186 }
4187
4188 MachineSDNode *NewNode;
4189 SDValue Input = N0->getOperand(Num: 0);
4190 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4191 if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4192 SDValue Ops[] = {
4193 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: 0)};
4194 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
4195 NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4196 // Update the chain.
4197 ReplaceUses(F: Input.getValue(R: 1), T: SDValue(NewNode, 2));
4198 // Record the mem-refs
4199 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()});
4200 } else {
4201 NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control);
4202 }
4203
4204 if (!PreferBEXTR) {
4205 // We still need to apply the shift.
4206 SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT);
4207 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4208 : GET_ND_IF_ENABLED(X86::SHR32ri);
4209 NewNode =
4210 CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue(NewNode, 0), Op2: ShAmt);
4211 }
4212
4213 return NewNode;
4214}
4215
4216// Emit a PCMISTR(I/M) instruction.
4217MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4218 bool MayFoldLoad, const SDLoc &dl,
4219 MVT VT, SDNode *Node) {
4220 SDValue N0 = Node->getOperand(Num: 0);
4221 SDValue N1 = Node->getOperand(Num: 1);
4222 SDValue Imm = Node->getOperand(Num: 2);
4223 auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4224 Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType());
4225
4226 // Try to fold a load. No need to check alignment.
4227 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4228 if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4229 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4230 N1.getOperand(i: 0) };
4231 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other);
4232 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4233 // Update the chain.
4234 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 2));
4235 // Record the mem-refs
4236 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
4237 return CNode;
4238 }
4239
4240 SDValue Ops[] = { N0, N1, Imm };
4241 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32);
4242 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4243 return CNode;
4244}
4245
4246// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4247// to emit a second instruction after this one. This is needed since we have two
4248// copyToReg nodes glued before this and we need to continue that glue through.
4249MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4250 bool MayFoldLoad, const SDLoc &dl,
4251 MVT VT, SDNode *Node,
4252 SDValue &InGlue) {
4253 SDValue N0 = Node->getOperand(Num: 0);
4254 SDValue N2 = Node->getOperand(Num: 2);
4255 SDValue Imm = Node->getOperand(Num: 4);
4256 auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4257 Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType());
4258
4259 // Try to fold a load. No need to check alignment.
4260 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4261 if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4262 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4263 N2.getOperand(i: 0), InGlue };
4264 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
4265 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4266 InGlue = SDValue(CNode, 3);
4267 // Update the chain.
4268 ReplaceUses(F: N2.getValue(R: 1), T: SDValue(CNode, 2));
4269 // Record the mem-refs
4270 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()});
4271 return CNode;
4272 }
4273
4274 SDValue Ops[] = { N0, N2, Imm, InGlue };
4275 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue);
4276 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4277 InGlue = SDValue(CNode, 2);
4278 return CNode;
4279}
4280
4281bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4282 EVT VT = N->getValueType(ResNo: 0);
4283
4284 // Only handle scalar shifts.
4285 if (VT.isVector())
4286 return false;
4287
4288 // Narrower shifts only mask to 5 bits in hardware.
4289 unsigned Size = VT == MVT::i64 ? 64 : 32;
4290
4291 SDValue OrigShiftAmt = N->getOperand(Num: 1);
4292 SDValue ShiftAmt = OrigShiftAmt;
4293 SDLoc DL(N);
4294
4295 // Skip over a truncate of the shift amount.
4296 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4297 ShiftAmt = ShiftAmt->getOperand(Num: 0);
4298
4299 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4300 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4301
4302 SDValue NewShiftAmt;
4303 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4304 ShiftAmt->getOpcode() == ISD::XOR) {
4305 SDValue Add0 = ShiftAmt->getOperand(Num: 0);
4306 SDValue Add1 = ShiftAmt->getOperand(Num: 1);
4307 auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0);
4308 auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1);
4309 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4310 // to avoid the ADD/SUB/XOR.
4311 if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == 0) {
4312 NewShiftAmt = Add0;
4313
4314 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4315 ((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - 1) ||
4316 (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - 1))) {
4317 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4318 // we can replace it with a NOT. In the XOR case it may save some code
4319 // size, in the SUB case it also may save a move.
4320 assert(Add0C == nullptr || Add1C == nullptr);
4321
4322 // We can only do N-X, not X-N
4323 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4324 return false;
4325
4326 EVT OpVT = ShiftAmt.getValueType();
4327
4328 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT);
4329 NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT,
4330 N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes);
4331 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes);
4332 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4333 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4334 // -X to generate a NEG instead of a SUB of a constant.
4335 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4336 Add0C->getZExtValue() != 0) {
4337 EVT SubVT = ShiftAmt.getValueType();
4338 SDValue X;
4339 if (Add0C->getZExtValue() % Size == 0)
4340 X = Add1;
4341 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4342 Add0C->getZExtValue() % 32 == 0) {
4343 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4344 // This is mainly beneficial if we already compute (x+n*32).
4345 if (Add1.getOpcode() == ISD::TRUNCATE) {
4346 Add1 = Add1.getOperand(i: 0);
4347 SubVT = Add1.getValueType();
4348 }
4349 if (Add0.getValueType() != SubVT) {
4350 Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT);
4351 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0);
4352 }
4353
4354 X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0);
4355 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X);
4356 } else
4357 return false;
4358 // Insert a negate op.
4359 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4360 // that uses it that's not a shift.
4361 SDValue Zero = CurDAG->getConstant(Val: 0, DL, VT: SubVT);
4362 SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X);
4363 NewShiftAmt = Neg;
4364
4365 // Insert these operands into a valid topological order so they can
4366 // get selected independently.
4367 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero);
4368 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg);
4369 } else
4370 return false;
4371 } else
4372 return false;
4373
4374 if (NewShiftAmt.getValueType() != MVT::i8) {
4375 // Need to truncate the shift amount.
4376 NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt);
4377 // Add to a correct topological ordering.
4378 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4379 }
4380
4381 // Insert a new mask to keep the shift amount legal. This should be removed
4382 // by isel patterns.
4383 NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt,
4384 N2: CurDAG->getConstant(Val: Size - 1, DL, VT: MVT::i8));
4385 // Place in a correct topological ordering.
4386 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4387
4388 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0),
4389 Op2: NewShiftAmt);
4390 if (UpdatedNode != N) {
4391 // If we found an existing node, we should replace ourselves with that node
4392 // and wait for it to be selected after its other users.
4393 ReplaceNode(F: N, T: UpdatedNode);
4394 return true;
4395 }
4396
4397 // If the original shift amount is now dead, delete it so that we don't run
4398 // it through isel.
4399 if (OrigShiftAmt.getNode()->use_empty())
4400 CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode());
4401
4402 // Now that we've optimized the shift amount, defer to normal isel to get
4403 // load folding and legacy vs BMI2 selection without repeating it here.
4404 SelectCode(N);
4405 return true;
4406}
4407
4408bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4409 MVT NVT = N->getSimpleValueType(ResNo: 0);
4410 unsigned Opcode = N->getOpcode();
4411 SDLoc dl(N);
4412
4413 // For operations of the form (x << C1) op C2, check if we can use a smaller
4414 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4415 SDValue Shift = N->getOperand(Num: 0);
4416 SDValue N1 = N->getOperand(Num: 1);
4417
4418 auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
4419 if (!Cst)
4420 return false;
4421
4422 int64_t Val = Cst->getSExtValue();
4423
4424 // If we have an any_extend feeding the AND, look through it to see if there
4425 // is a shift behind it. But only if the AND doesn't use the extended bits.
4426 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4427 bool FoundAnyExtend = false;
4428 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4429 Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 &&
4430 isUInt<32>(x: Val)) {
4431 FoundAnyExtend = true;
4432 Shift = Shift.getOperand(i: 0);
4433 }
4434
4435 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4436 return false;
4437
4438 // i8 is unshrinkable, i16 should be promoted to i32.
4439 if (NVT != MVT::i32 && NVT != MVT::i64)
4440 return false;
4441
4442 auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1));
4443 if (!ShlCst)
4444 return false;
4445
4446 uint64_t ShAmt = ShlCst->getZExtValue();
4447
4448 // Make sure that we don't change the operation by removing bits.
4449 // This only matters for OR and XOR, AND is unaffected.
4450 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4451 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4452 return false;
4453
4454 // Check the minimum bitwidth for the new constant.
4455 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4456 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4457 if (Opcode == ISD::AND) {
4458 // AND32ri is the same as AND64ri32 with zext imm.
4459 // Try this before sign extended immediates below.
4460 ShiftedVal = (uint64_t)Val >> ShAmt;
4461 if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal))
4462 return true;
4463 // Also swap order when the AND can become MOVZX.
4464 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4465 return true;
4466 }
4467 ShiftedVal = Val >> ShAmt;
4468 if ((!isInt<8>(x: Val) && isInt<8>(x: ShiftedVal)) ||
4469 (!isInt<32>(x: Val) && isInt<32>(x: ShiftedVal)))
4470 return true;
4471 if (Opcode != ISD::AND) {
4472 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4473 ShiftedVal = (uint64_t)Val >> ShAmt;
4474 if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal))
4475 return true;
4476 }
4477 return false;
4478 };
4479
4480 int64_t ShiftedVal;
4481 if (!CanShrinkImmediate(ShiftedVal))
4482 return false;
4483
4484 // Ok, we can reorder to get a smaller immediate.
4485
4486 // But, its possible the original immediate allowed an AND to become MOVZX.
4487 // Doing this late due to avoid the MakedValueIsZero call as late as
4488 // possible.
4489 if (Opcode == ISD::AND) {
4490 // Find the smallest zext this could possibly be.
4491 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4492 ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: 8U));
4493
4494 // Figure out which bits need to be zero to achieve that mask.
4495 APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(),
4496 loBitsSet: ZExtWidth);
4497 NeededMask &= ~Cst->getAPIntValue();
4498
4499 if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: 0), Mask: NeededMask))
4500 return false;
4501 }
4502
4503 SDValue X = Shift.getOperand(i: 0);
4504 if (FoundAnyExtend) {
4505 SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X);
4506 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewX);
4507 X = NewX;
4508 }
4509
4510 SDValue NewCst = CurDAG->getConstant(Val: ShiftedVal, DL: dl, VT: NVT);
4511 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewCst);
4512 SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst);
4513 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewBinOp);
4514 SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp,
4515 N2: Shift.getOperand(i: 1));
4516 ReplaceNode(F: N, T: NewSHL.getNode());
4517 SelectCode(N: NewSHL.getNode());
4518 return true;
4519}
4520
4521bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4522 SDNode *ParentB, SDNode *ParentC,
4523 SDValue A, SDValue B, SDValue C,
4524 uint8_t Imm) {
4525 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4526 C.isOperandOf(ParentC) && "Incorrect parent node");
4527
4528 auto tryFoldLoadOrBCast =
4529 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4530 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4531 if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4532 return true;
4533
4534 // Not a load, check for broadcast which may be behind a bitcast.
4535 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4536 P = L.getNode();
4537 L = L.getOperand(i: 0);
4538 }
4539
4540 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4541 return false;
4542
4543 // Only 32 and 64 bit broadcasts are supported.
4544 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4545 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4546 if (Size != 32 && Size != 64)
4547 return false;
4548
4549 return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4550 };
4551
4552 bool FoldedLoad = false;
4553 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4554 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4555 FoldedLoad = true;
4556 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4557 Tmp4)) {
4558 FoldedLoad = true;
4559 std::swap(a&: A, b&: C);
4560 // Swap bits 1/4 and 3/6.
4561 uint8_t OldImm = Imm;
4562 Imm = OldImm & 0xa5;
4563 if (OldImm & 0x02) Imm |= 0x10;
4564 if (OldImm & 0x10) Imm |= 0x02;
4565 if (OldImm & 0x08) Imm |= 0x40;
4566 if (OldImm & 0x40) Imm |= 0x08;
4567 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4568 Tmp4)) {
4569 FoldedLoad = true;
4570 std::swap(a&: B, b&: C);
4571 // Swap bits 1/2 and 5/6.
4572 uint8_t OldImm = Imm;
4573 Imm = OldImm & 0x99;
4574 if (OldImm & 0x02) Imm |= 0x04;
4575 if (OldImm & 0x04) Imm |= 0x02;
4576 if (OldImm & 0x20) Imm |= 0x40;
4577 if (OldImm & 0x40) Imm |= 0x20;
4578 }
4579
4580 SDLoc DL(Root);
4581
4582 SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
4583
4584 MVT NVT = Root->getSimpleValueType(ResNo: 0);
4585
4586 MachineSDNode *MNode;
4587 if (FoldedLoad) {
4588 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
4589
4590 unsigned Opc;
4591 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4592 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C);
4593 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4594 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4595
4596 bool UseD = EltSize == 32;
4597 if (NVT.is128BitVector())
4598 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4599 else if (NVT.is256BitVector())
4600 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4601 else if (NVT.is512BitVector())
4602 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4603 else
4604 llvm_unreachable("Unexpected vector size!");
4605 } else {
4606 bool UseD = NVT.getVectorElementType() == MVT::i32;
4607 if (NVT.is128BitVector())
4608 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4609 else if (NVT.is256BitVector())
4610 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4611 else if (NVT.is512BitVector())
4612 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4613 else
4614 llvm_unreachable("Unexpected vector size!");
4615 }
4616
4617 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: 0)};
4618 MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops);
4619
4620 // Update the chain.
4621 ReplaceUses(F: C.getValue(R: 1), T: SDValue(MNode, 1));
4622 // Record the mem-refs
4623 CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()});
4624 } else {
4625 bool UseD = NVT.getVectorElementType() == MVT::i32;
4626 unsigned Opc;
4627 if (NVT.is128BitVector())
4628 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4629 else if (NVT.is256BitVector())
4630 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4631 else if (NVT.is512BitVector())
4632 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4633 else
4634 llvm_unreachable("Unexpected vector size!");
4635
4636 MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm});
4637 }
4638
4639 ReplaceUses(F: SDValue(Root, 0), T: SDValue(MNode, 0));
4640 CurDAG->RemoveDeadNode(N: Root);
4641 return true;
4642}
4643
4644// Try to match two logic ops to a VPTERNLOG.
4645// FIXME: Handle more complex patterns that use an operand more than once?
4646bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4647 MVT NVT = N->getSimpleValueType(ResNo: 0);
4648
4649 // Make sure we support VPTERNLOG.
4650 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4651 NVT.getVectorElementType() == MVT::i1)
4652 return false;
4653
4654 // We need VLX for 128/256-bit.
4655 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4656 return false;
4657
4658 SDValue N0 = N->getOperand(Num: 0);
4659 SDValue N1 = N->getOperand(Num: 1);
4660
4661 auto getFoldableLogicOp = [](SDValue Op) {
4662 // Peek through single use bitcast.
4663 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4664 Op = Op.getOperand(i: 0);
4665
4666 if (!Op.hasOneUse())
4667 return SDValue();
4668
4669 unsigned Opc = Op.getOpcode();
4670 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4671 Opc == X86ISD::ANDNP)
4672 return Op;
4673
4674 return SDValue();
4675 };
4676
4677 SDValue A, FoldableOp;
4678 if ((FoldableOp = getFoldableLogicOp(N1))) {
4679 A = N0;
4680 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4681 A = N1;
4682 } else
4683 return false;
4684
4685 SDValue B = FoldableOp.getOperand(i: 0);
4686 SDValue C = FoldableOp.getOperand(i: 1);
4687 SDNode *ParentA = N;
4688 SDNode *ParentB = FoldableOp.getNode();
4689 SDNode *ParentC = FoldableOp.getNode();
4690
4691 // We can build the appropriate control immediate by performing the logic
4692 // operation we're matching using these constants for A, B, and C.
4693 uint8_t TernlogMagicA = 0xf0;
4694 uint8_t TernlogMagicB = 0xcc;
4695 uint8_t TernlogMagicC = 0xaa;
4696
4697 // Some of the inputs may be inverted, peek through them and invert the
4698 // magic values accordingly.
4699 // TODO: There may be a bitcast before the xor that we should peek through.
4700 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4701 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4702 ISD::isBuildVectorAllOnes(N: Op.getOperand(i: 1).getNode())) {
4703 Magic = ~Magic;
4704 Parent = Op.getNode();
4705 Op = Op.getOperand(i: 0);
4706 }
4707 };
4708
4709 PeekThroughNot(A, ParentA, TernlogMagicA);
4710 PeekThroughNot(B, ParentB, TernlogMagicB);
4711 PeekThroughNot(C, ParentC, TernlogMagicC);
4712
4713 uint8_t Imm;
4714 switch (FoldableOp.getOpcode()) {
4715 default: llvm_unreachable("Unexpected opcode!");
4716 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4717 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4718 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4719 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4720 }
4721
4722 switch (N->getOpcode()) {
4723 default: llvm_unreachable("Unexpected opcode!");
4724 case X86ISD::ANDNP:
4725 if (A == N0)
4726 Imm &= ~TernlogMagicA;
4727 else
4728 Imm = ~(Imm) & TernlogMagicA;
4729 break;
4730 case ISD::AND: Imm &= TernlogMagicA; break;
4731 case ISD::OR: Imm |= TernlogMagicA; break;
4732 case ISD::XOR: Imm ^= TernlogMagicA; break;
4733 }
4734
4735 return matchVPTERNLOG(Root: N, ParentA, ParentB, ParentC, A, B, C, Imm);
4736}
4737
4738/// If the high bits of an 'and' operand are known zero, try setting the
4739/// high bits of an 'and' constant operand to produce a smaller encoding by
4740/// creating a small, sign-extended negative immediate rather than a large
4741/// positive one. This reverses a transform in SimplifyDemandedBits that
4742/// shrinks mask constants by clearing bits. There is also a possibility that
4743/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4744/// case, just replace the 'and'. Return 'true' if the node is replaced.
4745bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4746 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4747 // have immediate operands.
4748 MVT VT = And->getSimpleValueType(ResNo: 0);
4749 if (VT != MVT::i32 && VT != MVT::i64)
4750 return false;
4751
4752 auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1));
4753 if (!And1C)
4754 return false;
4755
4756 // Bail out if the mask constant is already negative. It's can't shrink more.
4757 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4758 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4759 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4760 // are negative too.
4761 APInt MaskVal = And1C->getAPIntValue();
4762 unsigned MaskLZ = MaskVal.countl_zero();
4763 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4764 return false;
4765
4766 // Don't extend into the upper 32 bits of a 64 bit mask.
4767 if (VT == MVT::i64 && MaskLZ >= 32) {
4768 MaskLZ -= 32;
4769 MaskVal = MaskVal.trunc(width: 32);
4770 }
4771
4772 SDValue And0 = And->getOperand(Num: 0);
4773 APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ);
4774 APInt NegMaskVal = MaskVal | HighZeros;
4775
4776 // If a negative constant would not allow a smaller encoding, there's no need
4777 // to continue. Only change the constant when we know it's a win.
4778 unsigned MinWidth = NegMaskVal.getSignificantBits();
4779 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4780 return false;
4781
4782 // Extend masks if we truncated above.
4783 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4784 NegMaskVal = NegMaskVal.zext(width: 64);
4785 HighZeros = HighZeros.zext(width: 64);
4786 }
4787
4788 // The variable operand must be all zeros in the top bits to allow using the
4789 // new, negative constant as the mask.
4790 if (!CurDAG->MaskedValueIsZero(Op: And0, Mask: HighZeros))
4791 return false;
4792
4793 // Check if the mask is -1. In that case, this is an unnecessary instruction
4794 // that escaped earlier analysis.
4795 if (NegMaskVal.isAllOnes()) {
4796 ReplaceNode(F: And, T: And0.getNode());
4797 return true;
4798 }
4799
4800 // A negative mask allows a smaller encoding. Create a new 'and' node.
4801 SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc(And), VT);
4802 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(And, 0), N: NewMask);
4803 SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(And), VT, N1: And0, N2: NewMask);
4804 ReplaceNode(F: And, T: NewAnd.getNode());
4805 SelectCode(N: NewAnd.getNode());
4806 return true;
4807}
4808
4809static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4810 bool FoldedBCast, bool Masked) {
4811#define VPTESTM_CASE(VT, SUFFIX) \
4812case MVT::VT: \
4813 if (Masked) \
4814 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4815 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4816
4817
4818#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4819default: llvm_unreachable("Unexpected VT!"); \
4820VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4821VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4822VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4823VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4824VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4825VPTESTM_CASE(v8i64, QZ##SUFFIX)
4826
4827#define VPTESTM_FULL_CASES(SUFFIX) \
4828VPTESTM_BROADCAST_CASES(SUFFIX) \
4829VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4830VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4831VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4832VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4833VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4834VPTESTM_CASE(v32i16, WZ##SUFFIX)
4835
4836 if (FoldedBCast) {
4837 switch (TestVT.SimpleTy) {
4838 VPTESTM_BROADCAST_CASES(rmb)
4839 }
4840 }
4841
4842 if (FoldedLoad) {
4843 switch (TestVT.SimpleTy) {
4844 VPTESTM_FULL_CASES(rm)
4845 }
4846 }
4847
4848 switch (TestVT.SimpleTy) {
4849 VPTESTM_FULL_CASES(rr)
4850 }
4851
4852#undef VPTESTM_FULL_CASES
4853#undef VPTESTM_BROADCAST_CASES
4854#undef VPTESTM_CASE
4855}
4856
4857// Try to create VPTESTM instruction. If InMask is not null, it will be used
4858// to form a masked operation.
4859bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4860 SDValue InMask) {
4861 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4862 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4863 "Unexpected VT!");
4864
4865 // Look for equal and not equal compares.
4866 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: 2))->get();
4867 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4868 return false;
4869
4870 SDValue SetccOp0 = Setcc.getOperand(i: 0);
4871 SDValue SetccOp1 = Setcc.getOperand(i: 1);
4872
4873 // Canonicalize the all zero vector to the RHS.
4874 if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode()))
4875 std::swap(a&: SetccOp0, b&: SetccOp1);
4876
4877 // See if we're comparing against zero.
4878 if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode()))
4879 return false;
4880
4881 SDValue N0 = SetccOp0;
4882
4883 MVT CmpVT = N0.getSimpleValueType();
4884 MVT CmpSVT = CmpVT.getVectorElementType();
4885
4886 // Start with both operands the same. We'll try to refine this.
4887 SDValue Src0 = N0;
4888 SDValue Src1 = N0;
4889
4890 {
4891 // Look through single use bitcasts.
4892 SDValue N0Temp = N0;
4893 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4894 N0Temp = N0.getOperand(i: 0);
4895
4896 // Look for single use AND.
4897 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4898 Src0 = N0Temp.getOperand(i: 0);
4899 Src1 = N0Temp.getOperand(i: 1);
4900 }
4901 }
4902
4903 // Without VLX we need to widen the operation.
4904 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4905
4906 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4907 SDValue &Base, SDValue &Scale, SDValue &Index,
4908 SDValue &Disp, SDValue &Segment) {
4909 // If we need to widen, we can't fold the load.
4910 if (!Widen)
4911 if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4912 return true;
4913
4914 // If we didn't fold a load, try to match broadcast. No widening limitation
4915 // for this. But only 32 and 64 bit types are supported.
4916 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4917 return false;
4918
4919 // Look through single use bitcasts.
4920 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4921 P = L.getNode();
4922 L = L.getOperand(i: 0);
4923 }
4924
4925 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4926 return false;
4927
4928 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4929 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4930 return false;
4931
4932 return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4933 };
4934
4935 // We can only fold loads if the sources are unique.
4936 bool CanFoldLoads = Src0 != Src1;
4937
4938 bool FoldedLoad = false;
4939 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4940 if (CanFoldLoads) {
4941 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4942 Tmp3, Tmp4);
4943 if (!FoldedLoad) {
4944 // And is commutative.
4945 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4946 Tmp2, Tmp3, Tmp4);
4947 if (FoldedLoad)
4948 std::swap(a&: Src0, b&: Src1);
4949 }
4950 }
4951
4952 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4953
4954 bool IsMasked = InMask.getNode() != nullptr;
4955
4956 SDLoc dl(Root);
4957
4958 MVT ResVT = Setcc.getSimpleValueType();
4959 MVT MaskVT = ResVT;
4960 if (Widen) {
4961 // Widen the inputs using insert_subreg or copy_to_regclass.
4962 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4963 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4964 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4965 CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts);
4966 MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts);
4967 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl,
4968 VT: CmpVT), 0);
4969 Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0);
4970
4971 if (!FoldedBCast)
4972 Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1);
4973
4974 if (IsMasked) {
4975 // Widen the mask.
4976 unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID();
4977 SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
4978 InMask = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
4979 dl, VT: MaskVT, Op1: InMask, Op2: RC), 0);
4980 }
4981 }
4982
4983 bool IsTestN = CC == ISD::SETEQ;
4984 unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4985 Masked: IsMasked);
4986
4987 MachineSDNode *CNode;
4988 if (FoldedLoad) {
4989 SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other);
4990
4991 if (IsMasked) {
4992 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4993 Src1.getOperand(i: 0) };
4994 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
4995 } else {
4996 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4997 Src1.getOperand(i: 0) };
4998 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
4999 }
5000
5001 // Update the chain.
5002 ReplaceUses(F: Src1.getValue(R: 1), T: SDValue(CNode, 1));
5003 // Record the mem-refs
5004 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()});
5005 } else {
5006 if (IsMasked)
5007 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1);
5008 else
5009 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1);
5010 }
5011
5012 // If we widened, we need to shrink the mask VT.
5013 if (Widen) {
5014 unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID();
5015 SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5016 CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5017 dl, VT: ResVT, Op1: SDValue(CNode, 0), Op2: RC);
5018 }
5019
5020 ReplaceUses(F: SDValue(Root, 0), T: SDValue(CNode, 0));
5021 CurDAG->RemoveDeadNode(N: Root);
5022 return true;
5023}
5024
5025// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5026// into vpternlog.
5027bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5028 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5029
5030 MVT NVT = N->getSimpleValueType(ResNo: 0);
5031
5032 // Make sure we support VPTERNLOG.
5033 if (!NVT.isVector() || !Subtarget->hasAVX512())
5034 return false;
5035
5036 // We need VLX for 128/256-bit.
5037 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5038 return false;
5039
5040 SDValue N0 = N->getOperand(Num: 0);
5041 SDValue N1 = N->getOperand(Num: 1);
5042
5043 // Canonicalize AND to LHS.
5044 if (N1.getOpcode() == ISD::AND)
5045 std::swap(a&: N0, b&: N1);
5046
5047 if (N0.getOpcode() != ISD::AND ||
5048 N1.getOpcode() != X86ISD::ANDNP ||
5049 !N0.hasOneUse() || !N1.hasOneUse())
5050 return false;
5051
5052 // ANDN is not commutable, use it to pick down A and C.
5053 SDValue A = N1.getOperand(i: 0);
5054 SDValue C = N1.getOperand(i: 1);
5055
5056 // AND is commutable, if one operand matches A, the other operand is B.
5057 // Otherwise this isn't a match.
5058 SDValue B;
5059 if (N0.getOperand(i: 0) == A)
5060 B = N0.getOperand(i: 1);
5061 else if (N0.getOperand(i: 1) == A)
5062 B = N0.getOperand(i: 0);
5063 else
5064 return false;
5065
5066 SDLoc dl(N);
5067 SDValue Imm = CurDAG->getTargetConstant(Val: 0xCA, DL: dl, VT: MVT::i8);
5068 SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm);
5069 ReplaceNode(F: N, T: Ternlog.getNode());
5070
5071 return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(),
5072 ParentC: Ternlog.getNode(), A, B, C, Imm: 0xCA);
5073}
5074
5075void X86DAGToDAGISel::Select(SDNode *Node) {
5076 MVT NVT = Node->getSimpleValueType(ResNo: 0);
5077 unsigned Opcode = Node->getOpcode();
5078 SDLoc dl(Node);
5079
5080 if (Node->isMachineOpcode()) {
5081 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5082 Node->setNodeId(-1);
5083 return; // Already selected.
5084 }
5085
5086 switch (Opcode) {
5087 default: break;
5088 case ISD::INTRINSIC_W_CHAIN: {
5089 unsigned IntNo = Node->getConstantOperandVal(Num: 1);
5090 switch (IntNo) {
5091 default: break;
5092 case Intrinsic::x86_encodekey128:
5093 case Intrinsic::x86_encodekey256: {
5094 if (!Subtarget->hasKL())
5095 break;
5096
5097 unsigned Opcode;
5098 switch (IntNo) {
5099 default: llvm_unreachable("Impossible intrinsic");
5100 case Intrinsic::x86_encodekey128:
5101 Opcode = X86::ENCODEKEY128;
5102 break;
5103 case Intrinsic::x86_encodekey256:
5104 Opcode = X86::ENCODEKEY256;
5105 break;
5106 }
5107
5108 SDValue Chain = Node->getOperand(Num: 0);
5109 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 3),
5110 Glue: SDValue());
5111 if (Opcode == X86::ENCODEKEY256)
5112 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 4),
5113 Glue: Chain.getValue(R: 1));
5114
5115 MachineSDNode *Res = CurDAG->getMachineNode(
5116 Opcode, dl, VTs: Node->getVTList(),
5117 Ops: {Node->getOperand(Num: 2), Chain, Chain.getValue(R: 1)});
5118 ReplaceNode(F: Node, T: Res);
5119 return;
5120 }
5121 case Intrinsic::x86_tileloadd64_internal:
5122 case Intrinsic::x86_tileloaddt164_internal: {
5123 if (!Subtarget->hasAMXTILE())
5124 break;
5125 auto *MFI =
5126 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5127 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5128 unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
5129 ? X86::PTILELOADDV
5130 : X86::PTILELOADDT1V;
5131 // _tile_loadd_internal(row, col, buf, STRIDE)
5132 SDValue Base = Node->getOperand(Num: 4);
5133 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5134 SDValue Index = Node->getOperand(Num: 5);
5135 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5136 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5137 SDValue Chain = Node->getOperand(Num: 0);
5138 MachineSDNode *CNode;
5139 SDValue Ops[] = {Node->getOperand(Num: 2),
5140 Node->getOperand(Num: 3),
5141 Base,
5142 Scale,
5143 Index,
5144 Disp,
5145 Segment,
5146 Chain};
5147 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops);
5148 ReplaceNode(F: Node, T: CNode);
5149 return;
5150 }
5151 }
5152 break;
5153 }
5154 case ISD::INTRINSIC_VOID: {
5155 unsigned IntNo = Node->getConstantOperandVal(Num: 1);
5156 switch (IntNo) {
5157 default: break;
5158 case Intrinsic::x86_sse3_monitor:
5159 case Intrinsic::x86_monitorx:
5160 case Intrinsic::x86_clzero: {
5161 bool Use64BitPtr = Node->getOperand(Num: 2).getValueType() == MVT::i64;
5162
5163 unsigned Opc = 0;
5164 switch (IntNo) {
5165 default: llvm_unreachable("Unexpected intrinsic!");
5166 case Intrinsic::x86_sse3_monitor:
5167 if (!Subtarget->hasSSE3())
5168 break;
5169 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5170 break;
5171 case Intrinsic::x86_monitorx:
5172 if (!Subtarget->hasMWAITX())
5173 break;
5174 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5175 break;
5176 case Intrinsic::x86_clzero:
5177 if (!Subtarget->hasCLZERO())
5178 break;
5179 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5180 break;
5181 }
5182
5183 if (Opc) {
5184 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5185 SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: 0), dl, Reg: PtrReg,
5186 N: Node->getOperand(Num: 2), Glue: SDValue());
5187 SDValue InGlue = Chain.getValue(R: 1);
5188
5189 if (IntNo == Intrinsic::x86_sse3_monitor ||
5190 IntNo == Intrinsic::x86_monitorx) {
5191 // Copy the other two operands to ECX and EDX.
5192 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: 3),
5193 Glue: InGlue);
5194 InGlue = Chain.getValue(R: 1);
5195 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: 4),
5196 Glue: InGlue);
5197 InGlue = Chain.getValue(R: 1);
5198 }
5199
5200 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other,
5201 Ops: { Chain, InGlue});
5202 ReplaceNode(F: Node, T: CNode);
5203 return;
5204 }
5205
5206 break;
5207 }
5208 case Intrinsic::x86_tilestored64_internal: {
5209 auto *MFI =
5210 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5211 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5212 unsigned Opc = X86::PTILESTOREDV;
5213 // _tile_stored_internal(row, col, buf, STRIDE, c)
5214 SDValue Base = Node->getOperand(Num: 4);
5215 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5216 SDValue Index = Node->getOperand(Num: 5);
5217 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5218 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5219 SDValue Chain = Node->getOperand(Num: 0);
5220 MachineSDNode *CNode;
5221 SDValue Ops[] = {Node->getOperand(Num: 2),
5222 Node->getOperand(Num: 3),
5223 Base,
5224 Scale,
5225 Index,
5226 Disp,
5227 Segment,
5228 Node->getOperand(Num: 6),
5229 Chain};
5230 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5231 ReplaceNode(F: Node, T: CNode);
5232 return;
5233 }
5234 case Intrinsic::x86_tileloadd64:
5235 case Intrinsic::x86_tileloaddt164:
5236 case Intrinsic::x86_tilestored64: {
5237 if (!Subtarget->hasAMXTILE())
5238 break;
5239 auto *MFI =
5240 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5241 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5242 unsigned Opc;
5243 switch (IntNo) {
5244 default: llvm_unreachable("Unexpected intrinsic!");
5245 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5246 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5247 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5248 }
5249 // FIXME: Match displacement and scale.
5250 unsigned TIndex = Node->getConstantOperandVal(Num: 2);
5251 SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5252 SDValue Base = Node->getOperand(Num: 3);
5253 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5254 SDValue Index = Node->getOperand(Num: 4);
5255 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5256 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5257 SDValue Chain = Node->getOperand(Num: 0);
5258 MachineSDNode *CNode;
5259 if (Opc == X86::PTILESTORED) {
5260 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5261 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5262 } else {
5263 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5264 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5265 }
5266 ReplaceNode(F: Node, T: CNode);
5267 return;
5268 }
5269 }
5270 break;
5271 }
5272 case ISD::BRIND:
5273 case X86ISD::NT_BRIND: {
5274 if (Subtarget->isTargetNaCl())
5275 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5276 // leave the instruction alone.
5277 break;
5278 if (Subtarget->isTarget64BitILP32()) {
5279 // Converts a 32-bit register to a 64-bit, zero-extended version of
5280 // it. This is needed because x86-64 can do many things, but jmp %r32
5281 // ain't one of them.
5282 SDValue Target = Node->getOperand(Num: 1);
5283 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5284 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64);
5285 SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other,
5286 N1: Node->getOperand(Num: 0), N2: ZextTarget);
5287 ReplaceNode(F: Node, T: Brind.getNode());
5288 SelectCode(N: ZextTarget.getNode());
5289 SelectCode(N: Brind.getNode());
5290 return;
5291 }
5292 break;
5293 }
5294 case X86ISD::GlobalBaseReg:
5295 ReplaceNode(F: Node, T: getGlobalBaseReg());
5296 return;
5297
5298 case ISD::BITCAST:
5299 // Just drop all 128/256/512-bit bitcasts.
5300 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5301 NVT == MVT::f128) {
5302 ReplaceUses(F: SDValue(Node, 0), T: Node->getOperand(Num: 0));
5303 CurDAG->RemoveDeadNode(N: Node);
5304 return;
5305 }
5306 break;
5307
5308 case ISD::SRL:
5309 if (matchBitExtract(Node))
5310 return;
5311 [[fallthrough]];
5312 case ISD::SRA:
5313 case ISD::SHL:
5314 if (tryShiftAmountMod(N: Node))
5315 return;
5316 break;
5317
5318 case X86ISD::VPTERNLOG: {
5319 uint8_t Imm = Node->getConstantOperandVal(Num: 3);
5320 if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: 0),
5321 B: Node->getOperand(Num: 1), C: Node->getOperand(Num: 2), Imm))
5322 return;
5323 break;
5324 }
5325
5326 case X86ISD::ANDNP:
5327 if (tryVPTERNLOG(N: Node))
5328 return;
5329 break;
5330
5331 case ISD::AND:
5332 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5333 // Try to form a masked VPTESTM. Operands can be in either order.
5334 SDValue N0 = Node->getOperand(Num: 0);
5335 SDValue N1 = Node->getOperand(Num: 1);
5336 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5337 tryVPTESTM(Root: Node, Setcc: N0, InMask: N1))
5338 return;
5339 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5340 tryVPTESTM(Root: Node, Setcc: N1, InMask: N0))
5341 return;
5342 }
5343
5344 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5345 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
5346 CurDAG->RemoveDeadNode(N: Node);
5347 return;
5348 }
5349 if (matchBitExtract(Node))
5350 return;
5351 if (AndImmShrink && shrinkAndImmediate(And: Node))
5352 return;
5353
5354 [[fallthrough]];
5355 case ISD::OR:
5356 case ISD::XOR:
5357 if (tryShrinkShlLogicImm(N: Node))
5358 return;
5359 if (Opcode == ISD::OR && tryMatchBitSelect(N: Node))
5360 return;
5361 if (tryVPTERNLOG(N: Node))
5362 return;
5363
5364 [[fallthrough]];
5365 case ISD::ADD:
5366 if (Opcode == ISD::ADD && matchBitExtract(Node))
5367 return;
5368 [[fallthrough]];
5369 case ISD::SUB: {
5370 // Try to avoid folding immediates with multiple uses for optsize.
5371 // This code tries to select to register form directly to avoid going
5372 // through the isel table which might fold the immediate. We can't change
5373 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5374 // tablegen files to check immediate use count without making the patterns
5375 // unavailable to the fast-isel table.
5376 if (!CurDAG->shouldOptForSize())
5377 break;
5378
5379 // Only handle i8/i16/i32/i64.
5380 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5381 break;
5382
5383 SDValue N0 = Node->getOperand(Num: 0);
5384 SDValue N1 = Node->getOperand(Num: 1);
5385
5386 auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
5387 if (!Cst)
5388 break;
5389
5390 int64_t Val = Cst->getSExtValue();
5391
5392 // Make sure its an immediate that is considered foldable.
5393 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5394 if (!isInt<8>(x: Val) && !isInt<32>(x: Val))
5395 break;
5396
5397 // If this can match to INC/DEC, let it go.
5398 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5399 break;
5400
5401 // Check if we should avoid folding this immediate.
5402 if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode()))
5403 break;
5404
5405 // We should not fold the immediate. So we need a register form instead.
5406 unsigned ROpc, MOpc;
5407 switch (NVT.SimpleTy) {
5408 default: llvm_unreachable("Unexpected VT!");
5409 case MVT::i8:
5410 switch (Opcode) {
5411 default: llvm_unreachable("Unexpected opcode!");
5412 case ISD::ADD:
5413 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5414 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5415 break;
5416 case ISD::SUB:
5417 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5418 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5419 break;
5420 case ISD::AND:
5421 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5422 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5423 break;
5424 case ISD::OR:
5425 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5426 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5427 break;
5428 case ISD::XOR:
5429 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5430 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5431 break;
5432 }
5433 break;
5434 case MVT::i16:
5435 switch (Opcode) {
5436 default: llvm_unreachable("Unexpected opcode!");
5437 case ISD::ADD:
5438 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5439 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5440 break;
5441 case ISD::SUB:
5442 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5443 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5444 break;
5445 case ISD::AND:
5446 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5447 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5448 break;
5449 case ISD::OR:
5450 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5451 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5452 break;
5453 case ISD::XOR:
5454 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5455 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5456 break;
5457 }
5458 break;
5459 case MVT::i32:
5460 switch (Opcode) {
5461 default: llvm_unreachable("Unexpected opcode!");
5462 case ISD::ADD:
5463 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5464 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5465 break;
5466 case ISD::SUB:
5467 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5468 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5469 break;
5470 case ISD::AND:
5471 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5472 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5473 break;
5474 case ISD::OR:
5475 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5476 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5477 break;
5478 case ISD::XOR:
5479 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5480 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5481 break;
5482 }
5483 break;
5484 case MVT::i64:
5485 switch (Opcode) {
5486 default: llvm_unreachable("Unexpected opcode!");
5487 case ISD::ADD:
5488 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5489 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5490 break;
5491 case ISD::SUB:
5492 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5493 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5494 break;
5495 case ISD::AND:
5496 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5497 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5498 break;
5499 case ISD::OR:
5500 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5501 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5502 break;
5503 case ISD::XOR:
5504 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5505 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5506 break;
5507 }
5508 break;
5509 }
5510
5511 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5512
5513 // If this is a not a subtract, we can still try to fold a load.
5514 if (Opcode != ISD::SUB) {
5515 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5516 if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5517 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) };
5518 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5519 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5520 // Update the chain.
5521 ReplaceUses(F: N0.getValue(R: 1), T: SDValue(CNode, 2));
5522 // Record the mem-refs
5523 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5524 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
5525 CurDAG->RemoveDeadNode(N: Node);
5526 return;
5527 }
5528 }
5529
5530 CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1);
5531 return;
5532 }
5533
5534 case X86ISD::SMUL:
5535 // i16/i32/i64 are handled with isel patterns.
5536 if (NVT != MVT::i8)
5537 break;
5538 [[fallthrough]];
5539 case X86ISD::UMUL: {
5540 SDValue N0 = Node->getOperand(Num: 0);
5541 SDValue N1 = Node->getOperand(Num: 1);
5542
5543 unsigned LoReg, ROpc, MOpc;
5544 switch (NVT.SimpleTy) {
5545 default: llvm_unreachable("Unsupported VT!");
5546 case MVT::i8:
5547 LoReg = X86::AL;
5548 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5549 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5550 break;
5551 case MVT::i16:
5552 LoReg = X86::AX;
5553 ROpc = X86::MUL16r;
5554 MOpc = X86::MUL16m;
5555 break;
5556 case MVT::i32:
5557 LoReg = X86::EAX;
5558 ROpc = X86::MUL32r;
5559 MOpc = X86::MUL32m;
5560 break;
5561 case MVT::i64:
5562 LoReg = X86::RAX;
5563 ROpc = X86::MUL64r;
5564 MOpc = X86::MUL64m;
5565 break;
5566 }
5567
5568 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5569 bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5570 // Multiply is commutative.
5571 if (!FoldedLoad) {
5572 FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5573 if (FoldedLoad)
5574 std::swap(a&: N0, b&: N1);
5575 }
5576
5577 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5578 N: N0, Glue: SDValue()).getValue(R: 1);
5579
5580 MachineSDNode *CNode;
5581 if (FoldedLoad) {
5582 // i16/i32/i64 use an instruction that produces a low and high result even
5583 // though only the low result is used.
5584 SDVTList VTs;
5585 if (NVT == MVT::i8)
5586 VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5587 else
5588 VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other);
5589
5590 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5591 InGlue };
5592 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5593
5594 // Update the chain.
5595 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5596 // Record the mem-refs
5597 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5598 } else {
5599 // i16/i32/i64 use an instruction that produces a low and high result even
5600 // though only the low result is used.
5601 SDVTList VTs;
5602 if (NVT == MVT::i8)
5603 VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32);
5604 else
5605 VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32);
5606
5607 CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue});
5608 }
5609
5610 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
5611 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5612 CurDAG->RemoveDeadNode(N: Node);
5613 return;
5614 }
5615
5616 case ISD::SMUL_LOHI:
5617 case ISD::UMUL_LOHI: {
5618 SDValue N0 = Node->getOperand(Num: 0);
5619 SDValue N1 = Node->getOperand(Num: 1);
5620
5621 unsigned Opc, MOpc;
5622 unsigned LoReg, HiReg;
5623 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5624 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5625 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5626 switch (NVT.SimpleTy) {
5627 default: llvm_unreachable("Unsupported VT!");
5628 case MVT::i32:
5629 Opc = UseMULXHi ? X86::MULX32Hrr
5630 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5631 : IsSigned ? X86::IMUL32r
5632 : X86::MUL32r;
5633 MOpc = UseMULXHi ? X86::MULX32Hrm
5634 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5635 : IsSigned ? X86::IMUL32m
5636 : X86::MUL32m;
5637 LoReg = UseMULX ? X86::EDX : X86::EAX;
5638 HiReg = X86::EDX;
5639 break;
5640 case MVT::i64:
5641 Opc = UseMULXHi ? X86::MULX64Hrr
5642 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5643 : IsSigned ? X86::IMUL64r
5644 : X86::MUL64r;
5645 MOpc = UseMULXHi ? X86::MULX64Hrm
5646 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5647 : IsSigned ? X86::IMUL64m
5648 : X86::MUL64m;
5649 LoReg = UseMULX ? X86::RDX : X86::RAX;
5650 HiReg = X86::RDX;
5651 break;
5652 }
5653
5654 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5655 bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5656 // Multiply is commutative.
5657 if (!foldedLoad) {
5658 foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5659 if (foldedLoad)
5660 std::swap(a&: N0, b&: N1);
5661 }
5662
5663 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5664 N: N0, Glue: SDValue()).getValue(R: 1);
5665 SDValue ResHi, ResLo;
5666 if (foldedLoad) {
5667 SDValue Chain;
5668 MachineSDNode *CNode = nullptr;
5669 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5670 InGlue };
5671 if (UseMULXHi) {
5672 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
5673 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5674 ResHi = SDValue(CNode, 0);
5675 Chain = SDValue(CNode, 1);
5676 } else if (UseMULX) {
5677 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other);
5678 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5679 ResHi = SDValue(CNode, 0);
5680 ResLo = SDValue(CNode, 1);
5681 Chain = SDValue(CNode, 2);
5682 } else {
5683 SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
5684 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5685 Chain = SDValue(CNode, 0);
5686 InGlue = SDValue(CNode, 1);
5687 }
5688
5689 // Update the chain.
5690 ReplaceUses(F: N1.getValue(R: 1), T: Chain);
5691 // Record the mem-refs
5692 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5693 } else {
5694 SDValue Ops[] = { N1, InGlue };
5695 if (UseMULXHi) {
5696 SDVTList VTs = CurDAG->getVTList(VT: NVT);
5697 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5698 ResHi = SDValue(CNode, 0);
5699 } else if (UseMULX) {
5700 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT);
5701 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5702 ResHi = SDValue(CNode, 0);
5703 ResLo = SDValue(CNode, 1);
5704 } else {
5705 SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue);
5706 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5707 InGlue = SDValue(CNode, 0);
5708 }
5709 }
5710
5711 // Copy the low half of the result, if it is needed.
5712 if (!SDValue(Node, 0).use_empty()) {
5713 if (!ResLo) {
5714 assert(LoReg && "Register for low half is not defined!");
5715 ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5716 VT: NVT, Glue: InGlue);
5717 InGlue = ResLo.getValue(R: 2);
5718 }
5719 ReplaceUses(F: SDValue(Node, 0), T: ResLo);
5720 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5721 dbgs() << '\n');
5722 }
5723 // Copy the high half of the result, if it is needed.
5724 if (!SDValue(Node, 1).use_empty()) {
5725 if (!ResHi) {
5726 assert(HiReg && "Register for high half is not defined!");
5727 ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg,
5728 VT: NVT, Glue: InGlue);
5729 InGlue = ResHi.getValue(R: 2);
5730 }
5731 ReplaceUses(F: SDValue(Node, 1), T: ResHi);
5732 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5733 dbgs() << '\n');
5734 }
5735
5736 CurDAG->RemoveDeadNode(N: Node);
5737 return;
5738 }
5739
5740 case ISD::SDIVREM:
5741 case ISD::UDIVREM: {
5742 SDValue N0 = Node->getOperand(Num: 0);
5743 SDValue N1 = Node->getOperand(Num: 1);
5744
5745 unsigned ROpc, MOpc;
5746 bool isSigned = Opcode == ISD::SDIVREM;
5747 if (!isSigned) {
5748 switch (NVT.SimpleTy) {
5749 default: llvm_unreachable("Unsupported VT!");
5750 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5751 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5752 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5753 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5754 }
5755 } else {
5756 switch (NVT.SimpleTy) {
5757 default: llvm_unreachable("Unsupported VT!");
5758 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5759 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5760 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5761 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5762 }
5763 }
5764
5765 unsigned LoReg, HiReg, ClrReg;
5766 unsigned SExtOpcode;
5767 switch (NVT.SimpleTy) {
5768 default: llvm_unreachable("Unsupported VT!");
5769 case MVT::i8:
5770 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5771 SExtOpcode = 0; // Not used.
5772 break;
5773 case MVT::i16:
5774 LoReg = X86::AX; HiReg = X86::DX;
5775 ClrReg = X86::DX;
5776 SExtOpcode = X86::CWD;
5777 break;
5778 case MVT::i32:
5779 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5780 SExtOpcode = X86::CDQ;
5781 break;
5782 case MVT::i64:
5783 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5784 SExtOpcode = X86::CQO;
5785 break;
5786 }
5787
5788 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5789 bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5790 bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0);
5791
5792 SDValue InGlue;
5793 if (NVT == MVT::i8) {
5794 // Special case for div8, just use a move with zero extension to AX to
5795 // clear the upper 8 bits (AH).
5796 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5797 MachineSDNode *Move;
5798 if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5799 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) };
5800 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5801 : X86::MOVZX16rm8;
5802 Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops);
5803 Chain = SDValue(Move, 1);
5804 ReplaceUses(F: N0.getValue(R: 1), T: Chain);
5805 // Record the mem-refs
5806 CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5807 } else {
5808 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5809 : X86::MOVZX16rr8;
5810 Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0);
5811 Chain = CurDAG->getEntryNode();
5812 }
5813 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue(Move, 0),
5814 Glue: SDValue());
5815 InGlue = Chain.getValue(R: 1);
5816 } else {
5817 InGlue =
5818 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl,
5819 Reg: LoReg, N: N0, Glue: SDValue()).getValue(R: 1);
5820 if (isSigned && !signBitIsZero) {
5821 // Sign extend the low part into the high part.
5822 InGlue =
5823 SDValue(CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),0);
5824 } else {
5825 // Zero out the high part, effectively zero extending the input.
5826 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
5827 SDValue ClrNode = SDValue(
5828 CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: std::nullopt), 0);
5829 switch (NVT.SimpleTy) {
5830 case MVT::i16:
5831 ClrNode =
5832 SDValue(CurDAG->getMachineNode(
5833 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode,
5834 Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl,
5835 VT: MVT::i32)),
5836 0);
5837 break;
5838 case MVT::i32:
5839 break;
5840 case MVT::i64:
5841 ClrNode =
5842 SDValue(CurDAG->getMachineNode(
5843 Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64,
5844 Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: ClrNode,
5845 Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl,
5846 VT: MVT::i32)),
5847 0);
5848 break;
5849 default:
5850 llvm_unreachable("Unexpected division source");
5851 }
5852
5853 InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg,
5854 N: ClrNode, Glue: InGlue).getValue(R: 1);
5855 }
5856 }
5857
5858 if (foldedLoad) {
5859 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5860 InGlue };
5861 MachineSDNode *CNode =
5862 CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops);
5863 InGlue = SDValue(CNode, 1);
5864 // Update the chain.
5865 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 0));
5866 // Record the mem-refs
5867 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5868 } else {
5869 InGlue =
5870 SDValue(CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), 0);
5871 }
5872
5873 // Prevent use of AH in a REX instruction by explicitly copying it to
5874 // an ABCD_L register.
5875 //
5876 // The current assumption of the register allocator is that isel
5877 // won't generate explicit references to the GR8_ABCD_H registers. If
5878 // the allocator and/or the backend get enhanced to be more robust in
5879 // that regard, this can be, and should be, removed.
5880 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
5881 SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8);
5882 unsigned AHExtOpcode =
5883 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5884
5885 SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32,
5886 VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue);
5887 SDValue Result(RNode, 0);
5888 InGlue = SDValue(RNode, 1);
5889
5890 Result =
5891 CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result);
5892
5893 ReplaceUses(F: SDValue(Node, 1), T: Result);
5894 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5895 dbgs() << '\n');
5896 }
5897 // Copy the division (low) result, if it is needed.
5898 if (!SDValue(Node, 0).use_empty()) {
5899 SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
5900 Reg: LoReg, VT: NVT, Glue: InGlue);
5901 InGlue = Result.getValue(R: 2);
5902 ReplaceUses(F: SDValue(Node, 0), T: Result);
5903 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5904 dbgs() << '\n');
5905 }
5906 // Copy the remainder (high) result, if it is needed.
5907 if (!SDValue(Node, 1).use_empty()) {
5908 SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
5909 Reg: HiReg, VT: NVT, Glue: InGlue);
5910 InGlue = Result.getValue(R: 2);
5911 ReplaceUses(F: SDValue(Node, 1), T: Result);
5912 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5913 dbgs() << '\n');
5914 }
5915 CurDAG->RemoveDeadNode(N: Node);
5916 return;
5917 }
5918
5919 case X86ISD::FCMP:
5920 case X86ISD::STRICT_FCMP:
5921 case X86ISD::STRICT_FCMPS: {
5922 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
5923 Node->getOpcode() == X86ISD::STRICT_FCMPS;
5924 SDValue N0 = Node->getOperand(Num: IsStrictCmp ? 1 : 0);
5925 SDValue N1 = Node->getOperand(Num: IsStrictCmp ? 2 : 1);
5926
5927 // Save the original VT of the compare.
5928 MVT CmpVT = N0.getSimpleValueType();
5929
5930 // Floating point needs special handling if we don't have FCOMI.
5931 if (Subtarget->canUseCMOV())
5932 break;
5933
5934 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5935
5936 unsigned Opc;
5937 switch (CmpVT.SimpleTy) {
5938 default: llvm_unreachable("Unexpected type!");
5939 case MVT::f32:
5940 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5941 break;
5942 case MVT::f64:
5943 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5944 break;
5945 case MVT::f80:
5946 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5947 break;
5948 }
5949
5950 SDValue Chain =
5951 IsStrictCmp ? Node->getOperand(Num: 0) : CurDAG->getEntryNode();
5952 SDValue Glue;
5953 if (IsStrictCmp) {
5954 SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
5955 Chain = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), 0);
5956 Glue = Chain.getValue(R: 1);
5957 } else {
5958 Glue = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), 0);
5959 }
5960
5961 // Move FPSW to AX.
5962 SDValue FNSTSW =
5963 SDValue(CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), 0);
5964
5965 // Extract upper 8-bits of AX.
5966 SDValue Extract =
5967 CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW);
5968
5969 // Move AH into flags.
5970 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
5971 assert(Subtarget->canUseLAHFSAHF() &&
5972 "Target doesn't support SAHF or FCOMI?");
5973 SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue());
5974 Chain = AH;
5975 SDValue SAHF = SDValue(
5976 CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: 1)), 0);
5977
5978 if (IsStrictCmp)
5979 ReplaceUses(F: SDValue(Node, 1), T: Chain);
5980
5981 ReplaceUses(F: SDValue(Node, 0), T: SAHF);
5982 CurDAG->RemoveDeadNode(N: Node);
5983 return;
5984 }
5985
5986 case X86ISD::CMP: {
5987 SDValue N0 = Node->getOperand(Num: 0);
5988 SDValue N1 = Node->getOperand(Num: 1);
5989
5990 // Optimizations for TEST compares.
5991 if (!isNullConstant(V: N1))
5992 break;
5993
5994 // Save the original VT of the compare.
5995 MVT CmpVT = N0.getSimpleValueType();
5996
5997 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5998 // by a test instruction. The test should be removed later by
5999 // analyzeCompare if we are using only the zero flag.
6000 // TODO: Should we check the users and use the BEXTR flags directly?
6001 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6002 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) {
6003 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6004 : X86::TEST32rr;
6005 SDValue BEXTR = SDValue(NewNode, 0);
6006 NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR);
6007 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
6008 CurDAG->RemoveDeadNode(N: Node);
6009 return;
6010 }
6011 }
6012
6013 // We can peek through truncates, but we need to be careful below.
6014 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6015 N0 = N0.getOperand(i: 0);
6016
6017 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6018 // use a smaller encoding.
6019 // Look past the truncate if CMP is the only use of it.
6020 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6021 N0.getValueType() != MVT::i8) {
6022 auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
6023 if (!MaskC)
6024 break;
6025
6026 // We may have looked through a truncate so mask off any bits that
6027 // shouldn't be part of the compare.
6028 uint64_t Mask = MaskC->getZExtValue();
6029 Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits());
6030
6031 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6032 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6033 // zero flag.
6034 if (CmpVT == MVT::i64 && !isInt<8>(x: Mask) && isShiftedMask_64(Value: Mask) &&
6035 onlyUsesZeroFlag(Flags: SDValue(Node, 0))) {
6036 unsigned ShiftOpcode = ISD::DELETED_NODE;
6037 unsigned ShiftAmt;
6038 unsigned SubRegIdx;
6039 MVT SubRegVT;
6040 unsigned TestOpcode;
6041 unsigned LeadingZeros = llvm::countl_zero(Val: Mask);
6042 unsigned TrailingZeros = llvm::countr_zero(Val: Mask);
6043
6044 // With leading/trailing zeros, the transform is profitable if we can
6045 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6046 // incurring any extra register moves.
6047 bool SavesBytes = !isInt<32>(x: Mask) || N0.getOperand(i: 0).hasOneUse();
6048 if (LeadingZeros == 0 && SavesBytes) {
6049 // If the mask covers the most significant bit, then we can replace
6050 // TEST+AND with a SHR and check eflags.
6051 // This emits a redundant TEST which is subsequently eliminated.
6052 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6053 ShiftAmt = TrailingZeros;
6054 SubRegIdx = 0;
6055 TestOpcode = X86::TEST64rr;
6056 } else if (TrailingZeros == 0 && SavesBytes) {
6057 // If the mask covers the least significant bit, then we can replace
6058 // TEST+AND with a SHL and check eflags.
6059 // This emits a redundant TEST which is subsequently eliminated.
6060 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6061 ShiftAmt = LeadingZeros;
6062 SubRegIdx = 0;
6063 TestOpcode = X86::TEST64rr;
6064 } else if (MaskC->hasOneUse() && !isInt<32>(x: Mask)) {
6065 // If the shifted mask extends into the high half and is 8/16/32 bits
6066 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6067 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6068 if (PopCount == 8) {
6069 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6070 ShiftAmt = TrailingZeros;
6071 SubRegIdx = X86::sub_8bit;
6072 SubRegVT = MVT::i8;
6073 TestOpcode = X86::TEST8rr;
6074 } else if (PopCount == 16) {
6075 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6076 ShiftAmt = TrailingZeros;
6077 SubRegIdx = X86::sub_16bit;
6078 SubRegVT = MVT::i16;
6079 TestOpcode = X86::TEST16rr;
6080 } else if (PopCount == 32) {
6081 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6082 ShiftAmt = TrailingZeros;
6083 SubRegIdx = X86::sub_32bit;
6084 SubRegVT = MVT::i32;
6085 TestOpcode = X86::TEST32rr;
6086 }
6087 }
6088 if (ShiftOpcode != ISD::DELETED_NODE) {
6089 SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64);
6090 SDValue Shift = SDValue(
6091 CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32,
6092 Op1: N0.getOperand(i: 0), Op2: ShiftC),
6093 0);
6094 if (SubRegIdx != 0) {
6095 Shift =
6096 CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift);
6097 }
6098 MachineSDNode *Test =
6099 CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift);
6100 ReplaceNode(F: Node, T: Test);
6101 return;
6102 }
6103 }
6104
6105 MVT VT;
6106 int SubRegOp;
6107 unsigned ROpc, MOpc;
6108
6109 // For each of these checks we need to be careful if the sign flag is
6110 // being used. It is only safe to use the sign flag in two conditions,
6111 // either the sign bit in the shrunken mask is zero or the final test
6112 // size is equal to the original compare size.
6113
6114 if (isUInt<8>(x: Mask) &&
6115 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6116 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6117 // For example, convert "testl %eax, $8" to "testb %al, $8"
6118 VT = MVT::i8;
6119 SubRegOp = X86::sub_8bit;
6120 ROpc = X86::TEST8ri;
6121 MOpc = X86::TEST8mi;
6122 } else if (OptForMinSize && isUInt<16>(x: Mask) &&
6123 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6124 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6125 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6126 // NOTE: We only want to form TESTW instructions if optimizing for
6127 // min size. Otherwise we only save one byte and possibly get a length
6128 // changing prefix penalty in the decoders.
6129 VT = MVT::i16;
6130 SubRegOp = X86::sub_16bit;
6131 ROpc = X86::TEST16ri;
6132 MOpc = X86::TEST16mi;
6133 } else if (isUInt<32>(x: Mask) && N0.getValueType() != MVT::i16 &&
6134 ((!(Mask & 0x80000000) &&
6135 // Without minsize 16-bit Cmps can get here so we need to
6136 // be sure we calculate the correct sign flag if needed.
6137 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6138 CmpVT == MVT::i32 ||
6139 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6140 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6141 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6142 // Otherwize, we find ourselves in a position where we have to do
6143 // promotion. If previous passes did not promote the and, we assume
6144 // they had a good reason not to and do not promote here.
6145 VT = MVT::i32;
6146 SubRegOp = X86::sub_32bit;
6147 ROpc = X86::TEST32ri;
6148 MOpc = X86::TEST32mi;
6149 } else {
6150 // No eligible transformation was found.
6151 break;
6152 }
6153
6154 SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT);
6155 SDValue Reg = N0.getOperand(i: 0);
6156
6157 // Emit a testl or testw.
6158 MachineSDNode *NewNode;
6159 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6160 if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6161 if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: 0).getNode())) {
6162 if (!LoadN->isSimple()) {
6163 unsigned NumVolBits = LoadN->getValueType(ResNo: 0).getSizeInBits();
6164 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6165 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6166 (MOpc == X86::TEST32mi && NumVolBits != 32))
6167 break;
6168 }
6169 }
6170 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6171 Reg.getOperand(i: 0) };
6172 NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops);
6173 // Update the chain.
6174 ReplaceUses(F: Reg.getValue(R: 1), T: SDValue(NewNode, 1));
6175 // Record the mem-refs
6176 CurDAG->setNodeMemRefs(N: NewNode,
6177 NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()});
6178 } else {
6179 // Extract the subregister if necessary.
6180 if (N0.getValueType() != VT)
6181 Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg);
6182
6183 NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm);
6184 }
6185 // Replace CMP with TEST.
6186 ReplaceNode(F: Node, T: NewNode);
6187 return;
6188 }
6189 break;
6190 }
6191 case X86ISD::PCMPISTR: {
6192 if (!Subtarget->hasSSE42())
6193 break;
6194
6195 bool NeedIndex = !SDValue(Node, 0).use_empty();
6196 bool NeedMask = !SDValue(Node, 1).use_empty();
6197 // We can't fold a load if we are going to make two instructions.
6198 bool MayFoldLoad = !NeedIndex || !NeedMask;
6199
6200 MachineSDNode *CNode;
6201 if (NeedMask) {
6202 unsigned ROpc =
6203 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6204 unsigned MOpc =
6205 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6206 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node);
6207 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0));
6208 }
6209 if (NeedIndex || !NeedMask) {
6210 unsigned ROpc =
6211 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6212 unsigned MOpc =
6213 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6214 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node);
6215 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
6216 }
6217
6218 // Connect the flag usage to the last instruction created.
6219 ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1));
6220 CurDAG->RemoveDeadNode(N: Node);
6221 return;
6222 }
6223 case X86ISD::PCMPESTR: {
6224 if (!Subtarget->hasSSE42())
6225 break;
6226
6227 // Copy the two implicit register inputs.
6228 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX,
6229 N: Node->getOperand(Num: 1),
6230 Glue: SDValue()).getValue(R: 1);
6231 InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX,
6232 N: Node->getOperand(Num: 3), Glue: InGlue).getValue(R: 1);
6233
6234 bool NeedIndex = !SDValue(Node, 0).use_empty();
6235 bool NeedMask = !SDValue(Node, 1).use_empty();
6236 // We can't fold a load if we are going to make two instructions.
6237 bool MayFoldLoad = !NeedIndex || !NeedMask;
6238
6239 MachineSDNode *CNode;
6240 if (NeedMask) {
6241 unsigned ROpc =
6242 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6243 unsigned MOpc =
6244 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6245 CNode =
6246 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue);
6247 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0));
6248 }
6249 if (NeedIndex || !NeedMask) {
6250 unsigned ROpc =
6251 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6252 unsigned MOpc =
6253 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6254 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue);
6255 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
6256 }
6257 // Connect the flag usage to the last instruction created.
6258 ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1));
6259 CurDAG->RemoveDeadNode(N: Node);
6260 return;
6261 }
6262
6263 case ISD::SETCC: {
6264 if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue(Node, 0), InMask: SDValue()))
6265 return;
6266
6267 break;
6268 }
6269
6270 case ISD::STORE:
6271 if (foldLoadStoreIntoMemOperand(Node))
6272 return;
6273 break;
6274
6275 case X86ISD::SETCC_CARRY: {
6276 MVT VT = Node->getSimpleValueType(ResNo: 0);
6277 SDValue Result;
6278 if (Subtarget->hasSBBDepBreaking()) {
6279 // We have to do this manually because tblgen will put the eflags copy in
6280 // the wrong place if we use an extract_subreg in the pattern.
6281 // Copy flags to the EFLAGS register and glue it to next node.
6282 SDValue EFLAGS =
6283 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
6284 N: Node->getOperand(Num: 1), Glue: SDValue());
6285
6286 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6287 // 32-bit version.
6288 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6289 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6290 Result = SDValue(
6291 CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: 1)),
6292 0);
6293 } else {
6294 // The target does not recognize sbb with the same reg operand as a
6295 // no-source idiom, so we explicitly zero the input values.
6296 Result = getSBBZero(N: Node);
6297 }
6298
6299 // For less than 32-bits we need to extract from the 32-bit node.
6300 if (VT == MVT::i8 || VT == MVT::i16) {
6301 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6302 Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6303 }
6304
6305 ReplaceUses(F: SDValue(Node, 0), T: Result);
6306 CurDAG->RemoveDeadNode(N: Node);
6307 return;
6308 }
6309 case X86ISD::SBB: {
6310 if (isNullConstant(V: Node->getOperand(Num: 0)) &&
6311 isNullConstant(V: Node->getOperand(Num: 1))) {
6312 SDValue Result = getSBBZero(N: Node);
6313
6314 // Replace the flag use.
6315 ReplaceUses(F: SDValue(Node, 1), T: Result.getValue(R: 1));
6316
6317 // Replace the result use.
6318 if (!SDValue(Node, 0).use_empty()) {
6319 // For less than 32-bits we need to extract from the 32-bit node.
6320 MVT VT = Node->getSimpleValueType(ResNo: 0);
6321 if (VT == MVT::i8 || VT == MVT::i16) {
6322 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6323 Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6324 }
6325 ReplaceUses(F: SDValue(Node, 0), T: Result);
6326 }
6327
6328 CurDAG->RemoveDeadNode(N: Node);
6329 return;
6330 }
6331 break;
6332 }
6333 case X86ISD::MGATHER: {
6334 auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node);
6335 SDValue IndexOp = Mgt->getIndex();
6336 SDValue Mask = Mgt->getMask();
6337 MVT IndexVT = IndexOp.getSimpleValueType();
6338 MVT ValueVT = Node->getSimpleValueType(ResNo: 0);
6339 MVT MaskVT = Mask.getSimpleValueType();
6340
6341 // This is just to prevent crashes if the nodes are malformed somehow. We're
6342 // otherwise only doing loose type checking in here based on type what
6343 // a type constraint would say just like table based isel.
6344 if (!ValueVT.isVector() || !MaskVT.isVector())
6345 break;
6346
6347 unsigned NumElts = ValueVT.getVectorNumElements();
6348 MVT ValueSVT = ValueVT.getVectorElementType();
6349
6350 bool IsFP = ValueSVT.isFloatingPoint();
6351 unsigned EltSize = ValueSVT.getSizeInBits();
6352
6353 unsigned Opc = 0;
6354 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6355 if (AVX512Gather) {
6356 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6357 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6358 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6359 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6360 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6361 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6362 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6363 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6364 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6365 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6366 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6367 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6368 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6369 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6370 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6371 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6372 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6373 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6374 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6375 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6376 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6377 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6378 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6379 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6380 } else {
6381 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6382 "Unexpected mask VT!");
6383 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6384 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6385 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6386 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6387 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6388 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6389 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6390 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6391 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6392 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6393 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6394 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6395 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6396 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6397 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6398 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6399 }
6400
6401 if (!Opc)
6402 break;
6403
6404 SDValue Base, Scale, Index, Disp, Segment;
6405 if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(),
6406 Base, Scale, Index, Disp, Segment))
6407 break;
6408
6409 SDValue PassThru = Mgt->getPassThru();
6410 SDValue Chain = Mgt->getChain();
6411 // Gather instructions have a mask output not in the ISD node.
6412 SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other);
6413
6414 MachineSDNode *NewNode;
6415 if (AVX512Gather) {
6416 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6417 Index, Disp, Segment, Chain};
6418 NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6419 } else {
6420 SDValue Ops[] = {PassThru, Base, Scale, Index,
6421 Disp, Segment, Mask, Chain};
6422 NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6423 }
6424 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()});
6425 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
6426 ReplaceUses(F: SDValue(Node, 1), T: SDValue(NewNode, 2));
6427 CurDAG->RemoveDeadNode(N: Node);
6428 return;
6429 }
6430 case X86ISD::MSCATTER: {
6431 auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node);
6432 SDValue Value = Sc->getValue();
6433 SDValue IndexOp = Sc->getIndex();
6434 MVT IndexVT = IndexOp.getSimpleValueType();
6435 MVT ValueVT = Value.getSimpleValueType();
6436
6437 // This is just to prevent crashes if the nodes are malformed somehow. We're
6438 // otherwise only doing loose type checking in here based on type what
6439 // a type constraint would say just like table based isel.
6440 if (!ValueVT.isVector())
6441 break;
6442
6443 unsigned NumElts = ValueVT.getVectorNumElements();
6444 MVT ValueSVT = ValueVT.getVectorElementType();
6445
6446 bool IsFP = ValueSVT.isFloatingPoint();
6447 unsigned EltSize = ValueSVT.getSizeInBits();
6448
6449 unsigned Opc;
6450 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6451 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6452 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6453 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6454 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6455 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6456 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6457 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6458 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6459 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6460 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6461 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6462 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6463 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6464 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6465 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6466 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6467 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6468 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6469 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6470 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6471 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6472 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6473 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6474 else
6475 break;
6476
6477 SDValue Base, Scale, Index, Disp, Segment;
6478 if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(),
6479 Base, Scale, Index, Disp, Segment))
6480 break;
6481
6482 SDValue Mask = Sc->getMask();
6483 SDValue Chain = Sc->getChain();
6484 // Scatter instructions have a mask output not in the ISD node.
6485 SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other);
6486 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6487
6488 MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6489 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()});
6490 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 1));
6491 CurDAG->RemoveDeadNode(N: Node);
6492 return;
6493 }
6494 case ISD::PREALLOCATED_SETUP: {
6495 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6496 auto CallId = MFI->getPreallocatedIdForCallSite(
6497 CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue());
6498 SDValue Chain = Node->getOperand(Num: 0);
6499 SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6500 MachineSDNode *New = CurDAG->getMachineNode(
6501 Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain);
6502 ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Chain
6503 CurDAG->RemoveDeadNode(N: Node);
6504 return;
6505 }
6506 case ISD::PREALLOCATED_ARG: {
6507 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6508 auto CallId = MFI->getPreallocatedIdForCallSite(
6509 CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue());
6510 SDValue Chain = Node->getOperand(Num: 0);
6511 SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6512 SDValue ArgIndex = Node->getOperand(Num: 2);
6513 SDValue Ops[3];
6514 Ops[0] = CallIdValue;
6515 Ops[1] = ArgIndex;
6516 Ops[2] = Chain;
6517 MachineSDNode *New = CurDAG->getMachineNode(
6518 Opcode: TargetOpcode::PREALLOCATED_ARG, dl,
6519 VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()),
6520 VT2: MVT::Other),
6521 Ops);
6522 ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Arg pointer
6523 ReplaceUses(F: SDValue(Node, 1), T: SDValue(New, 1)); // Chain
6524 CurDAG->RemoveDeadNode(N: Node);
6525 return;
6526 }
6527 case X86ISD::AESENCWIDE128KL:
6528 case X86ISD::AESDECWIDE128KL:
6529 case X86ISD::AESENCWIDE256KL:
6530 case X86ISD::AESDECWIDE256KL: {
6531 if (!Subtarget->hasWIDEKL())
6532 break;
6533
6534 unsigned Opcode;
6535 switch (Node->getOpcode()) {
6536 default:
6537 llvm_unreachable("Unexpected opcode!");
6538 case X86ISD::AESENCWIDE128KL:
6539 Opcode = X86::AESENCWIDE128KL;
6540 break;
6541 case X86ISD::AESDECWIDE128KL:
6542 Opcode = X86::AESDECWIDE128KL;
6543 break;
6544 case X86ISD::AESENCWIDE256KL:
6545 Opcode = X86::AESENCWIDE256KL;
6546 break;
6547 case X86ISD::AESDECWIDE256KL:
6548 Opcode = X86::AESDECWIDE256KL;
6549 break;
6550 }
6551
6552 SDValue Chain = Node->getOperand(Num: 0);
6553 SDValue Addr = Node->getOperand(Num: 1);
6554
6555 SDValue Base, Scale, Index, Disp, Segment;
6556 if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment))
6557 break;
6558
6559 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 2),
6560 Glue: SDValue());
6561 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 3),
6562 Glue: Chain.getValue(R: 1));
6563 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: 4),
6564 Glue: Chain.getValue(R: 1));
6565 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: 5),
6566 Glue: Chain.getValue(R: 1));
6567 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: 6),
6568 Glue: Chain.getValue(R: 1));
6569 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: 7),
6570 Glue: Chain.getValue(R: 1));
6571 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: 8),
6572 Glue: Chain.getValue(R: 1));
6573 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: 9),
6574 Glue: Chain.getValue(R: 1));
6575
6576 MachineSDNode *Res = CurDAG->getMachineNode(
6577 Opcode, dl, VTs: Node->getVTList(),
6578 Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: 1)});
6579 CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand());
6580 ReplaceNode(F: Node, T: Res);
6581 return;
6582 }
6583 }
6584
6585 SelectCode(N: Node);
6586}
6587
6588bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6589 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6590 std::vector<SDValue> &OutOps) {
6591 SDValue Op0, Op1, Op2, Op3, Op4;
6592 switch (ConstraintID) {
6593 default:
6594 llvm_unreachable("Unexpected asm memory constraint");
6595 case InlineAsm::ConstraintCode::o: // offsetable ??
6596 case InlineAsm::ConstraintCode::v: // not offsetable ??
6597 case InlineAsm::ConstraintCode::m: // memory
6598 case InlineAsm::ConstraintCode::X:
6599 case InlineAsm::ConstraintCode::p: // address
6600 if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4))
6601 return true;
6602 break;
6603 }
6604
6605 OutOps.push_back(x: Op0);
6606 OutOps.push_back(x: Op1);
6607 OutOps.push_back(x: Op2);
6608 OutOps.push_back(x: Op3);
6609 OutOps.push_back(x: Op4);
6610 return false;
6611}
6612
6613X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6614 : SelectionDAGISelPass(
6615 std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
6616
6617/// This pass converts a legalized DAG into a X86-specific DAG,
6618/// ready for instruction scheduling.
6619FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6620 CodeGenOptLevel OptLevel) {
6621 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6622}
6623