1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
16#include "X86MachineFunctionInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/CodeGen/MachineModuleInfo.h"
21#include "llvm/CodeGen/SelectionDAGISel.h"
22#include "llvm/Config/llvm-config.h"
23#include "llvm/IR/ConstantRange.h"
24#include "llvm/IR/Function.h"
25#include "llvm/IR/Instructions.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/ErrorHandling.h"
32#include "llvm/Support/KnownBits.h"
33#include "llvm/Support/MathExtras.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(Val: true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45 cl::Hidden);
46
47static cl::opt<bool> EnablePromoteAnyextLoad(
48 "x86-promote-anyext-load", cl::init(Val: true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
51extern cl::opt<bool> IndirectBranchTracking;
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 Kind: "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
187 return SelectionDAGISel::runOnMachineFunction(mf&: MF);
188 }
189
190 void emitFunctionEntryCode() override;
191
192 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
193
194 void PreprocessISelDAG() override;
195 void PostprocessISelDAG() override;
196
197// Include the pieces autogenerated from the target description.
198#include "X86GenDAGISel.inc"
199
200 private:
201 void Select(SDNode *N) override;
202
203 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205 bool AllowSegmentRegForX32 = false);
206 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211 unsigned Depth);
212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
218 SDValue &Scale, SDValue &Index, SDValue &Disp,
219 SDValue &Segment);
220 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
221 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
222 SDValue &Index, SDValue &Disp, SDValue &Segment);
223 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
224 bool selectLEAAddr(SDValue N, SDValue &Base,
225 SDValue &Scale, SDValue &Index, SDValue &Disp,
226 SDValue &Segment);
227 bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
228 SDValue &Index, SDValue &Disp, SDValue &Segment);
229 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectRelocImm(SDValue N, SDValue &Op);
233
234 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
235 SDValue &Base, SDValue &Scale,
236 SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238
239 // Convenience method where P is also root.
240 bool tryFoldLoad(SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment) {
244 return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment);
245 }
246
247 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
248 SDValue &Base, SDValue &Scale,
249 SDValue &Index, SDValue &Disp,
250 SDValue &Segment);
251
252 bool isProfitableToFormMaskedOp(SDNode *N) const;
253
254 /// Implement addressing mode selection for inline asm expressions.
255 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
256 InlineAsm::ConstraintCode ConstraintID,
257 std::vector<SDValue> &OutOps) override;
258
259 void emitSpecialCodeForMain();
260
261 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
262 MVT VT, SDValue &Base, SDValue &Scale,
263 SDValue &Index, SDValue &Disp,
264 SDValue &Segment) {
265 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
266 Base = CurDAG->getTargetFrameIndex(
267 FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout()));
268 else if (AM.Base_Reg.getNode())
269 Base = AM.Base_Reg;
270 else
271 Base = CurDAG->getRegister(Reg: 0, VT);
272
273 Scale = getI8Imm(Imm: AM.Scale, DL);
274
275#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
276 // Negate the index if needed.
277 if (AM.NegateIndex) {
278 unsigned NegOpc;
279 switch (VT.SimpleTy) {
280 default:
281 llvm_unreachable("Unsupported VT!");
282 case MVT::i64:
283 NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
284 break;
285 case MVT::i32:
286 NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
287 break;
288 case MVT::i16:
289 NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
290 break;
291 case MVT::i8:
292 NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
293 break;
294 }
295 SDValue Neg = SDValue(CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32,
296 Ops: AM.IndexReg), 0);
297 AM.IndexReg = Neg;
298 }
299
300 if (AM.IndexReg.getNode())
301 Index = AM.IndexReg;
302 else
303 Index = CurDAG->getRegister(Reg: 0, VT);
304
305 // These are 32-bit even in 64-bit mode since RIP-relative offset
306 // is 32-bit.
307 if (AM.GV)
308 Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc(),
309 VT: MVT::i32, offset: AM.Disp,
310 TargetFlags: AM.SymbolFlags);
311 else if (AM.CP)
312 Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment,
313 Offset: AM.Disp, TargetFlags: AM.SymbolFlags);
314 else if (AM.ES) {
315 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
316 Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
317 } else if (AM.MCSym) {
318 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
319 assert(AM.SymbolFlags == 0 && "oo");
320 Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32);
321 } else if (AM.JT != -1) {
322 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
323 Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
324 } else if (AM.BlockAddr)
325 Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp,
326 TargetFlags: AM.SymbolFlags);
327 else
328 Disp = CurDAG->getSignedTargetConstant(Val: AM.Disp, DL, VT: MVT::i32);
329
330 if (AM.Segment.getNode())
331 Segment = AM.Segment;
332 else
333 Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
334 }
335
336 // Utility function to determine whether it is AMX SDNode right after
337 // lowering but before ISEL.
338 bool isAMXSDNode(SDNode *N) const {
339 // Check if N is AMX SDNode:
340 // 1. check specific opcode since these carry MVT::Untyped instead of
341 // x86amx_type;
342 // 2. check result type;
343 // 3. check operand type;
344 switch (N->getOpcode()) {
345 default:
346 break;
347 case X86::PT2RPNTLVWZ0V:
348 case X86::PT2RPNTLVWZ0T1V:
349 case X86::PT2RPNTLVWZ1V:
350 case X86::PT2RPNTLVWZ1T1V:
351 case X86::PT2RPNTLVWZ0RSV:
352 case X86::PT2RPNTLVWZ0RST1V:
353 case X86::PT2RPNTLVWZ1RSV:
354 case X86::PT2RPNTLVWZ1RST1V:
355 return true;
356 }
357 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
358 if (N->getValueType(ResNo: Idx) == MVT::x86amx)
359 return true;
360 }
361 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
362 SDValue Op = N->getOperand(Num: Idx);
363 if (Op.getValueType() == MVT::x86amx)
364 return true;
365 }
366 return false;
367 }
368
369 // Utility function to determine whether we should avoid selecting
370 // immediate forms of instructions for better code size or not.
371 // At a high level, we'd like to avoid such instructions when
372 // we have similar constants used within the same basic block
373 // that can be kept in a register.
374 //
375 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
376 uint32_t UseCount = 0;
377
378 // Do not want to hoist if we're not optimizing for size.
379 // TODO: We'd like to remove this restriction.
380 // See the comment in X86InstrInfo.td for more info.
381 if (!CurDAG->shouldOptForSize())
382 return false;
383
384 // Walk all the users of the immediate.
385 for (const SDNode *User : N->users()) {
386 if (UseCount >= 2)
387 break;
388
389 // This user is already selected. Count it as a legitimate use and
390 // move on.
391 if (User->isMachineOpcode()) {
392 UseCount++;
393 continue;
394 }
395
396 // We want to count stores of immediates as real uses.
397 if (User->getOpcode() == ISD::STORE &&
398 User->getOperand(Num: 1).getNode() == N) {
399 UseCount++;
400 continue;
401 }
402
403 // We don't currently match users that have > 2 operands (except
404 // for stores, which are handled above)
405 // Those instruction won't match in ISEL, for now, and would
406 // be counted incorrectly.
407 // This may change in the future as we add additional instruction
408 // types.
409 if (User->getNumOperands() != 2)
410 continue;
411
412 // If this is a sign-extended 8-bit integer immediate used in an ALU
413 // instruction, there is probably an opcode encoding to save space.
414 auto *C = dyn_cast<ConstantSDNode>(Val: N);
415 if (C && isInt<8>(x: C->getSExtValue()))
416 continue;
417
418 // Immediates that are used for offsets as part of stack
419 // manipulation should be left alone. These are typically
420 // used to indicate SP offsets for argument passing and
421 // will get pulled into stores/pushes (implicitly).
422 if (User->getOpcode() == X86ISD::ADD ||
423 User->getOpcode() == ISD::ADD ||
424 User->getOpcode() == X86ISD::SUB ||
425 User->getOpcode() == ISD::SUB) {
426
427 // Find the other operand of the add/sub.
428 SDValue OtherOp = User->getOperand(Num: 0);
429 if (OtherOp.getNode() == N)
430 OtherOp = User->getOperand(Num: 1);
431
432 // Don't count if the other operand is SP.
433 RegisterSDNode *RegNode;
434 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
435 (RegNode = dyn_cast_or_null<RegisterSDNode>(
436 Val: OtherOp->getOperand(Num: 1).getNode())))
437 if ((RegNode->getReg() == X86::ESP) ||
438 (RegNode->getReg() == X86::RSP))
439 continue;
440 }
441
442 // ... otherwise, count this and move on.
443 UseCount++;
444 }
445
446 // If we have more than 1 use, then recommend for hoisting.
447 return (UseCount > 1);
448 }
449
450 /// Return a target constant with the specified value of type i8.
451 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
452 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
453 }
454
455 /// Return a target constant with the specified value, of type i32.
456 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
457 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32);
458 }
459
460 /// Return a target constant with the specified value, of type i64.
461 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
462 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64);
463 }
464
465 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
466 const SDLoc &DL) {
467 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
468 uint64_t Index = N->getConstantOperandVal(Num: 1);
469 MVT VecVT = N->getOperand(Num: 0).getSimpleValueType();
470 return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
471 }
472
473 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
474 const SDLoc &DL) {
475 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
476 uint64_t Index = N->getConstantOperandVal(Num: 2);
477 MVT VecVT = N->getSimpleValueType(ResNo: 0);
478 return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
479 }
480
481 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
482 const SDLoc &DL) {
483 assert(VecWidth == 128 && "Unexpected vector width");
484 uint64_t Index = N->getConstantOperandVal(Num: 2);
485 MVT VecVT = N->getSimpleValueType(ResNo: 0);
486 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
487 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
488 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
489 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
490 return getI8Imm(Imm: InsertIdx ? 0x02 : 0x30, DL);
491 }
492
493 SDValue getSBBZero(SDNode *N) {
494 SDLoc dl(N);
495 MVT VT = N->getSimpleValueType(ResNo: 0);
496
497 // Create zero.
498 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
499 SDValue Zero =
500 SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0);
501 if (VT == MVT::i64) {
502 Zero = SDValue(
503 CurDAG->getMachineNode(
504 Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64,
505 Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: Zero,
506 Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
507 0);
508 }
509
510 // Copy flags to the EFLAGS register and glue it to next node.
511 unsigned Opcode = N->getOpcode();
512 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
513 "Unexpected opcode for SBB materialization");
514 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
515 SDValue EFLAGS =
516 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
517 N: N->getOperand(Num: FlagOpIndex), Glue: SDValue());
518
519 // Create a 64-bit instruction if the result is 64-bits otherwise use the
520 // 32-bit version.
521 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
522 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
523 VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32);
524 return SDValue(
525 CurDAG->getMachineNode(Opcode: Opc, dl, VTs,
526 Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: 1)}),
527 0);
528 }
529
530 // Helper to detect unneeded and instructions on shift amounts. Called
531 // from PatFrags in tablegen.
532 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
533 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
534 const APInt &Val = N->getConstantOperandAPInt(Num: 1);
535
536 if (Val.countr_one() >= Width)
537 return true;
538
539 APInt Mask = Val | CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero;
540 return Mask.countr_one() >= Width;
541 }
542
543 /// Return an SDNode that returns the value of the global base register.
544 /// Output instructions required to initialize the global base register,
545 /// if necessary.
546 SDNode *getGlobalBaseReg();
547
548 /// Return a reference to the TargetMachine, casted to the target-specific
549 /// type.
550 const X86TargetMachine &getTargetMachine() const {
551 return static_cast<const X86TargetMachine &>(TM);
552 }
553
554 /// Return a reference to the TargetInstrInfo, casted to the target-specific
555 /// type.
556 const X86InstrInfo *getInstrInfo() const {
557 return Subtarget->getInstrInfo();
558 }
559
560 /// Return a condition code of the given SDNode
561 X86::CondCode getCondFromNode(SDNode *N) const;
562
563 /// Address-mode matching performs shift-of-and to and-of-shift
564 /// reassociation in order to expose more scaled addressing
565 /// opportunities.
566 bool ComplexPatternFuncMutatesDAG() const override {
567 return true;
568 }
569
570 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
571
572 // Indicates we should prefer to use a non-temporal load for this load.
573 bool useNonTemporalLoad(LoadSDNode *N) const {
574 if (!N->isNonTemporal())
575 return false;
576
577 unsigned StoreSize = N->getMemoryVT().getStoreSize();
578
579 if (N->getAlign().value() < StoreSize)
580 return false;
581
582 switch (StoreSize) {
583 default: llvm_unreachable("Unsupported store size");
584 case 4:
585 case 8:
586 return false;
587 case 16:
588 return Subtarget->hasSSE41();
589 case 32:
590 return Subtarget->hasAVX2();
591 case 64:
592 return Subtarget->hasAVX512();
593 }
594 }
595
596 bool foldLoadStoreIntoMemOperand(SDNode *Node);
597 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
598 bool matchBitExtract(SDNode *Node);
599 bool shrinkAndImmediate(SDNode *N);
600 bool isMaskZeroExtended(SDNode *N) const;
601 bool tryShiftAmountMod(SDNode *N);
602 bool tryShrinkShlLogicImm(SDNode *N);
603 bool tryVPTERNLOG(SDNode *N);
604 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
605 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
606 uint8_t Imm);
607 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
608 bool tryMatchBitSelect(SDNode *N);
609
610 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
611 const SDLoc &dl, MVT VT, SDNode *Node);
612 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
613 const SDLoc &dl, MVT VT, SDNode *Node,
614 SDValue &InGlue);
615
616 bool tryOptimizeRem8Extend(SDNode *N);
617
618 bool onlyUsesZeroFlag(SDValue Flags) const;
619 bool hasNoSignFlagUses(SDValue Flags) const;
620 bool hasNoCarryFlagUses(SDValue Flags) const;
621 };
622
623 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
624 public:
625 static char ID;
626 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
627 CodeGenOptLevel OptLevel)
628 : SelectionDAGISelLegacy(
629 ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {}
630 };
631}
632
633char X86DAGToDAGISelLegacy::ID = 0;
634
635INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
636
637// Returns true if this masked compare can be implemented legally with this
638// type.
639static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
640 unsigned Opcode = N->getOpcode();
641 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
642 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
643 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
644 // We can get 256-bit 8 element types here without VLX being enabled. When
645 // this happens we will use 512-bit operations and the mask will not be
646 // zero extended.
647 EVT OpVT = N->getOperand(Num: 0).getValueType();
648 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
649 // second operand.
650 if (Opcode == X86ISD::STRICT_CMPM)
651 OpVT = N->getOperand(Num: 1).getValueType();
652 if (OpVT.is256BitVector() || OpVT.is128BitVector())
653 return Subtarget->hasVLX();
654
655 return true;
656 }
657 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
658 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
659 Opcode == X86ISD::FSETCCM_SAE)
660 return true;
661
662 return false;
663}
664
665// Returns true if we can assume the writer of the mask has zero extended it
666// for us.
667bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
668 // If this is an AND, check if we have a compare on either side. As long as
669 // one side guarantees the mask is zero extended, the AND will preserve those
670 // zeros.
671 if (N->getOpcode() == ISD::AND)
672 return isLegalMaskCompare(N: N->getOperand(Num: 0).getNode(), Subtarget) ||
673 isLegalMaskCompare(N: N->getOperand(Num: 1).getNode(), Subtarget);
674
675 return isLegalMaskCompare(N, Subtarget);
676}
677
678bool
679X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
680 if (OptLevel == CodeGenOptLevel::None)
681 return false;
682
683 if (!N.hasOneUse())
684 return false;
685
686 if (N.getOpcode() != ISD::LOAD)
687 return true;
688
689 // Don't fold non-temporal loads if we have an instruction for them.
690 if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N)))
691 return false;
692
693 // If N is a load, do additional profitability checks.
694 if (U == Root) {
695 switch (U->getOpcode()) {
696 default: break;
697 case X86ISD::ADD:
698 case X86ISD::ADC:
699 case X86ISD::SUB:
700 case X86ISD::SBB:
701 case X86ISD::AND:
702 case X86ISD::XOR:
703 case X86ISD::OR:
704 case ISD::ADD:
705 case ISD::UADDO_CARRY:
706 case ISD::AND:
707 case ISD::OR:
708 case ISD::XOR: {
709 SDValue Op1 = U->getOperand(Num: 1);
710
711 // If the other operand is a 8-bit immediate we should fold the immediate
712 // instead. This reduces code size.
713 // e.g.
714 // movl 4(%esp), %eax
715 // addl $4, %eax
716 // vs.
717 // movl $4, %eax
718 // addl 4(%esp), %eax
719 // The former is 2 bytes shorter. In case where the increment is 1, then
720 // the saving can be 4 bytes (by using incl %eax).
721 if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) {
722 if (Imm->getAPIntValue().isSignedIntN(N: 8))
723 return false;
724
725 // If this is a 64-bit AND with an immediate that fits in 32-bits,
726 // prefer using the smaller and over folding the load. This is needed to
727 // make sure immediates created by shrinkAndImmediate are always folded.
728 // Ideally we would narrow the load during DAG combine and get the
729 // best of both worlds.
730 if (U->getOpcode() == ISD::AND &&
731 Imm->getAPIntValue().getBitWidth() == 64 &&
732 Imm->getAPIntValue().isIntN(N: 32))
733 return false;
734
735 // If this really a zext_inreg that can be represented with a movzx
736 // instruction, prefer that.
737 // TODO: We could shrink the load and fold if it is non-volatile.
738 if (U->getOpcode() == ISD::AND &&
739 (Imm->getAPIntValue() == UINT8_MAX ||
740 Imm->getAPIntValue() == UINT16_MAX ||
741 Imm->getAPIntValue() == UINT32_MAX))
742 return false;
743
744 // ADD/SUB with can negate the immediate and use the opposite operation
745 // to fit 128 into a sign extended 8 bit immediate.
746 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
747 (-Imm->getAPIntValue()).isSignedIntN(N: 8))
748 return false;
749
750 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
751 (-Imm->getAPIntValue()).isSignedIntN(N: 8) &&
752 hasNoCarryFlagUses(Flags: SDValue(U, 1)))
753 return false;
754 }
755
756 // If the other operand is a TLS address, we should fold it instead.
757 // This produces
758 // movl %gs:0, %eax
759 // leal i@NTPOFF(%eax), %eax
760 // instead of
761 // movl $i@NTPOFF, %eax
762 // addl %gs:0, %eax
763 // if the block also has an access to a second TLS address this will save
764 // a load.
765 // FIXME: This is probably also true for non-TLS addresses.
766 if (Op1.getOpcode() == X86ISD::Wrapper) {
767 SDValue Val = Op1.getOperand(i: 0);
768 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
769 return false;
770 }
771
772 // Don't fold load if this matches the BTS/BTR/BTC patterns.
773 // BTS: (or X, (shl 1, n))
774 // BTR: (and X, (rotl -2, n))
775 // BTC: (xor X, (shl 1, n))
776 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
777 if (U->getOperand(Num: 0).getOpcode() == ISD::SHL &&
778 isOneConstant(V: U->getOperand(Num: 0).getOperand(i: 0)))
779 return false;
780
781 if (U->getOperand(Num: 1).getOpcode() == ISD::SHL &&
782 isOneConstant(V: U->getOperand(Num: 1).getOperand(i: 0)))
783 return false;
784 }
785 if (U->getOpcode() == ISD::AND) {
786 SDValue U0 = U->getOperand(Num: 0);
787 SDValue U1 = U->getOperand(Num: 1);
788 if (U0.getOpcode() == ISD::ROTL) {
789 auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: 0));
790 if (C && C->getSExtValue() == -2)
791 return false;
792 }
793
794 if (U1.getOpcode() == ISD::ROTL) {
795 auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: 0));
796 if (C && C->getSExtValue() == -2)
797 return false;
798 }
799 }
800
801 break;
802 }
803 case ISD::SHL:
804 case ISD::SRA:
805 case ISD::SRL:
806 // Don't fold a load into a shift by immediate. The BMI2 instructions
807 // support folding a load, but not an immediate. The legacy instructions
808 // support folding an immediate, but can't fold a load. Folding an
809 // immediate is preferable to folding a load.
810 if (isa<ConstantSDNode>(Val: U->getOperand(Num: 1)))
811 return false;
812
813 break;
814 }
815 }
816
817 // Prevent folding a load if this can implemented with an insert_subreg or
818 // a move that implicitly zeroes.
819 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
820 isNullConstant(V: Root->getOperand(Num: 2)) &&
821 (Root->getOperand(Num: 0).isUndef() ||
822 ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: 0).getNode())))
823 return false;
824
825 return true;
826}
827
828// Indicates it is profitable to form an AVX512 masked operation. Returning
829// false will favor a masked register-register masked move or vblendm and the
830// operation will be selected separately.
831bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
832 assert(
833 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
834 "Unexpected opcode!");
835
836 // If the operation has additional users, the operation will be duplicated.
837 // Check the use count to prevent that.
838 // FIXME: Are there cheap opcodes we might want to duplicate?
839 return N->getOperand(Num: 1).hasOneUse();
840}
841
842/// Replace the original chain operand of the call with
843/// load's chain operand and move load below the call's chain operand.
844static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
845 SDValue Call, SDValue OrigChain) {
846 SmallVector<SDValue, 8> Ops;
847 SDValue Chain = OrigChain.getOperand(i: 0);
848 if (Chain.getNode() == Load.getNode())
849 Ops.push_back(Elt: Load.getOperand(i: 0));
850 else {
851 assert(Chain.getOpcode() == ISD::TokenFactor &&
852 "Unexpected chain operand");
853 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
854 if (Chain.getOperand(i).getNode() == Load.getNode())
855 Ops.push_back(Elt: Load.getOperand(i: 0));
856 else
857 Ops.push_back(Elt: Chain.getOperand(i));
858 SDValue NewChain =
859 CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Load), VT: MVT::Other, Ops);
860 Ops.clear();
861 Ops.push_back(Elt: NewChain);
862 }
863 Ops.append(in_start: OrigChain->op_begin() + 1, in_end: OrigChain->op_end());
864 CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops);
865 CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: 0),
866 Op2: Load.getOperand(i: 1), Op3: Load.getOperand(i: 2));
867
868 Ops.clear();
869 Ops.push_back(Elt: SDValue(Load.getNode(), 1));
870 Ops.append(in_start: Call->op_begin() + 1, in_end: Call->op_end());
871 CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops);
872}
873
874/// Return true if call address is a load and it can be
875/// moved below CALLSEQ_START and the chains leading up to the call.
876/// Return the CALLSEQ_START by reference as a second output.
877/// In the case of a tail call, there isn't a callseq node between the call
878/// chain and the load.
879static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
880 // The transformation is somewhat dangerous if the call's chain was glued to
881 // the call. After MoveBelowOrigChain the load is moved between the call and
882 // the chain, this can create a cycle if the load is not folded. So it is
883 // *really* important that we are sure the load will be folded.
884 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
885 return false;
886 auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode());
887 if (!LD ||
888 !LD->isSimple() ||
889 LD->getAddressingMode() != ISD::UNINDEXED ||
890 LD->getExtensionType() != ISD::NON_EXTLOAD)
891 return false;
892
893 // Now let's find the callseq_start.
894 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
895 if (!Chain.hasOneUse())
896 return false;
897 Chain = Chain.getOperand(i: 0);
898 }
899
900 if (!Chain.getNumOperands())
901 return false;
902 // Since we are not checking for AA here, conservatively abort if the chain
903 // writes to memory. It's not safe to move the callee (a load) across a store.
904 if (isa<MemSDNode>(Val: Chain.getNode()) &&
905 cast<MemSDNode>(Val: Chain.getNode())->writeMem())
906 return false;
907 if (Chain.getOperand(i: 0).getNode() == Callee.getNode())
908 return true;
909 if (Chain.getOperand(i: 0).getOpcode() == ISD::TokenFactor &&
910 Callee.getValue(R: 1).isOperandOf(N: Chain.getOperand(i: 0).getNode()) &&
911 Callee.getValue(R: 1).hasOneUse())
912 return true;
913 return false;
914}
915
916static bool isEndbrImm64(uint64_t Imm) {
917// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
918// i.g: 0xF3660F1EFA, 0xF3670F1EFA
919 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
920 return false;
921
922 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
923 0x65, 0x66, 0x67, 0xf0, 0xf2};
924 int i = 24; // 24bit 0x0F1EFA has matched
925 while (i < 64) {
926 uint8_t Byte = (Imm >> i) & 0xFF;
927 if (Byte == 0xF3)
928 return true;
929 if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte))
930 return false;
931 i += 8;
932 }
933
934 return false;
935}
936
937static bool needBWI(MVT VT) {
938 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
939}
940
941void X86DAGToDAGISel::PreprocessISelDAG() {
942 bool MadeChange = false;
943 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
944 E = CurDAG->allnodes_end(); I != E; ) {
945 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
946
947 // This is for CET enhancement.
948 //
949 // ENDBR32 and ENDBR64 have specific opcodes:
950 // ENDBR32: F3 0F 1E FB
951 // ENDBR64: F3 0F 1E FA
952 // And we want that attackers won’t find unintended ENDBR32/64
953 // opcode matches in the binary
954 // Here’s an example:
955 // If the compiler had to generate asm for the following code:
956 // a = 0xF30F1EFA
957 // it could, for example, generate:
958 // mov 0xF30F1EFA, dword ptr[a]
959 // In such a case, the binary would include a gadget that starts
960 // with a fake ENDBR64 opcode. Therefore, we split such generation
961 // into multiple operations, let it not shows in the binary
962 if (N->getOpcode() == ISD::Constant) {
963 MVT VT = N->getSimpleValueType(ResNo: 0);
964 int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
965 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
966 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
967 // Check that the cf-protection-branch is enabled.
968 Metadata *CFProtectionBranch =
969 MF->getFunction().getParent()->getModuleFlag(
970 Key: "cf-protection-branch");
971 if (CFProtectionBranch || IndirectBranchTracking) {
972 SDLoc dl(N);
973 SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true);
974 Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT);
975 --I;
976 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Complement);
977 ++I;
978 MadeChange = true;
979 continue;
980 }
981 }
982 }
983
984 // If this is a target specific AND node with no flag usages, turn it back
985 // into ISD::AND to enable test instruction matching.
986 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: 1)) {
987 SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
988 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
989 --I;
990 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
991 ++I;
992 MadeChange = true;
993 continue;
994 }
995
996 // Convert vector increment or decrement to sub/add with an all-ones
997 // constant:
998 // add X, <1, 1...> --> sub X, <-1, -1...>
999 // sub X, <1, 1...> --> add X, <-1, -1...>
1000 // The all-ones vector constant can be materialized using a pcmpeq
1001 // instruction that is commonly recognized as an idiom (has no register
1002 // dependency), so that's better/smaller than loading a splat 1 constant.
1003 //
1004 // But don't do this if it would inhibit a potentially profitable load
1005 // folding opportunity for the other operand. That only occurs with the
1006 // intersection of:
1007 // (1) The other operand (op0) is load foldable.
1008 // (2) The op is an add (otherwise, we are *creating* an add and can still
1009 // load fold the other op).
1010 // (3) The target has AVX (otherwise, we have a destructive add and can't
1011 // load fold the other op without killing the constant op).
1012 // (4) The constant 1 vector has multiple uses (so it is profitable to load
1013 // into a register anyway).
1014 auto mayPreventLoadFold = [&]() {
1015 return X86::mayFoldLoad(Op: N->getOperand(Num: 0), Subtarget: *Subtarget) &&
1016 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1017 !N->getOperand(Num: 1).hasOneUse();
1018 };
1019 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1020 N->getSimpleValueType(ResNo: 0).isVector() && !mayPreventLoadFold()) {
1021 APInt SplatVal;
1022 if (X86::isConstantSplat(Op: N->getOperand(Num: 1), SplatVal) &&
1023 SplatVal.isOne()) {
1024 SDLoc DL(N);
1025
1026 MVT VT = N->getSimpleValueType(ResNo: 0);
1027 unsigned NumElts = VT.getSizeInBits() / 32;
1028 SDValue AllOnes =
1029 CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts));
1030 AllOnes = CurDAG->getBitcast(VT, V: AllOnes);
1031
1032 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1033 SDValue Res =
1034 CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: 0), N2: AllOnes);
1035 --I;
1036 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1037 ++I;
1038 MadeChange = true;
1039 continue;
1040 }
1041 }
1042
1043 switch (N->getOpcode()) {
1044 case X86ISD::VBROADCAST: {
1045 MVT VT = N->getSimpleValueType(ResNo: 0);
1046 // Emulate v32i16/v64i8 broadcast without BWI.
1047 if (!Subtarget->hasBWI() && needBWI(VT)) {
1048 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1049 SDLoc dl(N);
1050 SDValue NarrowBCast =
1051 CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: 0));
1052 SDValue Res =
1053 CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1054 N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1055 unsigned Index = NarrowVT.getVectorMinNumElements();
1056 Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1057 N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1058
1059 --I;
1060 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1061 ++I;
1062 MadeChange = true;
1063 continue;
1064 }
1065
1066 break;
1067 }
1068 case X86ISD::VBROADCAST_LOAD: {
1069 MVT VT = N->getSimpleValueType(ResNo: 0);
1070 // Emulate v32i16/v64i8 broadcast without BWI.
1071 if (!Subtarget->hasBWI() && needBWI(VT)) {
1072 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1073 auto *MemNode = cast<MemSDNode>(Val: N);
1074 SDLoc dl(N);
1075 SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other);
1076 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1077 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1078 Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(),
1079 MMO: MemNode->getMemOperand());
1080 SDValue Res =
1081 CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1082 N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1083 unsigned Index = NarrowVT.getVectorMinNumElements();
1084 Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1085 N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1086
1087 --I;
1088 SDValue To[] = {Res, NarrowBCast.getValue(R: 1)};
1089 CurDAG->ReplaceAllUsesWith(From: N, To);
1090 ++I;
1091 MadeChange = true;
1092 continue;
1093 }
1094
1095 break;
1096 }
1097 case ISD::LOAD: {
1098 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1099 // load, then just extract the lower subvector and avoid the second load.
1100 auto *Ld = cast<LoadSDNode>(Val: N);
1101 MVT VT = N->getSimpleValueType(ResNo: 0);
1102 if (!ISD::isNormalLoad(N: Ld) || !Ld->isSimple() ||
1103 !(VT.is128BitVector() || VT.is256BitVector()))
1104 break;
1105
1106 MVT MaxVT = VT;
1107 SDNode *MaxLd = nullptr;
1108 SDValue Ptr = Ld->getBasePtr();
1109 SDValue Chain = Ld->getChain();
1110 for (SDNode *User : Ptr->users()) {
1111 auto *UserLd = dyn_cast<LoadSDNode>(Val: User);
1112 MVT UserVT = User->getSimpleValueType(ResNo: 0);
1113 if (User != N && UserLd && ISD::isNormalLoad(N: User) &&
1114 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1115 !User->hasAnyUseOfValue(Value: 1) &&
1116 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1117 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1118 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1119 MaxLd = User;
1120 MaxVT = UserVT;
1121 }
1122 }
1123 if (MaxLd) {
1124 SDLoc dl(N);
1125 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1126 MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts);
1127 SDValue Extract = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT,
1128 N1: SDValue(MaxLd, 0),
1129 N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1130 SDValue Res = CurDAG->getBitcast(VT, V: Extract);
1131
1132 --I;
1133 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1134 CurDAG->ReplaceAllUsesWith(From: N, To);
1135 ++I;
1136 MadeChange = true;
1137 continue;
1138 }
1139 break;
1140 }
1141 case ISD::VSELECT: {
1142 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1143 EVT EleVT = N->getOperand(Num: 0).getValueType().getVectorElementType();
1144 if (EleVT == MVT::i1)
1145 break;
1146
1147 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1148 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1149 "We can't replace VSELECT with BLENDV in vXi16!");
1150 SDValue R;
1151 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: 0)) ==
1152 EleVT.getSizeInBits()) {
1153 R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1154 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), N3: N->getOperand(Num: 2),
1155 N4: CurDAG->getTargetConstant(Val: 0xCA, DL: SDLoc(N), VT: MVT::i8));
1156 } else {
1157 R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1158 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1),
1159 N3: N->getOperand(Num: 2));
1160 }
1161 --I;
1162 CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode());
1163 ++I;
1164 MadeChange = true;
1165 continue;
1166 }
1167 case ISD::FP_ROUND:
1168 case ISD::STRICT_FP_ROUND:
1169 case ISD::FP_TO_SINT:
1170 case ISD::FP_TO_UINT:
1171 case ISD::STRICT_FP_TO_SINT:
1172 case ISD::STRICT_FP_TO_UINT: {
1173 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1174 // don't need 2 sets of patterns.
1175 if (!N->getSimpleValueType(ResNo: 0).isVector())
1176 break;
1177
1178 unsigned NewOpc;
1179 switch (N->getOpcode()) {
1180 default: llvm_unreachable("Unexpected opcode!");
1181 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1182 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1183 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1184 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1185 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1186 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1187 }
1188 SDValue Res;
1189 if (N->isStrictFPOpcode())
1190 Res =
1191 CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), ResultTys: {N->getValueType(ResNo: 0), MVT::Other},
1192 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)});
1193 else
1194 Res =
1195 CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1196 Operand: N->getOperand(Num: 0));
1197 --I;
1198 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1199 ++I;
1200 MadeChange = true;
1201 continue;
1202 }
1203 case ISD::SHL:
1204 case ISD::SRA:
1205 case ISD::SRL: {
1206 // Replace vector shifts with their X86 specific equivalent so we don't
1207 // need 2 sets of patterns.
1208 if (!N->getValueType(ResNo: 0).isVector())
1209 break;
1210
1211 unsigned NewOpc;
1212 switch (N->getOpcode()) {
1213 default: llvm_unreachable("Unexpected opcode!");
1214 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1215 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1216 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1217 }
1218 SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1219 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
1220 --I;
1221 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1222 ++I;
1223 MadeChange = true;
1224 continue;
1225 }
1226 case ISD::ANY_EXTEND:
1227 case ISD::ANY_EXTEND_VECTOR_INREG: {
1228 // Replace vector any extend with the zero extend equivalents so we don't
1229 // need 2 sets of patterns. Ignore vXi1 extensions.
1230 if (!N->getValueType(ResNo: 0).isVector())
1231 break;
1232
1233 unsigned NewOpc;
1234 if (N->getOperand(Num: 0).getScalarValueSizeInBits() == 1) {
1235 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1236 "Unexpected opcode for mask vector!");
1237 NewOpc = ISD::SIGN_EXTEND;
1238 } else {
1239 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1240 ? ISD::ZERO_EXTEND
1241 : ISD::ZERO_EXTEND_VECTOR_INREG;
1242 }
1243
1244 SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1245 Operand: N->getOperand(Num: 0));
1246 --I;
1247 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1248 ++I;
1249 MadeChange = true;
1250 continue;
1251 }
1252 case ISD::FCEIL:
1253 case ISD::STRICT_FCEIL:
1254 case ISD::FFLOOR:
1255 case ISD::STRICT_FFLOOR:
1256 case ISD::FTRUNC:
1257 case ISD::STRICT_FTRUNC:
1258 case ISD::FROUNDEVEN:
1259 case ISD::STRICT_FROUNDEVEN:
1260 case ISD::FNEARBYINT:
1261 case ISD::STRICT_FNEARBYINT:
1262 case ISD::FRINT:
1263 case ISD::STRICT_FRINT: {
1264 // Replace fp rounding with their X86 specific equivalent so we don't
1265 // need 2 sets of patterns.
1266 unsigned Imm;
1267 switch (N->getOpcode()) {
1268 default: llvm_unreachable("Unexpected opcode!");
1269 case ISD::STRICT_FCEIL:
1270 case ISD::FCEIL: Imm = 0xA; break;
1271 case ISD::STRICT_FFLOOR:
1272 case ISD::FFLOOR: Imm = 0x9; break;
1273 case ISD::STRICT_FTRUNC:
1274 case ISD::FTRUNC: Imm = 0xB; break;
1275 case ISD::STRICT_FROUNDEVEN:
1276 case ISD::FROUNDEVEN: Imm = 0x8; break;
1277 case ISD::STRICT_FNEARBYINT:
1278 case ISD::FNEARBYINT: Imm = 0xC; break;
1279 case ISD::STRICT_FRINT:
1280 case ISD::FRINT: Imm = 0x4; break;
1281 }
1282 SDLoc dl(N);
1283 bool IsStrict = N->isStrictFPOpcode();
1284 SDValue Res;
1285 if (IsStrict)
1286 Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl,
1287 ResultTys: {N->getValueType(ResNo: 0), MVT::Other},
1288 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1),
1289 CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)});
1290 else
1291 Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: 0),
1292 N1: N->getOperand(Num: 0),
1293 N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32));
1294 --I;
1295 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1296 ++I;
1297 MadeChange = true;
1298 continue;
1299 }
1300 case X86ISD::FANDN:
1301 case X86ISD::FAND:
1302 case X86ISD::FOR:
1303 case X86ISD::FXOR: {
1304 // Widen scalar fp logic ops to vector to reduce isel patterns.
1305 // FIXME: Can we do this during lowering/combine.
1306 MVT VT = N->getSimpleValueType(ResNo: 0);
1307 if (VT.isVector() || VT == MVT::f128)
1308 break;
1309
1310 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1311 : VT == MVT::f32 ? MVT::v4f32
1312 : MVT::v8f16;
1313
1314 SDLoc dl(N);
1315 SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1316 Operand: N->getOperand(Num: 0));
1317 SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1318 Operand: N->getOperand(Num: 1));
1319
1320 SDValue Res;
1321 if (Subtarget->hasSSE2()) {
1322 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1323 Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0);
1324 Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1);
1325 unsigned Opc;
1326 switch (N->getOpcode()) {
1327 default: llvm_unreachable("Unexpected opcode!");
1328 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1329 case X86ISD::FAND: Opc = ISD::AND; break;
1330 case X86ISD::FOR: Opc = ISD::OR; break;
1331 case X86ISD::FXOR: Opc = ISD::XOR; break;
1332 }
1333 Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1);
1334 Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res);
1335 } else {
1336 Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1);
1337 }
1338 Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res,
1339 N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1340 --I;
1341 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1342 ++I;
1343 MadeChange = true;
1344 continue;
1345 }
1346 }
1347
1348 if (OptLevel != CodeGenOptLevel::None &&
1349 // Only do this when the target can fold the load into the call or
1350 // jmp.
1351 !Subtarget->useIndirectThunkCalls() &&
1352 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1353 (N->getOpcode() == X86ISD::TC_RETURN &&
1354 (Subtarget->is64Bit() ||
1355 !getTargetMachine().isPositionIndependent())))) {
1356 /// Also try moving call address load from outside callseq_start to just
1357 /// before the call to allow it to be folded.
1358 ///
1359 /// [Load chain]
1360 /// ^
1361 /// |
1362 /// [Load]
1363 /// ^ ^
1364 /// | |
1365 /// / \--
1366 /// / |
1367 ///[CALLSEQ_START] |
1368 /// ^ |
1369 /// | |
1370 /// [LOAD/C2Reg] |
1371 /// | |
1372 /// \ /
1373 /// \ /
1374 /// [CALL]
1375 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1376 SDValue Chain = N->getOperand(Num: 0);
1377 SDValue Load = N->getOperand(Num: 1);
1378 if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq))
1379 continue;
1380 moveBelowOrigChain(CurDAG, Load, Call: SDValue(N, 0), OrigChain: Chain);
1381 ++NumLoadMoved;
1382 MadeChange = true;
1383 continue;
1384 }
1385
1386 // Lower fpround and fpextend nodes that target the FP stack to be store and
1387 // load to the stack. This is a gross hack. We would like to simply mark
1388 // these as being illegal, but when we do that, legalize produces these when
1389 // it expands calls, then expands these in the same legalize pass. We would
1390 // like dag combine to be able to hack on these between the call expansion
1391 // and the node legalization. As such this pass basically does "really
1392 // late" legalization of these inline with the X86 isel pass.
1393 // FIXME: This should only happen when not compiled with -O0.
1394 switch (N->getOpcode()) {
1395 default: continue;
1396 case ISD::FP_ROUND:
1397 case ISD::FP_EXTEND:
1398 {
1399 MVT SrcVT = N->getOperand(Num: 0).getSimpleValueType();
1400 MVT DstVT = N->getSimpleValueType(ResNo: 0);
1401
1402 // If any of the sources are vectors, no fp stack involved.
1403 if (SrcVT.isVector() || DstVT.isVector())
1404 continue;
1405
1406 // If the source and destination are SSE registers, then this is a legal
1407 // conversion that should not be lowered.
1408 const X86TargetLowering *X86Lowering =
1409 static_cast<const X86TargetLowering *>(TLI);
1410 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1411 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1412 if (SrcIsSSE && DstIsSSE)
1413 continue;
1414
1415 if (!SrcIsSSE && !DstIsSSE) {
1416 // If this is an FPStack extension, it is a noop.
1417 if (N->getOpcode() == ISD::FP_EXTEND)
1418 continue;
1419 // If this is a value-preserving FPStack truncation, it is a noop.
1420 if (N->getConstantOperandVal(Num: 1))
1421 continue;
1422 }
1423
1424 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1425 // FPStack has extload and truncstore. SSE can fold direct loads into other
1426 // operations. Based on this, decide what we want to do.
1427 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1428 SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1429 int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1430 MachinePointerInfo MPI =
1431 MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1432 SDLoc dl(N);
1433
1434 // FIXME: optimize the case where the src/dest is a load or store?
1435
1436 SDValue Store = CurDAG->getTruncStore(
1437 Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: 0), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT);
1438 SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store,
1439 Ptr: MemTmp, PtrInfo: MPI, MemVT);
1440
1441 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1442 // extload we created. This will cause general havok on the dag because
1443 // anything below the conversion could be folded into other existing nodes.
1444 // To avoid invalidating 'I', back it up to the convert node.
1445 --I;
1446 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Result);
1447 break;
1448 }
1449
1450 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1451 //dealing with the chain differently, as there is already a preexisting chain.
1452 case ISD::STRICT_FP_ROUND:
1453 case ISD::STRICT_FP_EXTEND:
1454 {
1455 MVT SrcVT = N->getOperand(Num: 1).getSimpleValueType();
1456 MVT DstVT = N->getSimpleValueType(ResNo: 0);
1457
1458 // If any of the sources are vectors, no fp stack involved.
1459 if (SrcVT.isVector() || DstVT.isVector())
1460 continue;
1461
1462 // If the source and destination are SSE registers, then this is a legal
1463 // conversion that should not be lowered.
1464 const X86TargetLowering *X86Lowering =
1465 static_cast<const X86TargetLowering *>(TLI);
1466 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1467 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1468 if (SrcIsSSE && DstIsSSE)
1469 continue;
1470
1471 if (!SrcIsSSE && !DstIsSSE) {
1472 // If this is an FPStack extension, it is a noop.
1473 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1474 continue;
1475 // If this is a value-preserving FPStack truncation, it is a noop.
1476 if (N->getConstantOperandVal(Num: 2))
1477 continue;
1478 }
1479
1480 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1481 // FPStack has extload and truncstore. SSE can fold direct loads into other
1482 // operations. Based on this, decide what we want to do.
1483 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1484 SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1485 int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1486 MachinePointerInfo MPI =
1487 MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1488 SDLoc dl(N);
1489
1490 // FIXME: optimize the case where the src/dest is a load or store?
1491
1492 //Since the operation is StrictFP, use the preexisting chain.
1493 SDValue Store, Result;
1494 if (!SrcIsSSE) {
1495 SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
1496 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), MemTmp};
1497 Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT,
1498 PtrInfo: MPI, /*Align*/ Alignment: std::nullopt,
1499 Flags: MachineMemOperand::MOStore);
1500 if (N->getFlags().hasNoFPExcept()) {
1501 SDNodeFlags Flags = Store->getFlags();
1502 Flags.setNoFPExcept(true);
1503 Store->setFlags(Flags);
1504 }
1505 } else {
1506 assert(SrcVT == MemVT && "Unexpected VT!");
1507 Store = CurDAG->getStore(Chain: N->getOperand(Num: 0), dl, Val: N->getOperand(Num: 1), Ptr: MemTmp,
1508 PtrInfo: MPI);
1509 }
1510
1511 if (!DstIsSSE) {
1512 SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other);
1513 SDValue Ops[] = {Store, MemTmp};
1514 Result = CurDAG->getMemIntrinsicNode(
1515 Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI,
1516 /*Align*/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad);
1517 if (N->getFlags().hasNoFPExcept()) {
1518 SDNodeFlags Flags = Result->getFlags();
1519 Flags.setNoFPExcept(true);
1520 Result->setFlags(Flags);
1521 }
1522 } else {
1523 assert(DstVT == MemVT && "Unexpected VT!");
1524 Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI);
1525 }
1526
1527 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1528 // extload we created. This will cause general havok on the dag because
1529 // anything below the conversion could be folded into other existing nodes.
1530 // To avoid invalidating 'I', back it up to the convert node.
1531 --I;
1532 CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode());
1533 break;
1534 }
1535 }
1536
1537
1538 // Now that we did that, the node is dead. Increment the iterator to the
1539 // next node to process, then delete N.
1540 ++I;
1541 MadeChange = true;
1542 }
1543
1544 // Remove any dead nodes that may have been left behind.
1545 if (MadeChange)
1546 CurDAG->RemoveDeadNodes();
1547}
1548
1549// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1550bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1551 unsigned Opc = N->getMachineOpcode();
1552 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1553 Opc != X86::MOVSX64rr8)
1554 return false;
1555
1556 SDValue N0 = N->getOperand(Num: 0);
1557
1558 // We need to be extracting the lower bit of an extend.
1559 if (!N0.isMachineOpcode() ||
1560 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1561 N0.getConstantOperandVal(i: 1) != X86::sub_8bit)
1562 return false;
1563
1564 // We're looking for either a movsx or movzx to match the original opcode.
1565 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1566 : X86::MOVSX32rr8_NOREX;
1567 SDValue N00 = N0.getOperand(i: 0);
1568 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1569 return false;
1570
1571 if (Opc == X86::MOVSX64rr8) {
1572 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1573 // to 64.
1574 MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc(N),
1575 VT: MVT::i64, Op1: N00);
1576 ReplaceUses(F: N, T: Extend);
1577 } else {
1578 // Ok we can drop this extend and just use the original extend.
1579 ReplaceUses(F: N, T: N00.getNode());
1580 }
1581
1582 return true;
1583}
1584
1585void X86DAGToDAGISel::PostprocessISelDAG() {
1586 // Skip peepholes at -O0.
1587 if (TM.getOptLevel() == CodeGenOptLevel::None)
1588 return;
1589
1590 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1591
1592 bool MadeChange = false;
1593 while (Position != CurDAG->allnodes_begin()) {
1594 SDNode *N = &*--Position;
1595 // Skip dead nodes and any non-machine opcodes.
1596 if (N->use_empty() || !N->isMachineOpcode())
1597 continue;
1598
1599 if (tryOptimizeRem8Extend(N)) {
1600 MadeChange = true;
1601 continue;
1602 }
1603
1604 unsigned Opc = N->getMachineOpcode();
1605 switch (Opc) {
1606 default:
1607 continue;
1608 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1609 case X86::TEST8rr:
1610 case X86::TEST16rr:
1611 case X86::TEST32rr:
1612 case X86::TEST64rr:
1613 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1614 case X86::CTEST8rr:
1615 case X86::CTEST16rr:
1616 case X86::CTEST32rr:
1617 case X86::CTEST64rr: {
1618 auto &Op0 = N->getOperand(Num: 0);
1619 if (Op0 != N->getOperand(Num: 1) || !Op0->hasNUsesOfValue(NUses: 2, Value: Op0.getResNo()) ||
1620 !Op0.isMachineOpcode())
1621 continue;
1622 SDValue And = N->getOperand(Num: 0);
1623#define CASE_ND(OP) \
1624 case X86::OP: \
1625 case X86::OP##_ND:
1626 switch (And.getMachineOpcode()) {
1627 default:
1628 continue;
1629 CASE_ND(AND8rr)
1630 CASE_ND(AND16rr)
1631 CASE_ND(AND32rr)
1632 CASE_ND(AND64rr) {
1633 if (And->hasAnyUseOfValue(Value: 1))
1634 continue;
1635 SmallVector<SDValue> Ops(N->op_values());
1636 Ops[0] = And.getOperand(i: 0);
1637 Ops[1] = And.getOperand(i: 1);
1638 MachineSDNode *Test =
1639 CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: MVT::i32, Ops);
1640 ReplaceUses(F: N, T: Test);
1641 MadeChange = true;
1642 continue;
1643 }
1644 CASE_ND(AND8rm)
1645 CASE_ND(AND16rm)
1646 CASE_ND(AND32rm)
1647 CASE_ND(AND64rm) {
1648 if (And->hasAnyUseOfValue(Value: 1))
1649 continue;
1650 unsigned NewOpc;
1651 bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc);
1652#define FROM_TO(A, B) \
1653 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1654 break;
1655 switch (And.getMachineOpcode()) {
1656 FROM_TO(AND8rm, TEST8mr);
1657 FROM_TO(AND16rm, TEST16mr);
1658 FROM_TO(AND32rm, TEST32mr);
1659 FROM_TO(AND64rm, TEST64mr);
1660 }
1661#undef FROM_TO
1662#undef CASE_ND
1663 // Need to swap the memory and register operand.
1664 SmallVector<SDValue> Ops = {And.getOperand(i: 1), And.getOperand(i: 2),
1665 And.getOperand(i: 3), And.getOperand(i: 4),
1666 And.getOperand(i: 5), And.getOperand(i: 0)};
1667 // CC, Cflags.
1668 if (IsCTESTCC) {
1669 Ops.push_back(Elt: N->getOperand(Num: 2));
1670 Ops.push_back(Elt: N->getOperand(Num: 3));
1671 }
1672 // Chain of memory load
1673 Ops.push_back(Elt: And.getOperand(i: 6));
1674 // Glue
1675 if (IsCTESTCC)
1676 Ops.push_back(Elt: N->getOperand(Num: 4));
1677
1678 MachineSDNode *Test = CurDAG->getMachineNode(
1679 Opcode: NewOpc, dl: SDLoc(N), VT1: MVT::i32, VT2: MVT::Other, Ops);
1680 CurDAG->setNodeMemRefs(
1681 N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands());
1682 ReplaceUses(F: And.getValue(R: 2), T: SDValue(Test, 1));
1683 ReplaceUses(F: SDValue(N, 0), T: SDValue(Test, 0));
1684 MadeChange = true;
1685 continue;
1686 }
1687 }
1688 }
1689 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1690 // used. We're doing this late so we can prefer to fold the AND into masked
1691 // comparisons. Doing that can be better for the live range of the mask
1692 // register.
1693 case X86::KORTESTBkk:
1694 case X86::KORTESTWkk:
1695 case X86::KORTESTDkk:
1696 case X86::KORTESTQkk: {
1697 SDValue Op0 = N->getOperand(Num: 0);
1698 if (Op0 != N->getOperand(Num: 1) || !N->isOnlyUserOf(N: Op0.getNode()) ||
1699 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(Flags: SDValue(N, 0)))
1700 continue;
1701#define CASE(A) \
1702 case X86::A: \
1703 break;
1704 switch (Op0.getMachineOpcode()) {
1705 default:
1706 continue;
1707 CASE(KANDBkk)
1708 CASE(KANDWkk)
1709 CASE(KANDDkk)
1710 CASE(KANDQkk)
1711 }
1712 unsigned NewOpc;
1713#define FROM_TO(A, B) \
1714 case X86::A: \
1715 NewOpc = X86::B; \
1716 break;
1717 switch (Opc) {
1718 FROM_TO(KORTESTBkk, KTESTBkk)
1719 FROM_TO(KORTESTWkk, KTESTWkk)
1720 FROM_TO(KORTESTDkk, KTESTDkk)
1721 FROM_TO(KORTESTQkk, KTESTQkk)
1722 }
1723 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1724 // KAND instructions and KTEST use the same ISA feature.
1725 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1726 continue;
1727#undef FROM_TO
1728 MachineSDNode *KTest = CurDAG->getMachineNode(
1729 Opcode: NewOpc, dl: SDLoc(N), VT: MVT::i32, Op1: Op0.getOperand(i: 0), Op2: Op0.getOperand(i: 1));
1730 ReplaceUses(F: N, T: KTest);
1731 MadeChange = true;
1732 continue;
1733 }
1734 // Attempt to remove vectors moves that were inserted to zero upper bits.
1735 case TargetOpcode::SUBREG_TO_REG: {
1736 unsigned SubRegIdx = N->getConstantOperandVal(Num: 2);
1737 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1738 continue;
1739
1740 SDValue Move = N->getOperand(Num: 1);
1741 if (!Move.isMachineOpcode())
1742 continue;
1743
1744 // Make sure its one of the move opcodes we recognize.
1745 switch (Move.getMachineOpcode()) {
1746 default:
1747 continue;
1748 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1749 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1750 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1751 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1752 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1753 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1754 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1755 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1756 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1757 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1758 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1759 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1760 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1761 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1762 }
1763#undef CASE
1764
1765 SDValue In = Move.getOperand(i: 0);
1766 if (!In.isMachineOpcode() ||
1767 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1768 continue;
1769
1770 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1771 // the SHA instructions which use a legacy encoding.
1772 uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags;
1773 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1774 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1775 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1776 continue;
1777
1778 // Producing instruction is another vector instruction. We can drop the
1779 // move.
1780 CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: In, Op3: N->getOperand(Num: 2));
1781 MadeChange = true;
1782 }
1783 }
1784 }
1785
1786 if (MadeChange)
1787 CurDAG->RemoveDeadNodes();
1788}
1789
1790
1791/// Emit any code that needs to be executed only in the main function.
1792void X86DAGToDAGISel::emitSpecialCodeForMain() {
1793 if (Subtarget->isTargetCygMing()) {
1794 TargetLowering::ArgListTy Args;
1795 auto &DL = CurDAG->getDataLayout();
1796
1797 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1798 CLI.setChain(CurDAG->getRoot())
1799 .setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()),
1800 Target: CurDAG->getExternalSymbol(Sym: "__main", VT: TLI->getPointerTy(DL)),
1801 ArgsList: std::move(Args));
1802 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1803 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1804 CurDAG->setRoot(Result.second);
1805 }
1806}
1807
1808void X86DAGToDAGISel::emitFunctionEntryCode() {
1809 // If this is main, emit special code for main.
1810 const Function &F = MF->getFunction();
1811 if (F.hasExternalLinkage() && F.getName() == "main")
1812 emitSpecialCodeForMain();
1813}
1814
1815static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1816 // We can run into an issue where a frame index or a register base
1817 // includes a displacement that, when added to the explicit displacement,
1818 // will overflow the displacement field. Assuming that the
1819 // displacement fits into a 31-bit integer (which is only slightly more
1820 // aggressive than the current fundamental assumption that it fits into
1821 // a 32-bit integer), a 31-bit disp should always be safe.
1822 return isInt<31>(x: Val);
1823}
1824
1825bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1826 X86ISelAddressMode &AM) {
1827 // We may have already matched a displacement and the caller just added the
1828 // symbolic displacement. So we still need to do the checks even if Offset
1829 // is zero.
1830
1831 int64_t Val = AM.Disp + Offset;
1832
1833 // Cannot combine ExternalSymbol displacements with integer offsets.
1834 if (Val != 0 && (AM.ES || AM.MCSym))
1835 return true;
1836
1837 CodeModel::Model M = TM.getCodeModel();
1838 if (Subtarget->is64Bit()) {
1839 if (Val != 0 &&
1840 !X86::isOffsetSuitableForCodeModel(Offset: Val, M,
1841 hasSymbolicDisplacement: AM.hasSymbolicDisplacement()))
1842 return true;
1843 // In addition to the checks required for a register base, check that
1844 // we do not try to use an unsafe Disp with a frame index.
1845 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1846 !isDispSafeForFrameIndexOrRegBase(Val))
1847 return true;
1848 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1849 // 64 bits. Instructions with 32-bit register addresses perform this zero
1850 // extension for us and we can safely ignore the high bits of Offset.
1851 // Instructions with only a 32-bit immediate address do not, though: they
1852 // sign extend instead. This means only address the low 2GB of address space
1853 // is directly addressable, we need indirect addressing for the high 2GB of
1854 // address space.
1855 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1856 // implicit zero extension of instructions would cover up any problem.
1857 // However, we have asserts elsewhere that get triggered if we do, so keep
1858 // the checks for now.
1859 // TODO: We would actually be able to accept these, as well as the same
1860 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1861 // to get an address size override to be emitted. However, this
1862 // pseudo-register is not part of any register class and therefore causes
1863 // MIR verification to fail.
1864 if (Subtarget->isTarget64BitILP32() &&
1865 !isDispSafeForFrameIndexOrRegBase(Val: (uint32_t)Val) &&
1866 !AM.hasBaseOrIndexReg())
1867 return true;
1868 } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1869 // For 32-bit X86, make sure the displacement still isn't close to the
1870 // expressible limit.
1871 return true;
1872 AM.Disp = Val;
1873 return false;
1874}
1875
1876bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1877 bool AllowSegmentRegForX32) {
1878 SDValue Address = N->getOperand(Num: 1);
1879
1880 // load gs:0 -> GS segment register.
1881 // load fs:0 -> FS segment register.
1882 //
1883 // This optimization is generally valid because the GNU TLS model defines that
1884 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1885 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1886 // zero-extended to 64 bits and then added it to the base address, which gives
1887 // unwanted results when the register holds a negative value.
1888 // For more information see http://people.redhat.com/drepper/tls.pdf
1889 if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr &&
1890 !IndirectTlsSegRefs &&
1891 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1892 Subtarget->isTargetFuchsia())) {
1893 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1894 return true;
1895 switch (N->getPointerInfo().getAddrSpace()) {
1896 case X86AS::GS:
1897 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
1898 return false;
1899 case X86AS::FS:
1900 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
1901 return false;
1902 // Address space X86AS::SS is not handled here, because it is not used to
1903 // address TLS areas.
1904 }
1905 }
1906
1907 return true;
1908}
1909
1910/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1911/// mode. These wrap things that will resolve down into a symbol reference.
1912/// If no match is possible, this returns true, otherwise it returns false.
1913bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1914 // If the addressing mode already has a symbol as the displacement, we can
1915 // never match another symbol.
1916 if (AM.hasSymbolicDisplacement())
1917 return true;
1918
1919 bool IsRIPRelTLS = false;
1920 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1921 if (IsRIPRel) {
1922 SDValue Val = N.getOperand(i: 0);
1923 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1924 IsRIPRelTLS = true;
1925 }
1926
1927 // We can't use an addressing mode in the 64-bit large code model.
1928 // Global TLS addressing is an exception. In the medium code model,
1929 // we use can use a mode when RIP wrappers are present.
1930 // That signifies access to globals that are known to be "near",
1931 // such as the GOT itself.
1932 CodeModel::Model M = TM.getCodeModel();
1933 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1934 return true;
1935
1936 // Base and index reg must be 0 in order to use %rip as base.
1937 if (IsRIPRel && AM.hasBaseOrIndexReg())
1938 return true;
1939
1940 // Make a local copy in case we can't do this fold.
1941 X86ISelAddressMode Backup = AM;
1942
1943 int64_t Offset = 0;
1944 SDValue N0 = N.getOperand(i: 0);
1945 if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) {
1946 AM.GV = G->getGlobal();
1947 AM.SymbolFlags = G->getTargetFlags();
1948 Offset = G->getOffset();
1949 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) {
1950 AM.CP = CP->getConstVal();
1951 AM.Alignment = CP->getAlign();
1952 AM.SymbolFlags = CP->getTargetFlags();
1953 Offset = CP->getOffset();
1954 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) {
1955 AM.ES = S->getSymbol();
1956 AM.SymbolFlags = S->getTargetFlags();
1957 } else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) {
1958 AM.MCSym = S->getMCSymbol();
1959 } else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) {
1960 AM.JT = J->getIndex();
1961 AM.SymbolFlags = J->getTargetFlags();
1962 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) {
1963 AM.BlockAddr = BA->getBlockAddress();
1964 AM.SymbolFlags = BA->getTargetFlags();
1965 Offset = BA->getOffset();
1966 } else
1967 llvm_unreachable("Unhandled symbol reference node.");
1968
1969 // Can't use an addressing mode with large globals.
1970 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1971 TM.isLargeGlobalValue(GV: AM.GV)) {
1972 AM = Backup;
1973 return true;
1974 }
1975
1976 if (foldOffsetIntoAddress(Offset, AM)) {
1977 AM = Backup;
1978 return true;
1979 }
1980
1981 if (IsRIPRel)
1982 AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64));
1983
1984 // Commit the changes now that we know this fold is safe.
1985 return false;
1986}
1987
1988/// Add the specified node to the specified addressing mode, returning true if
1989/// it cannot be done. This just pattern matches for the addressing mode.
1990bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1991 if (matchAddressRecursively(N, AM, Depth: 0))
1992 return true;
1993
1994 // Post-processing: Make a second attempt to fold a load, if we now know
1995 // that there will not be any other register. This is only performed for
1996 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1997 // any foldable load the first time.
1998 if (Subtarget->isTarget64BitILP32() &&
1999 AM.BaseType == X86ISelAddressMode::RegBase &&
2000 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
2001 SDValue Save_Base_Reg = AM.Base_Reg;
2002 if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) {
2003 AM.Base_Reg = SDValue();
2004 if (matchLoadInAddress(N: LoadN, AM, /*AllowSegmentRegForX32=*/true))
2005 AM.Base_Reg = Save_Base_Reg;
2006 }
2007 }
2008
2009 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
2010 // a smaller encoding and avoids a scaled-index.
2011 if (AM.Scale == 2 &&
2012 AM.BaseType == X86ISelAddressMode::RegBase &&
2013 AM.Base_Reg.getNode() == nullptr) {
2014 AM.Base_Reg = AM.IndexReg;
2015 AM.Scale = 1;
2016 }
2017
2018 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2019 // because it has a smaller encoding.
2020 if (TM.getCodeModel() != CodeModel::Large &&
2021 (!AM.GV || !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() &&
2022 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2023 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2024 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2025 // However, when GV is a local function symbol and in the same section as
2026 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2027 // referencing GV+Disp generates a relocation referencing the section symbol
2028 // with an even smaller offset, which might underflow. We should bail out if
2029 // the negative offset is too close to INT32_MIN. Actually, we are more
2030 // conservative here, using a smaller magic number also used by
2031 // isOffsetSuitableForCodeModel.
2032 if (isa_and_nonnull<Function>(Val: AM.GV) && AM.Disp < -16 * 1024 * 1024)
2033 return true;
2034
2035 AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64);
2036 }
2037
2038 return false;
2039}
2040
2041bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2042 unsigned Depth) {
2043 // Add an artificial use to this node so that we can keep track of
2044 // it if it gets CSE'd with a different node.
2045 HandleSDNode Handle(N);
2046
2047 X86ISelAddressMode Backup = AM;
2048 if (!matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1) &&
2049 !matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, Depth: Depth+1))
2050 return false;
2051 AM = Backup;
2052
2053 // Try again after commutating the operands.
2054 if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2055 Depth: Depth + 1) &&
2056 !matchAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, Depth: Depth + 1))
2057 return false;
2058 AM = Backup;
2059
2060 // If we couldn't fold both operands into the address at the same time,
2061 // see if we can just put each operand into a register and fold at least
2062 // the add.
2063 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2064 !AM.Base_Reg.getNode() &&
2065 !AM.IndexReg.getNode()) {
2066 N = Handle.getValue();
2067 AM.Base_Reg = N.getOperand(i: 0);
2068 AM.IndexReg = N.getOperand(i: 1);
2069 AM.Scale = 1;
2070 return false;
2071 }
2072 N = Handle.getValue();
2073 return true;
2074}
2075
2076// Insert a node into the DAG at least before the Pos node's position. This
2077// will reposition the node as needed, and will assign it a node ID that is <=
2078// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2079// IDs! The selection DAG must no longer depend on their uniqueness when this
2080// is used.
2081static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2082 if (N->getNodeId() == -1 ||
2083 (SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) >
2084 SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) {
2085 DAG.RepositionNode(Position: Pos->getIterator(), N: N.getNode());
2086 // Mark Node as invalid for pruning as after this it may be a successor to a
2087 // selected node but otherwise be in the same position of Pos.
2088 // Conservatively mark it with the same -abs(Id) to assure node id
2089 // invariant is preserved.
2090 N->setNodeId(Pos->getNodeId());
2091 SelectionDAGISel::InvalidateNodeId(N: N.getNode());
2092 }
2093}
2094
2095// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2096// safe. This allows us to convert the shift and and into an h-register
2097// extract and a scaled index. Returns false if the simplification is
2098// performed.
2099static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2100 uint64_t Mask,
2101 SDValue Shift, SDValue X,
2102 X86ISelAddressMode &AM) {
2103 if (Shift.getOpcode() != ISD::SRL ||
2104 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) ||
2105 !Shift.hasOneUse())
2106 return true;
2107
2108 int ScaleLog = 8 - Shift.getConstantOperandVal(i: 1);
2109 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2110 Mask != (0xffu << ScaleLog))
2111 return true;
2112
2113 MVT XVT = X.getSimpleValueType();
2114 MVT VT = N.getSimpleValueType();
2115 SDLoc DL(N);
2116 SDValue Eight = DAG.getConstant(Val: 8, DL, VT: MVT::i8);
2117 SDValue NewMask = DAG.getConstant(Val: 0xff, DL, VT: XVT);
2118 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight);
2119 SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask);
2120 SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT);
2121 SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8);
2122 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount);
2123
2124 // Insert the new nodes into the topological ordering. We must do this in
2125 // a valid topological ordering as nothing is going to go back and re-sort
2126 // these nodes. We continually insert before 'N' in sequence as this is
2127 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2128 // hierarchy left to express.
2129 insertDAGNode(DAG, Pos: N, N: Eight);
2130 insertDAGNode(DAG, Pos: N, N: NewMask);
2131 insertDAGNode(DAG, Pos: N, N: Srl);
2132 insertDAGNode(DAG, Pos: N, N: And);
2133 insertDAGNode(DAG, Pos: N, N: Ext);
2134 insertDAGNode(DAG, Pos: N, N: ShlCount);
2135 insertDAGNode(DAG, Pos: N, N: Shl);
2136 DAG.ReplaceAllUsesWith(From: N, To: Shl);
2137 DAG.RemoveDeadNode(N: N.getNode());
2138 AM.IndexReg = Ext;
2139 AM.Scale = (1 << ScaleLog);
2140 return false;
2141}
2142
2143// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2144// allows us to fold the shift into this addressing mode. Returns false if the
2145// transform succeeded.
2146static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2147 X86ISelAddressMode &AM) {
2148 SDValue Shift = N.getOperand(i: 0);
2149
2150 // Use a signed mask so that shifting right will insert sign bits. These
2151 // bits will be removed when we shift the result left so it doesn't matter
2152 // what we use. This might allow a smaller immediate encoding.
2153 int64_t Mask = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getSExtValue();
2154
2155 // If we have an any_extend feeding the AND, look through it to see if there
2156 // is a shift behind it. But only if the AND doesn't use the extended bits.
2157 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2158 bool FoundAnyExtend = false;
2159 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2160 Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 &&
2161 isUInt<32>(x: Mask)) {
2162 FoundAnyExtend = true;
2163 Shift = Shift.getOperand(i: 0);
2164 }
2165
2166 if (Shift.getOpcode() != ISD::SHL ||
2167 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
2168 return true;
2169
2170 SDValue X = Shift.getOperand(i: 0);
2171
2172 // Not likely to be profitable if either the AND or SHIFT node has more
2173 // than one use (unless all uses are for address computation). Besides,
2174 // isel mechanism requires their node ids to be reused.
2175 if (!N.hasOneUse() || !Shift.hasOneUse())
2176 return true;
2177
2178 // Verify that the shift amount is something we can fold.
2179 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2180 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2181 return true;
2182
2183 MVT VT = N.getSimpleValueType();
2184 SDLoc DL(N);
2185 if (FoundAnyExtend) {
2186 SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X);
2187 insertDAGNode(DAG, Pos: N, N: NewX);
2188 X = NewX;
2189 }
2190
2191 SDValue NewMask = DAG.getSignedConstant(Val: Mask >> ShiftAmt, DL, VT);
2192 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask);
2193 SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: 1));
2194
2195 // Insert the new nodes into the topological ordering. We must do this in
2196 // a valid topological ordering as nothing is going to go back and re-sort
2197 // these nodes. We continually insert before 'N' in sequence as this is
2198 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2199 // hierarchy left to express.
2200 insertDAGNode(DAG, Pos: N, N: NewMask);
2201 insertDAGNode(DAG, Pos: N, N: NewAnd);
2202 insertDAGNode(DAG, Pos: N, N: NewShift);
2203 DAG.ReplaceAllUsesWith(From: N, To: NewShift);
2204 DAG.RemoveDeadNode(N: N.getNode());
2205
2206 AM.Scale = 1 << ShiftAmt;
2207 AM.IndexReg = NewAnd;
2208 return false;
2209}
2210
2211// Implement some heroics to detect shifts of masked values where the mask can
2212// be replaced by extending the shift and undoing that in the addressing mode
2213// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2214// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2215// the addressing mode. This results in code such as:
2216//
2217// int f(short *y, int *lookup_table) {
2218// ...
2219// return *y + lookup_table[*y >> 11];
2220// }
2221//
2222// Turning into:
2223// movzwl (%rdi), %eax
2224// movl %eax, %ecx
2225// shrl $11, %ecx
2226// addl (%rsi,%rcx,4), %eax
2227//
2228// Instead of:
2229// movzwl (%rdi), %eax
2230// movl %eax, %ecx
2231// shrl $9, %ecx
2232// andl $124, %rcx
2233// addl (%rsi,%rcx), %eax
2234//
2235// Note that this function assumes the mask is provided as a mask *after* the
2236// value is shifted. The input chain may or may not match that, but computing
2237// such a mask is trivial.
2238static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2239 uint64_t Mask,
2240 SDValue Shift, SDValue X,
2241 X86ISelAddressMode &AM) {
2242 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2243 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
2244 return true;
2245
2246 // We need to ensure that mask is a continuous run of bits.
2247 unsigned MaskIdx, MaskLen;
2248 if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2249 return true;
2250 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2251
2252 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2253
2254 // The amount of shift we're trying to fit into the addressing mode is taken
2255 // from the shifted mask index (number of trailing zeros of the mask).
2256 unsigned AMShiftAmt = MaskIdx;
2257
2258 // There is nothing we can do here unless the mask is removing some bits.
2259 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2260 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2261
2262 // Scale the leading zero count down based on the actual size of the value.
2263 // Also scale it down based on the size of the shift.
2264 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2265 if (MaskLZ < ScaleDown)
2266 return true;
2267 MaskLZ -= ScaleDown;
2268
2269 // The final check is to ensure that any masked out high bits of X are
2270 // already known to be zero. Otherwise, the mask has a semantic impact
2271 // other than masking out a couple of low bits. Unfortunately, because of
2272 // the mask, zero extensions will be removed from operands in some cases.
2273 // This code works extra hard to look through extensions because we can
2274 // replace them with zero extensions cheaply if necessary.
2275 bool ReplacingAnyExtend = false;
2276 if (X.getOpcode() == ISD::ANY_EXTEND) {
2277 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2278 X.getOperand(i: 0).getSimpleValueType().getSizeInBits();
2279 // Assume that we'll replace the any-extend with a zero-extend, and
2280 // narrow the search to the extended value.
2281 X = X.getOperand(i: 0);
2282 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2283 ReplacingAnyExtend = true;
2284 }
2285 APInt MaskedHighBits =
2286 APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ);
2287 if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits))
2288 return true;
2289
2290 // We've identified a pattern that can be transformed into a single shift
2291 // and an addressing mode. Make it so.
2292 MVT VT = N.getSimpleValueType();
2293 if (ReplacingAnyExtend) {
2294 assert(X.getValueType() != VT);
2295 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2296 SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(X), VT, Operand: X);
2297 insertDAGNode(DAG, Pos: N, N: NewX);
2298 X = NewX;
2299 }
2300
2301 MVT XVT = X.getSimpleValueType();
2302 SDLoc DL(N);
2303 SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2304 SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2305 SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT);
2306 SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2307 SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2308
2309 // Insert the new nodes into the topological ordering. We must do this in
2310 // a valid topological ordering as nothing is going to go back and re-sort
2311 // these nodes. We continually insert before 'N' in sequence as this is
2312 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2313 // hierarchy left to express.
2314 insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2315 insertDAGNode(DAG, Pos: N, N: NewSRL);
2316 insertDAGNode(DAG, Pos: N, N: NewExt);
2317 insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2318 insertDAGNode(DAG, Pos: N, N: NewSHL);
2319 DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2320 DAG.RemoveDeadNode(N: N.getNode());
2321
2322 AM.Scale = 1 << AMShiftAmt;
2323 AM.IndexReg = NewExt;
2324 return false;
2325}
2326
2327// Transform "(X >> SHIFT) & (MASK << C1)" to
2328// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2329// matched to a BEXTR later. Returns false if the simplification is performed.
2330static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2331 uint64_t Mask,
2332 SDValue Shift, SDValue X,
2333 X86ISelAddressMode &AM,
2334 const X86Subtarget &Subtarget) {
2335 if (Shift.getOpcode() != ISD::SRL ||
2336 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) ||
2337 !Shift.hasOneUse() || !N.hasOneUse())
2338 return true;
2339
2340 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2341 if (!Subtarget.hasTBM() &&
2342 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2343 return true;
2344
2345 // We need to ensure that mask is a continuous run of bits.
2346 unsigned MaskIdx, MaskLen;
2347 if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2348 return true;
2349
2350 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2351
2352 // The amount of shift we're trying to fit into the addressing mode is taken
2353 // from the shifted mask index (number of trailing zeros of the mask).
2354 unsigned AMShiftAmt = MaskIdx;
2355
2356 // There is nothing we can do here unless the mask is removing some bits.
2357 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2358 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2359
2360 MVT XVT = X.getSimpleValueType();
2361 MVT VT = N.getSimpleValueType();
2362 SDLoc DL(N);
2363 SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2364 SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2365 SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT);
2366 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask);
2367 SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT);
2368 SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2369 SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2370
2371 // Insert the new nodes into the topological ordering. We must do this in
2372 // a valid topological ordering as nothing is going to go back and re-sort
2373 // these nodes. We continually insert before 'N' in sequence as this is
2374 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2375 // hierarchy left to express.
2376 insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2377 insertDAGNode(DAG, Pos: N, N: NewSRL);
2378 insertDAGNode(DAG, Pos: N, N: NewMask);
2379 insertDAGNode(DAG, Pos: N, N: NewAnd);
2380 insertDAGNode(DAG, Pos: N, N: NewExt);
2381 insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2382 insertDAGNode(DAG, Pos: N, N: NewSHL);
2383 DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2384 DAG.RemoveDeadNode(N: N.getNode());
2385
2386 AM.Scale = 1 << AMShiftAmt;
2387 AM.IndexReg = NewExt;
2388 return false;
2389}
2390
2391// Attempt to peek further into a scaled index register, collecting additional
2392// extensions / offsets / etc. Returns /p N if we can't peek any further.
2393SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2394 X86ISelAddressMode &AM,
2395 unsigned Depth) {
2396 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2397 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2398 "Illegal index scale");
2399
2400 // Limit recursion.
2401 if (Depth >= SelectionDAG::MaxRecursionDepth)
2402 return N;
2403
2404 EVT VT = N.getValueType();
2405 unsigned Opc = N.getOpcode();
2406
2407 // index: add(x,c) -> index: x, disp + c
2408 if (CurDAG->isBaseWithConstantOffset(Op: N)) {
2409 auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: 1));
2410 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2411 if (!foldOffsetIntoAddress(Offset, AM))
2412 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2413 }
2414
2415 // index: add(x,x) -> index: x, scale * 2
2416 if (Opc == ISD::ADD && N.getOperand(i: 0) == N.getOperand(i: 1)) {
2417 if (AM.Scale <= 4) {
2418 AM.Scale *= 2;
2419 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2420 }
2421 }
2422
2423 // index: shl(x,i) -> index: x, scale * (1 << i)
2424 if (Opc == X86ISD::VSHLI) {
2425 uint64_t ShiftAmt = N.getConstantOperandVal(i: 1);
2426 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2427 if ((AM.Scale * ScaleAmt) <= 8) {
2428 AM.Scale *= ScaleAmt;
2429 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2430 }
2431 }
2432
2433 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2434 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2435 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2436 SDValue Src = N.getOperand(i: 0);
2437 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2438 Src.hasOneUse()) {
2439 if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2440 SDValue AddSrc = Src.getOperand(i: 0);
2441 auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: 1));
2442 int64_t Offset = AddVal->getSExtValue();
2443 if (!foldOffsetIntoAddress(Offset: (uint64_t)Offset * AM.Scale, AM)) {
2444 SDLoc DL(N);
2445 SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2446 SDValue ExtVal = CurDAG->getSignedConstant(Val: Offset, DL, VT);
2447 SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal);
2448 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2449 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2450 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2451 CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2452 CurDAG->RemoveDeadNode(N: N.getNode());
2453 return ExtSrc;
2454 }
2455 }
2456 }
2457 }
2458
2459 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2460 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2461 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2462 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2463 SDValue Src = N.getOperand(i: 0);
2464 unsigned SrcOpc = Src.getOpcode();
2465 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2466 CurDAG->isADDLike(Op: Src, /*NoWrap=*/true)) &&
2467 Src.hasOneUse()) {
2468 if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2469 SDValue AddSrc = Src.getOperand(i: 0);
2470 uint64_t Offset = Src.getConstantOperandVal(i: 1);
2471 if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2472 SDLoc DL(N);
2473 SDValue Res;
2474 // If we're also scaling, see if we can use that as well.
2475 if (AddSrc.getOpcode() == ISD::SHL &&
2476 isa<ConstantSDNode>(Val: AddSrc.getOperand(i: 1))) {
2477 SDValue ShVal = AddSrc.getOperand(i: 0);
2478 uint64_t ShAmt = AddSrc.getConstantOperandVal(i: 1);
2479 APInt HiBits =
2480 APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt);
2481 uint64_t ScaleAmt = 1ULL << ShAmt;
2482 if ((AM.Scale * ScaleAmt) <= 8 &&
2483 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2484 CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) {
2485 AM.Scale *= ScaleAmt;
2486 SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal);
2487 SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal,
2488 N2: AddSrc.getOperand(i: 1));
2489 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal);
2490 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift);
2491 AddSrc = ExtShift;
2492 Res = ExtShVal;
2493 }
2494 }
2495 SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2496 SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2497 SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal);
2498 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2499 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2500 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2501 CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2502 CurDAG->RemoveDeadNode(N: N.getNode());
2503 return Res ? Res : ExtSrc;
2504 }
2505 }
2506 }
2507 }
2508
2509 // TODO: Handle extensions, shifted masks etc.
2510 return N;
2511}
2512
2513bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2514 unsigned Depth) {
2515 LLVM_DEBUG({
2516 dbgs() << "MatchAddress: ";
2517 AM.dump(CurDAG);
2518 });
2519 // Limit recursion.
2520 if (Depth >= SelectionDAG::MaxRecursionDepth)
2521 return matchAddressBase(N, AM);
2522
2523 // If this is already a %rip relative address, we can only merge immediates
2524 // into it. Instead of handling this in every case, we handle it here.
2525 // RIP relative addressing: %rip + 32-bit displacement!
2526 if (AM.isRIPRelative()) {
2527 // FIXME: JumpTable and ExternalSymbol address currently don't like
2528 // displacements. It isn't very important, but this should be fixed for
2529 // consistency.
2530 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2531 return true;
2532
2533 if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N))
2534 if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM))
2535 return false;
2536 return true;
2537 }
2538
2539 switch (N.getOpcode()) {
2540 default: break;
2541 case ISD::LOCAL_RECOVER: {
2542 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2543 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: 0))) {
2544 // Use the symbol and don't prefix it.
2545 AM.MCSym = ESNode->getMCSymbol();
2546 return false;
2547 }
2548 break;
2549 }
2550 case ISD::Constant: {
2551 uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2552 if (!foldOffsetIntoAddress(Offset: Val, AM))
2553 return false;
2554 break;
2555 }
2556
2557 case X86ISD::Wrapper:
2558 case X86ISD::WrapperRIP:
2559 if (!matchWrapper(N, AM))
2560 return false;
2561 break;
2562
2563 case ISD::LOAD:
2564 if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM))
2565 return false;
2566 break;
2567
2568 case ISD::FrameIndex:
2569 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2570 AM.Base_Reg.getNode() == nullptr &&
2571 (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(Val: AM.Disp))) {
2572 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2573 AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex();
2574 return false;
2575 }
2576 break;
2577
2578 case ISD::SHL:
2579 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2580 break;
2581
2582 if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) {
2583 unsigned Val = CN->getZExtValue();
2584 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2585 // that the base operand remains free for further matching. If
2586 // the base doesn't end up getting used, a post-processing step
2587 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2588 if (Val == 1 || Val == 2 || Val == 3) {
2589 SDValue ShVal = N.getOperand(i: 0);
2590 AM.Scale = 1 << Val;
2591 AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + 1);
2592 return false;
2593 }
2594 }
2595 break;
2596
2597 case ISD::SRL: {
2598 // Scale must not be used already.
2599 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2600
2601 // We only handle up to 64-bit values here as those are what matter for
2602 // addressing mode optimizations.
2603 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2604 "Unexpected value size!");
2605
2606 SDValue And = N.getOperand(i: 0);
2607 if (And.getOpcode() != ISD::AND) break;
2608 SDValue X = And.getOperand(i: 0);
2609
2610 // The mask used for the transform is expected to be post-shift, but we
2611 // found the shift first so just apply the shift to the mask before passing
2612 // it down.
2613 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)) ||
2614 !isa<ConstantSDNode>(Val: And.getOperand(i: 1)))
2615 break;
2616 uint64_t Mask = And.getConstantOperandVal(i: 1) >> N.getConstantOperandVal(i: 1);
2617
2618 // Try to fold the mask and shift into the scale, and return false if we
2619 // succeed.
2620 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM))
2621 return false;
2622 break;
2623 }
2624
2625 case ISD::SMUL_LOHI:
2626 case ISD::UMUL_LOHI:
2627 // A mul_lohi where we need the low part can be folded as a plain multiply.
2628 if (N.getResNo() != 0) break;
2629 [[fallthrough]];
2630 case ISD::MUL:
2631 case X86ISD::MUL_IMM:
2632 // X*[3,5,9] -> X+X*[2,4,8]
2633 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2634 AM.Base_Reg.getNode() == nullptr &&
2635 AM.IndexReg.getNode() == nullptr) {
2636 if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1)))
2637 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2638 CN->getZExtValue() == 9) {
2639 AM.Scale = unsigned(CN->getZExtValue())-1;
2640
2641 SDValue MulVal = N.getOperand(i: 0);
2642 SDValue Reg;
2643
2644 // Okay, we know that we have a scale by now. However, if the scaled
2645 // value is an add of something and a constant, we can fold the
2646 // constant into the disp field here.
2647 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2648 isa<ConstantSDNode>(Val: MulVal.getOperand(i: 1))) {
2649 Reg = MulVal.getOperand(i: 0);
2650 auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: 1));
2651 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2652 if (foldOffsetIntoAddress(Offset: Disp, AM))
2653 Reg = N.getOperand(i: 0);
2654 } else {
2655 Reg = N.getOperand(i: 0);
2656 }
2657
2658 AM.IndexReg = AM.Base_Reg = Reg;
2659 return false;
2660 }
2661 }
2662 break;
2663
2664 case ISD::SUB: {
2665 // Given A-B, if A can be completely folded into the address and
2666 // the index field with the index field unused, use -B as the index.
2667 // This is a win if a has multiple parts that can be folded into
2668 // the address. Also, this saves a mov if the base register has
2669 // other uses, since it avoids a two-address sub instruction, however
2670 // it costs an additional mov if the index register has other uses.
2671
2672 // Add an artificial use to this node so that we can keep track of
2673 // it if it gets CSE'd with a different node.
2674 HandleSDNode Handle(N);
2675
2676 // Test if the LHS of the sub can be folded.
2677 X86ISelAddressMode Backup = AM;
2678 if (matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1)) {
2679 N = Handle.getValue();
2680 AM = Backup;
2681 break;
2682 }
2683 N = Handle.getValue();
2684 // Test if the index field is free for use.
2685 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2686 AM = Backup;
2687 break;
2688 }
2689
2690 int Cost = 0;
2691 SDValue RHS = N.getOperand(i: 1);
2692 // If the RHS involves a register with multiple uses, this
2693 // transformation incurs an extra mov, due to the neg instruction
2694 // clobbering its operand.
2695 if (!RHS.getNode()->hasOneUse() ||
2696 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2697 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2698 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2699 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2700 RHS.getOperand(i: 0).getValueType() == MVT::i32))
2701 ++Cost;
2702 // If the base is a register with multiple uses, this
2703 // transformation may save a mov.
2704 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2705 !AM.Base_Reg.getNode()->hasOneUse()) ||
2706 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2707 --Cost;
2708 // If the folded LHS was interesting, this transformation saves
2709 // address arithmetic.
2710 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2711 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2712 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2713 --Cost;
2714 // If it doesn't look like it may be an overall win, don't do it.
2715 if (Cost >= 0) {
2716 AM = Backup;
2717 break;
2718 }
2719
2720 // Ok, the transformation is legal and appears profitable. Go for it.
2721 // Negation will be emitted later to avoid creating dangling nodes if this
2722 // was an unprofitable LEA.
2723 AM.IndexReg = RHS;
2724 AM.NegateIndex = true;
2725 AM.Scale = 1;
2726 return false;
2727 }
2728
2729 case ISD::OR:
2730 case ISD::XOR:
2731 // See if we can treat the OR/XOR node as an ADD node.
2732 if (!CurDAG->isADDLike(Op: N))
2733 break;
2734 [[fallthrough]];
2735 case ISD::ADD:
2736 if (!matchAdd(N, AM, Depth))
2737 return false;
2738 break;
2739
2740 case ISD::AND: {
2741 // Perform some heroic transforms on an and of a constant-count shift
2742 // with a constant to enable use of the scaled offset field.
2743
2744 // Scale must not be used already.
2745 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2746
2747 // We only handle up to 64-bit values here as those are what matter for
2748 // addressing mode optimizations.
2749 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2750 "Unexpected value size!");
2751
2752 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)))
2753 break;
2754
2755 if (N.getOperand(i: 0).getOpcode() == ISD::SRL) {
2756 SDValue Shift = N.getOperand(i: 0);
2757 SDValue X = Shift.getOperand(i: 0);
2758
2759 uint64_t Mask = N.getConstantOperandVal(i: 1);
2760
2761 // Try to fold the mask and shift into an extract and scale.
2762 if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2763 return false;
2764
2765 // Try to fold the mask and shift directly into the scale.
2766 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2767 return false;
2768
2769 // Try to fold the mask and shift into BEXTR and scale.
2770 if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask, Shift, X, AM, Subtarget: *Subtarget))
2771 return false;
2772 }
2773
2774 // Try to swap the mask and shift to place shifts which can be done as
2775 // a scale on the outside of the mask.
2776 if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM))
2777 return false;
2778
2779 break;
2780 }
2781 case ISD::ZERO_EXTEND: {
2782 // Try to widen a zexted shift left to the same size as its use, so we can
2783 // match the shift as a scale factor.
2784 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2785 break;
2786
2787 SDValue Src = N.getOperand(i: 0);
2788
2789 // See if we can match a zext(addlike(x,c)).
2790 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2791 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2792 if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + 1))
2793 if (Index != N) {
2794 AM.IndexReg = Index;
2795 return false;
2796 }
2797
2798 // Peek through mask: zext(and(shl(x,c1),c2))
2799 APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits());
2800 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2801 if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1))) {
2802 Mask = MaskC->getAPIntValue();
2803 Src = Src.getOperand(i: 0);
2804 }
2805
2806 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2807 // Give up if the shift is not a valid scale factor [1,2,3].
2808 SDValue ShlSrc = Src.getOperand(i: 0);
2809 SDValue ShlAmt = Src.getOperand(i: 1);
2810 auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt);
2811 if (!ShAmtC)
2812 break;
2813 unsigned ShAmtV = ShAmtC->getZExtValue();
2814 if (ShAmtV > 3)
2815 break;
2816
2817 // The narrow shift must only shift out zero bits (it must be 'nuw').
2818 // That makes it safe to widen to the destination type.
2819 APInt HighZeros =
2820 APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV);
2821 if (!Src->getFlags().hasNoUnsignedWrap() &&
2822 !CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask))
2823 break;
2824
2825 // zext (shl nuw i8 %x, C1) to i32
2826 // --> shl (zext i8 %x to i32), (zext C1)
2827 // zext (and (shl nuw i8 %x, C1), C2) to i32
2828 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2829 MVT SrcVT = ShlSrc.getSimpleValueType();
2830 MVT VT = N.getSimpleValueType();
2831 SDLoc DL(N);
2832
2833 SDValue Res = ShlSrc;
2834 if (!Mask.isAllOnes()) {
2835 Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT);
2836 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2837 Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res);
2838 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2839 }
2840 SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res);
2841 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext);
2842 SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt);
2843 insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl);
2844 CurDAG->ReplaceAllUsesWith(From: N, To: NewShl);
2845 CurDAG->RemoveDeadNode(N: N.getNode());
2846
2847 // Convert the shift to scale factor.
2848 AM.Scale = 1 << ShAmtV;
2849 // If matchIndexRecursively is not called here,
2850 // Zext may be replaced by other nodes but later used to call a builder
2851 // method
2852 AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + 1);
2853 return false;
2854 }
2855
2856 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2857 // Try to fold the mask and shift into an extract and scale.
2858 if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2859 X: Src.getOperand(i: 0), AM))
2860 return false;
2861
2862 // Try to fold the mask and shift directly into the scale.
2863 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2864 X: Src.getOperand(i: 0), AM))
2865 return false;
2866
2867 // Try to fold the mask and shift into BEXTR and scale.
2868 if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2869 X: Src.getOperand(i: 0), AM, Subtarget: *Subtarget))
2870 return false;
2871 }
2872
2873 break;
2874 }
2875 }
2876
2877 return matchAddressBase(N, AM);
2878}
2879
2880/// Helper for MatchAddress. Add the specified node to the
2881/// specified addressing mode without any further recursion.
2882bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2883 // Is the base register already occupied?
2884 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2885 // If so, check to see if the scale index register is set.
2886 if (!AM.IndexReg.getNode()) {
2887 AM.IndexReg = N;
2888 AM.Scale = 1;
2889 return false;
2890 }
2891
2892 // Otherwise, we cannot select it.
2893 return true;
2894 }
2895
2896 // Default, generate it as a register.
2897 AM.BaseType = X86ISelAddressMode::RegBase;
2898 AM.Base_Reg = N;
2899 return false;
2900}
2901
2902bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2903 X86ISelAddressMode &AM,
2904 unsigned Depth) {
2905 LLVM_DEBUG({
2906 dbgs() << "MatchVectorAddress: ";
2907 AM.dump(CurDAG);
2908 });
2909 // Limit recursion.
2910 if (Depth >= SelectionDAG::MaxRecursionDepth)
2911 return matchAddressBase(N, AM);
2912
2913 // TODO: Support other operations.
2914 switch (N.getOpcode()) {
2915 case ISD::Constant: {
2916 uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2917 if (!foldOffsetIntoAddress(Offset: Val, AM))
2918 return false;
2919 break;
2920 }
2921 case X86ISD::Wrapper:
2922 if (!matchWrapper(N, AM))
2923 return false;
2924 break;
2925 case ISD::ADD: {
2926 // Add an artificial use to this node so that we can keep track of
2927 // it if it gets CSE'd with a different node.
2928 HandleSDNode Handle(N);
2929
2930 X86ISelAddressMode Backup = AM;
2931 if (!matchVectorAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1) &&
2932 !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2933 Depth: Depth + 1))
2934 return false;
2935 AM = Backup;
2936
2937 // Try again after commuting the operands.
2938 if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2939 Depth: Depth + 1) &&
2940 !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM,
2941 Depth: Depth + 1))
2942 return false;
2943 AM = Backup;
2944
2945 N = Handle.getValue();
2946 break;
2947 }
2948 }
2949
2950 return matchAddressBase(N, AM);
2951}
2952
2953/// Helper for selectVectorAddr. Handles things that can be folded into a
2954/// gather/scatter address. The index register and scale should have already
2955/// been handled.
2956bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2957 return matchVectorAddressRecursively(N, AM, Depth: 0);
2958}
2959
2960bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2961 SDValue IndexOp, SDValue ScaleOp,
2962 SDValue &Base, SDValue &Scale,
2963 SDValue &Index, SDValue &Disp,
2964 SDValue &Segment) {
2965 X86ISelAddressMode AM;
2966 AM.Scale = ScaleOp->getAsZExtVal();
2967
2968 // Attempt to match index patterns, as long as we're not relying on implicit
2969 // sign-extension, which is performed BEFORE scale.
2970 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2971 AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: 0);
2972 else
2973 AM.IndexReg = IndexOp;
2974
2975 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2976 if (AddrSpace == X86AS::GS)
2977 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
2978 if (AddrSpace == X86AS::FS)
2979 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
2980 if (AddrSpace == X86AS::SS)
2981 AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
2982
2983 SDLoc DL(BasePtr);
2984 MVT VT = BasePtr.getSimpleValueType();
2985
2986 // Try to match into the base and displacement fields.
2987 if (matchVectorAddress(N: BasePtr, AM))
2988 return false;
2989
2990 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2991 return true;
2992}
2993
2994/// Returns true if it is able to pattern match an addressing mode.
2995/// It returns the operands which make up the maximal addressing mode it can
2996/// match by reference.
2997///
2998/// Parent is the parent node of the addr operand that is being matched. It
2999/// is always a load, store, atomic node, or null. It is only null when
3000/// checking memory operands for inline asm nodes.
3001bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
3002 SDValue &Scale, SDValue &Index,
3003 SDValue &Disp, SDValue &Segment) {
3004 X86ISelAddressMode AM;
3005
3006 if (Parent &&
3007 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
3008 // that are not a MemSDNode, and thus don't have proper addrspace info.
3009 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
3010 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
3011 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
3012 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3013 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3014 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3015 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3016 unsigned AddrSpace =
3017 cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace();
3018 if (AddrSpace == X86AS::GS)
3019 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
3020 if (AddrSpace == X86AS::FS)
3021 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
3022 if (AddrSpace == X86AS::SS)
3023 AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
3024 }
3025
3026 // Save the DL and VT before calling matchAddress, it can invalidate N.
3027 SDLoc DL(N);
3028 MVT VT = N.getSimpleValueType();
3029
3030 if (matchAddress(N, AM))
3031 return false;
3032
3033 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3034 return true;
3035}
3036
3037bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3038 // Cannot use 32 bit constants to reference objects in kernel/large code
3039 // model.
3040 if (TM.getCodeModel() == CodeModel::Kernel ||
3041 TM.getCodeModel() == CodeModel::Large)
3042 return false;
3043
3044 // In static codegen with small code model, we can get the address of a label
3045 // into a register with 'movl'
3046 if (N->getOpcode() != X86ISD::Wrapper)
3047 return false;
3048
3049 N = N.getOperand(i: 0);
3050
3051 // At least GNU as does not accept 'movl' for TPOFF relocations.
3052 // FIXME: We could use 'movl' when we know we are targeting MC.
3053 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3054 return false;
3055
3056 Imm = N;
3057 // Small/medium code model can reference non-TargetGlobalAddress objects with
3058 // 32 bit constants.
3059 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3060 return TM.getCodeModel() == CodeModel::Small ||
3061 TM.getCodeModel() == CodeModel::Medium;
3062 }
3063
3064 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal();
3065 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3066 return CR->getUnsignedMax().ult(RHS: 1ull << 32);
3067
3068 return !TM.isLargeGlobalValue(GV);
3069}
3070
3071bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3072 SDValue &Index, SDValue &Disp,
3073 SDValue &Segment) {
3074 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3075 SDLoc DL(N);
3076
3077 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3078 return false;
3079
3080 EVT BaseType = Base.getValueType();
3081 unsigned SubReg;
3082 if (BaseType == MVT::i8)
3083 SubReg = X86::sub_8bit;
3084 else if (BaseType == MVT::i16)
3085 SubReg = X86::sub_16bit;
3086 else
3087 SubReg = X86::sub_32bit;
3088
3089 auto *RN = dyn_cast<RegisterSDNode>(Val&: Base);
3090 if (RN && RN->getReg() == 0)
3091 Base = CurDAG->getRegister(Reg: 0, VT: MVT::i64);
3092 else if ((BaseType == MVT::i8 || BaseType == MVT::i16 ||
3093 BaseType == MVT::i32) &&
3094 !isa<FrameIndexSDNode>(Val: Base)) {
3095 // Base could already be %rip, particularly in the x32 ABI.
3096 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3097 VT: MVT::i64), 0);
3098 Base = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Base);
3099 }
3100
3101 [[maybe_unused]] EVT IndexType = Index.getValueType();
3102 RN = dyn_cast<RegisterSDNode>(Val&: Index);
3103 if (RN && RN->getReg() == 0)
3104 Index = CurDAG->getRegister(Reg: 0, VT: MVT::i64);
3105 else {
3106 assert((IndexType == BaseType) &&
3107 "Expect to be extending 8/16/32-bit registers for use in LEA");
3108 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3109 VT: MVT::i64), 0);
3110 Index = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Index);
3111 }
3112
3113 return true;
3114}
3115
3116/// Calls SelectAddr and determines if the maximal addressing
3117/// mode it matches can be cost effectively emitted as an LEA instruction.
3118bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3119 SDValue &Base, SDValue &Scale,
3120 SDValue &Index, SDValue &Disp,
3121 SDValue &Segment) {
3122 X86ISelAddressMode AM;
3123
3124 // Save the DL and VT before calling matchAddress, it can invalidate N.
3125 SDLoc DL(N);
3126 MVT VT = N.getSimpleValueType();
3127
3128 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3129 // segments.
3130 SDValue Copy = AM.Segment;
3131 SDValue T = CurDAG->getRegister(Reg: 0, VT: MVT::i32);
3132 AM.Segment = T;
3133 if (matchAddress(N, AM))
3134 return false;
3135 assert (T == AM.Segment);
3136 AM.Segment = Copy;
3137
3138 unsigned Complexity = 0;
3139 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3140 Complexity = 1;
3141 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3142 Complexity = 4;
3143
3144 if (AM.IndexReg.getNode())
3145 Complexity++;
3146
3147 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3148 // a simple shift.
3149 if (AM.Scale > 1)
3150 Complexity++;
3151
3152 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3153 // to a LEA. This is determined with some experimentation but is by no means
3154 // optimal (especially for code size consideration). LEA is nice because of
3155 // its three-address nature. Tweak the cost function again when we can run
3156 // convertToThreeAddress() at register allocation time.
3157 if (AM.hasSymbolicDisplacement()) {
3158 // For X86-64, always use LEA to materialize RIP-relative addresses.
3159 if (Subtarget->is64Bit())
3160 Complexity = 4;
3161 else
3162 Complexity += 2;
3163 }
3164
3165 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3166 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3167 // duplicating flag-producing instructions later in the pipeline.
3168 if (N.getOpcode() == ISD::ADD) {
3169 auto isMathWithFlags = [](SDValue V) {
3170 switch (V.getOpcode()) {
3171 case X86ISD::ADD:
3172 case X86ISD::SUB:
3173 case X86ISD::ADC:
3174 case X86ISD::SBB:
3175 case X86ISD::SMUL:
3176 case X86ISD::UMUL:
3177 /* TODO: These opcodes can be added safely, but we may want to justify
3178 their inclusion for different reasons (better for reg-alloc).
3179 case X86ISD::OR:
3180 case X86ISD::XOR:
3181 case X86ISD::AND:
3182 */
3183 // Value 1 is the flag output of the node - verify it's not dead.
3184 return !SDValue(V.getNode(), 1).use_empty();
3185 default:
3186 return false;
3187 }
3188 };
3189 // TODO: We might want to factor in whether there's a load folding
3190 // opportunity for the math op that disappears with LEA.
3191 if (isMathWithFlags(N.getOperand(i: 0)) || isMathWithFlags(N.getOperand(i: 1)))
3192 Complexity++;
3193 }
3194
3195 if (AM.Disp)
3196 Complexity++;
3197
3198 // If it isn't worth using an LEA, reject it.
3199 if (Complexity <= 2)
3200 return false;
3201
3202 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3203 return true;
3204}
3205
3206/// This is only run on TargetGlobalTLSAddress nodes.
3207bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3208 SDValue &Scale, SDValue &Index,
3209 SDValue &Disp, SDValue &Segment) {
3210 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3211 N.getOpcode() == ISD::TargetExternalSymbol);
3212
3213 X86ISelAddressMode AM;
3214 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) {
3215 AM.GV = GA->getGlobal();
3216 AM.Disp += GA->getOffset();
3217 AM.SymbolFlags = GA->getTargetFlags();
3218 } else {
3219 auto *SA = cast<ExternalSymbolSDNode>(Val&: N);
3220 AM.ES = SA->getSymbol();
3221 AM.SymbolFlags = SA->getTargetFlags();
3222 }
3223
3224 if (Subtarget->is32Bit()) {
3225 AM.Scale = 1;
3226 AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32);
3227 }
3228
3229 MVT VT = N.getSimpleValueType();
3230 getAddressOperands(AM, DL: SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3231 return true;
3232}
3233
3234bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3235 // Keep track of the original value type and whether this value was
3236 // truncated. If we see a truncation from pointer type to VT that truncates
3237 // bits that are known to be zero, we can use a narrow reference.
3238 EVT VT = N.getValueType();
3239 bool WasTruncated = false;
3240 if (N.getOpcode() == ISD::TRUNCATE) {
3241 WasTruncated = true;
3242 N = N.getOperand(i: 0);
3243 }
3244
3245 if (N.getOpcode() != X86ISD::Wrapper)
3246 return false;
3247
3248 // We can only use non-GlobalValues as immediates if they were not truncated,
3249 // as we do not have any range information. If we have a GlobalValue and the
3250 // address was not truncated, we can select it as an operand directly.
3251 unsigned Opc = N.getOperand(i: 0)->getOpcode();
3252 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3253 Op = N.getOperand(i: 0);
3254 // We can only select the operand directly if we didn't have to look past a
3255 // truncate.
3256 return !WasTruncated;
3257 }
3258
3259 // Check that the global's range fits into VT.
3260 auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: 0));
3261 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3262 if (!CR || CR->getUnsignedMax().uge(RHS: 1ull << VT.getSizeInBits()))
3263 return false;
3264
3265 // Okay, we can use a narrow reference.
3266 Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N), VT,
3267 offset: GA->getOffset(), TargetFlags: GA->getTargetFlags());
3268 return true;
3269}
3270
3271bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3272 SDValue &Base, SDValue &Scale,
3273 SDValue &Index, SDValue &Disp,
3274 SDValue &Segment) {
3275 assert(Root && P && "Unknown root/parent nodes");
3276 if (!ISD::isNON_EXTLoad(N: N.getNode()) ||
3277 !IsProfitableToFold(N, U: P, Root) ||
3278 !IsLegalToFold(N, U: P, Root, OptLevel))
3279 return false;
3280
3281 return selectAddr(Parent: N.getNode(),
3282 N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment);
3283}
3284
3285bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3286 SDValue &Base, SDValue &Scale,
3287 SDValue &Index, SDValue &Disp,
3288 SDValue &Segment) {
3289 assert(Root && P && "Unknown root/parent nodes");
3290 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3291 !IsProfitableToFold(N, U: P, Root) ||
3292 !IsLegalToFold(N, U: P, Root, OptLevel))
3293 return false;
3294
3295 return selectAddr(Parent: N.getNode(),
3296 N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment);
3297}
3298
3299/// Return an SDNode that returns the value of the global base register.
3300/// Output instructions required to initialize the global base register,
3301/// if necessary.
3302SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3303 Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3304 auto &DL = MF->getDataLayout();
3305 return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode();
3306}
3307
3308bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3309 if (N->getOpcode() == ISD::TRUNCATE)
3310 N = N->getOperand(Num: 0).getNode();
3311 if (N->getOpcode() != X86ISD::Wrapper)
3312 return false;
3313
3314 auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: 0));
3315 if (!GA)
3316 return false;
3317
3318 auto *GV = GA->getGlobal();
3319 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3320 if (CR)
3321 return CR->getSignedMin().sge(RHS: -1ull << Width) &&
3322 CR->getSignedMax().slt(RHS: 1ull << Width);
3323 // In the kernel code model, globals are in the negative 2GB of the address
3324 // space, so globals can be a sign extended 32-bit immediate.
3325 // In other code models, small globals are in the low 2GB of the address
3326 // space, so sign extending them is equivalent to zero extending them.
3327 return Width == 32 && !TM.isLargeGlobalValue(GV);
3328}
3329
3330X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3331 assert(N->isMachineOpcode() && "Unexpected node");
3332 unsigned Opc = N->getMachineOpcode();
3333 const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc);
3334 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3335 if (CondNo < 0)
3336 return X86::COND_INVALID;
3337
3338 return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo));
3339}
3340
3341/// Test whether the given X86ISD::CMP node has any users that use a flag
3342/// other than ZF.
3343bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3344 // Examine each user of the node.
3345 for (SDUse &Use : Flags->uses()) {
3346 // Only check things that use the flags.
3347 if (Use.getResNo() != Flags.getResNo())
3348 continue;
3349 SDNode *User = Use.getUser();
3350 // Only examine CopyToReg uses that copy to EFLAGS.
3351 if (User->getOpcode() != ISD::CopyToReg ||
3352 cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3353 return false;
3354 // Examine each user of the CopyToReg use.
3355 for (SDUse &FlagUse : User->uses()) {
3356 // Only examine the Flag result.
3357 if (FlagUse.getResNo() != 1)
3358 continue;
3359 // Anything unusual: assume conservatively.
3360 if (!FlagUse.getUser()->isMachineOpcode())
3361 return false;
3362 // Examine the condition code of the user.
3363 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3364
3365 switch (CC) {
3366 // Comparisons which only use the zero flag.
3367 case X86::COND_E: case X86::COND_NE:
3368 continue;
3369 // Anything else: assume conservatively.
3370 default:
3371 return false;
3372 }
3373 }
3374 }
3375 return true;
3376}
3377
3378/// Test whether the given X86ISD::CMP node has any uses which require the SF
3379/// flag to be accurate.
3380bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3381 // Examine each user of the node.
3382 for (SDUse &Use : Flags->uses()) {
3383 // Only check things that use the flags.
3384 if (Use.getResNo() != Flags.getResNo())
3385 continue;
3386 SDNode *User = Use.getUser();
3387 // Only examine CopyToReg uses that copy to EFLAGS.
3388 if (User->getOpcode() != ISD::CopyToReg ||
3389 cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3390 return false;
3391 // Examine each user of the CopyToReg use.
3392 for (SDUse &FlagUse : User->uses()) {
3393 // Only examine the Flag result.
3394 if (FlagUse.getResNo() != 1)
3395 continue;
3396 // Anything unusual: assume conservatively.
3397 if (!FlagUse.getUser()->isMachineOpcode())
3398 return false;
3399 // Examine the condition code of the user.
3400 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3401
3402 switch (CC) {
3403 // Comparisons which don't examine the SF flag.
3404 case X86::COND_A: case X86::COND_AE:
3405 case X86::COND_B: case X86::COND_BE:
3406 case X86::COND_E: case X86::COND_NE:
3407 case X86::COND_O: case X86::COND_NO:
3408 case X86::COND_P: case X86::COND_NP:
3409 continue;
3410 // Anything else: assume conservatively.
3411 default:
3412 return false;
3413 }
3414 }
3415 }
3416 return true;
3417}
3418
3419static bool mayUseCarryFlag(X86::CondCode CC) {
3420 switch (CC) {
3421 // Comparisons which don't examine the CF flag.
3422 case X86::COND_O: case X86::COND_NO:
3423 case X86::COND_E: case X86::COND_NE:
3424 case X86::COND_S: case X86::COND_NS:
3425 case X86::COND_P: case X86::COND_NP:
3426 case X86::COND_L: case X86::COND_GE:
3427 case X86::COND_G: case X86::COND_LE:
3428 return false;
3429 // Anything else: assume conservatively.
3430 default:
3431 return true;
3432 }
3433}
3434
3435/// Test whether the given node which sets flags has any uses which require the
3436/// CF flag to be accurate.
3437 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3438 // Examine each user of the node.
3439 for (SDUse &Use : Flags->uses()) {
3440 // Only check things that use the flags.
3441 if (Use.getResNo() != Flags.getResNo())
3442 continue;
3443
3444 SDNode *User = Use.getUser();
3445 unsigned UserOpc = User->getOpcode();
3446
3447 if (UserOpc == ISD::CopyToReg) {
3448 // Only examine CopyToReg uses that copy to EFLAGS.
3449 if (cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3450 return false;
3451 // Examine each user of the CopyToReg use.
3452 for (SDUse &FlagUse : User->uses()) {
3453 // Only examine the Flag result.
3454 if (FlagUse.getResNo() != 1)
3455 continue;
3456 // Anything unusual: assume conservatively.
3457 if (!FlagUse.getUser()->isMachineOpcode())
3458 return false;
3459 // Examine the condition code of the user.
3460 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3461
3462 if (mayUseCarryFlag(CC))
3463 return false;
3464 }
3465
3466 // This CopyToReg is ok. Move on to the next user.
3467 continue;
3468 }
3469
3470 // This might be an unselected node. So look for the pre-isel opcodes that
3471 // use flags.
3472 unsigned CCOpNo;
3473 switch (UserOpc) {
3474 default:
3475 // Something unusual. Be conservative.
3476 return false;
3477 case X86ISD::SETCC: CCOpNo = 0; break;
3478 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3479 case X86ISD::CMOV: CCOpNo = 2; break;
3480 case X86ISD::BRCOND: CCOpNo = 2; break;
3481 }
3482
3483 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(Num: CCOpNo);
3484 if (mayUseCarryFlag(CC))
3485 return false;
3486 }
3487 return true;
3488}
3489
3490/// Check whether or not the chain ending in StoreNode is suitable for doing
3491/// the {load; op; store} to modify transformation.
3492static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3493 SDValue StoredVal, SelectionDAG *CurDAG,
3494 unsigned LoadOpNo,
3495 LoadSDNode *&LoadNode,
3496 SDValue &InputChain) {
3497 // Is the stored value result 0 of the operation?
3498 if (StoredVal.getResNo() != 0) return false;
3499
3500 // Are there other uses of the operation other than the store?
3501 if (!StoredVal.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) return false;
3502
3503 // Is the store non-extending and non-indexed?
3504 if (!ISD::isNormalStore(N: StoreNode) || StoreNode->isNonTemporal())
3505 return false;
3506
3507 SDValue Load = StoredVal->getOperand(Num: LoadOpNo);
3508 // Is the stored value a non-extending and non-indexed load?
3509 if (!ISD::isNormalLoad(N: Load.getNode())) return false;
3510
3511 // Return LoadNode by reference.
3512 LoadNode = cast<LoadSDNode>(Val&: Load);
3513
3514 // Is store the only read of the loaded value?
3515 if (!Load.hasOneUse())
3516 return false;
3517
3518 // Is the address of the store the same as the load?
3519 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3520 LoadNode->getOffset() != StoreNode->getOffset())
3521 return false;
3522
3523 bool FoundLoad = false;
3524 SmallVector<SDValue, 4> ChainOps;
3525 SmallVector<const SDNode *, 4> LoopWorklist;
3526 SmallPtrSet<const SDNode *, 16> Visited;
3527 const unsigned int Max = 1024;
3528
3529 // Visualization of Load-Op-Store fusion:
3530 // -------------------------
3531 // Legend:
3532 // *-lines = Chain operand dependencies.
3533 // |-lines = Normal operand dependencies.
3534 // Dependencies flow down and right. n-suffix references multiple nodes.
3535 //
3536 // C Xn C
3537 // * * *
3538 // * * *
3539 // Xn A-LD Yn TF Yn
3540 // * * \ | * |
3541 // * * \ | * |
3542 // * * \ | => A--LD_OP_ST
3543 // * * \| \
3544 // TF OP \
3545 // * | \ Zn
3546 // * | \
3547 // A-ST Zn
3548 //
3549
3550 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3551 // #2: Yn -> LD
3552 // #3: ST -> Zn
3553
3554 // Ensure the transform is safe by checking for the dual
3555 // dependencies to make sure we do not induce a loop.
3556
3557 // As LD is a predecessor to both OP and ST we can do this by checking:
3558 // a). if LD is a predecessor to a member of Xn or Yn.
3559 // b). if a Zn is a predecessor to ST.
3560
3561 // However, (b) can only occur through being a chain predecessor to
3562 // ST, which is the same as Zn being a member or predecessor of Xn,
3563 // which is a subset of LD being a predecessor of Xn. So it's
3564 // subsumed by check (a).
3565
3566 SDValue Chain = StoreNode->getChain();
3567
3568 // Gather X elements in ChainOps.
3569 if (Chain == Load.getValue(R: 1)) {
3570 FoundLoad = true;
3571 ChainOps.push_back(Elt: Load.getOperand(i: 0));
3572 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3573 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3574 SDValue Op = Chain.getOperand(i);
3575 if (Op == Load.getValue(R: 1)) {
3576 FoundLoad = true;
3577 // Drop Load, but keep its chain. No cycle check necessary.
3578 ChainOps.push_back(Elt: Load.getOperand(i: 0));
3579 continue;
3580 }
3581 LoopWorklist.push_back(Elt: Op.getNode());
3582 ChainOps.push_back(Elt: Op);
3583 }
3584 }
3585
3586 if (!FoundLoad)
3587 return false;
3588
3589 // Worklist is currently Xn. Add Yn to worklist.
3590 for (SDValue Op : StoredVal->ops())
3591 if (Op.getNode() != LoadNode)
3592 LoopWorklist.push_back(Elt: Op.getNode());
3593
3594 // Check (a) if Load is a predecessor to Xn + Yn
3595 if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max,
3596 TopologicalPrune: true))
3597 return false;
3598
3599 InputChain =
3600 CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ChainOps);
3601 return true;
3602}
3603
3604// Change a chain of {load; op; store} of the same value into a simple op
3605// through memory of that value, if the uses of the modified value and its
3606// address are suitable.
3607//
3608// The tablegen pattern memory operand pattern is currently not able to match
3609// the case where the EFLAGS on the original operation are used.
3610//
3611// To move this to tablegen, we'll need to improve tablegen to allow flags to
3612// be transferred from a node in the pattern to the result node, probably with
3613// a new keyword. For example, we have this
3614// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3615// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3616// but maybe need something like this
3617// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3618// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3619// (transferrable EFLAGS)]>;
3620//
3621// Until then, we manually fold these and instruction select the operation
3622// here.
3623bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3624 auto *StoreNode = cast<StoreSDNode>(Val: Node);
3625 SDValue StoredVal = StoreNode->getOperand(Num: 1);
3626 unsigned Opc = StoredVal->getOpcode();
3627
3628 // Before we try to select anything, make sure this is memory operand size
3629 // and opcode we can handle. Note that this must match the code below that
3630 // actually lowers the opcodes.
3631 EVT MemVT = StoreNode->getMemoryVT();
3632 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3633 MemVT != MVT::i8)
3634 return false;
3635
3636 bool IsCommutable = false;
3637 bool IsNegate = false;
3638 switch (Opc) {
3639 default:
3640 return false;
3641 case X86ISD::SUB:
3642 IsNegate = isNullConstant(V: StoredVal.getOperand(i: 0));
3643 break;
3644 case X86ISD::SBB:
3645 break;
3646 case X86ISD::ADD:
3647 case X86ISD::ADC:
3648 case X86ISD::AND:
3649 case X86ISD::OR:
3650 case X86ISD::XOR:
3651 IsCommutable = true;
3652 break;
3653 }
3654
3655 unsigned LoadOpNo = IsNegate ? 1 : 0;
3656 LoadSDNode *LoadNode = nullptr;
3657 SDValue InputChain;
3658 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3659 LoadNode, InputChain)) {
3660 if (!IsCommutable)
3661 return false;
3662
3663 // This operation is commutable, try the other operand.
3664 LoadOpNo = 1;
3665 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3666 LoadNode, InputChain))
3667 return false;
3668 }
3669
3670 SDValue Base, Scale, Index, Disp, Segment;
3671 if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3672 Segment))
3673 return false;
3674
3675 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3676 unsigned Opc8) {
3677 switch (MemVT.getSimpleVT().SimpleTy) {
3678 case MVT::i64:
3679 return Opc64;
3680 case MVT::i32:
3681 return Opc32;
3682 case MVT::i16:
3683 return Opc16;
3684 case MVT::i8:
3685 return Opc8;
3686 default:
3687 llvm_unreachable("Invalid size!");
3688 }
3689 };
3690
3691 MachineSDNode *Result;
3692 switch (Opc) {
3693 case X86ISD::SUB:
3694 // Handle negate.
3695 if (IsNegate) {
3696 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3697 X86::NEG8m);
3698 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3699 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32,
3700 VT2: MVT::Other, Ops);
3701 break;
3702 }
3703 [[fallthrough]];
3704 case X86ISD::ADD:
3705 // Try to match inc/dec.
3706 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3707 bool IsOne = isOneConstant(V: StoredVal.getOperand(i: 1));
3708 bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: 1));
3709 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3710 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) {
3711 unsigned NewOpc =
3712 ((Opc == X86ISD::ADD) == IsOne)
3713 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3714 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3715 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3716 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32,
3717 VT2: MVT::Other, Ops);
3718 break;
3719 }
3720 }
3721 [[fallthrough]];
3722 case X86ISD::ADC:
3723 case X86ISD::SBB:
3724 case X86ISD::AND:
3725 case X86ISD::OR:
3726 case X86ISD::XOR: {
3727 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3728 switch (Opc) {
3729 case X86ISD::ADD:
3730 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3731 X86::ADD8mr);
3732 case X86ISD::ADC:
3733 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3734 X86::ADC8mr);
3735 case X86ISD::SUB:
3736 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3737 X86::SUB8mr);
3738 case X86ISD::SBB:
3739 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3740 X86::SBB8mr);
3741 case X86ISD::AND:
3742 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3743 X86::AND8mr);
3744 case X86ISD::OR:
3745 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3746 case X86ISD::XOR:
3747 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3748 X86::XOR8mr);
3749 default:
3750 llvm_unreachable("Invalid opcode!");
3751 }
3752 };
3753 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3754 switch (Opc) {
3755 case X86ISD::ADD:
3756 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3757 X86::ADD8mi);
3758 case X86ISD::ADC:
3759 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3760 X86::ADC8mi);
3761 case X86ISD::SUB:
3762 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3763 X86::SUB8mi);
3764 case X86ISD::SBB:
3765 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3766 X86::SBB8mi);
3767 case X86ISD::AND:
3768 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3769 X86::AND8mi);
3770 case X86ISD::OR:
3771 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3772 X86::OR8mi);
3773 case X86ISD::XOR:
3774 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3775 X86::XOR8mi);
3776 default:
3777 llvm_unreachable("Invalid opcode!");
3778 }
3779 };
3780
3781 unsigned NewOpc = SelectRegOpcode(Opc);
3782 SDValue Operand = StoredVal->getOperand(Num: 1-LoadOpNo);
3783
3784 // See if the operand is a constant that we can fold into an immediate
3785 // operand.
3786 if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) {
3787 int64_t OperandV = OperandC->getSExtValue();
3788
3789 // Check if we can shrink the operand enough to fit in an immediate (or
3790 // fit into a smaller immediate) by negating it and switching the
3791 // operation.
3792 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3793 ((MemVT != MVT::i8 && !isInt<8>(x: OperandV) && isInt<8>(x: -OperandV)) ||
3794 (MemVT == MVT::i64 && !isInt<32>(x: OperandV) &&
3795 isInt<32>(x: -OperandV))) &&
3796 hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) {
3797 OperandV = -OperandV;
3798 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3799 }
3800
3801 if (MemVT != MVT::i64 || isInt<32>(x: OperandV)) {
3802 Operand = CurDAG->getSignedTargetConstant(Val: OperandV, DL: SDLoc(Node), VT: MemVT);
3803 NewOpc = SelectImmOpcode(Opc);
3804 }
3805 }
3806
3807 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3808 SDValue CopyTo =
3809 CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc(Node), Reg: X86::EFLAGS,
3810 N: StoredVal.getOperand(i: 2), Glue: SDValue());
3811
3812 const SDValue Ops[] = {Base, Scale, Index, Disp,
3813 Segment, Operand, CopyTo, CopyTo.getValue(R: 1)};
3814 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other,
3815 Ops);
3816 } else {
3817 const SDValue Ops[] = {Base, Scale, Index, Disp,
3818 Segment, Operand, InputChain};
3819 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other,
3820 Ops);
3821 }
3822 break;
3823 }
3824 default:
3825 llvm_unreachable("Invalid opcode!");
3826 }
3827
3828 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3829 LoadNode->getMemOperand()};
3830 CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps);
3831
3832 // Update Load Chain uses as well.
3833 ReplaceUses(F: SDValue(LoadNode, 1), T: SDValue(Result, 1));
3834 ReplaceUses(F: SDValue(StoreNode, 0), T: SDValue(Result, 1));
3835 ReplaceUses(F: SDValue(StoredVal.getNode(), 1), T: SDValue(Result, 0));
3836 CurDAG->RemoveDeadNode(N: Node);
3837 return true;
3838}
3839
3840// See if this is an X & Mask that we can match to BEXTR/BZHI.
3841// Where Mask is one of the following patterns:
3842// a) x & (1 << nbits) - 1
3843// b) x & ~(-1 << nbits)
3844// c) x & (-1 >> (32 - y))
3845// d) x << (32 - y) >> (32 - y)
3846// e) (1 << nbits) - 1
3847bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3848 assert(
3849 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3850 Node->getOpcode() == ISD::SRL) &&
3851 "Should be either an and-mask, or right-shift after clearing high bits.");
3852
3853 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3854 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3855 return false;
3856
3857 MVT NVT = Node->getSimpleValueType(ResNo: 0);
3858
3859 // Only supported for 32 and 64 bits.
3860 if (NVT != MVT::i32 && NVT != MVT::i64)
3861 return false;
3862
3863 SDValue NBits;
3864 bool NegateNBits;
3865
3866 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3867 // Else, if we only have BMI1's BEXTR, we require one-use.
3868 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3869 auto checkUses = [AllowExtraUsesByDefault](
3870 SDValue Op, unsigned NUses,
3871 std::optional<bool> AllowExtraUses) {
3872 return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) ||
3873 Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo());
3874 };
3875 auto checkOneUse = [checkUses](SDValue Op,
3876 std::optional<bool> AllowExtraUses =
3877 std::nullopt) {
3878 return checkUses(Op, 1, AllowExtraUses);
3879 };
3880 auto checkTwoUse = [checkUses](SDValue Op,
3881 std::optional<bool> AllowExtraUses =
3882 std::nullopt) {
3883 return checkUses(Op, 2, AllowExtraUses);
3884 };
3885
3886 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3887 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3888 assert(V.getSimpleValueType() == MVT::i32 &&
3889 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3890 "Expected i64 -> i32 truncation");
3891 V = V.getOperand(i: 0);
3892 }
3893 return V;
3894 };
3895
3896 // a) x & ((1 << nbits) + (-1))
3897 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3898 &NegateNBits](SDValue Mask) -> bool {
3899 // Match `add`. Must only have one use!
3900 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3901 return false;
3902 // We should be adding all-ones constant (i.e. subtracting one.)
3903 if (!isAllOnesConstant(V: Mask->getOperand(Num: 1)))
3904 return false;
3905 // Match `1 << nbits`. Might be truncated. Must only have one use!
3906 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0));
3907 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3908 return false;
3909 if (!isOneConstant(V: M0->getOperand(Num: 0)))
3910 return false;
3911 NBits = M0->getOperand(Num: 1);
3912 NegateNBits = false;
3913 return true;
3914 };
3915
3916 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3917 V = peekThroughOneUseTruncation(V);
3918 return CurDAG->MaskedValueIsAllOnes(
3919 Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(),
3920 loBitsSet: NVT.getSizeInBits()));
3921 };
3922
3923 // b) x & ~(-1 << nbits)
3924 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3925 &NBits, &NegateNBits](SDValue Mask) -> bool {
3926 // Match `~()`. Must only have one use!
3927 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3928 return false;
3929 // The -1 only has to be all-ones for the final Node's NVT.
3930 if (!isAllOnes(Mask->getOperand(Num: 1)))
3931 return false;
3932 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3933 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0));
3934 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3935 return false;
3936 // The -1 only has to be all-ones for the final Node's NVT.
3937 if (!isAllOnes(M0->getOperand(Num: 0)))
3938 return false;
3939 NBits = M0->getOperand(Num: 1);
3940 NegateNBits = false;
3941 return true;
3942 };
3943
3944 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3945 // or leave the shift amount as-is, but then we'll have to negate it.
3946 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3947 unsigned Bitwidth) {
3948 NBits = ShiftAmt;
3949 NegateNBits = true;
3950 // Skip over a truncate of the shift amount, if any.
3951 if (NBits.getOpcode() == ISD::TRUNCATE)
3952 NBits = NBits.getOperand(i: 0);
3953 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3954 // If it doesn't match, that's fine, we'll just negate it ourselves.
3955 if (NBits.getOpcode() != ISD::SUB)
3956 return;
3957 auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: 0));
3958 if (!V0 || V0->getZExtValue() != Bitwidth)
3959 return;
3960 NBits = NBits.getOperand(i: 1);
3961 NegateNBits = false;
3962 };
3963
3964 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3965 // or
3966 // c) x & (-1 >> (32 - y))
3967 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3968 canonicalizeShiftAmt](SDValue Mask) -> bool {
3969 // The mask itself may be truncated.
3970 Mask = peekThroughOneUseTruncation(Mask);
3971 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3972 // Match `l>>`. Must only have one use!
3973 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3974 return false;
3975 // We should be shifting truly all-ones constant.
3976 if (!isAllOnesConstant(V: Mask.getOperand(i: 0)))
3977 return false;
3978 SDValue M1 = Mask.getOperand(i: 1);
3979 // The shift amount should not be used externally.
3980 if (!checkOneUse(M1))
3981 return false;
3982 canonicalizeShiftAmt(M1, Bitwidth);
3983 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3984 // is no extra use of the mask. Clearly, there was one since we are here.
3985 // But at the same time, if we need to negate the shift amount,
3986 // then we don't want the mask to stick around, else it's unprofitable.
3987 return !NegateNBits;
3988 };
3989
3990 SDValue X;
3991
3992 // d) x << z >> z but then we'll have to subtract z from bitwidth
3993 // or
3994 // d) x << (32 - y) >> (32 - y)
3995 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3996 AllowExtraUsesByDefault, &NegateNBits,
3997 &X](SDNode *Node) -> bool {
3998 if (Node->getOpcode() != ISD::SRL)
3999 return false;
4000 SDValue N0 = Node->getOperand(Num: 0);
4001 if (N0->getOpcode() != ISD::SHL)
4002 return false;
4003 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
4004 SDValue N1 = Node->getOperand(Num: 1);
4005 SDValue N01 = N0->getOperand(Num: 1);
4006 // Both of the shifts must be by the exact same value.
4007 if (N1 != N01)
4008 return false;
4009 canonicalizeShiftAmt(N1, Bitwidth);
4010 // There should not be any external uses of the inner shift / shift amount.
4011 // Note that while we are generally okay with external uses given BMI2,
4012 // iff we need to negate the shift amount, we are not okay with extra uses.
4013 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4014 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
4015 return false;
4016 X = N0->getOperand(Num: 0);
4017 return true;
4018 };
4019
4020 auto matchLowBitMask = [matchPatternA, matchPatternB,
4021 matchPatternC](SDValue Mask) -> bool {
4022 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4023 };
4024
4025 if (Node->getOpcode() == ISD::AND) {
4026 X = Node->getOperand(Num: 0);
4027 SDValue Mask = Node->getOperand(Num: 1);
4028
4029 if (matchLowBitMask(Mask)) {
4030 // Great.
4031 } else {
4032 std::swap(a&: X, b&: Mask);
4033 if (!matchLowBitMask(Mask))
4034 return false;
4035 }
4036 } else if (matchLowBitMask(SDValue(Node, 0))) {
4037 X = CurDAG->getAllOnesConstant(DL: SDLoc(Node), VT: NVT);
4038 } else if (!matchPatternD(Node))
4039 return false;
4040
4041 // If we need to negate the shift amount, require BMI2 BZHI support.
4042 // It's just too unprofitable for BMI1 BEXTR.
4043 if (NegateNBits && !Subtarget->hasBMI2())
4044 return false;
4045
4046 SDLoc DL(Node);
4047
4048 // Truncate the shift amount.
4049 NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits);
4050 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4051
4052 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4053 // All the other bits are undefined, we do not care about them.
4054 SDValue ImplDef = SDValue(
4055 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), 0);
4056 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: ImplDef);
4057
4058 SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32);
4059 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: SRIdxVal);
4060 NBits = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL,
4061 VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal),
4062 0);
4063 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4064
4065 // We might have matched the amount of high bits to be cleared,
4066 // but we want the amount of low bits to be kept, so negate it then.
4067 if (NegateNBits) {
4068 SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32);
4069 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: BitWidthC);
4070
4071 NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits);
4072 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4073 }
4074
4075 if (Subtarget->hasBMI2()) {
4076 // Great, just emit the BZHI..
4077 if (NVT != MVT::i32) {
4078 // But have to place the bit count into the wide-enough register first.
4079 NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits);
4080 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4081 }
4082
4083 SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits);
4084 ReplaceNode(F: Node, T: Extract.getNode());
4085 SelectCode(N: Extract.getNode());
4086 return true;
4087 }
4088
4089 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4090 // *logically* shifted (potentially with one-use trunc inbetween),
4091 // and the truncation was the only use of the shift,
4092 // and if so look past one-use truncation.
4093 {
4094 SDValue RealX = peekThroughOneUseTruncation(X);
4095 // FIXME: only if the shift is one-use?
4096 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4097 X = RealX;
4098 }
4099
4100 MVT XVT = X.getSimpleValueType();
4101
4102 // Else, emitting BEXTR requires one more step.
4103 // The 'control' of BEXTR has the pattern of:
4104 // [15...8 bit][ 7...0 bit] location
4105 // [ bit count][ shift] name
4106 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4107
4108 // Shift NBits left by 8 bits, thus producing 'control'.
4109 // This makes the low 8 bits to be zero.
4110 SDValue C8 = CurDAG->getConstant(Val: 8, DL, VT: MVT::i8);
4111 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: C8);
4112 SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8);
4113 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4114
4115 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4116 // FIXME: only if the shift is one-use?
4117 if (X.getOpcode() == ISD::SRL) {
4118 SDValue ShiftAmt = X.getOperand(i: 1);
4119 X = X.getOperand(i: 0);
4120
4121 assert(ShiftAmt.getValueType() == MVT::i8 &&
4122 "Expected shift amount to be i8");
4123
4124 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4125 // We could zext to i16 in some form, but we intentionally don't do that.
4126 SDValue OrigShiftAmt = ShiftAmt;
4127 ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt);
4128 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt);
4129
4130 // And now 'or' these low 8 bits of shift amount into the 'control'.
4131 Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt);
4132 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4133 }
4134
4135 // But have to place the 'control' into the wide-enough register first.
4136 if (XVT != MVT::i32) {
4137 Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control);
4138 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4139 }
4140
4141 // And finally, form the BEXTR itself.
4142 SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control);
4143
4144 // The 'X' was originally truncated. Do that now.
4145 if (XVT != NVT) {
4146 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Extract);
4147 Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract);
4148 }
4149
4150 ReplaceNode(F: Node, T: Extract.getNode());
4151 SelectCode(N: Extract.getNode());
4152
4153 return true;
4154}
4155
4156// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4157MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4158 MVT NVT = Node->getSimpleValueType(ResNo: 0);
4159 SDLoc dl(Node);
4160
4161 SDValue N0 = Node->getOperand(Num: 0);
4162 SDValue N1 = Node->getOperand(Num: 1);
4163
4164 // If we have TBM we can use an immediate for the control. If we have BMI
4165 // we should only do this if the BEXTR instruction is implemented well.
4166 // Otherwise moving the control into a register makes this more costly.
4167 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4168 // hoisting the move immediate would make it worthwhile with a less optimal
4169 // BEXTR?
4170 bool PreferBEXTR =
4171 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4172 if (!PreferBEXTR && !Subtarget->hasBMI2())
4173 return nullptr;
4174
4175 // Must have a shift right.
4176 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4177 return nullptr;
4178
4179 // Shift can't have additional users.
4180 if (!N0->hasOneUse())
4181 return nullptr;
4182
4183 // Only supported for 32 and 64 bits.
4184 if (NVT != MVT::i32 && NVT != MVT::i64)
4185 return nullptr;
4186
4187 // Shift amount and RHS of and must be constant.
4188 auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1);
4189 auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1));
4190 if (!MaskCst || !ShiftCst)
4191 return nullptr;
4192
4193 // And RHS must be a mask.
4194 uint64_t Mask = MaskCst->getZExtValue();
4195 if (!isMask_64(Value: Mask))
4196 return nullptr;
4197
4198 uint64_t Shift = ShiftCst->getZExtValue();
4199 uint64_t MaskSize = llvm::popcount(Value: Mask);
4200
4201 // Don't interfere with something that can be handled by extracting AH.
4202 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4203 if (Shift == 8 && MaskSize == 8)
4204 return nullptr;
4205
4206 // Make sure we are only using bits that were in the original value, not
4207 // shifted in.
4208 if (Shift + MaskSize > NVT.getSizeInBits())
4209 return nullptr;
4210
4211 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4212 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4213 // does not fit into 32 bits. Load folding is not a sufficient reason.
4214 if (!PreferBEXTR && MaskSize <= 32)
4215 return nullptr;
4216
4217 SDValue Control;
4218 unsigned ROpc, MOpc;
4219
4220#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4221 if (!PreferBEXTR) {
4222 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4223 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4224 // Let's perform the mask first, and apply shift later. Note that we need to
4225 // widen the mask to account for the fact that we'll apply shift afterwards!
4226 Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT);
4227 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4228 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4229 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4230 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4231 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4232 Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0);
4233 } else {
4234 // The 'control' of BEXTR has the pattern of:
4235 // [15...8 bit][ 7...0 bit] location
4236 // [ bit count][ shift] name
4237 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4238 Control = CurDAG->getTargetConstant(Val: Shift | (MaskSize << 8), DL: dl, VT: NVT);
4239 if (Subtarget->hasTBM()) {
4240 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4241 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4242 } else {
4243 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4244 // BMI requires the immediate to placed in a register.
4245 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4246 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4247 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4248 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4249 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4250 Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0);
4251 }
4252 }
4253
4254 MachineSDNode *NewNode;
4255 SDValue Input = N0->getOperand(Num: 0);
4256 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4257 if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4258 SDValue Ops[] = {
4259 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: 0)};
4260 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
4261 NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4262 // Update the chain.
4263 ReplaceUses(F: Input.getValue(R: 1), T: SDValue(NewNode, 2));
4264 // Record the mem-refs
4265 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()});
4266 } else {
4267 NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control);
4268 }
4269
4270 if (!PreferBEXTR) {
4271 // We still need to apply the shift.
4272 SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT);
4273 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4274 : GET_ND_IF_ENABLED(X86::SHR32ri);
4275 NewNode =
4276 CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue(NewNode, 0), Op2: ShAmt);
4277 }
4278
4279 return NewNode;
4280}
4281
4282// Emit a PCMISTR(I/M) instruction.
4283MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4284 bool MayFoldLoad, const SDLoc &dl,
4285 MVT VT, SDNode *Node) {
4286 SDValue N0 = Node->getOperand(Num: 0);
4287 SDValue N1 = Node->getOperand(Num: 1);
4288 SDValue Imm = Node->getOperand(Num: 2);
4289 auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4290 Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType());
4291
4292 // Try to fold a load. No need to check alignment.
4293 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4294 if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4295 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4296 N1.getOperand(i: 0) };
4297 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other);
4298 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4299 // Update the chain.
4300 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 2));
4301 // Record the mem-refs
4302 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
4303 return CNode;
4304 }
4305
4306 SDValue Ops[] = { N0, N1, Imm };
4307 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32);
4308 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4309 return CNode;
4310}
4311
4312// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4313// to emit a second instruction after this one. This is needed since we have two
4314// copyToReg nodes glued before this and we need to continue that glue through.
4315MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4316 bool MayFoldLoad, const SDLoc &dl,
4317 MVT VT, SDNode *Node,
4318 SDValue &InGlue) {
4319 SDValue N0 = Node->getOperand(Num: 0);
4320 SDValue N2 = Node->getOperand(Num: 2);
4321 SDValue Imm = Node->getOperand(Num: 4);
4322 auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4323 Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType());
4324
4325 // Try to fold a load. No need to check alignment.
4326 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4327 if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4328 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4329 N2.getOperand(i: 0), InGlue };
4330 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
4331 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4332 InGlue = SDValue(CNode, 3);
4333 // Update the chain.
4334 ReplaceUses(F: N2.getValue(R: 1), T: SDValue(CNode, 2));
4335 // Record the mem-refs
4336 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()});
4337 return CNode;
4338 }
4339
4340 SDValue Ops[] = { N0, N2, Imm, InGlue };
4341 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue);
4342 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4343 InGlue = SDValue(CNode, 2);
4344 return CNode;
4345}
4346
4347bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4348 EVT VT = N->getValueType(ResNo: 0);
4349
4350 // Only handle scalar shifts.
4351 if (VT.isVector())
4352 return false;
4353
4354 // Narrower shifts only mask to 5 bits in hardware.
4355 unsigned Size = VT == MVT::i64 ? 64 : 32;
4356
4357 SDValue OrigShiftAmt = N->getOperand(Num: 1);
4358 SDValue ShiftAmt = OrigShiftAmt;
4359 SDLoc DL(N);
4360
4361 // Skip over a truncate of the shift amount.
4362 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4363 ShiftAmt = ShiftAmt->getOperand(Num: 0);
4364
4365 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4366 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4367
4368 SDValue NewShiftAmt;
4369 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4370 ShiftAmt->getOpcode() == ISD::XOR) {
4371 SDValue Add0 = ShiftAmt->getOperand(Num: 0);
4372 SDValue Add1 = ShiftAmt->getOperand(Num: 1);
4373 auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0);
4374 auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1);
4375 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4376 // to avoid the ADD/SUB/XOR.
4377 if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == 0) {
4378 NewShiftAmt = Add0;
4379
4380 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4381 ((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - 1) ||
4382 (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - 1))) {
4383 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4384 // we can replace it with a NOT. In the XOR case it may save some code
4385 // size, in the SUB case it also may save a move.
4386 assert(Add0C == nullptr || Add1C == nullptr);
4387
4388 // We can only do N-X, not X-N
4389 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4390 return false;
4391
4392 EVT OpVT = ShiftAmt.getValueType();
4393
4394 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT);
4395 NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT,
4396 N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes);
4397 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes);
4398 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4399 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4400 // -X to generate a NEG instead of a SUB of a constant.
4401 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4402 Add0C->getZExtValue() != 0) {
4403 EVT SubVT = ShiftAmt.getValueType();
4404 SDValue X;
4405 if (Add0C->getZExtValue() % Size == 0)
4406 X = Add1;
4407 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4408 Add0C->getZExtValue() % 32 == 0) {
4409 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4410 // This is mainly beneficial if we already compute (x+n*32).
4411 if (Add1.getOpcode() == ISD::TRUNCATE) {
4412 Add1 = Add1.getOperand(i: 0);
4413 SubVT = Add1.getValueType();
4414 }
4415 if (Add0.getValueType() != SubVT) {
4416 Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT);
4417 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0);
4418 }
4419
4420 X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0);
4421 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X);
4422 } else
4423 return false;
4424 // Insert a negate op.
4425 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4426 // that uses it that's not a shift.
4427 SDValue Zero = CurDAG->getConstant(Val: 0, DL, VT: SubVT);
4428 SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X);
4429 NewShiftAmt = Neg;
4430
4431 // Insert these operands into a valid topological order so they can
4432 // get selected independently.
4433 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero);
4434 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg);
4435 } else
4436 return false;
4437 } else
4438 return false;
4439
4440 if (NewShiftAmt.getValueType() != MVT::i8) {
4441 // Need to truncate the shift amount.
4442 NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt);
4443 // Add to a correct topological ordering.
4444 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4445 }
4446
4447 // Insert a new mask to keep the shift amount legal. This should be removed
4448 // by isel patterns.
4449 NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt,
4450 N2: CurDAG->getConstant(Val: Size - 1, DL, VT: MVT::i8));
4451 // Place in a correct topological ordering.
4452 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4453
4454 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0),
4455 Op2: NewShiftAmt);
4456 if (UpdatedNode != N) {
4457 // If we found an existing node, we should replace ourselves with that node
4458 // and wait for it to be selected after its other users.
4459 ReplaceNode(F: N, T: UpdatedNode);
4460 return true;
4461 }
4462
4463 // If the original shift amount is now dead, delete it so that we don't run
4464 // it through isel.
4465 if (OrigShiftAmt.getNode()->use_empty())
4466 CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode());
4467
4468 // Now that we've optimized the shift amount, defer to normal isel to get
4469 // load folding and legacy vs BMI2 selection without repeating it here.
4470 SelectCode(N);
4471 return true;
4472}
4473
4474bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4475 MVT NVT = N->getSimpleValueType(ResNo: 0);
4476 unsigned Opcode = N->getOpcode();
4477 SDLoc dl(N);
4478
4479 // For operations of the form (x << C1) op C2, check if we can use a smaller
4480 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4481 SDValue Shift = N->getOperand(Num: 0);
4482 SDValue N1 = N->getOperand(Num: 1);
4483
4484 auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
4485 if (!Cst)
4486 return false;
4487
4488 int64_t Val = Cst->getSExtValue();
4489
4490 // If we have an any_extend feeding the AND, look through it to see if there
4491 // is a shift behind it. But only if the AND doesn't use the extended bits.
4492 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4493 bool FoundAnyExtend = false;
4494 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4495 Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 &&
4496 isUInt<32>(x: Val)) {
4497 FoundAnyExtend = true;
4498 Shift = Shift.getOperand(i: 0);
4499 }
4500
4501 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4502 return false;
4503
4504 // i8 is unshrinkable, i16 should be promoted to i32.
4505 if (NVT != MVT::i32 && NVT != MVT::i64)
4506 return false;
4507
4508 auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1));
4509 if (!ShlCst)
4510 return false;
4511
4512 uint64_t ShAmt = ShlCst->getZExtValue();
4513
4514 // Make sure that we don't change the operation by removing bits.
4515 // This only matters for OR and XOR, AND is unaffected.
4516 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4517 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4518 return false;
4519
4520 // Check the minimum bitwidth for the new constant.
4521 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4522 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4523 if (Opcode == ISD::AND) {
4524 // AND32ri is the same as AND64ri32 with zext imm.
4525 // Try this before sign extended immediates below.
4526 ShiftedVal = (uint64_t)Val >> ShAmt;
4527 if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal))
4528 return true;
4529 // Also swap order when the AND can become MOVZX.
4530 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4531 return true;
4532 }
4533 ShiftedVal = Val >> ShAmt;
4534 if ((!isInt<8>(x: Val) && isInt<8>(x: ShiftedVal)) ||
4535 (!isInt<32>(x: Val) && isInt<32>(x: ShiftedVal)))
4536 return true;
4537 if (Opcode != ISD::AND) {
4538 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4539 ShiftedVal = (uint64_t)Val >> ShAmt;
4540 if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal))
4541 return true;
4542 }
4543 return false;
4544 };
4545
4546 int64_t ShiftedVal;
4547 if (!CanShrinkImmediate(ShiftedVal))
4548 return false;
4549
4550 // Ok, we can reorder to get a smaller immediate.
4551
4552 // But, its possible the original immediate allowed an AND to become MOVZX.
4553 // Doing this late due to avoid the MakedValueIsZero call as late as
4554 // possible.
4555 if (Opcode == ISD::AND) {
4556 // Find the smallest zext this could possibly be.
4557 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4558 ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: 8U));
4559
4560 // Figure out which bits need to be zero to achieve that mask.
4561 APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(),
4562 loBitsSet: ZExtWidth);
4563 NeededMask &= ~Cst->getAPIntValue();
4564
4565 if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: 0), Mask: NeededMask))
4566 return false;
4567 }
4568
4569 SDValue X = Shift.getOperand(i: 0);
4570 if (FoundAnyExtend) {
4571 SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X);
4572 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewX);
4573 X = NewX;
4574 }
4575
4576 SDValue NewCst = CurDAG->getSignedConstant(Val: ShiftedVal, DL: dl, VT: NVT);
4577 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewCst);
4578 SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst);
4579 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewBinOp);
4580 SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp,
4581 N2: Shift.getOperand(i: 1));
4582 ReplaceNode(F: N, T: NewSHL.getNode());
4583 SelectCode(N: NewSHL.getNode());
4584 return true;
4585}
4586
4587bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4588 SDNode *ParentB, SDNode *ParentC,
4589 SDValue A, SDValue B, SDValue C,
4590 uint8_t Imm) {
4591 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4592 C.isOperandOf(ParentC) && "Incorrect parent node");
4593
4594 auto tryFoldLoadOrBCast =
4595 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4596 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4597 if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4598 return true;
4599
4600 // Not a load, check for broadcast which may be behind a bitcast.
4601 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4602 P = L.getNode();
4603 L = L.getOperand(i: 0);
4604 }
4605
4606 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4607 return false;
4608
4609 // Only 32 and 64 bit broadcasts are supported.
4610 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4611 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4612 if (Size != 32 && Size != 64)
4613 return false;
4614
4615 return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4616 };
4617
4618 bool FoldedLoad = false;
4619 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4620 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4621 FoldedLoad = true;
4622 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4623 Tmp4)) {
4624 FoldedLoad = true;
4625 std::swap(a&: A, b&: C);
4626 // Swap bits 1/4 and 3/6.
4627 uint8_t OldImm = Imm;
4628 Imm = OldImm & 0xa5;
4629 if (OldImm & 0x02) Imm |= 0x10;
4630 if (OldImm & 0x10) Imm |= 0x02;
4631 if (OldImm & 0x08) Imm |= 0x40;
4632 if (OldImm & 0x40) Imm |= 0x08;
4633 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4634 Tmp4)) {
4635 FoldedLoad = true;
4636 std::swap(a&: B, b&: C);
4637 // Swap bits 1/2 and 5/6.
4638 uint8_t OldImm = Imm;
4639 Imm = OldImm & 0x99;
4640 if (OldImm & 0x02) Imm |= 0x04;
4641 if (OldImm & 0x04) Imm |= 0x02;
4642 if (OldImm & 0x20) Imm |= 0x40;
4643 if (OldImm & 0x40) Imm |= 0x20;
4644 }
4645
4646 SDLoc DL(Root);
4647
4648 SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
4649
4650 MVT NVT = Root->getSimpleValueType(ResNo: 0);
4651
4652 MachineSDNode *MNode;
4653 if (FoldedLoad) {
4654 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
4655
4656 unsigned Opc;
4657 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4658 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C);
4659 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4660 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4661
4662 bool UseD = EltSize == 32;
4663 if (NVT.is128BitVector())
4664 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4665 else if (NVT.is256BitVector())
4666 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4667 else if (NVT.is512BitVector())
4668 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4669 else
4670 llvm_unreachable("Unexpected vector size!");
4671 } else {
4672 bool UseD = NVT.getVectorElementType() == MVT::i32;
4673 if (NVT.is128BitVector())
4674 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4675 else if (NVT.is256BitVector())
4676 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4677 else if (NVT.is512BitVector())
4678 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4679 else
4680 llvm_unreachable("Unexpected vector size!");
4681 }
4682
4683 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: 0)};
4684 MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops);
4685
4686 // Update the chain.
4687 ReplaceUses(F: C.getValue(R: 1), T: SDValue(MNode, 1));
4688 // Record the mem-refs
4689 CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()});
4690 } else {
4691 bool UseD = NVT.getVectorElementType() == MVT::i32;
4692 unsigned Opc;
4693 if (NVT.is128BitVector())
4694 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4695 else if (NVT.is256BitVector())
4696 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4697 else if (NVT.is512BitVector())
4698 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4699 else
4700 llvm_unreachable("Unexpected vector size!");
4701
4702 MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm});
4703 }
4704
4705 ReplaceUses(F: SDValue(Root, 0), T: SDValue(MNode, 0));
4706 CurDAG->RemoveDeadNode(N: Root);
4707 return true;
4708}
4709
4710// Try to match two logic ops to a VPTERNLOG.
4711// FIXME: Handle more complex patterns that use an operand more than once?
4712bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4713 MVT NVT = N->getSimpleValueType(ResNo: 0);
4714
4715 // Make sure we support VPTERNLOG.
4716 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4717 NVT.getVectorElementType() == MVT::i1)
4718 return false;
4719
4720 // We need VLX for 128/256-bit.
4721 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4722 return false;
4723
4724 SDValue N0 = N->getOperand(Num: 0);
4725 SDValue N1 = N->getOperand(Num: 1);
4726
4727 auto getFoldableLogicOp = [](SDValue Op) {
4728 // Peek through single use bitcast.
4729 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4730 Op = Op.getOperand(i: 0);
4731
4732 if (!Op.hasOneUse())
4733 return SDValue();
4734
4735 unsigned Opc = Op.getOpcode();
4736 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4737 Opc == X86ISD::ANDNP)
4738 return Op;
4739
4740 return SDValue();
4741 };
4742
4743 SDValue A, FoldableOp;
4744 if ((FoldableOp = getFoldableLogicOp(N1))) {
4745 A = N0;
4746 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4747 A = N1;
4748 } else
4749 return false;
4750
4751 SDValue B = FoldableOp.getOperand(i: 0);
4752 SDValue C = FoldableOp.getOperand(i: 1);
4753 SDNode *ParentA = N;
4754 SDNode *ParentB = FoldableOp.getNode();
4755 SDNode *ParentC = FoldableOp.getNode();
4756
4757 // We can build the appropriate control immediate by performing the logic
4758 // operation we're matching using these constants for A, B, and C.
4759 uint8_t TernlogMagicA = 0xf0;
4760 uint8_t TernlogMagicB = 0xcc;
4761 uint8_t TernlogMagicC = 0xaa;
4762
4763 // Some of the inputs may be inverted, peek through them and invert the
4764 // magic values accordingly.
4765 // TODO: There may be a bitcast before the xor that we should peek through.
4766 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4767 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4768 ISD::isBuildVectorAllOnes(N: Op.getOperand(i: 1).getNode())) {
4769 Magic = ~Magic;
4770 Parent = Op.getNode();
4771 Op = Op.getOperand(i: 0);
4772 }
4773 };
4774
4775 PeekThroughNot(A, ParentA, TernlogMagicA);
4776 PeekThroughNot(B, ParentB, TernlogMagicB);
4777 PeekThroughNot(C, ParentC, TernlogMagicC);
4778
4779 uint8_t Imm;
4780 switch (FoldableOp.getOpcode()) {
4781 default: llvm_unreachable("Unexpected opcode!");
4782 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4783 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4784 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4785 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4786 }
4787
4788 switch (N->getOpcode()) {
4789 default: llvm_unreachable("Unexpected opcode!");
4790 case X86ISD::ANDNP:
4791 if (A == N0)
4792 Imm &= ~TernlogMagicA;
4793 else
4794 Imm = ~(Imm) & TernlogMagicA;
4795 break;
4796 case ISD::AND: Imm &= TernlogMagicA; break;
4797 case ISD::OR: Imm |= TernlogMagicA; break;
4798 case ISD::XOR: Imm ^= TernlogMagicA; break;
4799 }
4800
4801 return matchVPTERNLOG(Root: N, ParentA, ParentB, ParentC, A, B, C, Imm);
4802}
4803
4804/// If the high bits of an 'and' operand are known zero, try setting the
4805/// high bits of an 'and' constant operand to produce a smaller encoding by
4806/// creating a small, sign-extended negative immediate rather than a large
4807/// positive one. This reverses a transform in SimplifyDemandedBits that
4808/// shrinks mask constants by clearing bits. There is also a possibility that
4809/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4810/// case, just replace the 'and'. Return 'true' if the node is replaced.
4811bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4812 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4813 // have immediate operands.
4814 MVT VT = And->getSimpleValueType(ResNo: 0);
4815 if (VT != MVT::i32 && VT != MVT::i64)
4816 return false;
4817
4818 auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1));
4819 if (!And1C)
4820 return false;
4821
4822 // Bail out if the mask constant is already negative. It's can't shrink more.
4823 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4824 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4825 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4826 // are negative too.
4827 APInt MaskVal = And1C->getAPIntValue();
4828 unsigned MaskLZ = MaskVal.countl_zero();
4829 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4830 return false;
4831
4832 // Don't extend into the upper 32 bits of a 64 bit mask.
4833 if (VT == MVT::i64 && MaskLZ >= 32) {
4834 MaskLZ -= 32;
4835 MaskVal = MaskVal.trunc(width: 32);
4836 }
4837
4838 SDValue And0 = And->getOperand(Num: 0);
4839 APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ);
4840 APInt NegMaskVal = MaskVal | HighZeros;
4841
4842 // If a negative constant would not allow a smaller encoding, there's no need
4843 // to continue. Only change the constant when we know it's a win.
4844 unsigned MinWidth = NegMaskVal.getSignificantBits();
4845 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4846 return false;
4847
4848 // Extend masks if we truncated above.
4849 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4850 NegMaskVal = NegMaskVal.zext(width: 64);
4851 HighZeros = HighZeros.zext(width: 64);
4852 }
4853
4854 // The variable operand must be all zeros in the top bits to allow using the
4855 // new, negative constant as the mask.
4856 // TODO: Handle constant folding?
4857 KnownBits Known0 = CurDAG->computeKnownBits(Op: And0);
4858 if (Known0.isConstant() || !HighZeros.isSubsetOf(RHS: Known0.Zero))
4859 return false;
4860
4861 // Check if the mask is -1. In that case, this is an unnecessary instruction
4862 // that escaped earlier analysis.
4863 if (NegMaskVal.isAllOnes()) {
4864 ReplaceNode(F: And, T: And0.getNode());
4865 return true;
4866 }
4867
4868 // A negative mask allows a smaller encoding. Create a new 'and' node.
4869 SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc(And), VT);
4870 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(And, 0), N: NewMask);
4871 SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(And), VT, N1: And0, N2: NewMask);
4872 ReplaceNode(F: And, T: NewAnd.getNode());
4873 SelectCode(N: NewAnd.getNode());
4874 return true;
4875}
4876
4877static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4878 bool FoldedBCast, bool Masked) {
4879#define VPTESTM_CASE(VT, SUFFIX) \
4880case MVT::VT: \
4881 if (Masked) \
4882 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4883 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4884
4885
4886#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4887default: llvm_unreachable("Unexpected VT!"); \
4888VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4889VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4890VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4891VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4892VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4893VPTESTM_CASE(v8i64, QZ##SUFFIX)
4894
4895#define VPTESTM_FULL_CASES(SUFFIX) \
4896VPTESTM_BROADCAST_CASES(SUFFIX) \
4897VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4898VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4899VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4900VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4901VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4902VPTESTM_CASE(v32i16, WZ##SUFFIX)
4903
4904 if (FoldedBCast) {
4905 switch (TestVT.SimpleTy) {
4906 VPTESTM_BROADCAST_CASES(rmb)
4907 }
4908 }
4909
4910 if (FoldedLoad) {
4911 switch (TestVT.SimpleTy) {
4912 VPTESTM_FULL_CASES(rm)
4913 }
4914 }
4915
4916 switch (TestVT.SimpleTy) {
4917 VPTESTM_FULL_CASES(rr)
4918 }
4919
4920#undef VPTESTM_FULL_CASES
4921#undef VPTESTM_BROADCAST_CASES
4922#undef VPTESTM_CASE
4923}
4924
4925// Try to create VPTESTM instruction. If InMask is not null, it will be used
4926// to form a masked operation.
4927bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4928 SDValue InMask) {
4929 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4930 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4931 "Unexpected VT!");
4932
4933 // Look for equal and not equal compares.
4934 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: 2))->get();
4935 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4936 return false;
4937
4938 SDValue SetccOp0 = Setcc.getOperand(i: 0);
4939 SDValue SetccOp1 = Setcc.getOperand(i: 1);
4940
4941 // Canonicalize the all zero vector to the RHS.
4942 if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode()))
4943 std::swap(a&: SetccOp0, b&: SetccOp1);
4944
4945 // See if we're comparing against zero.
4946 if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode()))
4947 return false;
4948
4949 SDValue N0 = SetccOp0;
4950
4951 MVT CmpVT = N0.getSimpleValueType();
4952 MVT CmpSVT = CmpVT.getVectorElementType();
4953
4954 // Start with both operands the same. We'll try to refine this.
4955 SDValue Src0 = N0;
4956 SDValue Src1 = N0;
4957
4958 {
4959 // Look through single use bitcasts.
4960 SDValue N0Temp = N0;
4961 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4962 N0Temp = N0.getOperand(i: 0);
4963
4964 // Look for single use AND.
4965 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4966 Src0 = N0Temp.getOperand(i: 0);
4967 Src1 = N0Temp.getOperand(i: 1);
4968 }
4969 }
4970
4971 // Without VLX we need to widen the operation.
4972 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4973
4974 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4975 SDValue &Base, SDValue &Scale, SDValue &Index,
4976 SDValue &Disp, SDValue &Segment) {
4977 // If we need to widen, we can't fold the load.
4978 if (!Widen)
4979 if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4980 return true;
4981
4982 // If we didn't fold a load, try to match broadcast. No widening limitation
4983 // for this. But only 32 and 64 bit types are supported.
4984 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4985 return false;
4986
4987 // Look through single use bitcasts.
4988 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4989 P = L.getNode();
4990 L = L.getOperand(i: 0);
4991 }
4992
4993 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4994 return false;
4995
4996 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4997 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4998 return false;
4999
5000 return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
5001 };
5002
5003 // We can only fold loads if the sources are unique.
5004 bool CanFoldLoads = Src0 != Src1;
5005
5006 bool FoldedLoad = false;
5007 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5008 if (CanFoldLoads) {
5009 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5010 Tmp3, Tmp4);
5011 if (!FoldedLoad) {
5012 // And is commutative.
5013 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
5014 Tmp2, Tmp3, Tmp4);
5015 if (FoldedLoad)
5016 std::swap(a&: Src0, b&: Src1);
5017 }
5018 }
5019
5020 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5021
5022 bool IsMasked = InMask.getNode() != nullptr;
5023
5024 SDLoc dl(Root);
5025
5026 MVT ResVT = Setcc.getSimpleValueType();
5027 MVT MaskVT = ResVT;
5028 if (Widen) {
5029 // Widen the inputs using insert_subreg or copy_to_regclass.
5030 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5031 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5032 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5033 CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts);
5034 MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts);
5035 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl,
5036 VT: CmpVT), 0);
5037 Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0);
5038
5039 if (!FoldedBCast)
5040 Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1);
5041
5042 if (IsMasked) {
5043 // Widen the mask.
5044 unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID();
5045 SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5046 InMask = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5047 dl, VT: MaskVT, Op1: InMask, Op2: RC), 0);
5048 }
5049 }
5050
5051 bool IsTestN = CC == ISD::SETEQ;
5052 unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5053 Masked: IsMasked);
5054
5055 MachineSDNode *CNode;
5056 if (FoldedLoad) {
5057 SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other);
5058
5059 if (IsMasked) {
5060 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5061 Src1.getOperand(i: 0) };
5062 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5063 } else {
5064 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5065 Src1.getOperand(i: 0) };
5066 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5067 }
5068
5069 // Update the chain.
5070 ReplaceUses(F: Src1.getValue(R: 1), T: SDValue(CNode, 1));
5071 // Record the mem-refs
5072 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()});
5073 } else {
5074 if (IsMasked)
5075 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1);
5076 else
5077 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1);
5078 }
5079
5080 // If we widened, we need to shrink the mask VT.
5081 if (Widen) {
5082 unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID();
5083 SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5084 CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5085 dl, VT: ResVT, Op1: SDValue(CNode, 0), Op2: RC);
5086 }
5087
5088 ReplaceUses(F: SDValue(Root, 0), T: SDValue(CNode, 0));
5089 CurDAG->RemoveDeadNode(N: Root);
5090 return true;
5091}
5092
5093// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5094// into vpternlog.
5095bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5096 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5097
5098 MVT NVT = N->getSimpleValueType(ResNo: 0);
5099
5100 // Make sure we support VPTERNLOG.
5101 if (!NVT.isVector() || !Subtarget->hasAVX512())
5102 return false;
5103
5104 // We need VLX for 128/256-bit.
5105 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5106 return false;
5107
5108 SDValue N0 = N->getOperand(Num: 0);
5109 SDValue N1 = N->getOperand(Num: 1);
5110
5111 // Canonicalize AND to LHS.
5112 if (N1.getOpcode() == ISD::AND)
5113 std::swap(a&: N0, b&: N1);
5114
5115 if (N0.getOpcode() != ISD::AND ||
5116 N1.getOpcode() != X86ISD::ANDNP ||
5117 !N0.hasOneUse() || !N1.hasOneUse())
5118 return false;
5119
5120 // ANDN is not commutable, use it to pick down A and C.
5121 SDValue A = N1.getOperand(i: 0);
5122 SDValue C = N1.getOperand(i: 1);
5123
5124 // AND is commutable, if one operand matches A, the other operand is B.
5125 // Otherwise this isn't a match.
5126 SDValue B;
5127 if (N0.getOperand(i: 0) == A)
5128 B = N0.getOperand(i: 1);
5129 else if (N0.getOperand(i: 1) == A)
5130 B = N0.getOperand(i: 0);
5131 else
5132 return false;
5133
5134 SDLoc dl(N);
5135 SDValue Imm = CurDAG->getTargetConstant(Val: 0xCA, DL: dl, VT: MVT::i8);
5136 SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm);
5137 ReplaceNode(F: N, T: Ternlog.getNode());
5138
5139 return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(),
5140 ParentC: Ternlog.getNode(), A, B, C, Imm: 0xCA);
5141}
5142
5143void X86DAGToDAGISel::Select(SDNode *Node) {
5144 MVT NVT = Node->getSimpleValueType(ResNo: 0);
5145 unsigned Opcode = Node->getOpcode();
5146 SDLoc dl(Node);
5147
5148 if (Node->isMachineOpcode()) {
5149 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5150 Node->setNodeId(-1);
5151 return; // Already selected.
5152 }
5153
5154 switch (Opcode) {
5155 default: break;
5156 case ISD::INTRINSIC_W_CHAIN: {
5157 unsigned IntNo = Node->getConstantOperandVal(Num: 1);
5158 switch (IntNo) {
5159 default: break;
5160 case Intrinsic::x86_encodekey128:
5161 case Intrinsic::x86_encodekey256: {
5162 if (!Subtarget->hasKL())
5163 break;
5164
5165 unsigned Opcode;
5166 switch (IntNo) {
5167 default: llvm_unreachable("Impossible intrinsic");
5168 case Intrinsic::x86_encodekey128:
5169 Opcode = X86::ENCODEKEY128;
5170 break;
5171 case Intrinsic::x86_encodekey256:
5172 Opcode = X86::ENCODEKEY256;
5173 break;
5174 }
5175
5176 SDValue Chain = Node->getOperand(Num: 0);
5177 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 3),
5178 Glue: SDValue());
5179 if (Opcode == X86::ENCODEKEY256)
5180 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 4),
5181 Glue: Chain.getValue(R: 1));
5182
5183 MachineSDNode *Res = CurDAG->getMachineNode(
5184 Opcode, dl, VTs: Node->getVTList(),
5185 Ops: {Node->getOperand(Num: 2), Chain, Chain.getValue(R: 1)});
5186 ReplaceNode(F: Node, T: Res);
5187 return;
5188 }
5189 case Intrinsic::x86_tileloaddrs64_internal:
5190 case Intrinsic::x86_tileloaddrst164_internal:
5191 if (!Subtarget->hasAMXMOVRS())
5192 break;
5193 [[fallthrough]];
5194 case Intrinsic::x86_tileloadd64_internal:
5195 case Intrinsic::x86_tileloaddt164_internal: {
5196 if (!Subtarget->hasAMXTILE())
5197 break;
5198 auto *MFI =
5199 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5200 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5201 unsigned Opc;
5202 switch (IntNo) {
5203 default:
5204 llvm_unreachable("Unexpected intrinsic!");
5205 case Intrinsic::x86_tileloaddrs64_internal:
5206 Opc = X86::PTILELOADDRSV;
5207 break;
5208 case Intrinsic::x86_tileloaddrst164_internal:
5209 Opc = X86::PTILELOADDRST1V;
5210 break;
5211 case Intrinsic::x86_tileloadd64_internal:
5212 Opc = X86::PTILELOADDV;
5213 break;
5214 case Intrinsic::x86_tileloaddt164_internal:
5215 Opc = X86::PTILELOADDT1V;
5216 break;
5217 }
5218 // _tile_loadd_internal(row, col, buf, STRIDE)
5219 SDValue Base = Node->getOperand(Num: 4);
5220 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5221 SDValue Index = Node->getOperand(Num: 5);
5222 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5223 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5224 SDValue Chain = Node->getOperand(Num: 0);
5225 MachineSDNode *CNode;
5226 SDValue Ops[] = {Node->getOperand(Num: 2),
5227 Node->getOperand(Num: 3),
5228 Base,
5229 Scale,
5230 Index,
5231 Disp,
5232 Segment,
5233 Chain};
5234 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops);
5235 ReplaceNode(F: Node, T: CNode);
5236 return;
5237 }
5238 }
5239 break;
5240 }
5241 case ISD::INTRINSIC_VOID: {
5242 unsigned IntNo = Node->getConstantOperandVal(Num: 1);
5243 switch (IntNo) {
5244 default: break;
5245 case Intrinsic::x86_sse3_monitor:
5246 case Intrinsic::x86_monitorx:
5247 case Intrinsic::x86_clzero: {
5248 bool Use64BitPtr = Node->getOperand(Num: 2).getValueType() == MVT::i64;
5249
5250 unsigned Opc = 0;
5251 switch (IntNo) {
5252 default: llvm_unreachable("Unexpected intrinsic!");
5253 case Intrinsic::x86_sse3_monitor:
5254 if (!Subtarget->hasSSE3())
5255 break;
5256 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5257 break;
5258 case Intrinsic::x86_monitorx:
5259 if (!Subtarget->hasMWAITX())
5260 break;
5261 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5262 break;
5263 case Intrinsic::x86_clzero:
5264 if (!Subtarget->hasCLZERO())
5265 break;
5266 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5267 break;
5268 }
5269
5270 if (Opc) {
5271 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5272 SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: 0), dl, Reg: PtrReg,
5273 N: Node->getOperand(Num: 2), Glue: SDValue());
5274 SDValue InGlue = Chain.getValue(R: 1);
5275
5276 if (IntNo == Intrinsic::x86_sse3_monitor ||
5277 IntNo == Intrinsic::x86_monitorx) {
5278 // Copy the other two operands to ECX and EDX.
5279 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: 3),
5280 Glue: InGlue);
5281 InGlue = Chain.getValue(R: 1);
5282 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: 4),
5283 Glue: InGlue);
5284 InGlue = Chain.getValue(R: 1);
5285 }
5286
5287 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other,
5288 Ops: { Chain, InGlue});
5289 ReplaceNode(F: Node, T: CNode);
5290 return;
5291 }
5292
5293 break;
5294 }
5295 case Intrinsic::x86_tilestored64_internal: {
5296 auto *MFI =
5297 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5298 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5299 unsigned Opc = X86::PTILESTOREDV;
5300 // _tile_stored_internal(row, col, buf, STRIDE, c)
5301 SDValue Base = Node->getOperand(Num: 4);
5302 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5303 SDValue Index = Node->getOperand(Num: 5);
5304 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5305 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5306 SDValue Chain = Node->getOperand(Num: 0);
5307 MachineSDNode *CNode;
5308 SDValue Ops[] = {Node->getOperand(Num: 2),
5309 Node->getOperand(Num: 3),
5310 Base,
5311 Scale,
5312 Index,
5313 Disp,
5314 Segment,
5315 Node->getOperand(Num: 6),
5316 Chain};
5317 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5318 ReplaceNode(F: Node, T: CNode);
5319 return;
5320 }
5321 case Intrinsic::x86_tileloaddrs64:
5322 case Intrinsic::x86_tileloaddrst164:
5323 if (!Subtarget->hasAMXMOVRS())
5324 break;
5325 [[fallthrough]];
5326 case Intrinsic::x86_tileloadd64:
5327 case Intrinsic::x86_tileloaddt164:
5328 case Intrinsic::x86_tilestored64: {
5329 if (!Subtarget->hasAMXTILE())
5330 break;
5331 auto *MFI =
5332 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5333 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5334 unsigned Opc;
5335 switch (IntNo) {
5336 default: llvm_unreachable("Unexpected intrinsic!");
5337 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5338 case Intrinsic::x86_tileloaddrs64:
5339 Opc = X86::PTILELOADDRS;
5340 break;
5341 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5342 case Intrinsic::x86_tileloaddrst164:
5343 Opc = X86::PTILELOADDRST1;
5344 break;
5345 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5346 }
5347 // FIXME: Match displacement and scale.
5348 unsigned TIndex = Node->getConstantOperandVal(Num: 2);
5349 SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5350 SDValue Base = Node->getOperand(Num: 3);
5351 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5352 SDValue Index = Node->getOperand(Num: 4);
5353 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5354 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5355 SDValue Chain = Node->getOperand(Num: 0);
5356 MachineSDNode *CNode;
5357 if (Opc == X86::PTILESTORED) {
5358 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5359 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5360 } else {
5361 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5362 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5363 }
5364 ReplaceNode(F: Node, T: CNode);
5365 return;
5366 }
5367 case Intrinsic::x86_t2rpntlvwz0rs:
5368 case Intrinsic::x86_t2rpntlvwz0rst1:
5369 case Intrinsic::x86_t2rpntlvwz1rs:
5370 case Intrinsic::x86_t2rpntlvwz1rst1:
5371 if (!Subtarget->hasAMXMOVRS())
5372 break;
5373 [[fallthrough]];
5374 case Intrinsic::x86_t2rpntlvwz0:
5375 case Intrinsic::x86_t2rpntlvwz0t1:
5376 case Intrinsic::x86_t2rpntlvwz1:
5377 case Intrinsic::x86_t2rpntlvwz1t1: {
5378 if (!Subtarget->hasAMXTRANSPOSE())
5379 break;
5380 auto *MFI =
5381 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5382 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5383 unsigned Opc;
5384 switch (IntNo) {
5385 default:
5386 llvm_unreachable("Unexpected intrinsic!");
5387 case Intrinsic::x86_t2rpntlvwz0:
5388 Opc = X86::PT2RPNTLVWZ0;
5389 break;
5390 case Intrinsic::x86_t2rpntlvwz0t1:
5391 Opc = X86::PT2RPNTLVWZ0T1;
5392 break;
5393 case Intrinsic::x86_t2rpntlvwz1:
5394 Opc = X86::PT2RPNTLVWZ1;
5395 break;
5396 case Intrinsic::x86_t2rpntlvwz1t1:
5397 Opc = X86::PT2RPNTLVWZ1T1;
5398 break;
5399 case Intrinsic::x86_t2rpntlvwz0rs:
5400 Opc = X86::PT2RPNTLVWZ0RS;
5401 break;
5402 case Intrinsic::x86_t2rpntlvwz0rst1:
5403 Opc = X86::PT2RPNTLVWZ0RST1;
5404 break;
5405 case Intrinsic::x86_t2rpntlvwz1rs:
5406 Opc = X86::PT2RPNTLVWZ1RS;
5407 break;
5408 case Intrinsic::x86_t2rpntlvwz1rst1:
5409 Opc = X86::PT2RPNTLVWZ1RST1;
5410 break;
5411 }
5412 // FIXME: Match displacement and scale.
5413 unsigned TIndex = Node->getConstantOperandVal(Num: 2);
5414 SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5415 SDValue Base = Node->getOperand(Num: 3);
5416 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5417 SDValue Index = Node->getOperand(Num: 4);
5418 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5419 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5420 SDValue Chain = Node->getOperand(Num: 0);
5421 SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
5422 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5423 ReplaceNode(F: Node, T: CNode);
5424 return;
5425 }
5426 }
5427 break;
5428 }
5429 case ISD::BRIND:
5430 case X86ISD::NT_BRIND: {
5431 if (Subtarget->isTargetNaCl())
5432 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5433 // leave the instruction alone.
5434 break;
5435 if (Subtarget->isTarget64BitILP32()) {
5436 // Converts a 32-bit register to a 64-bit, zero-extended version of
5437 // it. This is needed because x86-64 can do many things, but jmp %r32
5438 // ain't one of them.
5439 SDValue Target = Node->getOperand(Num: 1);
5440 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5441 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64);
5442 SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other,
5443 N1: Node->getOperand(Num: 0), N2: ZextTarget);
5444 ReplaceNode(F: Node, T: Brind.getNode());
5445 SelectCode(N: ZextTarget.getNode());
5446 SelectCode(N: Brind.getNode());
5447 return;
5448 }
5449 break;
5450 }
5451 case X86ISD::GlobalBaseReg:
5452 ReplaceNode(F: Node, T: getGlobalBaseReg());
5453 return;
5454
5455 case ISD::BITCAST:
5456 // Just drop all 128/256/512-bit bitcasts.
5457 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5458 NVT == MVT::f128) {
5459 ReplaceUses(F: SDValue(Node, 0), T: Node->getOperand(Num: 0));
5460 CurDAG->RemoveDeadNode(N: Node);
5461 return;
5462 }
5463 break;
5464
5465 case ISD::SRL:
5466 if (matchBitExtract(Node))
5467 return;
5468 [[fallthrough]];
5469 case ISD::SRA:
5470 case ISD::SHL:
5471 if (tryShiftAmountMod(N: Node))
5472 return;
5473 break;
5474
5475 case X86ISD::VPTERNLOG: {
5476 uint8_t Imm = Node->getConstantOperandVal(Num: 3);
5477 if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: 0),
5478 B: Node->getOperand(Num: 1), C: Node->getOperand(Num: 2), Imm))
5479 return;
5480 break;
5481 }
5482
5483 case X86ISD::ANDNP:
5484 if (tryVPTERNLOG(N: Node))
5485 return;
5486 break;
5487
5488 case ISD::AND:
5489 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5490 // Try to form a masked VPTESTM. Operands can be in either order.
5491 SDValue N0 = Node->getOperand(Num: 0);
5492 SDValue N1 = Node->getOperand(Num: 1);
5493 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5494 tryVPTESTM(Root: Node, Setcc: N0, InMask: N1))
5495 return;
5496 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5497 tryVPTESTM(Root: Node, Setcc: N1, InMask: N0))
5498 return;
5499 }
5500
5501 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5502 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
5503 CurDAG->RemoveDeadNode(N: Node);
5504 return;
5505 }
5506 if (matchBitExtract(Node))
5507 return;
5508 if (AndImmShrink && shrinkAndImmediate(And: Node))
5509 return;
5510
5511 [[fallthrough]];
5512 case ISD::OR:
5513 case ISD::XOR:
5514 if (tryShrinkShlLogicImm(N: Node))
5515 return;
5516 if (Opcode == ISD::OR && tryMatchBitSelect(N: Node))
5517 return;
5518 if (tryVPTERNLOG(N: Node))
5519 return;
5520
5521 [[fallthrough]];
5522 case ISD::ADD:
5523 if (Opcode == ISD::ADD && matchBitExtract(Node))
5524 return;
5525 [[fallthrough]];
5526 case ISD::SUB: {
5527 // Try to avoid folding immediates with multiple uses for optsize.
5528 // This code tries to select to register form directly to avoid going
5529 // through the isel table which might fold the immediate. We can't change
5530 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5531 // tablegen files to check immediate use count without making the patterns
5532 // unavailable to the fast-isel table.
5533 if (!CurDAG->shouldOptForSize())
5534 break;
5535
5536 // Only handle i8/i16/i32/i64.
5537 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5538 break;
5539
5540 SDValue N0 = Node->getOperand(Num: 0);
5541 SDValue N1 = Node->getOperand(Num: 1);
5542
5543 auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
5544 if (!Cst)
5545 break;
5546
5547 int64_t Val = Cst->getSExtValue();
5548
5549 // Make sure its an immediate that is considered foldable.
5550 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5551 if (!isInt<8>(x: Val) && !isInt<32>(x: Val))
5552 break;
5553
5554 // If this can match to INC/DEC, let it go.
5555 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5556 break;
5557
5558 // Check if we should avoid folding this immediate.
5559 if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode()))
5560 break;
5561
5562 // We should not fold the immediate. So we need a register form instead.
5563 unsigned ROpc, MOpc;
5564 switch (NVT.SimpleTy) {
5565 default: llvm_unreachable("Unexpected VT!");
5566 case MVT::i8:
5567 switch (Opcode) {
5568 default: llvm_unreachable("Unexpected opcode!");
5569 case ISD::ADD:
5570 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5571 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5572 break;
5573 case ISD::SUB:
5574 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5575 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5576 break;
5577 case ISD::AND:
5578 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5579 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5580 break;
5581 case ISD::OR:
5582 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5583 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5584 break;
5585 case ISD::XOR:
5586 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5587 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5588 break;
5589 }
5590 break;
5591 case MVT::i16:
5592 switch (Opcode) {
5593 default: llvm_unreachable("Unexpected opcode!");
5594 case ISD::ADD:
5595 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5596 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5597 break;
5598 case ISD::SUB:
5599 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5600 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5601 break;
5602 case ISD::AND:
5603 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5604 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5605 break;
5606 case ISD::OR:
5607 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5608 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5609 break;
5610 case ISD::XOR:
5611 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5612 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5613 break;
5614 }
5615 break;
5616 case MVT::i32:
5617 switch (Opcode) {
5618 default: llvm_unreachable("Unexpected opcode!");
5619 case ISD::ADD:
5620 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5621 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5622 break;
5623 case ISD::SUB:
5624 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5625 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5626 break;
5627 case ISD::AND:
5628 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5629 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5630 break;
5631 case ISD::OR:
5632 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5633 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5634 break;
5635 case ISD::XOR:
5636 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5637 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5638 break;
5639 }
5640 break;
5641 case MVT::i64:
5642 switch (Opcode) {
5643 default: llvm_unreachable("Unexpected opcode!");
5644 case ISD::ADD:
5645 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5646 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5647 break;
5648 case ISD::SUB:
5649 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5650 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5651 break;
5652 case ISD::AND:
5653 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5654 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5655 break;
5656 case ISD::OR:
5657 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5658 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5659 break;
5660 case ISD::XOR:
5661 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5662 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5663 break;
5664 }
5665 break;
5666 }
5667
5668 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5669
5670 // If this is a not a subtract, we can still try to fold a load.
5671 if (Opcode != ISD::SUB) {
5672 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5673 if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5674 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) };
5675 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5676 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5677 // Update the chain.
5678 ReplaceUses(F: N0.getValue(R: 1), T: SDValue(CNode, 2));
5679 // Record the mem-refs
5680 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5681 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
5682 CurDAG->RemoveDeadNode(N: Node);
5683 return;
5684 }
5685 }
5686
5687 CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1);
5688 return;
5689 }
5690
5691 case X86ISD::SMUL:
5692 // i16/i32/i64 are handled with isel patterns.
5693 if (NVT != MVT::i8)
5694 break;
5695 [[fallthrough]];
5696 case X86ISD::UMUL: {
5697 SDValue N0 = Node->getOperand(Num: 0);
5698 SDValue N1 = Node->getOperand(Num: 1);
5699
5700 unsigned LoReg, ROpc, MOpc;
5701 switch (NVT.SimpleTy) {
5702 default: llvm_unreachable("Unsupported VT!");
5703 case MVT::i8:
5704 LoReg = X86::AL;
5705 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5706 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5707 break;
5708 case MVT::i16:
5709 LoReg = X86::AX;
5710 ROpc = X86::MUL16r;
5711 MOpc = X86::MUL16m;
5712 break;
5713 case MVT::i32:
5714 LoReg = X86::EAX;
5715 ROpc = X86::MUL32r;
5716 MOpc = X86::MUL32m;
5717 break;
5718 case MVT::i64:
5719 LoReg = X86::RAX;
5720 ROpc = X86::MUL64r;
5721 MOpc = X86::MUL64m;
5722 break;
5723 }
5724
5725 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5726 bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5727 // Multiply is commutative.
5728 if (!FoldedLoad) {
5729 FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5730 if (FoldedLoad)
5731 std::swap(a&: N0, b&: N1);
5732 }
5733
5734 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5735 N: N0, Glue: SDValue()).getValue(R: 1);
5736
5737 MachineSDNode *CNode;
5738 if (FoldedLoad) {
5739 // i16/i32/i64 use an instruction that produces a low and high result even
5740 // though only the low result is used.
5741 SDVTList VTs;
5742 if (NVT == MVT::i8)
5743 VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5744 else
5745 VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other);
5746
5747 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5748 InGlue };
5749 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5750
5751 // Update the chain.
5752 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5753 // Record the mem-refs
5754 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5755 } else {
5756 // i16/i32/i64 use an instruction that produces a low and high result even
5757 // though only the low result is used.
5758 SDVTList VTs;
5759 if (NVT == MVT::i8)
5760 VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32);
5761 else
5762 VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32);
5763
5764 CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue});
5765 }
5766
5767 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
5768 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5769 CurDAG->RemoveDeadNode(N: Node);
5770 return;
5771 }
5772
5773 case ISD::SMUL_LOHI:
5774 case ISD::UMUL_LOHI: {
5775 SDValue N0 = Node->getOperand(Num: 0);
5776 SDValue N1 = Node->getOperand(Num: 1);
5777
5778 unsigned Opc, MOpc;
5779 unsigned LoReg, HiReg;
5780 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5781 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5782 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5783 switch (NVT.SimpleTy) {
5784 default: llvm_unreachable("Unsupported VT!");
5785 case MVT::i32:
5786 Opc = UseMULXHi ? X86::MULX32Hrr
5787 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5788 : IsSigned ? X86::IMUL32r
5789 : X86::MUL32r;
5790 MOpc = UseMULXHi ? X86::MULX32Hrm
5791 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5792 : IsSigned ? X86::IMUL32m
5793 : X86::MUL32m;
5794 LoReg = UseMULX ? X86::EDX : X86::EAX;
5795 HiReg = X86::EDX;
5796 break;
5797 case MVT::i64:
5798 Opc = UseMULXHi ? X86::MULX64Hrr
5799 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5800 : IsSigned ? X86::IMUL64r
5801 : X86::MUL64r;
5802 MOpc = UseMULXHi ? X86::MULX64Hrm
5803 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5804 : IsSigned ? X86::IMUL64m
5805 : X86::MUL64m;
5806 LoReg = UseMULX ? X86::RDX : X86::RAX;
5807 HiReg = X86::RDX;
5808 break;
5809 }
5810
5811 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5812 bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5813 // Multiply is commutative.
5814 if (!foldedLoad) {
5815 foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5816 if (foldedLoad)
5817 std::swap(a&: N0, b&: N1);
5818 }
5819
5820 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5821 N: N0, Glue: SDValue()).getValue(R: 1);
5822 SDValue ResHi, ResLo;
5823 if (foldedLoad) {
5824 SDValue Chain;
5825 MachineSDNode *CNode = nullptr;
5826 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5827 InGlue };
5828 if (UseMULXHi) {
5829 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
5830 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5831 ResHi = SDValue(CNode, 0);
5832 Chain = SDValue(CNode, 1);
5833 } else if (UseMULX) {
5834 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other);
5835 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5836 ResHi = SDValue(CNode, 0);
5837 ResLo = SDValue(CNode, 1);
5838 Chain = SDValue(CNode, 2);
5839 } else {
5840 SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
5841 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5842 Chain = SDValue(CNode, 0);
5843 InGlue = SDValue(CNode, 1);
5844 }
5845
5846 // Update the chain.
5847 ReplaceUses(F: N1.getValue(R: 1), T: Chain);
5848 // Record the mem-refs
5849 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5850 } else {
5851 SDValue Ops[] = { N1, InGlue };
5852 if (UseMULXHi) {
5853 SDVTList VTs = CurDAG->getVTList(VT: NVT);
5854 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5855 ResHi = SDValue(CNode, 0);
5856 } else if (UseMULX) {
5857 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT);
5858 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5859 ResHi = SDValue(CNode, 0);
5860 ResLo = SDValue(CNode, 1);
5861 } else {
5862 SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue);
5863 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5864 InGlue = SDValue(CNode, 0);
5865 }
5866 }
5867
5868 // Copy the low half of the result, if it is needed.
5869 if (!SDValue(Node, 0).use_empty()) {
5870 if (!ResLo) {
5871 assert(LoReg && "Register for low half is not defined!");
5872 ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5873 VT: NVT, Glue: InGlue);
5874 InGlue = ResLo.getValue(R: 2);
5875 }
5876 ReplaceUses(F: SDValue(Node, 0), T: ResLo);
5877 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5878 dbgs() << '\n');
5879 }
5880 // Copy the high half of the result, if it is needed.
5881 if (!SDValue(Node, 1).use_empty()) {
5882 if (!ResHi) {
5883 assert(HiReg && "Register for high half is not defined!");
5884 ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg,
5885 VT: NVT, Glue: InGlue);
5886 InGlue = ResHi.getValue(R: 2);
5887 }
5888 ReplaceUses(F: SDValue(Node, 1), T: ResHi);
5889 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5890 dbgs() << '\n');
5891 }
5892
5893 CurDAG->RemoveDeadNode(N: Node);
5894 return;
5895 }
5896
5897 case ISD::SDIVREM:
5898 case ISD::UDIVREM: {
5899 SDValue N0 = Node->getOperand(Num: 0);
5900 SDValue N1 = Node->getOperand(Num: 1);
5901
5902 unsigned ROpc, MOpc;
5903 bool isSigned = Opcode == ISD::SDIVREM;
5904 if (!isSigned) {
5905 switch (NVT.SimpleTy) {
5906 default: llvm_unreachable("Unsupported VT!");
5907 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5908 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5909 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5910 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5911 }
5912 } else {
5913 switch (NVT.SimpleTy) {
5914 default: llvm_unreachable("Unsupported VT!");
5915 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5916 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5917 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5918 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5919 }
5920 }
5921
5922 unsigned LoReg, HiReg, ClrReg;
5923 unsigned SExtOpcode;
5924 switch (NVT.SimpleTy) {
5925 default: llvm_unreachable("Unsupported VT!");
5926 case MVT::i8:
5927 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5928 SExtOpcode = 0; // Not used.
5929 break;
5930 case MVT::i16:
5931 LoReg = X86::AX; HiReg = X86::DX;
5932 ClrReg = X86::DX;
5933 SExtOpcode = X86::CWD;
5934 break;
5935 case MVT::i32:
5936 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5937 SExtOpcode = X86::CDQ;
5938 break;
5939 case MVT::i64:
5940 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5941 SExtOpcode = X86::CQO;
5942 break;
5943 }
5944
5945 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5946 bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5947 bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0);
5948
5949 SDValue InGlue;
5950 if (NVT == MVT::i8) {
5951 // Special case for div8, just use a move with zero extension to AX to
5952 // clear the upper 8 bits (AH).
5953 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5954 MachineSDNode *Move;
5955 if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5956 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) };
5957 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5958 : X86::MOVZX16rm8;
5959 Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops);
5960 Chain = SDValue(Move, 1);
5961 ReplaceUses(F: N0.getValue(R: 1), T: Chain);
5962 // Record the mem-refs
5963 CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5964 } else {
5965 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5966 : X86::MOVZX16rr8;
5967 Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0);
5968 Chain = CurDAG->getEntryNode();
5969 }
5970 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue(Move, 0),
5971 Glue: SDValue());
5972 InGlue = Chain.getValue(R: 1);
5973 } else {
5974 InGlue =
5975 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl,
5976 Reg: LoReg, N: N0, Glue: SDValue()).getValue(R: 1);
5977 if (isSigned && !signBitIsZero) {
5978 // Sign extend the low part into the high part.
5979 InGlue =
5980 SDValue(CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),0);
5981 } else {
5982 // Zero out the high part, effectively zero extending the input.
5983 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
5984 SDValue ClrNode =
5985 SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0);
5986 switch (NVT.SimpleTy) {
5987 case MVT::i16:
5988 ClrNode =
5989 SDValue(CurDAG->getMachineNode(
5990 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode,
5991 Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl,
5992 VT: MVT::i32)),
5993 0);
5994 break;
5995 case MVT::i32:
5996 break;
5997 case MVT::i64:
5998 ClrNode =
5999 SDValue(CurDAG->getMachineNode(
6000 Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64,
6001 Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: ClrNode,
6002 Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl,
6003 VT: MVT::i32)),
6004 0);
6005 break;
6006 default:
6007 llvm_unreachable("Unexpected division source");
6008 }
6009
6010 InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg,
6011 N: ClrNode, Glue: InGlue).getValue(R: 1);
6012 }
6013 }
6014
6015 if (foldedLoad) {
6016 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
6017 InGlue };
6018 MachineSDNode *CNode =
6019 CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops);
6020 InGlue = SDValue(CNode, 1);
6021 // Update the chain.
6022 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 0));
6023 // Record the mem-refs
6024 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
6025 } else {
6026 InGlue =
6027 SDValue(CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), 0);
6028 }
6029
6030 // Prevent use of AH in a REX instruction by explicitly copying it to
6031 // an ABCD_L register.
6032 //
6033 // The current assumption of the register allocator is that isel
6034 // won't generate explicit references to the GR8_ABCD_H registers. If
6035 // the allocator and/or the backend get enhanced to be more robust in
6036 // that regard, this can be, and should be, removed.
6037 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6038 SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8);
6039 unsigned AHExtOpcode =
6040 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6041
6042 SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32,
6043 VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue);
6044 SDValue Result(RNode, 0);
6045 InGlue = SDValue(RNode, 1);
6046
6047 Result =
6048 CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result);
6049
6050 ReplaceUses(F: SDValue(Node, 1), T: Result);
6051 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6052 dbgs() << '\n');
6053 }
6054 // Copy the division (low) result, if it is needed.
6055 if (!SDValue(Node, 0).use_empty()) {
6056 SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6057 Reg: LoReg, VT: NVT, Glue: InGlue);
6058 InGlue = Result.getValue(R: 2);
6059 ReplaceUses(F: SDValue(Node, 0), T: Result);
6060 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6061 dbgs() << '\n');
6062 }
6063 // Copy the remainder (high) result, if it is needed.
6064 if (!SDValue(Node, 1).use_empty()) {
6065 SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6066 Reg: HiReg, VT: NVT, Glue: InGlue);
6067 InGlue = Result.getValue(R: 2);
6068 ReplaceUses(F: SDValue(Node, 1), T: Result);
6069 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6070 dbgs() << '\n');
6071 }
6072 CurDAG->RemoveDeadNode(N: Node);
6073 return;
6074 }
6075
6076 case X86ISD::FCMP:
6077 case X86ISD::STRICT_FCMP:
6078 case X86ISD::STRICT_FCMPS: {
6079 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6080 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6081 SDValue N0 = Node->getOperand(Num: IsStrictCmp ? 1 : 0);
6082 SDValue N1 = Node->getOperand(Num: IsStrictCmp ? 2 : 1);
6083
6084 // Save the original VT of the compare.
6085 MVT CmpVT = N0.getSimpleValueType();
6086
6087 // Floating point needs special handling if we don't have FCOMI.
6088 if (Subtarget->canUseCMOV())
6089 break;
6090
6091 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6092
6093 unsigned Opc;
6094 switch (CmpVT.SimpleTy) {
6095 default: llvm_unreachable("Unexpected type!");
6096 case MVT::f32:
6097 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6098 break;
6099 case MVT::f64:
6100 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6101 break;
6102 case MVT::f80:
6103 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6104 break;
6105 }
6106
6107 SDValue Chain =
6108 IsStrictCmp ? Node->getOperand(Num: 0) : CurDAG->getEntryNode();
6109 SDValue Glue;
6110 if (IsStrictCmp) {
6111 SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
6112 Chain = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), 0);
6113 Glue = Chain.getValue(R: 1);
6114 } else {
6115 Glue = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), 0);
6116 }
6117
6118 // Move FPSW to AX.
6119 SDValue FNSTSW =
6120 SDValue(CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), 0);
6121
6122 // Extract upper 8-bits of AX.
6123 SDValue Extract =
6124 CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW);
6125
6126 // Move AH into flags.
6127 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6128 assert(Subtarget->canUseLAHFSAHF() &&
6129 "Target doesn't support SAHF or FCOMI?");
6130 SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue());
6131 Chain = AH;
6132 SDValue SAHF = SDValue(
6133 CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: 1)), 0);
6134
6135 if (IsStrictCmp)
6136 ReplaceUses(F: SDValue(Node, 1), T: Chain);
6137
6138 ReplaceUses(F: SDValue(Node, 0), T: SAHF);
6139 CurDAG->RemoveDeadNode(N: Node);
6140 return;
6141 }
6142
6143 case X86ISD::CMP: {
6144 SDValue N0 = Node->getOperand(Num: 0);
6145 SDValue N1 = Node->getOperand(Num: 1);
6146
6147 // Optimizations for TEST compares.
6148 if (!isNullConstant(V: N1))
6149 break;
6150
6151 // Save the original VT of the compare.
6152 MVT CmpVT = N0.getSimpleValueType();
6153
6154 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6155 // by a test instruction. The test should be removed later by
6156 // analyzeCompare if we are using only the zero flag.
6157 // TODO: Should we check the users and use the BEXTR flags directly?
6158 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6159 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) {
6160 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6161 : X86::TEST32rr;
6162 SDValue BEXTR = SDValue(NewNode, 0);
6163 NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR);
6164 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
6165 CurDAG->RemoveDeadNode(N: Node);
6166 return;
6167 }
6168 }
6169
6170 // We can peek through truncates, but we need to be careful below.
6171 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6172 N0 = N0.getOperand(i: 0);
6173
6174 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6175 // use a smaller encoding.
6176 // Look past the truncate if CMP is the only use of it.
6177 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6178 N0.getValueType() != MVT::i8) {
6179 auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
6180 if (!MaskC)
6181 break;
6182
6183 // We may have looked through a truncate so mask off any bits that
6184 // shouldn't be part of the compare.
6185 uint64_t Mask = MaskC->getZExtValue();
6186 Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits());
6187
6188 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6189 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6190 // zero flag.
6191 if (CmpVT == MVT::i64 && !isInt<8>(x: Mask) && isShiftedMask_64(Value: Mask) &&
6192 onlyUsesZeroFlag(Flags: SDValue(Node, 0))) {
6193 unsigned ShiftOpcode = ISD::DELETED_NODE;
6194 unsigned ShiftAmt;
6195 unsigned SubRegIdx;
6196 MVT SubRegVT;
6197 unsigned TestOpcode;
6198 unsigned LeadingZeros = llvm::countl_zero(Val: Mask);
6199 unsigned TrailingZeros = llvm::countr_zero(Val: Mask);
6200
6201 // With leading/trailing zeros, the transform is profitable if we can
6202 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6203 // incurring any extra register moves.
6204 bool SavesBytes = !isInt<32>(x: Mask) || N0.getOperand(i: 0).hasOneUse();
6205 if (LeadingZeros == 0 && SavesBytes) {
6206 // If the mask covers the most significant bit, then we can replace
6207 // TEST+AND with a SHR and check eflags.
6208 // This emits a redundant TEST which is subsequently eliminated.
6209 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6210 ShiftAmt = TrailingZeros;
6211 SubRegIdx = 0;
6212 TestOpcode = X86::TEST64rr;
6213 } else if (TrailingZeros == 0 && SavesBytes) {
6214 // If the mask covers the least significant bit, then we can replace
6215 // TEST+AND with a SHL and check eflags.
6216 // This emits a redundant TEST which is subsequently eliminated.
6217 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6218 ShiftAmt = LeadingZeros;
6219 SubRegIdx = 0;
6220 TestOpcode = X86::TEST64rr;
6221 } else if (MaskC->hasOneUse() && !isInt<32>(x: Mask)) {
6222 // If the shifted mask extends into the high half and is 8/16/32 bits
6223 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6224 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6225 if (PopCount == 8) {
6226 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6227 ShiftAmt = TrailingZeros;
6228 SubRegIdx = X86::sub_8bit;
6229 SubRegVT = MVT::i8;
6230 TestOpcode = X86::TEST8rr;
6231 } else if (PopCount == 16) {
6232 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6233 ShiftAmt = TrailingZeros;
6234 SubRegIdx = X86::sub_16bit;
6235 SubRegVT = MVT::i16;
6236 TestOpcode = X86::TEST16rr;
6237 } else if (PopCount == 32) {
6238 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6239 ShiftAmt = TrailingZeros;
6240 SubRegIdx = X86::sub_32bit;
6241 SubRegVT = MVT::i32;
6242 TestOpcode = X86::TEST32rr;
6243 }
6244 }
6245 if (ShiftOpcode != ISD::DELETED_NODE) {
6246 SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64);
6247 SDValue Shift = SDValue(
6248 CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32,
6249 Op1: N0.getOperand(i: 0), Op2: ShiftC),
6250 0);
6251 if (SubRegIdx != 0) {
6252 Shift =
6253 CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift);
6254 }
6255 MachineSDNode *Test =
6256 CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift);
6257 ReplaceNode(F: Node, T: Test);
6258 return;
6259 }
6260 }
6261
6262 MVT VT;
6263 int SubRegOp;
6264 unsigned ROpc, MOpc;
6265
6266 // For each of these checks we need to be careful if the sign flag is
6267 // being used. It is only safe to use the sign flag in two conditions,
6268 // either the sign bit in the shrunken mask is zero or the final test
6269 // size is equal to the original compare size.
6270
6271 if (isUInt<8>(x: Mask) &&
6272 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6273 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6274 // For example, convert "testl %eax, $8" to "testb %al, $8"
6275 VT = MVT::i8;
6276 SubRegOp = X86::sub_8bit;
6277 ROpc = X86::TEST8ri;
6278 MOpc = X86::TEST8mi;
6279 } else if (OptForMinSize && isUInt<16>(x: Mask) &&
6280 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6281 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6282 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6283 // NOTE: We only want to form TESTW instructions if optimizing for
6284 // min size. Otherwise we only save one byte and possibly get a length
6285 // changing prefix penalty in the decoders.
6286 VT = MVT::i16;
6287 SubRegOp = X86::sub_16bit;
6288 ROpc = X86::TEST16ri;
6289 MOpc = X86::TEST16mi;
6290 } else if (isUInt<32>(x: Mask) && N0.getValueType() != MVT::i16 &&
6291 ((!(Mask & 0x80000000) &&
6292 // Without minsize 16-bit Cmps can get here so we need to
6293 // be sure we calculate the correct sign flag if needed.
6294 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6295 CmpVT == MVT::i32 ||
6296 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6297 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6298 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6299 // Otherwize, we find ourselves in a position where we have to do
6300 // promotion. If previous passes did not promote the and, we assume
6301 // they had a good reason not to and do not promote here.
6302 VT = MVT::i32;
6303 SubRegOp = X86::sub_32bit;
6304 ROpc = X86::TEST32ri;
6305 MOpc = X86::TEST32mi;
6306 } else {
6307 // No eligible transformation was found.
6308 break;
6309 }
6310
6311 SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT);
6312 SDValue Reg = N0.getOperand(i: 0);
6313
6314 // Emit a testl or testw.
6315 MachineSDNode *NewNode;
6316 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6317 if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6318 if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: 0).getNode())) {
6319 if (!LoadN->isSimple()) {
6320 unsigned NumVolBits = LoadN->getValueType(ResNo: 0).getSizeInBits();
6321 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6322 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6323 (MOpc == X86::TEST32mi && NumVolBits != 32))
6324 break;
6325 }
6326 }
6327 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6328 Reg.getOperand(i: 0) };
6329 NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops);
6330 // Update the chain.
6331 ReplaceUses(F: Reg.getValue(R: 1), T: SDValue(NewNode, 1));
6332 // Record the mem-refs
6333 CurDAG->setNodeMemRefs(N: NewNode,
6334 NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()});
6335 } else {
6336 // Extract the subregister if necessary.
6337 if (N0.getValueType() != VT)
6338 Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg);
6339
6340 NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm);
6341 }
6342 // Replace CMP with TEST.
6343 ReplaceNode(F: Node, T: NewNode);
6344 return;
6345 }
6346 break;
6347 }
6348 case X86ISD::PCMPISTR: {
6349 if (!Subtarget->hasSSE42())
6350 break;
6351
6352 bool NeedIndex = !SDValue(Node, 0).use_empty();
6353 bool NeedMask = !SDValue(Node, 1).use_empty();
6354 // We can't fold a load if we are going to make two instructions.
6355 bool MayFoldLoad = !NeedIndex || !NeedMask;
6356
6357 MachineSDNode *CNode;
6358 if (NeedMask) {
6359 unsigned ROpc =
6360 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6361 unsigned MOpc =
6362 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6363 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node);
6364 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0));
6365 }
6366 if (NeedIndex || !NeedMask) {
6367 unsigned ROpc =
6368 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6369 unsigned MOpc =
6370 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6371 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node);
6372 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
6373 }
6374
6375 // Connect the flag usage to the last instruction created.
6376 ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1));
6377 CurDAG->RemoveDeadNode(N: Node);
6378 return;
6379 }
6380 case X86ISD::PCMPESTR: {
6381 if (!Subtarget->hasSSE42())
6382 break;
6383
6384 // Copy the two implicit register inputs.
6385 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX,
6386 N: Node->getOperand(Num: 1),
6387 Glue: SDValue()).getValue(R: 1);
6388 InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX,
6389 N: Node->getOperand(Num: 3), Glue: InGlue).getValue(R: 1);
6390
6391 bool NeedIndex = !SDValue(Node, 0).use_empty();
6392 bool NeedMask = !SDValue(Node, 1).use_empty();
6393 // We can't fold a load if we are going to make two instructions.
6394 bool MayFoldLoad = !NeedIndex || !NeedMask;
6395
6396 MachineSDNode *CNode;
6397 if (NeedMask) {
6398 unsigned ROpc =
6399 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6400 unsigned MOpc =
6401 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6402 CNode =
6403 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue);
6404 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0));
6405 }
6406 if (NeedIndex || !NeedMask) {
6407 unsigned ROpc =
6408 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6409 unsigned MOpc =
6410 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6411 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue);
6412 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
6413 }
6414 // Connect the flag usage to the last instruction created.
6415 ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1));
6416 CurDAG->RemoveDeadNode(N: Node);
6417 return;
6418 }
6419
6420 case ISD::SETCC: {
6421 if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue(Node, 0), InMask: SDValue()))
6422 return;
6423
6424 break;
6425 }
6426
6427 case ISD::STORE:
6428 if (foldLoadStoreIntoMemOperand(Node))
6429 return;
6430 break;
6431
6432 case X86ISD::SETCC_CARRY: {
6433 MVT VT = Node->getSimpleValueType(ResNo: 0);
6434 SDValue Result;
6435 if (Subtarget->hasSBBDepBreaking()) {
6436 // We have to do this manually because tblgen will put the eflags copy in
6437 // the wrong place if we use an extract_subreg in the pattern.
6438 // Copy flags to the EFLAGS register and glue it to next node.
6439 SDValue EFLAGS =
6440 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
6441 N: Node->getOperand(Num: 1), Glue: SDValue());
6442
6443 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6444 // 32-bit version.
6445 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6446 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6447 Result = SDValue(
6448 CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: 1)),
6449 0);
6450 } else {
6451 // The target does not recognize sbb with the same reg operand as a
6452 // no-source idiom, so we explicitly zero the input values.
6453 Result = getSBBZero(N: Node);
6454 }
6455
6456 // For less than 32-bits we need to extract from the 32-bit node.
6457 if (VT == MVT::i8 || VT == MVT::i16) {
6458 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6459 Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6460 }
6461
6462 ReplaceUses(F: SDValue(Node, 0), T: Result);
6463 CurDAG->RemoveDeadNode(N: Node);
6464 return;
6465 }
6466 case X86ISD::SBB: {
6467 if (isNullConstant(V: Node->getOperand(Num: 0)) &&
6468 isNullConstant(V: Node->getOperand(Num: 1))) {
6469 SDValue Result = getSBBZero(N: Node);
6470
6471 // Replace the flag use.
6472 ReplaceUses(F: SDValue(Node, 1), T: Result.getValue(R: 1));
6473
6474 // Replace the result use.
6475 if (!SDValue(Node, 0).use_empty()) {
6476 // For less than 32-bits we need to extract from the 32-bit node.
6477 MVT VT = Node->getSimpleValueType(ResNo: 0);
6478 if (VT == MVT::i8 || VT == MVT::i16) {
6479 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6480 Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6481 }
6482 ReplaceUses(F: SDValue(Node, 0), T: Result);
6483 }
6484
6485 CurDAG->RemoveDeadNode(N: Node);
6486 return;
6487 }
6488 break;
6489 }
6490 case X86ISD::MGATHER: {
6491 auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node);
6492 SDValue IndexOp = Mgt->getIndex();
6493 SDValue Mask = Mgt->getMask();
6494 MVT IndexVT = IndexOp.getSimpleValueType();
6495 MVT ValueVT = Node->getSimpleValueType(ResNo: 0);
6496 MVT MaskVT = Mask.getSimpleValueType();
6497
6498 // This is just to prevent crashes if the nodes are malformed somehow. We're
6499 // otherwise only doing loose type checking in here based on type what
6500 // a type constraint would say just like table based isel.
6501 if (!ValueVT.isVector() || !MaskVT.isVector())
6502 break;
6503
6504 unsigned NumElts = ValueVT.getVectorNumElements();
6505 MVT ValueSVT = ValueVT.getVectorElementType();
6506
6507 bool IsFP = ValueSVT.isFloatingPoint();
6508 unsigned EltSize = ValueSVT.getSizeInBits();
6509
6510 unsigned Opc = 0;
6511 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6512 if (AVX512Gather) {
6513 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6514 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6515 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6516 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6517 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6518 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6519 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6520 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6521 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6522 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6523 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6524 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6525 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6526 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6527 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6528 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6529 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6530 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6531 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6532 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6533 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6534 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6535 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6536 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6537 } else {
6538 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6539 "Unexpected mask VT!");
6540 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6541 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6542 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6543 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6544 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6545 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6546 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6547 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6548 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6549 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6550 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6551 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6552 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6553 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6554 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6555 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6556 }
6557
6558 if (!Opc)
6559 break;
6560
6561 SDValue Base, Scale, Index, Disp, Segment;
6562 if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(),
6563 Base, Scale, Index, Disp, Segment))
6564 break;
6565
6566 SDValue PassThru = Mgt->getPassThru();
6567 SDValue Chain = Mgt->getChain();
6568 // Gather instructions have a mask output not in the ISD node.
6569 SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other);
6570
6571 MachineSDNode *NewNode;
6572 if (AVX512Gather) {
6573 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6574 Index, Disp, Segment, Chain};
6575 NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6576 } else {
6577 SDValue Ops[] = {PassThru, Base, Scale, Index,
6578 Disp, Segment, Mask, Chain};
6579 NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6580 }
6581 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()});
6582 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
6583 ReplaceUses(F: SDValue(Node, 1), T: SDValue(NewNode, 2));
6584 CurDAG->RemoveDeadNode(N: Node);
6585 return;
6586 }
6587 case X86ISD::MSCATTER: {
6588 auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node);
6589 SDValue Value = Sc->getValue();
6590 SDValue IndexOp = Sc->getIndex();
6591 MVT IndexVT = IndexOp.getSimpleValueType();
6592 MVT ValueVT = Value.getSimpleValueType();
6593
6594 // This is just to prevent crashes if the nodes are malformed somehow. We're
6595 // otherwise only doing loose type checking in here based on type what
6596 // a type constraint would say just like table based isel.
6597 if (!ValueVT.isVector())
6598 break;
6599
6600 unsigned NumElts = ValueVT.getVectorNumElements();
6601 MVT ValueSVT = ValueVT.getVectorElementType();
6602
6603 bool IsFP = ValueSVT.isFloatingPoint();
6604 unsigned EltSize = ValueSVT.getSizeInBits();
6605
6606 unsigned Opc;
6607 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6608 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6609 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6610 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6611 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6612 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6613 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6614 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6615 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6616 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6617 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6618 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6619 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6620 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6621 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6622 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6623 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6624 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6625 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6626 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6627 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6628 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6629 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6630 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6631 else
6632 break;
6633
6634 SDValue Base, Scale, Index, Disp, Segment;
6635 if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(),
6636 Base, Scale, Index, Disp, Segment))
6637 break;
6638
6639 SDValue Mask = Sc->getMask();
6640 SDValue Chain = Sc->getChain();
6641 // Scatter instructions have a mask output not in the ISD node.
6642 SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other);
6643 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6644
6645 MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6646 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()});
6647 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 1));
6648 CurDAG->RemoveDeadNode(N: Node);
6649 return;
6650 }
6651 case ISD::PREALLOCATED_SETUP: {
6652 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6653 auto CallId = MFI->getPreallocatedIdForCallSite(
6654 CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue());
6655 SDValue Chain = Node->getOperand(Num: 0);
6656 SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6657 MachineSDNode *New = CurDAG->getMachineNode(
6658 Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain);
6659 ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Chain
6660 CurDAG->RemoveDeadNode(N: Node);
6661 return;
6662 }
6663 case ISD::PREALLOCATED_ARG: {
6664 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6665 auto CallId = MFI->getPreallocatedIdForCallSite(
6666 CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue());
6667 SDValue Chain = Node->getOperand(Num: 0);
6668 SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6669 SDValue ArgIndex = Node->getOperand(Num: 2);
6670 SDValue Ops[3];
6671 Ops[0] = CallIdValue;
6672 Ops[1] = ArgIndex;
6673 Ops[2] = Chain;
6674 MachineSDNode *New = CurDAG->getMachineNode(
6675 Opcode: TargetOpcode::PREALLOCATED_ARG, dl,
6676 VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()),
6677 VT2: MVT::Other),
6678 Ops);
6679 ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Arg pointer
6680 ReplaceUses(F: SDValue(Node, 1), T: SDValue(New, 1)); // Chain
6681 CurDAG->RemoveDeadNode(N: Node);
6682 return;
6683 }
6684 case X86ISD::AESENCWIDE128KL:
6685 case X86ISD::AESDECWIDE128KL:
6686 case X86ISD::AESENCWIDE256KL:
6687 case X86ISD::AESDECWIDE256KL: {
6688 if (!Subtarget->hasWIDEKL())
6689 break;
6690
6691 unsigned Opcode;
6692 switch (Node->getOpcode()) {
6693 default:
6694 llvm_unreachable("Unexpected opcode!");
6695 case X86ISD::AESENCWIDE128KL:
6696 Opcode = X86::AESENCWIDE128KL;
6697 break;
6698 case X86ISD::AESDECWIDE128KL:
6699 Opcode = X86::AESDECWIDE128KL;
6700 break;
6701 case X86ISD::AESENCWIDE256KL:
6702 Opcode = X86::AESENCWIDE256KL;
6703 break;
6704 case X86ISD::AESDECWIDE256KL:
6705 Opcode = X86::AESDECWIDE256KL;
6706 break;
6707 }
6708
6709 SDValue Chain = Node->getOperand(Num: 0);
6710 SDValue Addr = Node->getOperand(Num: 1);
6711
6712 SDValue Base, Scale, Index, Disp, Segment;
6713 if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment))
6714 break;
6715
6716 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 2),
6717 Glue: SDValue());
6718 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 3),
6719 Glue: Chain.getValue(R: 1));
6720 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: 4),
6721 Glue: Chain.getValue(R: 1));
6722 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: 5),
6723 Glue: Chain.getValue(R: 1));
6724 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: 6),
6725 Glue: Chain.getValue(R: 1));
6726 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: 7),
6727 Glue: Chain.getValue(R: 1));
6728 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: 8),
6729 Glue: Chain.getValue(R: 1));
6730 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: 9),
6731 Glue: Chain.getValue(R: 1));
6732
6733 MachineSDNode *Res = CurDAG->getMachineNode(
6734 Opcode, dl, VTs: Node->getVTList(),
6735 Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: 1)});
6736 CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand());
6737 ReplaceNode(F: Node, T: Res);
6738 return;
6739 }
6740 case X86ISD::POP_FROM_X87_REG: {
6741 SDValue Chain = Node->getOperand(Num: 0);
6742 Register Reg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1))->getReg();
6743 SDValue Glue;
6744 if (Node->getNumValues() == 3)
6745 Glue = Node->getOperand(Num: 2);
6746 SDValue Copy =
6747 CurDAG->getCopyFromReg(Chain, dl, Reg, VT: Node->getValueType(ResNo: 0), Glue);
6748 ReplaceNode(F: Node, T: Copy.getNode());
6749 return;
6750 }
6751 }
6752
6753 SelectCode(N: Node);
6754}
6755
6756bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6757 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6758 std::vector<SDValue> &OutOps) {
6759 SDValue Op0, Op1, Op2, Op3, Op4;
6760 switch (ConstraintID) {
6761 default:
6762 llvm_unreachable("Unexpected asm memory constraint");
6763 case InlineAsm::ConstraintCode::o: // offsetable ??
6764 case InlineAsm::ConstraintCode::v: // not offsetable ??
6765 case InlineAsm::ConstraintCode::m: // memory
6766 case InlineAsm::ConstraintCode::X:
6767 case InlineAsm::ConstraintCode::p: // address
6768 if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4))
6769 return true;
6770 break;
6771 }
6772
6773 OutOps.push_back(x: Op0);
6774 OutOps.push_back(x: Op1);
6775 OutOps.push_back(x: Op2);
6776 OutOps.push_back(x: Op3);
6777 OutOps.push_back(x: Op4);
6778 return false;
6779}
6780
6781X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6782 : SelectionDAGISelPass(
6783 std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
6784
6785/// This pass converts a legalized DAG into a X86-specific DAG,
6786/// ready for instruction scheduling.
6787FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6788 CodeGenOptLevel OptLevel) {
6789 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6790}
6791