1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
16#include "X86MachineFunctionInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/CodeGen/MachineModuleInfo.h"
21#include "llvm/CodeGen/SelectionDAGISel.h"
22#include "llvm/Config/llvm-config.h"
23#include "llvm/IR/ConstantRange.h"
24#include "llvm/IR/Function.h"
25#include "llvm/IR/Instructions.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/ErrorHandling.h"
32#include "llvm/Support/KnownBits.h"
33#include "llvm/Support/MathExtras.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(Val: true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45 cl::Hidden);
46
47static cl::opt<bool> EnablePromoteAnyextLoad(
48 "x86-promote-anyext-load", cl::init(Val: true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
51extern cl::opt<bool> IndirectBranchTracking;
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 Kind: "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
187 return SelectionDAGISel::runOnMachineFunction(mf&: MF);
188 }
189
190 void emitFunctionEntryCode() override;
191
192 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
193
194 void PreprocessISelDAG() override;
195 void PostprocessISelDAG() override;
196
197// Include the pieces autogenerated from the target description.
198#include "X86GenDAGISel.inc"
199
200 private:
201 void Select(SDNode *N) override;
202
203 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205 bool AllowSegmentRegForX32 = false);
206 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211 unsigned Depth);
212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
218 SDValue &Scale, SDValue &Index, SDValue &Disp,
219 SDValue &Segment);
220 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
221 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
222 SDValue &Index, SDValue &Disp, SDValue &Segment);
223 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
224 bool selectLEAAddr(SDValue N, SDValue &Base,
225 SDValue &Scale, SDValue &Index, SDValue &Disp,
226 SDValue &Segment);
227 bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
228 SDValue &Index, SDValue &Disp, SDValue &Segment);
229 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectRelocImm(SDValue N, SDValue &Op);
233
234 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
235 SDValue &Base, SDValue &Scale,
236 SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238
239 // Convenience method where P is also root.
240 bool tryFoldLoad(SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment) {
244 return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment);
245 }
246
247 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
248 SDValue &Base, SDValue &Scale,
249 SDValue &Index, SDValue &Disp,
250 SDValue &Segment);
251
252 bool isProfitableToFormMaskedOp(SDNode *N) const;
253
254 /// Implement addressing mode selection for inline asm expressions.
255 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
256 InlineAsm::ConstraintCode ConstraintID,
257 std::vector<SDValue> &OutOps) override;
258
259 void emitSpecialCodeForMain();
260
261 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
262 MVT VT, SDValue &Base, SDValue &Scale,
263 SDValue &Index, SDValue &Disp,
264 SDValue &Segment) {
265 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
266 Base = CurDAG->getTargetFrameIndex(
267 FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout()));
268 else if (AM.Base_Reg.getNode())
269 Base = AM.Base_Reg;
270 else
271 Base = CurDAG->getRegister(Reg: 0, VT);
272
273 Scale = getI8Imm(Imm: AM.Scale, DL);
274
275#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
276 // Negate the index if needed.
277 if (AM.NegateIndex) {
278 unsigned NegOpc;
279 switch (VT.SimpleTy) {
280 default:
281 llvm_unreachable("Unsupported VT!");
282 case MVT::i64:
283 NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
284 break;
285 case MVT::i32:
286 NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
287 break;
288 case MVT::i16:
289 NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
290 break;
291 case MVT::i8:
292 NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
293 break;
294 }
295 SDValue Neg = SDValue(CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32,
296 Ops: AM.IndexReg), 0);
297 AM.IndexReg = Neg;
298 }
299
300 if (AM.IndexReg.getNode())
301 Index = AM.IndexReg;
302 else
303 Index = CurDAG->getRegister(Reg: 0, VT);
304
305 // These are 32-bit even in 64-bit mode since RIP-relative offset
306 // is 32-bit.
307 if (AM.GV)
308 Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc(),
309 VT: MVT::i32, offset: AM.Disp,
310 TargetFlags: AM.SymbolFlags);
311 else if (AM.CP)
312 Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment,
313 Offset: AM.Disp, TargetFlags: AM.SymbolFlags);
314 else if (AM.ES) {
315 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
316 Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
317 } else if (AM.MCSym) {
318 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
319 assert(AM.SymbolFlags == 0 && "oo");
320 Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32);
321 } else if (AM.JT != -1) {
322 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
323 Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
324 } else if (AM.BlockAddr)
325 Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp,
326 TargetFlags: AM.SymbolFlags);
327 else
328 Disp = CurDAG->getSignedTargetConstant(Val: AM.Disp, DL, VT: MVT::i32);
329
330 if (AM.Segment.getNode())
331 Segment = AM.Segment;
332 else
333 Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
334 }
335
336 // Utility function to determine whether it is AMX SDNode right after
337 // lowering but before ISEL.
338 bool isAMXSDNode(SDNode *N) const {
339 // Check if N is AMX SDNode:
340 // 1. check result type;
341 // 2. check operand type;
342 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
343 if (N->getValueType(ResNo: Idx) == MVT::x86amx)
344 return true;
345 }
346 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
347 SDValue Op = N->getOperand(Num: Idx);
348 if (Op.getValueType() == MVT::x86amx)
349 return true;
350 }
351 return false;
352 }
353
354 // Utility function to determine whether we should avoid selecting
355 // immediate forms of instructions for better code size or not.
356 // At a high level, we'd like to avoid such instructions when
357 // we have similar constants used within the same basic block
358 // that can be kept in a register.
359 //
360 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
361 uint32_t UseCount = 0;
362
363 // Do not want to hoist if we're not optimizing for size.
364 // TODO: We'd like to remove this restriction.
365 // See the comment in X86InstrInfo.td for more info.
366 if (!CurDAG->shouldOptForSize())
367 return false;
368
369 // Walk all the users of the immediate.
370 for (const SDNode *User : N->users()) {
371 if (UseCount >= 2)
372 break;
373
374 // This user is already selected. Count it as a legitimate use and
375 // move on.
376 if (User->isMachineOpcode()) {
377 UseCount++;
378 continue;
379 }
380
381 // We want to count stores of immediates as real uses.
382 if (User->getOpcode() == ISD::STORE &&
383 User->getOperand(Num: 1).getNode() == N) {
384 UseCount++;
385 continue;
386 }
387
388 // We don't currently match users that have > 2 operands (except
389 // for stores, which are handled above)
390 // Those instruction won't match in ISEL, for now, and would
391 // be counted incorrectly.
392 // This may change in the future as we add additional instruction
393 // types.
394 if (User->getNumOperands() != 2)
395 continue;
396
397 // If this is a sign-extended 8-bit integer immediate used in an ALU
398 // instruction, there is probably an opcode encoding to save space.
399 auto *C = dyn_cast<ConstantSDNode>(Val: N);
400 if (C && isInt<8>(x: C->getSExtValue()))
401 continue;
402
403 // Immediates that are used for offsets as part of stack
404 // manipulation should be left alone. These are typically
405 // used to indicate SP offsets for argument passing and
406 // will get pulled into stores/pushes (implicitly).
407 if (User->getOpcode() == X86ISD::ADD ||
408 User->getOpcode() == ISD::ADD ||
409 User->getOpcode() == X86ISD::SUB ||
410 User->getOpcode() == ISD::SUB) {
411
412 // Find the other operand of the add/sub.
413 SDValue OtherOp = User->getOperand(Num: 0);
414 if (OtherOp.getNode() == N)
415 OtherOp = User->getOperand(Num: 1);
416
417 // Don't count if the other operand is SP.
418 RegisterSDNode *RegNode;
419 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
420 (RegNode = dyn_cast_or_null<RegisterSDNode>(
421 Val: OtherOp->getOperand(Num: 1).getNode())))
422 if ((RegNode->getReg() == X86::ESP) ||
423 (RegNode->getReg() == X86::RSP))
424 continue;
425 }
426
427 // ... otherwise, count this and move on.
428 UseCount++;
429 }
430
431 // If we have more than 1 use, then recommend for hoisting.
432 return (UseCount > 1);
433 }
434
435 /// Return a target constant with the specified value of type i8.
436 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
437 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
438 }
439
440 /// Return a target constant with the specified value, of type i32.
441 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
442 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32);
443 }
444
445 /// Return a target constant with the specified value, of type i64.
446 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
447 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64);
448 }
449
450 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
451 const SDLoc &DL) {
452 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
453 uint64_t Index = N->getConstantOperandVal(Num: 1);
454 MVT VecVT = N->getOperand(Num: 0).getSimpleValueType();
455 return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
456 }
457
458 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
459 const SDLoc &DL) {
460 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
461 uint64_t Index = N->getConstantOperandVal(Num: 2);
462 MVT VecVT = N->getSimpleValueType(ResNo: 0);
463 return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
464 }
465
466 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
467 const SDLoc &DL) {
468 assert(VecWidth == 128 && "Unexpected vector width");
469 uint64_t Index = N->getConstantOperandVal(Num: 2);
470 MVT VecVT = N->getSimpleValueType(ResNo: 0);
471 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
472 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
473 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
474 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
475 return getI8Imm(Imm: InsertIdx ? 0x02 : 0x30, DL);
476 }
477
478 SDValue getSBBZero(SDNode *N) {
479 SDLoc dl(N);
480 MVT VT = N->getSimpleValueType(ResNo: 0);
481
482 // Create zero.
483 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
484 SDValue Zero =
485 SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0);
486 if (VT == MVT::i64) {
487 Zero = SDValue(
488 CurDAG->getMachineNode(
489 Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, Op1: Zero,
490 Op2: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
491 0);
492 }
493
494 // Copy flags to the EFLAGS register and glue it to next node.
495 unsigned Opcode = N->getOpcode();
496 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
497 "Unexpected opcode for SBB materialization");
498 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
499 SDValue EFLAGS =
500 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
501 N: N->getOperand(Num: FlagOpIndex), Glue: SDValue());
502
503 // Create a 64-bit instruction if the result is 64-bits otherwise use the
504 // 32-bit version.
505 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
506 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
507 VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32);
508 return SDValue(
509 CurDAG->getMachineNode(Opcode: Opc, dl, VTs,
510 Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: 1)}),
511 0);
512 }
513
514 // Helper to detect unneeded and instructions on shift amounts. Called
515 // from PatFrags in tablegen.
516 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
517 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
518 const APInt &Val = N->getConstantOperandAPInt(Num: 1);
519
520 if (Val.countr_one() >= Width)
521 return true;
522
523 APInt Mask = Val | CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero;
524 return Mask.countr_one() >= Width;
525 }
526
527 /// Return an SDNode that returns the value of the global base register.
528 /// Output instructions required to initialize the global base register,
529 /// if necessary.
530 SDNode *getGlobalBaseReg();
531
532 /// Return a reference to the TargetMachine, casted to the target-specific
533 /// type.
534 const X86TargetMachine &getTargetMachine() const {
535 return static_cast<const X86TargetMachine &>(TM);
536 }
537
538 /// Return a reference to the TargetInstrInfo, casted to the target-specific
539 /// type.
540 const X86InstrInfo *getInstrInfo() const {
541 return Subtarget->getInstrInfo();
542 }
543
544 /// Return a condition code of the given SDNode
545 X86::CondCode getCondFromNode(SDNode *N) const;
546
547 /// Address-mode matching performs shift-of-and to and-of-shift
548 /// reassociation in order to expose more scaled addressing
549 /// opportunities.
550 bool ComplexPatternFuncMutatesDAG() const override {
551 return true;
552 }
553
554 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
555
556 // Indicates we should prefer to use a non-temporal load for this load.
557 bool useNonTemporalLoad(LoadSDNode *N) const {
558 if (!N->isNonTemporal())
559 return false;
560
561 unsigned StoreSize = N->getMemoryVT().getStoreSize();
562
563 if (N->getAlign().value() < StoreSize)
564 return false;
565
566 switch (StoreSize) {
567 default: llvm_unreachable("Unsupported store size");
568 case 4:
569 case 8:
570 return false;
571 case 16:
572 return Subtarget->hasSSE41();
573 case 32:
574 return Subtarget->hasAVX2();
575 case 64:
576 return Subtarget->hasAVX512();
577 }
578 }
579
580 bool foldLoadStoreIntoMemOperand(SDNode *Node);
581 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
582 bool matchBitExtract(SDNode *Node);
583 bool shrinkAndImmediate(SDNode *N);
584 bool isMaskZeroExtended(SDNode *N) const;
585 bool tryShiftAmountMod(SDNode *N);
586 bool tryShrinkShlLogicImm(SDNode *N);
587 bool tryVPTERNLOG(SDNode *N);
588 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
589 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
590 uint8_t Imm);
591 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
592 bool tryMatchBitSelect(SDNode *N);
593
594 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
595 const SDLoc &dl, MVT VT, SDNode *Node);
596 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
597 const SDLoc &dl, MVT VT, SDNode *Node,
598 SDValue &InGlue);
599
600 bool tryOptimizeRem8Extend(SDNode *N);
601
602 bool onlyUsesZeroFlag(SDValue Flags) const;
603 bool hasNoSignFlagUses(SDValue Flags) const;
604 bool hasNoCarryFlagUses(SDValue Flags) const;
605 bool checkTCRetEnoughRegs(SDNode *N) const;
606 };
607
608 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
609 public:
610 static char ID;
611 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
612 CodeGenOptLevel OptLevel)
613 : SelectionDAGISelLegacy(
614 ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {}
615 };
616}
617
618char X86DAGToDAGISelLegacy::ID = 0;
619
620INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
621
622// Returns true if this masked compare can be implemented legally with this
623// type.
624static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
625 unsigned Opcode = N->getOpcode();
626 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
627 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
628 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
629 // We can get 256-bit 8 element types here without VLX being enabled. When
630 // this happens we will use 512-bit operations and the mask will not be
631 // zero extended.
632 EVT OpVT = N->getOperand(Num: 0).getValueType();
633 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
634 // second operand.
635 if (Opcode == X86ISD::STRICT_CMPM)
636 OpVT = N->getOperand(Num: 1).getValueType();
637 if (OpVT.is256BitVector() || OpVT.is128BitVector())
638 return Subtarget->hasVLX();
639
640 return true;
641 }
642 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
643 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
644 Opcode == X86ISD::FSETCCM_SAE)
645 return true;
646
647 return false;
648}
649
650// Returns true if we can assume the writer of the mask has zero extended it
651// for us.
652bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
653 // If this is an AND, check if we have a compare on either side. As long as
654 // one side guarantees the mask is zero extended, the AND will preserve those
655 // zeros.
656 if (N->getOpcode() == ISD::AND)
657 return isLegalMaskCompare(N: N->getOperand(Num: 0).getNode(), Subtarget) ||
658 isLegalMaskCompare(N: N->getOperand(Num: 1).getNode(), Subtarget);
659
660 return isLegalMaskCompare(N, Subtarget);
661}
662
663bool
664X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
665 if (OptLevel == CodeGenOptLevel::None)
666 return false;
667
668 if (!N.hasOneUse())
669 return false;
670
671 if (N.getOpcode() != ISD::LOAD)
672 return true;
673
674 // Don't fold non-temporal loads if we have an instruction for them.
675 if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N)))
676 return false;
677
678 // If N is a load, do additional profitability checks.
679 if (U == Root) {
680 switch (U->getOpcode()) {
681 default: break;
682 case X86ISD::ADD:
683 case X86ISD::ADC:
684 case X86ISD::SUB:
685 case X86ISD::SBB:
686 case X86ISD::AND:
687 case X86ISD::XOR:
688 case X86ISD::OR:
689 case ISD::ADD:
690 case ISD::UADDO_CARRY:
691 case ISD::AND:
692 case ISD::OR:
693 case ISD::XOR: {
694 SDValue Op1 = U->getOperand(Num: 1);
695
696 // If the other operand is a 8-bit immediate we should fold the immediate
697 // instead. This reduces code size.
698 // e.g.
699 // movl 4(%esp), %eax
700 // addl $4, %eax
701 // vs.
702 // movl $4, %eax
703 // addl 4(%esp), %eax
704 // The former is 2 bytes shorter. In case where the increment is 1, then
705 // the saving can be 4 bytes (by using incl %eax).
706 if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) {
707 if (Imm->getAPIntValue().isSignedIntN(N: 8))
708 return false;
709
710 // If this is a 64-bit AND with an immediate that fits in 32-bits,
711 // prefer using the smaller and over folding the load. This is needed to
712 // make sure immediates created by shrinkAndImmediate are always folded.
713 // Ideally we would narrow the load during DAG combine and get the
714 // best of both worlds.
715 if (U->getOpcode() == ISD::AND &&
716 Imm->getAPIntValue().getBitWidth() == 64 &&
717 Imm->getAPIntValue().isIntN(N: 32))
718 return false;
719
720 // If this really a zext_inreg that can be represented with a movzx
721 // instruction, prefer that.
722 // TODO: We could shrink the load and fold if it is non-volatile.
723 if (U->getOpcode() == ISD::AND &&
724 (Imm->getAPIntValue() == UINT8_MAX ||
725 Imm->getAPIntValue() == UINT16_MAX ||
726 Imm->getAPIntValue() == UINT32_MAX))
727 return false;
728
729 // ADD/SUB with can negate the immediate and use the opposite operation
730 // to fit 128 into a sign extended 8 bit immediate.
731 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
732 (-Imm->getAPIntValue()).isSignedIntN(N: 8))
733 return false;
734
735 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
736 (-Imm->getAPIntValue()).isSignedIntN(N: 8) &&
737 hasNoCarryFlagUses(Flags: SDValue(U, 1)))
738 return false;
739 }
740
741 // If the other operand is a TLS address, we should fold it instead.
742 // This produces
743 // movl %gs:0, %eax
744 // leal i@NTPOFF(%eax), %eax
745 // instead of
746 // movl $i@NTPOFF, %eax
747 // addl %gs:0, %eax
748 // if the block also has an access to a second TLS address this will save
749 // a load.
750 // FIXME: This is probably also true for non-TLS addresses.
751 if (Op1.getOpcode() == X86ISD::Wrapper) {
752 SDValue Val = Op1.getOperand(i: 0);
753 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
754 return false;
755 }
756
757 // Don't fold load if this matches the BTS/BTR/BTC patterns.
758 // BTS: (or X, (shl 1, n))
759 // BTR: (and X, (rotl -2, n))
760 // BTC: (xor X, (shl 1, n))
761 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
762 if (U->getOperand(Num: 0).getOpcode() == ISD::SHL &&
763 isOneConstant(V: U->getOperand(Num: 0).getOperand(i: 0)))
764 return false;
765
766 if (U->getOperand(Num: 1).getOpcode() == ISD::SHL &&
767 isOneConstant(V: U->getOperand(Num: 1).getOperand(i: 0)))
768 return false;
769 }
770 if (U->getOpcode() == ISD::AND) {
771 SDValue U0 = U->getOperand(Num: 0);
772 SDValue U1 = U->getOperand(Num: 1);
773 if (U0.getOpcode() == ISD::ROTL) {
774 auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: 0));
775 if (C && C->getSExtValue() == -2)
776 return false;
777 }
778
779 if (U1.getOpcode() == ISD::ROTL) {
780 auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: 0));
781 if (C && C->getSExtValue() == -2)
782 return false;
783 }
784 }
785
786 break;
787 }
788 case ISD::SHL:
789 case ISD::SRA:
790 case ISD::SRL:
791 // Don't fold a load into a shift by immediate. The BMI2 instructions
792 // support folding a load, but not an immediate. The legacy instructions
793 // support folding an immediate, but can't fold a load. Folding an
794 // immediate is preferable to folding a load.
795 if (isa<ConstantSDNode>(Val: U->getOperand(Num: 1)))
796 return false;
797
798 break;
799 }
800 }
801
802 // Prevent folding a load if this can implemented with an insert_subreg or
803 // a move that implicitly zeroes.
804 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
805 isNullConstant(V: Root->getOperand(Num: 2)) &&
806 (Root->getOperand(Num: 0).isUndef() ||
807 ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: 0).getNode())))
808 return false;
809
810 return true;
811}
812
813// Indicates it is profitable to form an AVX512 masked operation. Returning
814// false will favor a masked register-register masked move or vblendm and the
815// operation will be selected separately.
816bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
817 assert(
818 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
819 "Unexpected opcode!");
820
821 // If the operation has additional users, the operation will be duplicated.
822 // Check the use count to prevent that.
823 // FIXME: Are there cheap opcodes we might want to duplicate?
824 return N->getOperand(Num: 1).hasOneUse();
825}
826
827/// Replace the original chain operand of the call with
828/// load's chain operand and move load below the call's chain operand.
829static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
830 SDValue Call, SDValue OrigChain) {
831 SmallVector<SDValue, 8> Ops;
832 SDValue Chain = OrigChain.getOperand(i: 0);
833 if (Chain.getNode() == Load.getNode())
834 Ops.push_back(Elt: Load.getOperand(i: 0));
835 else {
836 assert(Chain.getOpcode() == ISD::TokenFactor &&
837 "Unexpected chain operand");
838 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
839 if (Chain.getOperand(i).getNode() == Load.getNode())
840 Ops.push_back(Elt: Load.getOperand(i: 0));
841 else
842 Ops.push_back(Elt: Chain.getOperand(i));
843 SDValue NewChain =
844 CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Load), VT: MVT::Other, Ops);
845 Ops.clear();
846 Ops.push_back(Elt: NewChain);
847 }
848 Ops.append(in_start: OrigChain->op_begin() + 1, in_end: OrigChain->op_end());
849 CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops);
850 CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: 0),
851 Op2: Load.getOperand(i: 1), Op3: Load.getOperand(i: 2));
852
853 Ops.clear();
854 Ops.push_back(Elt: SDValue(Load.getNode(), 1));
855 Ops.append(in_start: Call->op_begin() + 1, in_end: Call->op_end());
856 CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops);
857}
858
859/// Return true if call address is a load and it can be
860/// moved below CALLSEQ_START and the chains leading up to the call.
861/// Return the CALLSEQ_START by reference as a second output.
862/// In the case of a tail call, there isn't a callseq node between the call
863/// chain and the load.
864static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
865 // The transformation is somewhat dangerous if the call's chain was glued to
866 // the call. After MoveBelowOrigChain the load is moved between the call and
867 // the chain, this can create a cycle if the load is not folded. So it is
868 // *really* important that we are sure the load will be folded.
869 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
870 return false;
871 auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode());
872 if (!LD ||
873 !LD->isSimple() ||
874 LD->getAddressingMode() != ISD::UNINDEXED ||
875 LD->getExtensionType() != ISD::NON_EXTLOAD)
876 return false;
877
878 // If the load's outgoing chain has more than one use, we can't (currently)
879 // move the load since we'd most likely create a loop. TODO: Maybe it could
880 // work if moveBelowOrigChain() updated *all* the chain users.
881 if (!Callee.getValue(R: 1).hasOneUse())
882 return false;
883
884 // Now let's find the callseq_start.
885 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
886 if (!Chain.hasOneUse())
887 return false;
888 Chain = Chain.getOperand(i: 0);
889 }
890
891 while (true) {
892 if (!Chain.getNumOperands())
893 return false;
894
895 // It's not safe to move the callee (a load) across e.g. a store.
896 // Conservatively abort if the chain contains a node other than the ones
897 // below.
898 switch (Chain.getNode()->getOpcode()) {
899 case ISD::CALLSEQ_START:
900 case ISD::CopyToReg:
901 case ISD::LOAD:
902 break;
903 default:
904 return false;
905 }
906
907 if (Chain.getOperand(i: 0).getNode() == Callee.getNode())
908 return true;
909 if (Chain.getOperand(i: 0).getOpcode() == ISD::TokenFactor &&
910 Chain.getOperand(i: 0).getValue(R: 0).hasOneUse() &&
911 Callee.getValue(R: 1).isOperandOf(N: Chain.getOperand(i: 0).getNode()) &&
912 Callee.getValue(R: 1).hasOneUse())
913 return true;
914
915 // Look past CopyToRegs. We only walk one path, so the chain mustn't branch.
916 if (Chain.getOperand(i: 0).getOpcode() == ISD::CopyToReg &&
917 Chain.getOperand(i: 0).getValue(R: 0).hasOneUse()) {
918 Chain = Chain.getOperand(i: 0);
919 continue;
920 }
921
922 return false;
923 }
924}
925
926static bool isEndbrImm64(uint64_t Imm) {
927// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
928// i.g: 0xF3660F1EFA, 0xF3670F1EFA
929 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
930 return false;
931
932 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
933 0x65, 0x66, 0x67, 0xf0, 0xf2};
934 int i = 24; // 24bit 0x0F1EFA has matched
935 while (i < 64) {
936 uint8_t Byte = (Imm >> i) & 0xFF;
937 if (Byte == 0xF3)
938 return true;
939 if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte))
940 return false;
941 i += 8;
942 }
943
944 return false;
945}
946
947static bool needBWI(MVT VT) {
948 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
949}
950
951void X86DAGToDAGISel::PreprocessISelDAG() {
952 bool MadeChange = false;
953 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
954 E = CurDAG->allnodes_end(); I != E; ) {
955 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
956
957 // This is for CET enhancement.
958 //
959 // ENDBR32 and ENDBR64 have specific opcodes:
960 // ENDBR32: F3 0F 1E FB
961 // ENDBR64: F3 0F 1E FA
962 // And we want that attackers won’t find unintended ENDBR32/64
963 // opcode matches in the binary
964 // Here’s an example:
965 // If the compiler had to generate asm for the following code:
966 // a = 0xF30F1EFA
967 // it could, for example, generate:
968 // mov 0xF30F1EFA, dword ptr[a]
969 // In such a case, the binary would include a gadget that starts
970 // with a fake ENDBR64 opcode. Therefore, we split such generation
971 // into multiple operations, let it not shows in the binary
972 if (N->getOpcode() == ISD::Constant) {
973 MVT VT = N->getSimpleValueType(ResNo: 0);
974 int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
975 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
976 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
977 // Check that the cf-protection-branch is enabled.
978 Metadata *CFProtectionBranch =
979 MF->getFunction().getParent()->getModuleFlag(
980 Key: "cf-protection-branch");
981 if (CFProtectionBranch || IndirectBranchTracking) {
982 SDLoc dl(N);
983 SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true);
984 Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT);
985 --I;
986 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Complement);
987 ++I;
988 MadeChange = true;
989 continue;
990 }
991 }
992 }
993
994 // If this is a target specific AND node with no flag usages, turn it back
995 // into ISD::AND to enable test instruction matching.
996 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: 1)) {
997 SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
998 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
999 --I;
1000 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1001 ++I;
1002 MadeChange = true;
1003 continue;
1004 }
1005
1006 // Convert vector increment or decrement to sub/add with an all-ones
1007 // constant:
1008 // add X, <1, 1...> --> sub X, <-1, -1...>
1009 // sub X, <1, 1...> --> add X, <-1, -1...>
1010 // The all-ones vector constant can be materialized using a pcmpeq
1011 // instruction that is commonly recognized as an idiom (has no register
1012 // dependency), so that's better/smaller than loading a splat 1 constant.
1013 //
1014 // But don't do this if it would inhibit a potentially profitable load
1015 // folding opportunity for the other operand. That only occurs with the
1016 // intersection of:
1017 // (1) The other operand (op0) is load foldable.
1018 // (2) The op is an add (otherwise, we are *creating* an add and can still
1019 // load fold the other op).
1020 // (3) The target has AVX (otherwise, we have a destructive add and can't
1021 // load fold the other op without killing the constant op).
1022 // (4) The constant 1 vector has multiple uses (so it is profitable to load
1023 // into a register anyway).
1024 auto mayPreventLoadFold = [&]() {
1025 return X86::mayFoldLoad(Op: N->getOperand(Num: 0), Subtarget: *Subtarget) &&
1026 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1027 !N->getOperand(Num: 1).hasOneUse();
1028 };
1029 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1030 N->getSimpleValueType(ResNo: 0).isVector() && !mayPreventLoadFold()) {
1031 APInt SplatVal;
1032 if (!ISD::isBuildVectorOfConstantSDNodes(
1033 N: peekThroughBitcasts(V: N->getOperand(Num: 0)).getNode()) &&
1034 X86::isConstantSplat(Op: N->getOperand(Num: 1), SplatVal) &&
1035 SplatVal.isOne()) {
1036 SDLoc DL(N);
1037
1038 MVT VT = N->getSimpleValueType(ResNo: 0);
1039 unsigned NumElts = VT.getSizeInBits() / 32;
1040 SDValue AllOnes =
1041 CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts));
1042 AllOnes = CurDAG->getBitcast(VT, V: AllOnes);
1043
1044 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1045 SDValue Res =
1046 CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: 0), N2: AllOnes);
1047 --I;
1048 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1049 ++I;
1050 MadeChange = true;
1051 continue;
1052 }
1053 }
1054
1055 switch (N->getOpcode()) {
1056 case X86ISD::VBROADCAST: {
1057 MVT VT = N->getSimpleValueType(ResNo: 0);
1058 // Emulate v32i16/v64i8 broadcast without BWI.
1059 if (!Subtarget->hasBWI() && needBWI(VT)) {
1060 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1061 SDLoc dl(N);
1062 SDValue NarrowBCast =
1063 CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: 0));
1064 SDValue Res =
1065 CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1066 N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1067 unsigned Index = NarrowVT.getVectorMinNumElements();
1068 Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1069 N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1070
1071 --I;
1072 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1073 ++I;
1074 MadeChange = true;
1075 continue;
1076 }
1077
1078 break;
1079 }
1080 case X86ISD::VBROADCAST_LOAD: {
1081 MVT VT = N->getSimpleValueType(ResNo: 0);
1082 // Emulate v32i16/v64i8 broadcast without BWI.
1083 if (!Subtarget->hasBWI() && needBWI(VT)) {
1084 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1085 auto *MemNode = cast<MemSDNode>(Val: N);
1086 SDLoc dl(N);
1087 SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other);
1088 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1089 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1090 Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(),
1091 MMO: MemNode->getMemOperand());
1092 SDValue Res =
1093 CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1094 N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1095 unsigned Index = NarrowVT.getVectorMinNumElements();
1096 Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1097 N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1098
1099 --I;
1100 SDValue To[] = {Res, NarrowBCast.getValue(R: 1)};
1101 CurDAG->ReplaceAllUsesWith(From: N, To);
1102 ++I;
1103 MadeChange = true;
1104 continue;
1105 }
1106
1107 break;
1108 }
1109 case ISD::LOAD: {
1110 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1111 // load, then just extract the lower subvector and avoid the second load.
1112 auto *Ld = cast<LoadSDNode>(Val: N);
1113 MVT VT = N->getSimpleValueType(ResNo: 0);
1114 if (!ISD::isNormalLoad(N: Ld) || !Ld->isSimple() ||
1115 !(VT.is128BitVector() || VT.is256BitVector()))
1116 break;
1117
1118 MVT MaxVT = VT;
1119 SDNode *MaxLd = nullptr;
1120 SDValue Ptr = Ld->getBasePtr();
1121 SDValue Chain = Ld->getChain();
1122 for (SDNode *User : Ptr->users()) {
1123 auto *UserLd = dyn_cast<LoadSDNode>(Val: User);
1124 MVT UserVT = User->getSimpleValueType(ResNo: 0);
1125 if (User != N && UserLd && ISD::isNormalLoad(N: User) &&
1126 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1127 !User->hasAnyUseOfValue(Value: 1) &&
1128 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1129 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1130 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1131 MaxLd = User;
1132 MaxVT = UserVT;
1133 }
1134 }
1135 if (MaxLd) {
1136 SDLoc dl(N);
1137 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1138 MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts);
1139 SDValue Extract = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT,
1140 N1: SDValue(MaxLd, 0),
1141 N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1142 SDValue Res = CurDAG->getBitcast(VT, V: Extract);
1143
1144 --I;
1145 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1146 CurDAG->ReplaceAllUsesWith(From: N, To);
1147 ++I;
1148 MadeChange = true;
1149 continue;
1150 }
1151 break;
1152 }
1153 case ISD::VSELECT: {
1154 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1155 EVT EleVT = N->getOperand(Num: 0).getValueType().getVectorElementType();
1156 if (EleVT == MVT::i1)
1157 break;
1158
1159 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1160 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1161 "We can't replace VSELECT with BLENDV in vXi16!");
1162 SDValue R;
1163 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: 0)) ==
1164 EleVT.getSizeInBits()) {
1165 R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1166 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), N3: N->getOperand(Num: 2),
1167 N4: CurDAG->getTargetConstant(Val: 0xCA, DL: SDLoc(N), VT: MVT::i8));
1168 } else {
1169 R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1170 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1),
1171 N3: N->getOperand(Num: 2));
1172 }
1173 --I;
1174 CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode());
1175 ++I;
1176 MadeChange = true;
1177 continue;
1178 }
1179 case ISD::FP_ROUND:
1180 case ISD::STRICT_FP_ROUND:
1181 case ISD::FP_TO_SINT:
1182 case ISD::FP_TO_UINT:
1183 case ISD::STRICT_FP_TO_SINT:
1184 case ISD::STRICT_FP_TO_UINT: {
1185 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1186 // don't need 2 sets of patterns.
1187 if (!N->getSimpleValueType(ResNo: 0).isVector())
1188 break;
1189
1190 unsigned NewOpc;
1191 switch (N->getOpcode()) {
1192 default: llvm_unreachable("Unexpected opcode!");
1193 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1194 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1195 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1196 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1197 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1198 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1199 }
1200 SDValue Res;
1201 if (N->isStrictFPOpcode())
1202 Res =
1203 CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), ResultTys: {N->getValueType(ResNo: 0), MVT::Other},
1204 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)});
1205 else
1206 Res =
1207 CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1208 Operand: N->getOperand(Num: 0));
1209 --I;
1210 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1211 ++I;
1212 MadeChange = true;
1213 continue;
1214 }
1215 case ISD::SHL:
1216 case ISD::SRA:
1217 case ISD::SRL: {
1218 // Replace vector shifts with their X86 specific equivalent so we don't
1219 // need 2 sets of patterns.
1220 if (!N->getValueType(ResNo: 0).isVector())
1221 break;
1222
1223 unsigned NewOpc;
1224 switch (N->getOpcode()) {
1225 default: llvm_unreachable("Unexpected opcode!");
1226 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1227 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1228 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1229 }
1230 SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1231 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
1232 --I;
1233 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1234 ++I;
1235 MadeChange = true;
1236 continue;
1237 }
1238 case ISD::ANY_EXTEND:
1239 case ISD::ANY_EXTEND_VECTOR_INREG: {
1240 // Replace vector any extend with the zero extend equivalents so we don't
1241 // need 2 sets of patterns. Ignore vXi1 extensions.
1242 if (!N->getValueType(ResNo: 0).isVector())
1243 break;
1244
1245 unsigned NewOpc;
1246 if (N->getOperand(Num: 0).getScalarValueSizeInBits() == 1) {
1247 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1248 "Unexpected opcode for mask vector!");
1249 NewOpc = ISD::SIGN_EXTEND;
1250 } else {
1251 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1252 ? ISD::ZERO_EXTEND
1253 : ISD::ZERO_EXTEND_VECTOR_INREG;
1254 }
1255
1256 SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1257 Operand: N->getOperand(Num: 0));
1258 --I;
1259 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1260 ++I;
1261 MadeChange = true;
1262 continue;
1263 }
1264 case ISD::FCEIL:
1265 case ISD::STRICT_FCEIL:
1266 case ISD::FFLOOR:
1267 case ISD::STRICT_FFLOOR:
1268 case ISD::FTRUNC:
1269 case ISD::STRICT_FTRUNC:
1270 case ISD::FROUNDEVEN:
1271 case ISD::STRICT_FROUNDEVEN:
1272 case ISD::FNEARBYINT:
1273 case ISD::STRICT_FNEARBYINT:
1274 case ISD::FRINT:
1275 case ISD::STRICT_FRINT: {
1276 // Replace fp rounding with their X86 specific equivalent so we don't
1277 // need 2 sets of patterns.
1278 unsigned Imm;
1279 switch (N->getOpcode()) {
1280 default: llvm_unreachable("Unexpected opcode!");
1281 case ISD::STRICT_FCEIL:
1282 case ISD::FCEIL: Imm = 0xA; break;
1283 case ISD::STRICT_FFLOOR:
1284 case ISD::FFLOOR: Imm = 0x9; break;
1285 case ISD::STRICT_FTRUNC:
1286 case ISD::FTRUNC: Imm = 0xB; break;
1287 case ISD::STRICT_FROUNDEVEN:
1288 case ISD::FROUNDEVEN: Imm = 0x8; break;
1289 case ISD::STRICT_FNEARBYINT:
1290 case ISD::FNEARBYINT: Imm = 0xC; break;
1291 case ISD::STRICT_FRINT:
1292 case ISD::FRINT: Imm = 0x4; break;
1293 }
1294 SDLoc dl(N);
1295 bool IsStrict = N->isStrictFPOpcode();
1296 SDValue Res;
1297 if (IsStrict)
1298 Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl,
1299 ResultTys: {N->getValueType(ResNo: 0), MVT::Other},
1300 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1),
1301 CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)});
1302 else
1303 Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: 0),
1304 N1: N->getOperand(Num: 0),
1305 N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32));
1306 --I;
1307 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1308 ++I;
1309 MadeChange = true;
1310 continue;
1311 }
1312 case X86ISD::FANDN:
1313 case X86ISD::FAND:
1314 case X86ISD::FOR:
1315 case X86ISD::FXOR: {
1316 // Widen scalar fp logic ops to vector to reduce isel patterns.
1317 // FIXME: Can we do this during lowering/combine.
1318 MVT VT = N->getSimpleValueType(ResNo: 0);
1319 if (VT.isVector() || VT == MVT::f128)
1320 break;
1321
1322 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1323 : VT == MVT::f32 ? MVT::v4f32
1324 : MVT::v8f16;
1325
1326 SDLoc dl(N);
1327 SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1328 Operand: N->getOperand(Num: 0));
1329 SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1330 Operand: N->getOperand(Num: 1));
1331
1332 SDValue Res;
1333 if (Subtarget->hasSSE2()) {
1334 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1335 Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0);
1336 Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1);
1337 unsigned Opc;
1338 switch (N->getOpcode()) {
1339 default: llvm_unreachable("Unexpected opcode!");
1340 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1341 case X86ISD::FAND: Opc = ISD::AND; break;
1342 case X86ISD::FOR: Opc = ISD::OR; break;
1343 case X86ISD::FXOR: Opc = ISD::XOR; break;
1344 }
1345 Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1);
1346 Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res);
1347 } else {
1348 Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1);
1349 }
1350 Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res,
1351 N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1352 --I;
1353 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1354 ++I;
1355 MadeChange = true;
1356 continue;
1357 }
1358 }
1359
1360 if (OptLevel != CodeGenOptLevel::None &&
1361 // Only do this when the target can fold the load into the call or
1362 // jmp.
1363 !Subtarget->useIndirectThunkCalls() &&
1364 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1365 (N->getOpcode() == X86ISD::TC_RETURN &&
1366 (Subtarget->is64Bit() ||
1367 !getTargetMachine().isPositionIndependent())))) {
1368 /// Also try moving call address load from outside callseq_start to just
1369 /// before the call to allow it to be folded.
1370 ///
1371 /// [Load chain]
1372 /// ^
1373 /// |
1374 /// [Load]
1375 /// ^ ^
1376 /// | |
1377 /// / \--
1378 /// / |
1379 ///[CALLSEQ_START] |
1380 /// ^ |
1381 /// | |
1382 /// [LOAD/C2Reg] |
1383 /// | |
1384 /// \ /
1385 /// \ /
1386 /// [CALL]
1387 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1388 SDValue Chain = N->getOperand(Num: 0);
1389 SDValue Load = N->getOperand(Num: 1);
1390 if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq))
1391 continue;
1392 if (N->getOpcode() == X86ISD::TC_RETURN && !checkTCRetEnoughRegs(N))
1393 continue;
1394 moveBelowOrigChain(CurDAG, Load, Call: SDValue(N, 0), OrigChain: Chain);
1395 ++NumLoadMoved;
1396 MadeChange = true;
1397 continue;
1398 }
1399
1400 // Lower fpround and fpextend nodes that target the FP stack to be store and
1401 // load to the stack. This is a gross hack. We would like to simply mark
1402 // these as being illegal, but when we do that, legalize produces these when
1403 // it expands calls, then expands these in the same legalize pass. We would
1404 // like dag combine to be able to hack on these between the call expansion
1405 // and the node legalization. As such this pass basically does "really
1406 // late" legalization of these inline with the X86 isel pass.
1407 // FIXME: This should only happen when not compiled with -O0.
1408 switch (N->getOpcode()) {
1409 default: continue;
1410 case ISD::FP_ROUND:
1411 case ISD::FP_EXTEND:
1412 {
1413 MVT SrcVT = N->getOperand(Num: 0).getSimpleValueType();
1414 MVT DstVT = N->getSimpleValueType(ResNo: 0);
1415
1416 // If any of the sources are vectors, no fp stack involved.
1417 if (SrcVT.isVector() || DstVT.isVector())
1418 continue;
1419
1420 // If the source and destination are SSE registers, then this is a legal
1421 // conversion that should not be lowered.
1422 const X86TargetLowering *X86Lowering =
1423 static_cast<const X86TargetLowering *>(TLI);
1424 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1425 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1426 if (SrcIsSSE && DstIsSSE)
1427 continue;
1428
1429 if (!SrcIsSSE && !DstIsSSE) {
1430 // If this is an FPStack extension, it is a noop.
1431 if (N->getOpcode() == ISD::FP_EXTEND)
1432 continue;
1433 // If this is a value-preserving FPStack truncation, it is a noop.
1434 if (N->getConstantOperandVal(Num: 1))
1435 continue;
1436 }
1437
1438 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1439 // FPStack has extload and truncstore. SSE can fold direct loads into other
1440 // operations. Based on this, decide what we want to do.
1441 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1442 SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1443 int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1444 MachinePointerInfo MPI =
1445 MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1446 SDLoc dl(N);
1447
1448 // FIXME: optimize the case where the src/dest is a load or store?
1449
1450 SDValue Store = CurDAG->getTruncStore(
1451 Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: 0), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT);
1452 SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store,
1453 Ptr: MemTmp, PtrInfo: MPI, MemVT);
1454
1455 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1456 // extload we created. This will cause general havok on the dag because
1457 // anything below the conversion could be folded into other existing nodes.
1458 // To avoid invalidating 'I', back it up to the convert node.
1459 --I;
1460 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Result);
1461 break;
1462 }
1463
1464 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1465 //dealing with the chain differently, as there is already a preexisting chain.
1466 case ISD::STRICT_FP_ROUND:
1467 case ISD::STRICT_FP_EXTEND:
1468 {
1469 MVT SrcVT = N->getOperand(Num: 1).getSimpleValueType();
1470 MVT DstVT = N->getSimpleValueType(ResNo: 0);
1471
1472 // If any of the sources are vectors, no fp stack involved.
1473 if (SrcVT.isVector() || DstVT.isVector())
1474 continue;
1475
1476 // If the source and destination are SSE registers, then this is a legal
1477 // conversion that should not be lowered.
1478 const X86TargetLowering *X86Lowering =
1479 static_cast<const X86TargetLowering *>(TLI);
1480 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1481 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1482 if (SrcIsSSE && DstIsSSE)
1483 continue;
1484
1485 if (!SrcIsSSE && !DstIsSSE) {
1486 // If this is an FPStack extension, it is a noop.
1487 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1488 continue;
1489 // If this is a value-preserving FPStack truncation, it is a noop.
1490 if (N->getConstantOperandVal(Num: 2))
1491 continue;
1492 }
1493
1494 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1495 // FPStack has extload and truncstore. SSE can fold direct loads into other
1496 // operations. Based on this, decide what we want to do.
1497 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1498 SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1499 int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1500 MachinePointerInfo MPI =
1501 MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1502 SDLoc dl(N);
1503
1504 // FIXME: optimize the case where the src/dest is a load or store?
1505
1506 //Since the operation is StrictFP, use the preexisting chain.
1507 SDValue Store, Result;
1508 if (!SrcIsSSE) {
1509 SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
1510 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), MemTmp};
1511 Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT,
1512 PtrInfo: MPI, /*Align*/ Alignment: std::nullopt,
1513 Flags: MachineMemOperand::MOStore);
1514 if (N->getFlags().hasNoFPExcept()) {
1515 SDNodeFlags Flags = Store->getFlags();
1516 Flags.setNoFPExcept(true);
1517 Store->setFlags(Flags);
1518 }
1519 } else {
1520 assert(SrcVT == MemVT && "Unexpected VT!");
1521 Store = CurDAG->getStore(Chain: N->getOperand(Num: 0), dl, Val: N->getOperand(Num: 1), Ptr: MemTmp,
1522 PtrInfo: MPI);
1523 }
1524
1525 if (!DstIsSSE) {
1526 SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other);
1527 SDValue Ops[] = {Store, MemTmp};
1528 Result = CurDAG->getMemIntrinsicNode(
1529 Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI,
1530 /*Align*/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad);
1531 if (N->getFlags().hasNoFPExcept()) {
1532 SDNodeFlags Flags = Result->getFlags();
1533 Flags.setNoFPExcept(true);
1534 Result->setFlags(Flags);
1535 }
1536 } else {
1537 assert(DstVT == MemVT && "Unexpected VT!");
1538 Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI);
1539 }
1540
1541 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1542 // extload we created. This will cause general havok on the dag because
1543 // anything below the conversion could be folded into other existing nodes.
1544 // To avoid invalidating 'I', back it up to the convert node.
1545 --I;
1546 CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode());
1547 break;
1548 }
1549 }
1550
1551
1552 // Now that we did that, the node is dead. Increment the iterator to the
1553 // next node to process, then delete N.
1554 ++I;
1555 MadeChange = true;
1556 }
1557
1558 // Remove any dead nodes that may have been left behind.
1559 if (MadeChange)
1560 CurDAG->RemoveDeadNodes();
1561}
1562
1563// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1564bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1565 unsigned Opc = N->getMachineOpcode();
1566 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1567 Opc != X86::MOVSX64rr8)
1568 return false;
1569
1570 SDValue N0 = N->getOperand(Num: 0);
1571
1572 // We need to be extracting the lower bit of an extend.
1573 if (!N0.isMachineOpcode() ||
1574 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1575 N0.getConstantOperandVal(i: 1) != X86::sub_8bit)
1576 return false;
1577
1578 // We're looking for either a movsx or movzx to match the original opcode.
1579 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1580 : X86::MOVSX32rr8_NOREX;
1581 SDValue N00 = N0.getOperand(i: 0);
1582 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1583 return false;
1584
1585 if (Opc == X86::MOVSX64rr8) {
1586 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1587 // to 64.
1588 MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc(N),
1589 VT: MVT::i64, Op1: N00);
1590 ReplaceUses(F: N, T: Extend);
1591 } else {
1592 // Ok we can drop this extend and just use the original extend.
1593 ReplaceUses(F: N, T: N00.getNode());
1594 }
1595
1596 return true;
1597}
1598
1599void X86DAGToDAGISel::PostprocessISelDAG() {
1600 // Skip peepholes at -O0.
1601 if (TM.getOptLevel() == CodeGenOptLevel::None)
1602 return;
1603
1604 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1605
1606 bool MadeChange = false;
1607 while (Position != CurDAG->allnodes_begin()) {
1608 SDNode *N = &*--Position;
1609 // Skip dead nodes and any non-machine opcodes.
1610 if (N->use_empty() || !N->isMachineOpcode())
1611 continue;
1612
1613 if (tryOptimizeRem8Extend(N)) {
1614 MadeChange = true;
1615 continue;
1616 }
1617
1618 unsigned Opc = N->getMachineOpcode();
1619 switch (Opc) {
1620 default:
1621 continue;
1622 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1623 case X86::TEST8rr:
1624 case X86::TEST16rr:
1625 case X86::TEST32rr:
1626 case X86::TEST64rr:
1627 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1628 case X86::CTEST8rr:
1629 case X86::CTEST16rr:
1630 case X86::CTEST32rr:
1631 case X86::CTEST64rr: {
1632 auto &Op0 = N->getOperand(Num: 0);
1633 if (Op0 != N->getOperand(Num: 1) || !Op0->hasNUsesOfValue(NUses: 2, Value: Op0.getResNo()) ||
1634 !Op0.isMachineOpcode())
1635 continue;
1636 SDValue And = N->getOperand(Num: 0);
1637#define CASE_ND(OP) \
1638 case X86::OP: \
1639 case X86::OP##_ND:
1640 switch (And.getMachineOpcode()) {
1641 default:
1642 continue;
1643 CASE_ND(AND8rr)
1644 CASE_ND(AND16rr)
1645 CASE_ND(AND32rr)
1646 CASE_ND(AND64rr) {
1647 if (And->hasAnyUseOfValue(Value: 1))
1648 continue;
1649 SmallVector<SDValue> Ops(N->op_values());
1650 Ops[0] = And.getOperand(i: 0);
1651 Ops[1] = And.getOperand(i: 1);
1652 MachineSDNode *Test =
1653 CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: MVT::i32, Ops);
1654 ReplaceUses(F: N, T: Test);
1655 MadeChange = true;
1656 continue;
1657 }
1658 CASE_ND(AND8rm)
1659 CASE_ND(AND16rm)
1660 CASE_ND(AND32rm)
1661 CASE_ND(AND64rm) {
1662 if (And->hasAnyUseOfValue(Value: 1))
1663 continue;
1664 unsigned NewOpc;
1665 bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc);
1666#define FROM_TO(A, B) \
1667 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1668 break;
1669 switch (And.getMachineOpcode()) {
1670 FROM_TO(AND8rm, TEST8mr);
1671 FROM_TO(AND16rm, TEST16mr);
1672 FROM_TO(AND32rm, TEST32mr);
1673 FROM_TO(AND64rm, TEST64mr);
1674 }
1675#undef FROM_TO
1676#undef CASE_ND
1677 // Need to swap the memory and register operand.
1678 SmallVector<SDValue> Ops = {And.getOperand(i: 1), And.getOperand(i: 2),
1679 And.getOperand(i: 3), And.getOperand(i: 4),
1680 And.getOperand(i: 5), And.getOperand(i: 0)};
1681 // CC, Cflags.
1682 if (IsCTESTCC) {
1683 Ops.push_back(Elt: N->getOperand(Num: 2));
1684 Ops.push_back(Elt: N->getOperand(Num: 3));
1685 }
1686 // Chain of memory load
1687 Ops.push_back(Elt: And.getOperand(i: 6));
1688 // Glue
1689 if (IsCTESTCC)
1690 Ops.push_back(Elt: N->getOperand(Num: 4));
1691
1692 MachineSDNode *Test = CurDAG->getMachineNode(
1693 Opcode: NewOpc, dl: SDLoc(N), VT1: MVT::i32, VT2: MVT::Other, Ops);
1694 CurDAG->setNodeMemRefs(
1695 N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands());
1696 ReplaceUses(F: And.getValue(R: 2), T: SDValue(Test, 1));
1697 ReplaceUses(F: SDValue(N, 0), T: SDValue(Test, 0));
1698 MadeChange = true;
1699 continue;
1700 }
1701 }
1702 }
1703 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1704 // used. We're doing this late so we can prefer to fold the AND into masked
1705 // comparisons. Doing that can be better for the live range of the mask
1706 // register.
1707 case X86::KORTESTBkk:
1708 case X86::KORTESTWkk:
1709 case X86::KORTESTDkk:
1710 case X86::KORTESTQkk: {
1711 SDValue Op0 = N->getOperand(Num: 0);
1712 if (Op0 != N->getOperand(Num: 1) || !N->isOnlyUserOf(N: Op0.getNode()) ||
1713 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(Flags: SDValue(N, 0)))
1714 continue;
1715#define CASE(A) \
1716 case X86::A: \
1717 break;
1718 switch (Op0.getMachineOpcode()) {
1719 default:
1720 continue;
1721 CASE(KANDBkk)
1722 CASE(KANDWkk)
1723 CASE(KANDDkk)
1724 CASE(KANDQkk)
1725 }
1726 unsigned NewOpc;
1727#define FROM_TO(A, B) \
1728 case X86::A: \
1729 NewOpc = X86::B; \
1730 break;
1731 switch (Opc) {
1732 FROM_TO(KORTESTBkk, KTESTBkk)
1733 FROM_TO(KORTESTWkk, KTESTWkk)
1734 FROM_TO(KORTESTDkk, KTESTDkk)
1735 FROM_TO(KORTESTQkk, KTESTQkk)
1736 }
1737 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1738 // KAND instructions and KTEST use the same ISA feature.
1739 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1740 continue;
1741#undef FROM_TO
1742 MachineSDNode *KTest = CurDAG->getMachineNode(
1743 Opcode: NewOpc, dl: SDLoc(N), VT: MVT::i32, Op1: Op0.getOperand(i: 0), Op2: Op0.getOperand(i: 1));
1744 ReplaceUses(F: N, T: KTest);
1745 MadeChange = true;
1746 continue;
1747 }
1748 // Attempt to remove vectors moves that were inserted to zero upper bits.
1749 case TargetOpcode::SUBREG_TO_REG: {
1750 unsigned SubRegIdx = N->getConstantOperandVal(Num: 1);
1751 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1752 continue;
1753
1754 SDValue Move = N->getOperand(Num: 0);
1755 if (!Move.isMachineOpcode())
1756 continue;
1757
1758 // Make sure its one of the move opcodes we recognize.
1759 switch (Move.getMachineOpcode()) {
1760 default:
1761 continue;
1762 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1763 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1764 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1765 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1766 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1767 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1768 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1769 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1770 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1771 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1772 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1773 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1774 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1775 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1776 }
1777#undef CASE
1778
1779 SDValue In = Move.getOperand(i: 0);
1780 if (!In.isMachineOpcode() ||
1781 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1782 continue;
1783
1784 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1785 // the SHA instructions which use a legacy encoding.
1786 uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags;
1787 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1788 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1789 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1790 continue;
1791
1792 // Producing instruction is another vector instruction. We can drop the
1793 // move.
1794 CurDAG->UpdateNodeOperands(N, Op1: In, Op2: N->getOperand(Num: 1));
1795 MadeChange = true;
1796 }
1797 }
1798 }
1799
1800 if (MadeChange)
1801 CurDAG->RemoveDeadNodes();
1802}
1803
1804
1805/// Emit any code that needs to be executed only in the main function.
1806void X86DAGToDAGISel::emitSpecialCodeForMain() {
1807 if (Subtarget->isTargetCygMing()) {
1808 TargetLowering::ArgListTy Args;
1809 auto &DL = CurDAG->getDataLayout();
1810
1811 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1812 CLI.setChain(CurDAG->getRoot())
1813 .setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()),
1814 Target: CurDAG->getExternalSymbol(Sym: "__main", VT: TLI->getPointerTy(DL)),
1815 ArgsList: std::move(Args));
1816 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1817 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1818 CurDAG->setRoot(Result.second);
1819 }
1820}
1821
1822void X86DAGToDAGISel::emitFunctionEntryCode() {
1823 // If this is main, emit special code for main.
1824 const Function &F = MF->getFunction();
1825 if (F.hasExternalLinkage() && F.getName() == "main")
1826 emitSpecialCodeForMain();
1827}
1828
1829static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1830 // We can run into an issue where a frame index or a register base
1831 // includes a displacement that, when added to the explicit displacement,
1832 // will overflow the displacement field. Assuming that the
1833 // displacement fits into a 31-bit integer (which is only slightly more
1834 // aggressive than the current fundamental assumption that it fits into
1835 // a 32-bit integer), a 31-bit disp should always be safe.
1836 return isInt<31>(x: Val);
1837}
1838
1839bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1840 X86ISelAddressMode &AM) {
1841 // We may have already matched a displacement and the caller just added the
1842 // symbolic displacement. So we still need to do the checks even if Offset
1843 // is zero.
1844
1845 int64_t Val = AM.Disp + Offset;
1846
1847 // Cannot combine ExternalSymbol displacements with integer offsets.
1848 if (Val != 0 && (AM.ES || AM.MCSym))
1849 return true;
1850
1851 CodeModel::Model M = TM.getCodeModel();
1852 if (Subtarget->is64Bit()) {
1853 if (Val != 0 &&
1854 !X86::isOffsetSuitableForCodeModel(Offset: Val, M,
1855 hasSymbolicDisplacement: AM.hasSymbolicDisplacement()))
1856 return true;
1857 // In addition to the checks required for a register base, check that
1858 // we do not try to use an unsafe Disp with a frame index.
1859 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1860 !isDispSafeForFrameIndexOrRegBase(Val))
1861 return true;
1862 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1863 // 64 bits. Instructions with 32-bit register addresses perform this zero
1864 // extension for us and we can safely ignore the high bits of Offset.
1865 // Instructions with only a 32-bit immediate address do not, though: they
1866 // sign extend instead. This means only address the low 2GB of address space
1867 // is directly addressable, we need indirect addressing for the high 2GB of
1868 // address space.
1869 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1870 // implicit zero extension of instructions would cover up any problem.
1871 // However, we have asserts elsewhere that get triggered if we do, so keep
1872 // the checks for now.
1873 // TODO: We would actually be able to accept these, as well as the same
1874 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1875 // to get an address size override to be emitted. However, this
1876 // pseudo-register is not part of any register class and therefore causes
1877 // MIR verification to fail.
1878 if (Subtarget->isTarget64BitILP32() &&
1879 !isDispSafeForFrameIndexOrRegBase(Val: (uint32_t)Val) &&
1880 !AM.hasBaseOrIndexReg())
1881 return true;
1882 } else if (Subtarget->is16Bit()) {
1883 // In 16-bit mode, displacements are limited to [-65535,65535] for FK_Data_2
1884 // fixups of unknown signedness. See X86AsmBackend::applyFixup.
1885 if (Val < -(int64_t)UINT16_MAX || Val > (int64_t)UINT16_MAX)
1886 return true;
1887 } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1888 // For 32-bit X86, make sure the displacement still isn't close to the
1889 // expressible limit.
1890 return true;
1891 AM.Disp = Val;
1892 return false;
1893}
1894
1895bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1896 bool AllowSegmentRegForX32) {
1897 SDValue Address = N->getOperand(Num: 1);
1898
1899 // load gs:0 -> GS segment register.
1900 // load fs:0 -> FS segment register.
1901 //
1902 // This optimization is generally valid because the GNU TLS model defines that
1903 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1904 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1905 // zero-extended to 64 bits and then added it to the base address, which gives
1906 // unwanted results when the register holds a negative value.
1907 // For more information see http://people.redhat.com/drepper/tls.pdf
1908 if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr &&
1909 !IndirectTlsSegRefs &&
1910 (Subtarget->isTargetGlibc() || Subtarget->isTargetMusl() ||
1911 Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) {
1912 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1913 return true;
1914 switch (N->getPointerInfo().getAddrSpace()) {
1915 case X86AS::GS:
1916 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
1917 return false;
1918 case X86AS::FS:
1919 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
1920 return false;
1921 // Address space X86AS::SS is not handled here, because it is not used to
1922 // address TLS areas.
1923 }
1924 }
1925
1926 return true;
1927}
1928
1929/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1930/// mode. These wrap things that will resolve down into a symbol reference.
1931/// If no match is possible, this returns true, otherwise it returns false.
1932bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1933 // If the addressing mode already has a symbol as the displacement, we can
1934 // never match another symbol.
1935 if (AM.hasSymbolicDisplacement())
1936 return true;
1937
1938 bool IsRIPRelTLS = false;
1939 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1940 if (IsRIPRel) {
1941 SDValue Val = N.getOperand(i: 0);
1942 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1943 IsRIPRelTLS = true;
1944 }
1945
1946 // We can't use an addressing mode in the 64-bit large code model.
1947 // Global TLS addressing is an exception. In the medium code model,
1948 // we use can use a mode when RIP wrappers are present.
1949 // That signifies access to globals that are known to be "near",
1950 // such as the GOT itself.
1951 CodeModel::Model M = TM.getCodeModel();
1952 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1953 return true;
1954
1955 // Base and index reg must be 0 in order to use %rip as base.
1956 if (IsRIPRel && AM.hasBaseOrIndexReg())
1957 return true;
1958
1959 // Make a local copy in case we can't do this fold.
1960 X86ISelAddressMode Backup = AM;
1961
1962 int64_t Offset = 0;
1963 SDValue N0 = N.getOperand(i: 0);
1964 if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) {
1965 AM.GV = G->getGlobal();
1966 AM.SymbolFlags = G->getTargetFlags();
1967 Offset = G->getOffset();
1968 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) {
1969 AM.CP = CP->getConstVal();
1970 AM.Alignment = CP->getAlign();
1971 AM.SymbolFlags = CP->getTargetFlags();
1972 Offset = CP->getOffset();
1973 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) {
1974 AM.ES = S->getSymbol();
1975 AM.SymbolFlags = S->getTargetFlags();
1976 } else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) {
1977 AM.MCSym = S->getMCSymbol();
1978 } else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) {
1979 AM.JT = J->getIndex();
1980 AM.SymbolFlags = J->getTargetFlags();
1981 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) {
1982 AM.BlockAddr = BA->getBlockAddress();
1983 AM.SymbolFlags = BA->getTargetFlags();
1984 Offset = BA->getOffset();
1985 } else
1986 llvm_unreachable("Unhandled symbol reference node.");
1987
1988 // Can't use an addressing mode with large globals.
1989 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1990 TM.isLargeGlobalValue(GV: AM.GV)) {
1991 AM = Backup;
1992 return true;
1993 }
1994
1995 if (foldOffsetIntoAddress(Offset, AM)) {
1996 AM = Backup;
1997 return true;
1998 }
1999
2000 if (IsRIPRel)
2001 AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64));
2002
2003 // Commit the changes now that we know this fold is safe.
2004 return false;
2005}
2006
2007/// Add the specified node to the specified addressing mode, returning true if
2008/// it cannot be done. This just pattern matches for the addressing mode.
2009bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
2010 if (matchAddressRecursively(N, AM, Depth: 0))
2011 return true;
2012
2013 // Post-processing: Make a second attempt to fold a load, if we now know
2014 // that there will not be any other register. This is only performed for
2015 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
2016 // any foldable load the first time.
2017 if (Subtarget->isTarget64BitILP32() &&
2018 AM.BaseType == X86ISelAddressMode::RegBase &&
2019 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
2020 SDValue Save_Base_Reg = AM.Base_Reg;
2021 if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) {
2022 AM.Base_Reg = SDValue();
2023 if (matchLoadInAddress(N: LoadN, AM, /*AllowSegmentRegForX32=*/true))
2024 AM.Base_Reg = Save_Base_Reg;
2025 }
2026 }
2027
2028 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
2029 // a smaller encoding and avoids a scaled-index.
2030 if (AM.Scale == 2 &&
2031 AM.BaseType == X86ISelAddressMode::RegBase &&
2032 AM.Base_Reg.getNode() == nullptr) {
2033 AM.Base_Reg = AM.IndexReg;
2034 AM.Scale = 1;
2035 }
2036
2037 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2038 // because it has a smaller encoding.
2039 if (TM.getCodeModel() != CodeModel::Large &&
2040 (!AM.GV || !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() &&
2041 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2042 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2043 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2044 // However, when GV is a local function symbol and in the same section as
2045 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2046 // referencing GV+Disp generates a relocation referencing the section symbol
2047 // with an even smaller offset, which might underflow. We should bail out if
2048 // the negative offset is too close to INT32_MIN. Actually, we are more
2049 // conservative here, using a smaller magic number also used by
2050 // isOffsetSuitableForCodeModel.
2051 if (isa_and_nonnull<Function>(Val: AM.GV) && AM.Disp < -16 * 1024 * 1024)
2052 return true;
2053
2054 AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64);
2055 }
2056
2057 return false;
2058}
2059
2060bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2061 unsigned Depth) {
2062 // Add an artificial use to this node so that we can keep track of
2063 // it if it gets CSE'd with a different node.
2064 HandleSDNode Handle(N);
2065
2066 X86ISelAddressMode Backup = AM;
2067 if (!matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1) &&
2068 !matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, Depth: Depth+1))
2069 return false;
2070 AM = Backup;
2071
2072 // Try again after commutating the operands.
2073 if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2074 Depth: Depth + 1) &&
2075 !matchAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, Depth: Depth + 1))
2076 return false;
2077 AM = Backup;
2078
2079 // If we couldn't fold both operands into the address at the same time,
2080 // see if we can just put each operand into a register and fold at least
2081 // the add.
2082 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2083 !AM.Base_Reg.getNode() &&
2084 !AM.IndexReg.getNode()) {
2085 N = Handle.getValue();
2086 AM.Base_Reg = N.getOperand(i: 0);
2087 AM.IndexReg = N.getOperand(i: 1);
2088 AM.Scale = 1;
2089 return false;
2090 }
2091 N = Handle.getValue();
2092 return true;
2093}
2094
2095// Insert a node into the DAG at least before the Pos node's position. This
2096// will reposition the node as needed, and will assign it a node ID that is <=
2097// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2098// IDs! The selection DAG must no longer depend on their uniqueness when this
2099// is used.
2100static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2101 if (N->getNodeId() == -1 ||
2102 (SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) >
2103 SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) {
2104 DAG.RepositionNode(Position: Pos->getIterator(), N: N.getNode());
2105 // Mark Node as invalid for pruning as after this it may be a successor to a
2106 // selected node but otherwise be in the same position of Pos.
2107 // Conservatively mark it with the same -abs(Id) to assure node id
2108 // invariant is preserved.
2109 N->setNodeId(Pos->getNodeId());
2110 SelectionDAGISel::InvalidateNodeId(N: N.getNode());
2111 }
2112}
2113
2114// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2115// safe. This allows us to convert the shift and and into an h-register
2116// extract and a scaled index. Returns false if the simplification is
2117// performed.
2118static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2119 uint64_t Mask,
2120 SDValue Shift, SDValue X,
2121 X86ISelAddressMode &AM) {
2122 if (Shift.getOpcode() != ISD::SRL ||
2123 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) ||
2124 !Shift.hasOneUse())
2125 return true;
2126
2127 int ScaleLog = 8 - Shift.getConstantOperandVal(i: 1);
2128 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2129 Mask != (0xffu << ScaleLog))
2130 return true;
2131
2132 MVT XVT = X.getSimpleValueType();
2133 MVT VT = N.getSimpleValueType();
2134 SDLoc DL(N);
2135 SDValue Eight = DAG.getConstant(Val: 8, DL, VT: MVT::i8);
2136 SDValue NewMask = DAG.getConstant(Val: 0xff, DL, VT: XVT);
2137 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight);
2138 SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask);
2139 SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT);
2140 SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8);
2141 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount);
2142
2143 // Insert the new nodes into the topological ordering. We must do this in
2144 // a valid topological ordering as nothing is going to go back and re-sort
2145 // these nodes. We continually insert before 'N' in sequence as this is
2146 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2147 // hierarchy left to express.
2148 insertDAGNode(DAG, Pos: N, N: Eight);
2149 insertDAGNode(DAG, Pos: N, N: NewMask);
2150 insertDAGNode(DAG, Pos: N, N: Srl);
2151 insertDAGNode(DAG, Pos: N, N: And);
2152 insertDAGNode(DAG, Pos: N, N: Ext);
2153 insertDAGNode(DAG, Pos: N, N: ShlCount);
2154 insertDAGNode(DAG, Pos: N, N: Shl);
2155 DAG.ReplaceAllUsesWith(From: N, To: Shl);
2156 DAG.RemoveDeadNode(N: N.getNode());
2157 AM.IndexReg = Ext;
2158 AM.Scale = (1 << ScaleLog);
2159 return false;
2160}
2161
2162// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2163// allows us to fold the shift into this addressing mode. Returns false if the
2164// transform succeeded.
2165static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2166 X86ISelAddressMode &AM) {
2167 SDValue Shift = N.getOperand(i: 0);
2168
2169 // Use a signed mask so that shifting right will insert sign bits. These
2170 // bits will be removed when we shift the result left so it doesn't matter
2171 // what we use. This might allow a smaller immediate encoding.
2172 int64_t Mask = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getSExtValue();
2173
2174 // If we have an any_extend feeding the AND, look through it to see if there
2175 // is a shift behind it. But only if the AND doesn't use the extended bits.
2176 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2177 bool FoundAnyExtend = false;
2178 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2179 Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 &&
2180 isUInt<32>(x: Mask)) {
2181 FoundAnyExtend = true;
2182 Shift = Shift.getOperand(i: 0);
2183 }
2184
2185 if (Shift.getOpcode() != ISD::SHL ||
2186 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
2187 return true;
2188
2189 SDValue X = Shift.getOperand(i: 0);
2190
2191 // Not likely to be profitable if either the AND or SHIFT node has more
2192 // than one use (unless all uses are for address computation). Besides,
2193 // isel mechanism requires their node ids to be reused.
2194 if (!N.hasOneUse() || !Shift.hasOneUse())
2195 return true;
2196
2197 // Verify that the shift amount is something we can fold.
2198 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2199 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2200 return true;
2201
2202 MVT VT = N.getSimpleValueType();
2203 SDLoc DL(N);
2204 if (FoundAnyExtend) {
2205 SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X);
2206 insertDAGNode(DAG, Pos: N, N: NewX);
2207 X = NewX;
2208 }
2209
2210 SDValue NewMask = DAG.getSignedConstant(Val: Mask >> ShiftAmt, DL, VT);
2211 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask);
2212 SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: 1));
2213
2214 // Insert the new nodes into the topological ordering. We must do this in
2215 // a valid topological ordering as nothing is going to go back and re-sort
2216 // these nodes. We continually insert before 'N' in sequence as this is
2217 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2218 // hierarchy left to express.
2219 insertDAGNode(DAG, Pos: N, N: NewMask);
2220 insertDAGNode(DAG, Pos: N, N: NewAnd);
2221 insertDAGNode(DAG, Pos: N, N: NewShift);
2222 DAG.ReplaceAllUsesWith(From: N, To: NewShift);
2223 DAG.RemoveDeadNode(N: N.getNode());
2224
2225 AM.Scale = 1 << ShiftAmt;
2226 AM.IndexReg = NewAnd;
2227 return false;
2228}
2229
2230// Implement some heroics to detect shifts of masked values where the mask can
2231// be replaced by extending the shift and undoing that in the addressing mode
2232// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2233// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2234// the addressing mode. This results in code such as:
2235//
2236// int f(short *y, int *lookup_table) {
2237// ...
2238// return *y + lookup_table[*y >> 11];
2239// }
2240//
2241// Turning into:
2242// movzwl (%rdi), %eax
2243// movl %eax, %ecx
2244// shrl $11, %ecx
2245// addl (%rsi,%rcx,4), %eax
2246//
2247// Instead of:
2248// movzwl (%rdi), %eax
2249// movl %eax, %ecx
2250// shrl $9, %ecx
2251// andl $124, %rcx
2252// addl (%rsi,%rcx), %eax
2253//
2254// Note that this function assumes the mask is provided as a mask *after* the
2255// value is shifted. The input chain may or may not match that, but computing
2256// such a mask is trivial.
2257static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2258 uint64_t Mask,
2259 SDValue Shift, SDValue X,
2260 X86ISelAddressMode &AM) {
2261 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2262 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
2263 return true;
2264
2265 // We need to ensure that mask is a continuous run of bits.
2266 unsigned MaskIdx, MaskLen;
2267 if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2268 return true;
2269 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2270
2271 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2272
2273 // The amount of shift we're trying to fit into the addressing mode is taken
2274 // from the shifted mask index (number of trailing zeros of the mask).
2275 unsigned AMShiftAmt = MaskIdx;
2276
2277 // There is nothing we can do here unless the mask is removing some bits.
2278 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2279 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2280
2281 // Scale the leading zero count down based on the actual size of the value.
2282 // Also scale it down based on the size of the shift.
2283 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2284 if (MaskLZ < ScaleDown)
2285 return true;
2286 MaskLZ -= ScaleDown;
2287
2288 // The final check is to ensure that any masked out high bits of X are
2289 // already known to be zero. Otherwise, the mask has a semantic impact
2290 // other than masking out a couple of low bits. Unfortunately, because of
2291 // the mask, zero extensions will be removed from operands in some cases.
2292 // This code works extra hard to look through extensions because we can
2293 // replace them with zero extensions cheaply if necessary.
2294 bool ReplacingAnyExtend = false;
2295 if (X.getOpcode() == ISD::ANY_EXTEND) {
2296 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2297 X.getOperand(i: 0).getSimpleValueType().getSizeInBits();
2298 // Assume that we'll replace the any-extend with a zero-extend, and
2299 // narrow the search to the extended value.
2300 X = X.getOperand(i: 0);
2301 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2302 ReplacingAnyExtend = true;
2303 }
2304 APInt MaskedHighBits =
2305 APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ);
2306 if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits))
2307 return true;
2308
2309 // We've identified a pattern that can be transformed into a single shift
2310 // and an addressing mode. Make it so.
2311 MVT VT = N.getSimpleValueType();
2312 if (ReplacingAnyExtend) {
2313 assert(X.getValueType() != VT);
2314 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2315 SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(X), VT, Operand: X);
2316 insertDAGNode(DAG, Pos: N, N: NewX);
2317 X = NewX;
2318 }
2319
2320 MVT XVT = X.getSimpleValueType();
2321 SDLoc DL(N);
2322 SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2323 SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2324 SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT);
2325 SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2326 SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2327
2328 // Insert the new nodes into the topological ordering. We must do this in
2329 // a valid topological ordering as nothing is going to go back and re-sort
2330 // these nodes. We continually insert before 'N' in sequence as this is
2331 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2332 // hierarchy left to express.
2333 insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2334 insertDAGNode(DAG, Pos: N, N: NewSRL);
2335 insertDAGNode(DAG, Pos: N, N: NewExt);
2336 insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2337 insertDAGNode(DAG, Pos: N, N: NewSHL);
2338 DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2339 DAG.RemoveDeadNode(N: N.getNode());
2340
2341 AM.Scale = 1 << AMShiftAmt;
2342 AM.IndexReg = NewExt;
2343 return false;
2344}
2345
2346// Transform "(X >> SHIFT) & (MASK << C1)" to
2347// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2348// matched to a BEXTR later. Returns false if the simplification is performed.
2349static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2350 uint64_t Mask,
2351 SDValue Shift, SDValue X,
2352 X86ISelAddressMode &AM,
2353 const X86Subtarget &Subtarget) {
2354 if (Shift.getOpcode() != ISD::SRL ||
2355 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) ||
2356 !Shift.hasOneUse() || !N.hasOneUse())
2357 return true;
2358
2359 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2360 if (!Subtarget.hasTBM() &&
2361 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2362 return true;
2363
2364 // We need to ensure that mask is a continuous run of bits.
2365 unsigned MaskIdx, MaskLen;
2366 if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2367 return true;
2368
2369 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2370
2371 // The amount of shift we're trying to fit into the addressing mode is taken
2372 // from the shifted mask index (number of trailing zeros of the mask).
2373 unsigned AMShiftAmt = MaskIdx;
2374
2375 // There is nothing we can do here unless the mask is removing some bits.
2376 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2377 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2378
2379 MVT XVT = X.getSimpleValueType();
2380 MVT VT = N.getSimpleValueType();
2381 SDLoc DL(N);
2382 SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2383 SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2384 SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT);
2385 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask);
2386 SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT);
2387 SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2388 SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2389
2390 // Insert the new nodes into the topological ordering. We must do this in
2391 // a valid topological ordering as nothing is going to go back and re-sort
2392 // these nodes. We continually insert before 'N' in sequence as this is
2393 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2394 // hierarchy left to express.
2395 insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2396 insertDAGNode(DAG, Pos: N, N: NewSRL);
2397 insertDAGNode(DAG, Pos: N, N: NewMask);
2398 insertDAGNode(DAG, Pos: N, N: NewAnd);
2399 insertDAGNode(DAG, Pos: N, N: NewExt);
2400 insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2401 insertDAGNode(DAG, Pos: N, N: NewSHL);
2402 DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2403 DAG.RemoveDeadNode(N: N.getNode());
2404
2405 AM.Scale = 1 << AMShiftAmt;
2406 AM.IndexReg = NewExt;
2407 return false;
2408}
2409
2410// Attempt to peek further into a scaled index register, collecting additional
2411// extensions / offsets / etc. Returns /p N if we can't peek any further.
2412SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2413 X86ISelAddressMode &AM,
2414 unsigned Depth) {
2415 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2416 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2417 "Illegal index scale");
2418
2419 // Limit recursion.
2420 if (Depth >= SelectionDAG::MaxRecursionDepth)
2421 return N;
2422
2423 EVT VT = N.getValueType();
2424 unsigned Opc = N.getOpcode();
2425
2426 // index: add(x,c) -> index: x, disp + c
2427 if (CurDAG->isBaseWithConstantOffset(Op: N)) {
2428 auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: 1));
2429 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2430 if (!foldOffsetIntoAddress(Offset, AM))
2431 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2432 }
2433
2434 // index: add(x,x) -> index: x, scale * 2
2435 if (Opc == ISD::ADD && N.getOperand(i: 0) == N.getOperand(i: 1)) {
2436 if (AM.Scale <= 4) {
2437 AM.Scale *= 2;
2438 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2439 }
2440 }
2441
2442 // index: shl(x,i) -> index: x, scale * (1 << i)
2443 if (Opc == X86ISD::VSHLI) {
2444 uint64_t ShiftAmt = N.getConstantOperandVal(i: 1);
2445 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2446 if ((AM.Scale * ScaleAmt) <= 8) {
2447 AM.Scale *= ScaleAmt;
2448 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2449 }
2450 }
2451
2452 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2453 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2454 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2455 SDValue Src = N.getOperand(i: 0);
2456 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2457 Src.hasOneUse()) {
2458 if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2459 SDValue AddSrc = Src.getOperand(i: 0);
2460 auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: 1));
2461 int64_t Offset = AddVal->getSExtValue();
2462 if (!foldOffsetIntoAddress(Offset: (uint64_t)Offset * AM.Scale, AM)) {
2463 SDLoc DL(N);
2464 SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2465 SDValue ExtVal = CurDAG->getSignedConstant(Val: Offset, DL, VT);
2466 SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal);
2467 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2468 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2469 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2470 CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2471 CurDAG->RemoveDeadNode(N: N.getNode());
2472 return ExtSrc;
2473 }
2474 }
2475 }
2476 }
2477
2478 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2479 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2480 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2481 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2482 SDValue Src = N.getOperand(i: 0);
2483 unsigned SrcOpc = Src.getOpcode();
2484 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2485 CurDAG->isADDLike(Op: Src, /*NoWrap=*/true)) &&
2486 Src.hasOneUse()) {
2487 if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2488 SDValue AddSrc = Src.getOperand(i: 0);
2489 uint64_t Offset = Src.getConstantOperandVal(i: 1);
2490 if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2491 SDLoc DL(N);
2492 SDValue Res;
2493 // If we're also scaling, see if we can use that as well.
2494 if (AddSrc.getOpcode() == ISD::SHL &&
2495 isa<ConstantSDNode>(Val: AddSrc.getOperand(i: 1))) {
2496 SDValue ShVal = AddSrc.getOperand(i: 0);
2497 uint64_t ShAmt = AddSrc.getConstantOperandVal(i: 1);
2498 APInt HiBits =
2499 APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt);
2500 uint64_t ScaleAmt = 1ULL << ShAmt;
2501 if ((AM.Scale * ScaleAmt) <= 8 &&
2502 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2503 CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) {
2504 AM.Scale *= ScaleAmt;
2505 SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal);
2506 SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal,
2507 N2: AddSrc.getOperand(i: 1));
2508 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal);
2509 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift);
2510 AddSrc = ExtShift;
2511 Res = ExtShVal;
2512 }
2513 }
2514 SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2515 SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2516 SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal);
2517 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2518 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2519 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2520 CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2521 CurDAG->RemoveDeadNode(N: N.getNode());
2522 return Res ? Res : ExtSrc;
2523 }
2524 }
2525 }
2526 }
2527
2528 // TODO: Handle extensions, shifted masks etc.
2529 return N;
2530}
2531
2532bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2533 unsigned Depth) {
2534 LLVM_DEBUG({
2535 dbgs() << "MatchAddress: ";
2536 AM.dump(CurDAG);
2537 });
2538 // Limit recursion.
2539 if (Depth >= SelectionDAG::MaxRecursionDepth)
2540 return matchAddressBase(N, AM);
2541
2542 // If this is already a %rip relative address, we can only merge immediates
2543 // into it. Instead of handling this in every case, we handle it here.
2544 // RIP relative addressing: %rip + 32-bit displacement!
2545 if (AM.isRIPRelative()) {
2546 // FIXME: JumpTable and ExternalSymbol address currently don't like
2547 // displacements. It isn't very important, but this should be fixed for
2548 // consistency.
2549 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2550 return true;
2551
2552 if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N))
2553 if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM))
2554 return false;
2555 return true;
2556 }
2557
2558 switch (N.getOpcode()) {
2559 default: break;
2560 case ISD::LOCAL_RECOVER: {
2561 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2562 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: 0))) {
2563 // Use the symbol and don't prefix it.
2564 AM.MCSym = ESNode->getMCSymbol();
2565 return false;
2566 }
2567 break;
2568 }
2569 case ISD::Constant: {
2570 uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2571 if (!foldOffsetIntoAddress(Offset: Val, AM))
2572 return false;
2573 break;
2574 }
2575
2576 case X86ISD::Wrapper:
2577 case X86ISD::WrapperRIP:
2578 if (!matchWrapper(N, AM))
2579 return false;
2580 break;
2581
2582 case ISD::LOAD:
2583 if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM))
2584 return false;
2585 break;
2586
2587 case ISD::FrameIndex:
2588 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2589 AM.Base_Reg.getNode() == nullptr &&
2590 (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(Val: AM.Disp))) {
2591 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2592 AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex();
2593 return false;
2594 }
2595 break;
2596
2597 case ISD::SHL:
2598 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2599 break;
2600
2601 if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) {
2602 unsigned Val = CN->getZExtValue();
2603 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2604 // that the base operand remains free for further matching. If
2605 // the base doesn't end up getting used, a post-processing step
2606 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2607 if (Val == 1 || Val == 2 || Val == 3) {
2608 SDValue ShVal = N.getOperand(i: 0);
2609 AM.Scale = 1 << Val;
2610 AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + 1);
2611 return false;
2612 }
2613 }
2614 break;
2615
2616 case ISD::SRL: {
2617 // Scale must not be used already.
2618 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2619
2620 // We only handle up to 64-bit values here as those are what matter for
2621 // addressing mode optimizations.
2622 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2623 "Unexpected value size!");
2624
2625 SDValue And = N.getOperand(i: 0);
2626 if (And.getOpcode() != ISD::AND) break;
2627 SDValue X = And.getOperand(i: 0);
2628
2629 // The mask used for the transform is expected to be post-shift, but we
2630 // found the shift first so just apply the shift to the mask before passing
2631 // it down.
2632 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)) ||
2633 !isa<ConstantSDNode>(Val: And.getOperand(i: 1)))
2634 break;
2635 uint64_t Mask = And.getConstantOperandVal(i: 1) >> N.getConstantOperandVal(i: 1);
2636
2637 // Try to fold the mask and shift into the scale, and return false if we
2638 // succeed.
2639 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM))
2640 return false;
2641 break;
2642 }
2643
2644 case ISD::SMUL_LOHI:
2645 case ISD::UMUL_LOHI:
2646 // A mul_lohi where we need the low part can be folded as a plain multiply.
2647 if (N.getResNo() != 0) break;
2648 [[fallthrough]];
2649 case ISD::MUL:
2650 case X86ISD::MUL_IMM:
2651 // X*[3,5,9] -> X+X*[2,4,8]
2652 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2653 AM.Base_Reg.getNode() == nullptr &&
2654 AM.IndexReg.getNode() == nullptr) {
2655 if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1)))
2656 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2657 CN->getZExtValue() == 9) {
2658 AM.Scale = unsigned(CN->getZExtValue())-1;
2659
2660 SDValue MulVal = N.getOperand(i: 0);
2661 SDValue Reg;
2662
2663 // Okay, we know that we have a scale by now. However, if the scaled
2664 // value is an add of something and a constant, we can fold the
2665 // constant into the disp field here.
2666 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2667 isa<ConstantSDNode>(Val: MulVal.getOperand(i: 1))) {
2668 Reg = MulVal.getOperand(i: 0);
2669 auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: 1));
2670 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2671 if (foldOffsetIntoAddress(Offset: Disp, AM))
2672 Reg = N.getOperand(i: 0);
2673 } else {
2674 Reg = N.getOperand(i: 0);
2675 }
2676
2677 AM.IndexReg = AM.Base_Reg = Reg;
2678 return false;
2679 }
2680 }
2681 break;
2682
2683 case ISD::SUB: {
2684 // Given A-B, if A can be completely folded into the address and
2685 // the index field with the index field unused, use -B as the index.
2686 // This is a win if a has multiple parts that can be folded into
2687 // the address. Also, this saves a mov if the base register has
2688 // other uses, since it avoids a two-address sub instruction, however
2689 // it costs an additional mov if the index register has other uses.
2690
2691 // Add an artificial use to this node so that we can keep track of
2692 // it if it gets CSE'd with a different node.
2693 HandleSDNode Handle(N);
2694
2695 // Test if the LHS of the sub can be folded.
2696 X86ISelAddressMode Backup = AM;
2697 if (matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1)) {
2698 N = Handle.getValue();
2699 AM = Backup;
2700 break;
2701 }
2702 N = Handle.getValue();
2703 // Test if the index field is free for use.
2704 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2705 AM = Backup;
2706 break;
2707 }
2708
2709 int Cost = 0;
2710 SDValue RHS = N.getOperand(i: 1);
2711 // If the RHS involves a register with multiple uses, this
2712 // transformation incurs an extra mov, due to the neg instruction
2713 // clobbering its operand.
2714 if (!RHS.getNode()->hasOneUse() ||
2715 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2716 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2717 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2718 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2719 RHS.getOperand(i: 0).getValueType() == MVT::i32))
2720 ++Cost;
2721 // If the base is a register with multiple uses, this
2722 // transformation may save a mov.
2723 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2724 !AM.Base_Reg.getNode()->hasOneUse()) ||
2725 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2726 --Cost;
2727 // If the folded LHS was interesting, this transformation saves
2728 // address arithmetic.
2729 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2730 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2731 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2732 --Cost;
2733 // If it doesn't look like it may be an overall win, don't do it.
2734 if (Cost >= 0) {
2735 AM = Backup;
2736 break;
2737 }
2738
2739 // Ok, the transformation is legal and appears profitable. Go for it.
2740 // Negation will be emitted later to avoid creating dangling nodes if this
2741 // was an unprofitable LEA.
2742 AM.IndexReg = RHS;
2743 AM.NegateIndex = true;
2744 AM.Scale = 1;
2745 return false;
2746 }
2747
2748 case ISD::OR:
2749 case ISD::XOR:
2750 // See if we can treat the OR/XOR node as an ADD node.
2751 if (!CurDAG->isADDLike(Op: N))
2752 break;
2753 [[fallthrough]];
2754 case ISD::ADD:
2755 if (!matchAdd(N, AM, Depth))
2756 return false;
2757 break;
2758
2759 case ISD::AND: {
2760 // Perform some heroic transforms on an and of a constant-count shift
2761 // with a constant to enable use of the scaled offset field.
2762
2763 // Scale must not be used already.
2764 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2765
2766 // We only handle up to 64-bit values here as those are what matter for
2767 // addressing mode optimizations.
2768 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2769 "Unexpected value size!");
2770
2771 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)))
2772 break;
2773
2774 if (N.getOperand(i: 0).getOpcode() == ISD::SRL) {
2775 SDValue Shift = N.getOperand(i: 0);
2776 SDValue X = Shift.getOperand(i: 0);
2777
2778 uint64_t Mask = N.getConstantOperandVal(i: 1);
2779
2780 // Try to fold the mask and shift into an extract and scale.
2781 if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2782 return false;
2783
2784 // Try to fold the mask and shift directly into the scale.
2785 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2786 return false;
2787
2788 // Try to fold the mask and shift into BEXTR and scale.
2789 if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask, Shift, X, AM, Subtarget: *Subtarget))
2790 return false;
2791 }
2792
2793 // Try to swap the mask and shift to place shifts which can be done as
2794 // a scale on the outside of the mask.
2795 if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM))
2796 return false;
2797
2798 break;
2799 }
2800 case ISD::ZERO_EXTEND: {
2801 // Try to widen a zexted shift left to the same size as its use, so we can
2802 // match the shift as a scale factor.
2803 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2804 break;
2805
2806 SDValue Src = N.getOperand(i: 0);
2807
2808 // See if we can match a zext(addlike(x,c)).
2809 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2810 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2811 if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + 1))
2812 if (Index != N) {
2813 AM.IndexReg = Index;
2814 return false;
2815 }
2816
2817 // Peek through mask: zext(and(shl(x,c1),c2))
2818 APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits());
2819 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2820 if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1))) {
2821 Mask = MaskC->getAPIntValue();
2822 Src = Src.getOperand(i: 0);
2823 }
2824
2825 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2826 // Give up if the shift is not a valid scale factor [1,2,3].
2827 SDValue ShlSrc = Src.getOperand(i: 0);
2828 SDValue ShlAmt = Src.getOperand(i: 1);
2829 auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt);
2830 if (!ShAmtC)
2831 break;
2832 unsigned ShAmtV = ShAmtC->getZExtValue();
2833 if (ShAmtV > 3)
2834 break;
2835
2836 // The narrow shift must only shift out zero bits (it must be 'nuw').
2837 // That makes it safe to widen to the destination type.
2838 APInt HighZeros =
2839 APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV);
2840 if (!Src->getFlags().hasNoUnsignedWrap() &&
2841 !CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask))
2842 break;
2843
2844 // zext (shl nuw i8 %x, C1) to i32
2845 // --> shl (zext i8 %x to i32), (zext C1)
2846 // zext (and (shl nuw i8 %x, C1), C2) to i32
2847 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2848 MVT SrcVT = ShlSrc.getSimpleValueType();
2849 MVT VT = N.getSimpleValueType();
2850 SDLoc DL(N);
2851
2852 SDValue Res = ShlSrc;
2853 if (!Mask.isAllOnes()) {
2854 Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT);
2855 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2856 Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res);
2857 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2858 }
2859 SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res);
2860 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext);
2861 SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt);
2862 insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl);
2863 CurDAG->ReplaceAllUsesWith(From: N, To: NewShl);
2864 CurDAG->RemoveDeadNode(N: N.getNode());
2865
2866 // Convert the shift to scale factor.
2867 AM.Scale = 1 << ShAmtV;
2868 // If matchIndexRecursively is not called here,
2869 // Zext may be replaced by other nodes but later used to call a builder
2870 // method
2871 AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + 1);
2872 return false;
2873 }
2874
2875 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2876 // Try to fold the mask and shift into an extract and scale.
2877 if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2878 X: Src.getOperand(i: 0), AM))
2879 return false;
2880
2881 // Try to fold the mask and shift directly into the scale.
2882 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2883 X: Src.getOperand(i: 0), AM))
2884 return false;
2885
2886 // Try to fold the mask and shift into BEXTR and scale.
2887 if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2888 X: Src.getOperand(i: 0), AM, Subtarget: *Subtarget))
2889 return false;
2890 }
2891
2892 break;
2893 }
2894 }
2895
2896 return matchAddressBase(N, AM);
2897}
2898
2899/// Helper for MatchAddress. Add the specified node to the
2900/// specified addressing mode without any further recursion.
2901bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2902 // Is the base register already occupied?
2903 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2904 // If so, check to see if the scale index register is set.
2905 if (!AM.IndexReg.getNode()) {
2906 AM.IndexReg = N;
2907 AM.Scale = 1;
2908 return false;
2909 }
2910
2911 // Otherwise, we cannot select it.
2912 return true;
2913 }
2914
2915 // Default, generate it as a register.
2916 AM.BaseType = X86ISelAddressMode::RegBase;
2917 AM.Base_Reg = N;
2918 return false;
2919}
2920
2921bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2922 X86ISelAddressMode &AM,
2923 unsigned Depth) {
2924 LLVM_DEBUG({
2925 dbgs() << "MatchVectorAddress: ";
2926 AM.dump(CurDAG);
2927 });
2928 // Limit recursion.
2929 if (Depth >= SelectionDAG::MaxRecursionDepth)
2930 return matchAddressBase(N, AM);
2931
2932 // TODO: Support other operations.
2933 switch (N.getOpcode()) {
2934 case ISD::Constant: {
2935 uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2936 if (!foldOffsetIntoAddress(Offset: Val, AM))
2937 return false;
2938 break;
2939 }
2940 case X86ISD::Wrapper:
2941 if (!matchWrapper(N, AM))
2942 return false;
2943 break;
2944 case ISD::ADD: {
2945 // Add an artificial use to this node so that we can keep track of
2946 // it if it gets CSE'd with a different node.
2947 HandleSDNode Handle(N);
2948
2949 X86ISelAddressMode Backup = AM;
2950 if (!matchVectorAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1) &&
2951 !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2952 Depth: Depth + 1))
2953 return false;
2954 AM = Backup;
2955
2956 // Try again after commuting the operands.
2957 if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2958 Depth: Depth + 1) &&
2959 !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM,
2960 Depth: Depth + 1))
2961 return false;
2962 AM = Backup;
2963
2964 N = Handle.getValue();
2965 break;
2966 }
2967 }
2968
2969 return matchAddressBase(N, AM);
2970}
2971
2972/// Helper for selectVectorAddr. Handles things that can be folded into a
2973/// gather/scatter address. The index register and scale should have already
2974/// been handled.
2975bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2976 return matchVectorAddressRecursively(N, AM, Depth: 0);
2977}
2978
2979bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2980 SDValue IndexOp, SDValue ScaleOp,
2981 SDValue &Base, SDValue &Scale,
2982 SDValue &Index, SDValue &Disp,
2983 SDValue &Segment) {
2984 X86ISelAddressMode AM;
2985 AM.Scale = ScaleOp->getAsZExtVal();
2986
2987 // Attempt to match index patterns, as long as we're not relying on implicit
2988 // sign-extension, which is performed BEFORE scale.
2989 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2990 AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: 0);
2991 else
2992 AM.IndexReg = IndexOp;
2993
2994 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2995 if (AddrSpace == X86AS::GS)
2996 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
2997 if (AddrSpace == X86AS::FS)
2998 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
2999 if (AddrSpace == X86AS::SS)
3000 AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
3001
3002 SDLoc DL(BasePtr);
3003 MVT VT = BasePtr.getSimpleValueType();
3004
3005 // Try to match into the base and displacement fields.
3006 if (matchVectorAddress(N: BasePtr, AM))
3007 return false;
3008
3009 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3010 return true;
3011}
3012
3013/// Returns true if it is able to pattern match an addressing mode.
3014/// It returns the operands which make up the maximal addressing mode it can
3015/// match by reference.
3016///
3017/// Parent is the parent node of the addr operand that is being matched. It
3018/// is always a load, store, atomic node, or null. It is only null when
3019/// checking memory operands for inline asm nodes.
3020bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
3021 SDValue &Scale, SDValue &Index,
3022 SDValue &Disp, SDValue &Segment) {
3023 X86ISelAddressMode AM;
3024
3025 if (Parent &&
3026 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
3027 // that are not a MemSDNode, and thus don't have proper addrspace info.
3028 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
3029 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
3030 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
3031 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3032 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3033 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3034 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3035 unsigned AddrSpace =
3036 cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace();
3037 if (AddrSpace == X86AS::GS)
3038 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
3039 if (AddrSpace == X86AS::FS)
3040 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
3041 if (AddrSpace == X86AS::SS)
3042 AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
3043 }
3044
3045 // Save the DL and VT before calling matchAddress, it can invalidate N.
3046 SDLoc DL(N);
3047 MVT VT = N.getSimpleValueType();
3048
3049 if (matchAddress(N, AM))
3050 return false;
3051
3052 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3053 return true;
3054}
3055
3056bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3057 // Cannot use 32 bit constants to reference objects in kernel/large code
3058 // model.
3059 if (TM.getCodeModel() == CodeModel::Kernel ||
3060 TM.getCodeModel() == CodeModel::Large)
3061 return false;
3062
3063 // In static codegen with small code model, we can get the address of a label
3064 // into a register with 'movl'
3065 if (N->getOpcode() != X86ISD::Wrapper)
3066 return false;
3067
3068 N = N.getOperand(i: 0);
3069
3070 // At least GNU as does not accept 'movl' for TPOFF relocations.
3071 // FIXME: We could use 'movl' when we know we are targeting MC.
3072 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3073 return false;
3074
3075 Imm = N;
3076 // Small/medium code model can reference non-TargetGlobalAddress objects with
3077 // 32 bit constants.
3078 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3079 return TM.getCodeModel() == CodeModel::Small ||
3080 TM.getCodeModel() == CodeModel::Medium;
3081 }
3082
3083 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal();
3084 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3085 return CR->getUnsignedMax().ult(RHS: 1ull << 32);
3086
3087 return !TM.isLargeGlobalValue(GV);
3088}
3089
3090bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3091 SDValue &Index, SDValue &Disp,
3092 SDValue &Segment) {
3093 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3094 SDLoc DL(N);
3095
3096 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3097 return false;
3098
3099 EVT BaseType = Base.getValueType();
3100 unsigned SubReg;
3101 if (BaseType == MVT::i8)
3102 SubReg = X86::sub_8bit;
3103 else if (BaseType == MVT::i16)
3104 SubReg = X86::sub_16bit;
3105 else
3106 SubReg = X86::sub_32bit;
3107
3108 auto *RN = dyn_cast<RegisterSDNode>(Val&: Base);
3109 if (RN && RN->getReg() == 0)
3110 Base = CurDAG->getRegister(Reg: 0, VT: MVT::i64);
3111 else if ((BaseType == MVT::i8 || BaseType == MVT::i16 ||
3112 BaseType == MVT::i32) &&
3113 !isa<FrameIndexSDNode>(Val: Base)) {
3114 // Base could already be %rip, particularly in the x32 ABI.
3115 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3116 VT: MVT::i64), 0);
3117 Base = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Base);
3118 }
3119
3120 [[maybe_unused]] EVT IndexType = Index.getValueType();
3121 RN = dyn_cast<RegisterSDNode>(Val&: Index);
3122 if (RN && RN->getReg() == 0)
3123 Index = CurDAG->getRegister(Reg: 0, VT: MVT::i64);
3124 else {
3125 assert((IndexType == BaseType) &&
3126 "Expect to be extending 8/16/32-bit registers for use in LEA");
3127 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3128 VT: MVT::i64), 0);
3129 Index = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Index);
3130 }
3131
3132 return true;
3133}
3134
3135/// Calls SelectAddr and determines if the maximal addressing
3136/// mode it matches can be cost effectively emitted as an LEA instruction.
3137bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3138 SDValue &Base, SDValue &Scale,
3139 SDValue &Index, SDValue &Disp,
3140 SDValue &Segment) {
3141 X86ISelAddressMode AM;
3142
3143 // Save the DL and VT before calling matchAddress, it can invalidate N.
3144 SDLoc DL(N);
3145 MVT VT = N.getSimpleValueType();
3146
3147 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3148 // segments.
3149 SDValue Copy = AM.Segment;
3150 SDValue T = CurDAG->getRegister(Reg: 0, VT: MVT::i32);
3151 AM.Segment = T;
3152 if (matchAddress(N, AM))
3153 return false;
3154 assert (T == AM.Segment);
3155 AM.Segment = Copy;
3156
3157 unsigned Complexity = 0;
3158 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3159 Complexity = 1;
3160 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3161 Complexity = 4;
3162
3163 if (AM.IndexReg.getNode())
3164 Complexity++;
3165
3166 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3167 // a simple shift.
3168 if (AM.Scale > 1)
3169 Complexity++;
3170
3171 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3172 // to a LEA. This is determined with some experimentation but is by no means
3173 // optimal (especially for code size consideration). LEA is nice because of
3174 // its three-address nature. Tweak the cost function again when we can run
3175 // convertToThreeAddress() at register allocation time.
3176 if (AM.hasSymbolicDisplacement()) {
3177 // For X86-64, always use LEA to materialize RIP-relative addresses.
3178 if (Subtarget->is64Bit())
3179 Complexity = 4;
3180 else
3181 Complexity += 2;
3182 }
3183
3184 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3185 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3186 // duplicating flag-producing instructions later in the pipeline.
3187 if (N.getOpcode() == ISD::ADD) {
3188 auto isMathWithFlags = [](SDValue V) {
3189 switch (V.getOpcode()) {
3190 case X86ISD::ADD:
3191 case X86ISD::SUB:
3192 case X86ISD::ADC:
3193 case X86ISD::SBB:
3194 case X86ISD::SMUL:
3195 case X86ISD::UMUL:
3196 /* TODO: These opcodes can be added safely, but we may want to justify
3197 their inclusion for different reasons (better for reg-alloc).
3198 case X86ISD::OR:
3199 case X86ISD::XOR:
3200 case X86ISD::AND:
3201 */
3202 // Value 1 is the flag output of the node - verify it's not dead.
3203 return !SDValue(V.getNode(), 1).use_empty();
3204 default:
3205 return false;
3206 }
3207 };
3208 // TODO: We might want to factor in whether there's a load folding
3209 // opportunity for the math op that disappears with LEA.
3210 if (isMathWithFlags(N.getOperand(i: 0)) || isMathWithFlags(N.getOperand(i: 1)))
3211 Complexity++;
3212 }
3213
3214 if (AM.Disp)
3215 Complexity++;
3216
3217 // If it isn't worth using an LEA, reject it.
3218 if (Complexity <= 2)
3219 return false;
3220
3221 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3222 return true;
3223}
3224
3225/// This is only run on TargetGlobalTLSAddress nodes.
3226bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3227 SDValue &Scale, SDValue &Index,
3228 SDValue &Disp, SDValue &Segment) {
3229 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3230 N.getOpcode() == ISD::TargetExternalSymbol);
3231
3232 X86ISelAddressMode AM;
3233 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) {
3234 AM.GV = GA->getGlobal();
3235 AM.Disp += GA->getOffset();
3236 AM.SymbolFlags = GA->getTargetFlags();
3237 } else {
3238 auto *SA = cast<ExternalSymbolSDNode>(Val&: N);
3239 AM.ES = SA->getSymbol();
3240 AM.SymbolFlags = SA->getTargetFlags();
3241 }
3242
3243 if (Subtarget->is32Bit()) {
3244 AM.Scale = 1;
3245 AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32);
3246 }
3247
3248 MVT VT = N.getSimpleValueType();
3249 getAddressOperands(AM, DL: SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3250 return true;
3251}
3252
3253bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3254 // Keep track of the original value type and whether this value was
3255 // truncated. If we see a truncation from pointer type to VT that truncates
3256 // bits that are known to be zero, we can use a narrow reference.
3257 EVT VT = N.getValueType();
3258 bool WasTruncated = false;
3259 if (N.getOpcode() == ISD::TRUNCATE) {
3260 WasTruncated = true;
3261 N = N.getOperand(i: 0);
3262 }
3263
3264 if (N.getOpcode() != X86ISD::Wrapper)
3265 return false;
3266
3267 // We can only use non-GlobalValues as immediates if they were not truncated,
3268 // as we do not have any range information. If we have a GlobalValue and the
3269 // address was not truncated, we can select it as an operand directly.
3270 unsigned Opc = N.getOperand(i: 0)->getOpcode();
3271 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3272 Op = N.getOperand(i: 0);
3273 // We can only select the operand directly if we didn't have to look past a
3274 // truncate.
3275 return !WasTruncated;
3276 }
3277
3278 // Check that the global's range fits into VT.
3279 auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: 0));
3280 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3281 if (!CR || CR->getUnsignedMax().uge(RHS: 1ull << VT.getSizeInBits()))
3282 return false;
3283
3284 // Okay, we can use a narrow reference.
3285 Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N), VT,
3286 offset: GA->getOffset(), TargetFlags: GA->getTargetFlags());
3287 return true;
3288}
3289
3290bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3291 SDValue &Base, SDValue &Scale,
3292 SDValue &Index, SDValue &Disp,
3293 SDValue &Segment) {
3294 assert(Root && P && "Unknown root/parent nodes");
3295 if (!ISD::isNON_EXTLoad(N: N.getNode()) ||
3296 !IsProfitableToFold(N, U: P, Root) ||
3297 !IsLegalToFold(N, U: P, Root, OptLevel))
3298 return false;
3299
3300 return selectAddr(Parent: N.getNode(),
3301 N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment);
3302}
3303
3304bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3305 SDValue &Base, SDValue &Scale,
3306 SDValue &Index, SDValue &Disp,
3307 SDValue &Segment) {
3308 assert(Root && P && "Unknown root/parent nodes");
3309 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3310 !IsProfitableToFold(N, U: P, Root) ||
3311 !IsLegalToFold(N, U: P, Root, OptLevel))
3312 return false;
3313
3314 return selectAddr(Parent: N.getNode(),
3315 N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment);
3316}
3317
3318/// Return an SDNode that returns the value of the global base register.
3319/// Output instructions required to initialize the global base register,
3320/// if necessary.
3321SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3322 Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3323 auto &DL = MF->getDataLayout();
3324 return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode();
3325}
3326
3327bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3328 if (N->getOpcode() == ISD::TRUNCATE)
3329 N = N->getOperand(Num: 0).getNode();
3330 if (N->getOpcode() != X86ISD::Wrapper)
3331 return false;
3332
3333 auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: 0));
3334 if (!GA)
3335 return false;
3336
3337 auto *GV = GA->getGlobal();
3338 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3339 if (CR)
3340 return CR->getSignedMin().sge(RHS: -1ull << Width) &&
3341 CR->getSignedMax().slt(RHS: 1ull << Width);
3342 // In the kernel code model, globals are in the negative 2GB of the address
3343 // space, so globals can be a sign extended 32-bit immediate.
3344 // In other code models, small globals are in the low 2GB of the address
3345 // space, so sign extending them is equivalent to zero extending them.
3346 return TM.getCodeModel() != CodeModel::Large && Width == 32 &&
3347 !TM.isLargeGlobalValue(GV);
3348}
3349
3350X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3351 assert(N->isMachineOpcode() && "Unexpected node");
3352 unsigned Opc = N->getMachineOpcode();
3353 const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc);
3354 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3355 if (CondNo < 0)
3356 return X86::COND_INVALID;
3357
3358 return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo));
3359}
3360
3361/// Test whether the given X86ISD::CMP node has any users that use a flag
3362/// other than ZF.
3363bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3364 // Examine each user of the node.
3365 for (SDUse &Use : Flags->uses()) {
3366 // Only check things that use the flags.
3367 if (Use.getResNo() != Flags.getResNo())
3368 continue;
3369 SDNode *User = Use.getUser();
3370 // Only examine CopyToReg uses that copy to EFLAGS.
3371 if (User->getOpcode() != ISD::CopyToReg ||
3372 cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3373 return false;
3374 // Examine each user of the CopyToReg use.
3375 for (SDUse &FlagUse : User->uses()) {
3376 // Only examine the Flag result.
3377 if (FlagUse.getResNo() != 1)
3378 continue;
3379 // Anything unusual: assume conservatively.
3380 if (!FlagUse.getUser()->isMachineOpcode())
3381 return false;
3382 // Examine the condition code of the user.
3383 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3384
3385 switch (CC) {
3386 // Comparisons which only use the zero flag.
3387 case X86::COND_E: case X86::COND_NE:
3388 continue;
3389 // Anything else: assume conservatively.
3390 default:
3391 return false;
3392 }
3393 }
3394 }
3395 return true;
3396}
3397
3398/// Test whether the given X86ISD::CMP node has any uses which require the SF
3399/// flag to be accurate.
3400bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3401 // Examine each user of the node.
3402 for (SDUse &Use : Flags->uses()) {
3403 // Only check things that use the flags.
3404 if (Use.getResNo() != Flags.getResNo())
3405 continue;
3406 SDNode *User = Use.getUser();
3407 // Only examine CopyToReg uses that copy to EFLAGS.
3408 if (User->getOpcode() != ISD::CopyToReg ||
3409 cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3410 return false;
3411 // Examine each user of the CopyToReg use.
3412 for (SDUse &FlagUse : User->uses()) {
3413 // Only examine the Flag result.
3414 if (FlagUse.getResNo() != 1)
3415 continue;
3416 // Anything unusual: assume conservatively.
3417 if (!FlagUse.getUser()->isMachineOpcode())
3418 return false;
3419 // Examine the condition code of the user.
3420 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3421
3422 switch (CC) {
3423 // Comparisons which don't examine the SF flag.
3424 case X86::COND_A: case X86::COND_AE:
3425 case X86::COND_B: case X86::COND_BE:
3426 case X86::COND_E: case X86::COND_NE:
3427 case X86::COND_O: case X86::COND_NO:
3428 case X86::COND_P: case X86::COND_NP:
3429 continue;
3430 // Anything else: assume conservatively.
3431 default:
3432 return false;
3433 }
3434 }
3435 }
3436 return true;
3437}
3438
3439static bool mayUseCarryFlag(X86::CondCode CC) {
3440 switch (CC) {
3441 // Comparisons which don't examine the CF flag.
3442 case X86::COND_O: case X86::COND_NO:
3443 case X86::COND_E: case X86::COND_NE:
3444 case X86::COND_S: case X86::COND_NS:
3445 case X86::COND_P: case X86::COND_NP:
3446 case X86::COND_L: case X86::COND_GE:
3447 case X86::COND_G: case X86::COND_LE:
3448 return false;
3449 // Anything else: assume conservatively.
3450 default:
3451 return true;
3452 }
3453}
3454
3455/// Test whether the given node which sets flags has any uses which require the
3456/// CF flag to be accurate.
3457 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3458 // Examine each user of the node.
3459 for (SDUse &Use : Flags->uses()) {
3460 // Only check things that use the flags.
3461 if (Use.getResNo() != Flags.getResNo())
3462 continue;
3463
3464 SDNode *User = Use.getUser();
3465 unsigned UserOpc = User->getOpcode();
3466
3467 if (UserOpc == ISD::CopyToReg) {
3468 // Only examine CopyToReg uses that copy to EFLAGS.
3469 if (cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3470 return false;
3471 // Examine each user of the CopyToReg use.
3472 for (SDUse &FlagUse : User->uses()) {
3473 // Only examine the Flag result.
3474 if (FlagUse.getResNo() != 1)
3475 continue;
3476 // Anything unusual: assume conservatively.
3477 if (!FlagUse.getUser()->isMachineOpcode())
3478 return false;
3479 // Examine the condition code of the user.
3480 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3481
3482 if (mayUseCarryFlag(CC))
3483 return false;
3484 }
3485
3486 // This CopyToReg is ok. Move on to the next user.
3487 continue;
3488 }
3489
3490 // This might be an unselected node. So look for the pre-isel opcodes that
3491 // use flags.
3492 unsigned CCOpNo;
3493 switch (UserOpc) {
3494 default:
3495 // Something unusual. Be conservative.
3496 return false;
3497 case X86ISD::SETCC: CCOpNo = 0; break;
3498 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3499 case X86ISD::CMOV: CCOpNo = 2; break;
3500 case X86ISD::BRCOND: CCOpNo = 2; break;
3501 }
3502
3503 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(Num: CCOpNo);
3504 if (mayUseCarryFlag(CC))
3505 return false;
3506 }
3507 return true;
3508}
3509
3510bool X86DAGToDAGISel::checkTCRetEnoughRegs(SDNode *N) const {
3511 // Check that there is enough volatile registers to load the callee address.
3512
3513 const X86RegisterInfo *RI = Subtarget->getRegisterInfo();
3514 unsigned AvailGPRs;
3515 // The register classes below must stay in sync with what's used for
3516 // TCRETURNri, TCRETURN_HIPE32ri, TCRETURN_WIN64ri, etc).
3517 if (Subtarget->is64Bit()) {
3518 const TargetRegisterClass *TCGPRs =
3519 Subtarget->isCallingConvWin64(CC: MF->getFunction().getCallingConv())
3520 ? &X86::GR64_TCW64RegClass
3521 : &X86::GR64_TCRegClass;
3522 // Can't use RSP or RIP for the load in general.
3523 assert(TCGPRs->contains(X86::RSP));
3524 assert(TCGPRs->contains(X86::RIP));
3525 AvailGPRs = TCGPRs->getNumRegs() - 2;
3526 } else {
3527 const TargetRegisterClass *TCGPRs =
3528 MF->getFunction().getCallingConv() == CallingConv::HiPE
3529 ? &X86::GR32RegClass
3530 : &X86::GR32_TCRegClass;
3531 // Can't use ESP for the address in general.
3532 assert(TCGPRs->contains(X86::ESP));
3533 AvailGPRs = TCGPRs->getNumRegs() - 1;
3534 }
3535
3536 // The load's base and index need up to two registers.
3537 unsigned LoadGPRs = 2;
3538
3539 assert(N->getOpcode() == X86ISD::TC_RETURN);
3540 // X86tcret args: (*chain, ptr, imm, regs..., glue)
3541
3542 if (Subtarget->is32Bit()) {
3543 // FIXME: This was carried from X86tcret_1reg which was used for 32-bit,
3544 // but it could apply to 64-bit too.
3545 const SDValue &BasePtr = cast<LoadSDNode>(Val: N->getOperand(Num: 1))->getBasePtr();
3546 if (isa<FrameIndexSDNode>(Val: BasePtr)) {
3547 LoadGPRs -= 2; // Base is fixed index off ESP; no regs needed.
3548 } else if (BasePtr.getOpcode() == X86ISD::Wrapper &&
3549 isa<GlobalAddressSDNode>(Val: BasePtr->getOperand(Num: 0))) {
3550 assert(!getTargetMachine().isPositionIndependent());
3551 LoadGPRs -= 1; // Base is a global (immediate since this is non-PIC), no
3552 // reg needed.
3553 }
3554 }
3555
3556 unsigned ArgGPRs = 0;
3557 for (unsigned I = 3, E = N->getNumOperands(); I != E; ++I) {
3558 if (const auto *RN = dyn_cast<RegisterSDNode>(Val: N->getOperand(Num: I))) {
3559 if (!RI->isGeneralPurposeRegister(*MF, RN->getReg()))
3560 continue;
3561 if (++ArgGPRs + LoadGPRs > AvailGPRs)
3562 return false;
3563 }
3564 }
3565
3566 return true;
3567}
3568
3569/// Check whether or not the chain ending in StoreNode is suitable for doing
3570/// the {load; op; store} to modify transformation.
3571static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3572 SDValue StoredVal, SelectionDAG *CurDAG,
3573 unsigned LoadOpNo,
3574 LoadSDNode *&LoadNode,
3575 SDValue &InputChain) {
3576 // Is the stored value result 0 of the operation?
3577 if (StoredVal.getResNo() != 0) return false;
3578
3579 // Are there other uses of the operation other than the store?
3580 if (!StoredVal.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) return false;
3581
3582 // Is the store non-extending and non-indexed?
3583 if (!ISD::isNormalStore(N: StoreNode) || StoreNode->isNonTemporal())
3584 return false;
3585
3586 SDValue Load = StoredVal->getOperand(Num: LoadOpNo);
3587 // Is the stored value a non-extending and non-indexed load?
3588 if (!ISD::isNormalLoad(N: Load.getNode())) return false;
3589
3590 // Return LoadNode by reference.
3591 LoadNode = cast<LoadSDNode>(Val&: Load);
3592
3593 // Is store the only read of the loaded value?
3594 if (!Load.hasOneUse())
3595 return false;
3596
3597 // Is the address of the store the same as the load?
3598 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3599 LoadNode->getOffset() != StoreNode->getOffset())
3600 return false;
3601
3602 bool FoundLoad = false;
3603 SmallVector<SDValue, 4> ChainOps;
3604 SmallVector<const SDNode *, 4> LoopWorklist;
3605 SmallPtrSet<const SDNode *, 16> Visited;
3606 const unsigned int Max = 1024;
3607
3608 // Visualization of Load-Op-Store fusion:
3609 // -------------------------
3610 // Legend:
3611 // *-lines = Chain operand dependencies.
3612 // |-lines = Normal operand dependencies.
3613 // Dependencies flow down and right. n-suffix references multiple nodes.
3614 //
3615 // C Xn C
3616 // * * *
3617 // * * *
3618 // Xn A-LD Yn TF Yn
3619 // * * \ | * |
3620 // * * \ | * |
3621 // * * \ | => A--LD_OP_ST
3622 // * * \| \
3623 // TF OP \
3624 // * | \ Zn
3625 // * | \
3626 // A-ST Zn
3627 //
3628
3629 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3630 // #2: Yn -> LD
3631 // #3: ST -> Zn
3632
3633 // Ensure the transform is safe by checking for the dual
3634 // dependencies to make sure we do not induce a loop.
3635
3636 // As LD is a predecessor to both OP and ST we can do this by checking:
3637 // a). if LD is a predecessor to a member of Xn or Yn.
3638 // b). if a Zn is a predecessor to ST.
3639
3640 // However, (b) can only occur through being a chain predecessor to
3641 // ST, which is the same as Zn being a member or predecessor of Xn,
3642 // which is a subset of LD being a predecessor of Xn. So it's
3643 // subsumed by check (a).
3644
3645 SDValue Chain = StoreNode->getChain();
3646
3647 // Gather X elements in ChainOps.
3648 if (Chain == Load.getValue(R: 1)) {
3649 FoundLoad = true;
3650 ChainOps.push_back(Elt: Load.getOperand(i: 0));
3651 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3652 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3653 SDValue Op = Chain.getOperand(i);
3654 if (Op == Load.getValue(R: 1)) {
3655 FoundLoad = true;
3656 // Drop Load, but keep its chain. No cycle check necessary.
3657 ChainOps.push_back(Elt: Load.getOperand(i: 0));
3658 continue;
3659 }
3660 LoopWorklist.push_back(Elt: Op.getNode());
3661 ChainOps.push_back(Elt: Op);
3662 }
3663 }
3664
3665 if (!FoundLoad)
3666 return false;
3667
3668 // Worklist is currently Xn. Add Yn to worklist.
3669 for (SDValue Op : StoredVal->ops())
3670 if (Op.getNode() != LoadNode)
3671 LoopWorklist.push_back(Elt: Op.getNode());
3672
3673 // Check (a) if Load is a predecessor to Xn + Yn
3674 if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max,
3675 TopologicalPrune: true))
3676 return false;
3677
3678 InputChain =
3679 CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ChainOps);
3680 return true;
3681}
3682
3683// Change a chain of {load; op; store} of the same value into a simple op
3684// through memory of that value, if the uses of the modified value and its
3685// address are suitable.
3686//
3687// The tablegen pattern memory operand pattern is currently not able to match
3688// the case where the EFLAGS on the original operation are used.
3689//
3690// To move this to tablegen, we'll need to improve tablegen to allow flags to
3691// be transferred from a node in the pattern to the result node, probably with
3692// a new keyword. For example, we have this
3693// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3694// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3695// but maybe need something like this
3696// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3697// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3698// (transferrable EFLAGS)]>;
3699//
3700// Until then, we manually fold these and instruction select the operation
3701// here.
3702bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3703 auto *StoreNode = cast<StoreSDNode>(Val: Node);
3704 SDValue StoredVal = StoreNode->getOperand(Num: 1);
3705 unsigned Opc = StoredVal->getOpcode();
3706
3707 // Before we try to select anything, make sure this is memory operand size
3708 // and opcode we can handle. Note that this must match the code below that
3709 // actually lowers the opcodes.
3710 EVT MemVT = StoreNode->getMemoryVT();
3711 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3712 MemVT != MVT::i8)
3713 return false;
3714
3715 bool IsCommutable = false;
3716 bool IsNegate = false;
3717 switch (Opc) {
3718 default:
3719 return false;
3720 case X86ISD::SUB:
3721 IsNegate = isNullConstant(V: StoredVal.getOperand(i: 0));
3722 break;
3723 case X86ISD::SBB:
3724 break;
3725 case X86ISD::ADD:
3726 case X86ISD::ADC:
3727 case X86ISD::AND:
3728 case X86ISD::OR:
3729 case X86ISD::XOR:
3730 IsCommutable = true;
3731 break;
3732 }
3733
3734 unsigned LoadOpNo = IsNegate ? 1 : 0;
3735 LoadSDNode *LoadNode = nullptr;
3736 SDValue InputChain;
3737 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3738 LoadNode, InputChain)) {
3739 if (!IsCommutable)
3740 return false;
3741
3742 // This operation is commutable, try the other operand.
3743 LoadOpNo = 1;
3744 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3745 LoadNode, InputChain))
3746 return false;
3747 }
3748
3749 SDValue Base, Scale, Index, Disp, Segment;
3750 if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3751 Segment))
3752 return false;
3753
3754 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3755 unsigned Opc8) {
3756 switch (MemVT.getSimpleVT().SimpleTy) {
3757 case MVT::i64:
3758 return Opc64;
3759 case MVT::i32:
3760 return Opc32;
3761 case MVT::i16:
3762 return Opc16;
3763 case MVT::i8:
3764 return Opc8;
3765 default:
3766 llvm_unreachable("Invalid size!");
3767 }
3768 };
3769
3770 MachineSDNode *Result;
3771 switch (Opc) {
3772 case X86ISD::SUB:
3773 // Handle negate.
3774 if (IsNegate) {
3775 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3776 X86::NEG8m);
3777 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3778 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32,
3779 VT2: MVT::Other, Ops);
3780 break;
3781 }
3782 [[fallthrough]];
3783 case X86ISD::ADD:
3784 // Try to match inc/dec.
3785 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3786 bool IsOne = isOneConstant(V: StoredVal.getOperand(i: 1));
3787 bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: 1));
3788 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3789 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) {
3790 unsigned NewOpc =
3791 ((Opc == X86ISD::ADD) == IsOne)
3792 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3793 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3794 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3795 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32,
3796 VT2: MVT::Other, Ops);
3797 break;
3798 }
3799 }
3800 [[fallthrough]];
3801 case X86ISD::ADC:
3802 case X86ISD::SBB:
3803 case X86ISD::AND:
3804 case X86ISD::OR:
3805 case X86ISD::XOR: {
3806 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3807 switch (Opc) {
3808 case X86ISD::ADD:
3809 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3810 X86::ADD8mr);
3811 case X86ISD::ADC:
3812 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3813 X86::ADC8mr);
3814 case X86ISD::SUB:
3815 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3816 X86::SUB8mr);
3817 case X86ISD::SBB:
3818 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3819 X86::SBB8mr);
3820 case X86ISD::AND:
3821 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3822 X86::AND8mr);
3823 case X86ISD::OR:
3824 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3825 case X86ISD::XOR:
3826 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3827 X86::XOR8mr);
3828 default:
3829 llvm_unreachable("Invalid opcode!");
3830 }
3831 };
3832 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3833 switch (Opc) {
3834 case X86ISD::ADD:
3835 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3836 X86::ADD8mi);
3837 case X86ISD::ADC:
3838 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3839 X86::ADC8mi);
3840 case X86ISD::SUB:
3841 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3842 X86::SUB8mi);
3843 case X86ISD::SBB:
3844 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3845 X86::SBB8mi);
3846 case X86ISD::AND:
3847 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3848 X86::AND8mi);
3849 case X86ISD::OR:
3850 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3851 X86::OR8mi);
3852 case X86ISD::XOR:
3853 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3854 X86::XOR8mi);
3855 default:
3856 llvm_unreachable("Invalid opcode!");
3857 }
3858 };
3859
3860 unsigned NewOpc = SelectRegOpcode(Opc);
3861 SDValue Operand = StoredVal->getOperand(Num: 1-LoadOpNo);
3862
3863 // See if the operand is a constant that we can fold into an immediate
3864 // operand.
3865 if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) {
3866 int64_t OperandV = OperandC->getSExtValue();
3867
3868 // Check if we can shrink the operand enough to fit in an immediate (or
3869 // fit into a smaller immediate) by negating it and switching the
3870 // operation.
3871 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3872 ((MemVT != MVT::i8 && !isInt<8>(x: OperandV) && isInt<8>(x: -OperandV)) ||
3873 (MemVT == MVT::i64 && !isInt<32>(x: OperandV) &&
3874 isInt<32>(x: -OperandV))) &&
3875 hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) {
3876 OperandV = -OperandV;
3877 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3878 }
3879
3880 if (MemVT != MVT::i64 || isInt<32>(x: OperandV)) {
3881 Operand = CurDAG->getSignedTargetConstant(Val: OperandV, DL: SDLoc(Node), VT: MemVT);
3882 NewOpc = SelectImmOpcode(Opc);
3883 }
3884 }
3885
3886 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3887 SDValue CopyTo =
3888 CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc(Node), Reg: X86::EFLAGS,
3889 N: StoredVal.getOperand(i: 2), Glue: SDValue());
3890
3891 const SDValue Ops[] = {Base, Scale, Index, Disp,
3892 Segment, Operand, CopyTo, CopyTo.getValue(R: 1)};
3893 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other,
3894 Ops);
3895 } else {
3896 const SDValue Ops[] = {Base, Scale, Index, Disp,
3897 Segment, Operand, InputChain};
3898 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other,
3899 Ops);
3900 }
3901 break;
3902 }
3903 default:
3904 llvm_unreachable("Invalid opcode!");
3905 }
3906
3907 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3908 LoadNode->getMemOperand()};
3909 CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps);
3910
3911 // Update Load Chain uses as well.
3912 ReplaceUses(F: SDValue(LoadNode, 1), T: SDValue(Result, 1));
3913 ReplaceUses(F: SDValue(StoreNode, 0), T: SDValue(Result, 1));
3914 ReplaceUses(F: SDValue(StoredVal.getNode(), 1), T: SDValue(Result, 0));
3915 CurDAG->RemoveDeadNode(N: Node);
3916 return true;
3917}
3918
3919// See if this is an X & Mask that we can match to BEXTR/BZHI.
3920// Where Mask is one of the following patterns:
3921// a) x & (1 << nbits) - 1
3922// b) x & ~(-1 << nbits)
3923// c) x & (-1 >> (32 - y))
3924// d) x << (32 - y) >> (32 - y)
3925// e) (1 << nbits) - 1
3926bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3927 assert(
3928 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3929 Node->getOpcode() == ISD::SRL) &&
3930 "Should be either an and-mask, or right-shift after clearing high bits.");
3931
3932 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3933 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3934 return false;
3935
3936 MVT NVT = Node->getSimpleValueType(ResNo: 0);
3937
3938 // Only supported for 32 and 64 bits.
3939 if (NVT != MVT::i32 && NVT != MVT::i64)
3940 return false;
3941
3942 SDValue NBits;
3943 bool NegateNBits;
3944
3945 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3946 // Else, if we only have BMI1's BEXTR, we require one-use.
3947 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3948 auto checkUses = [AllowExtraUsesByDefault](
3949 SDValue Op, unsigned NUses,
3950 std::optional<bool> AllowExtraUses) {
3951 return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) ||
3952 Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo());
3953 };
3954 auto checkOneUse = [checkUses](SDValue Op,
3955 std::optional<bool> AllowExtraUses =
3956 std::nullopt) {
3957 return checkUses(Op, 1, AllowExtraUses);
3958 };
3959 auto checkTwoUse = [checkUses](SDValue Op,
3960 std::optional<bool> AllowExtraUses =
3961 std::nullopt) {
3962 return checkUses(Op, 2, AllowExtraUses);
3963 };
3964
3965 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3966 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3967 assert(V.getSimpleValueType() == MVT::i32 &&
3968 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3969 "Expected i64 -> i32 truncation");
3970 V = V.getOperand(i: 0);
3971 }
3972 return V;
3973 };
3974
3975 // a) x & ((1 << nbits) + (-1))
3976 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3977 &NegateNBits](SDValue Mask) -> bool {
3978 // Match `add`. Must only have one use!
3979 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3980 return false;
3981 // We should be adding all-ones constant (i.e. subtracting one.)
3982 if (!isAllOnesConstant(V: Mask->getOperand(Num: 1)))
3983 return false;
3984 // Match `1 << nbits`. Might be truncated. Must only have one use!
3985 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0));
3986 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3987 return false;
3988 if (!isOneConstant(V: M0->getOperand(Num: 0)))
3989 return false;
3990 NBits = M0->getOperand(Num: 1);
3991 NegateNBits = false;
3992 return true;
3993 };
3994
3995 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3996 V = peekThroughOneUseTruncation(V);
3997 return CurDAG->MaskedValueIsAllOnes(
3998 Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(),
3999 loBitsSet: NVT.getSizeInBits()));
4000 };
4001
4002 // b) x & ~(-1 << nbits)
4003 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
4004 &NBits, &NegateNBits](SDValue Mask) -> bool {
4005 // Match `~()`. Must only have one use!
4006 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
4007 return false;
4008 // The -1 only has to be all-ones for the final Node's NVT.
4009 if (!isAllOnes(Mask->getOperand(Num: 1)))
4010 return false;
4011 // Match `-1 << nbits`. Might be truncated. Must only have one use!
4012 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0));
4013 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
4014 return false;
4015 // The -1 only has to be all-ones for the final Node's NVT.
4016 if (!isAllOnes(M0->getOperand(Num: 0)))
4017 return false;
4018 NBits = M0->getOperand(Num: 1);
4019 NegateNBits = false;
4020 return true;
4021 };
4022
4023 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
4024 // or leave the shift amount as-is, but then we'll have to negate it.
4025 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
4026 unsigned Bitwidth) {
4027 NBits = ShiftAmt;
4028 NegateNBits = true;
4029 // Skip over a truncate of the shift amount, if any.
4030 if (NBits.getOpcode() == ISD::TRUNCATE)
4031 NBits = NBits.getOperand(i: 0);
4032 // Try to match the shift amount as (bitwidth - y). It should go away, too.
4033 // If it doesn't match, that's fine, we'll just negate it ourselves.
4034 if (NBits.getOpcode() != ISD::SUB)
4035 return;
4036 auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: 0));
4037 if (!V0 || V0->getZExtValue() != Bitwidth)
4038 return;
4039 NBits = NBits.getOperand(i: 1);
4040 NegateNBits = false;
4041 };
4042
4043 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
4044 // or
4045 // c) x & (-1 >> (32 - y))
4046 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
4047 canonicalizeShiftAmt](SDValue Mask) -> bool {
4048 // The mask itself may be truncated.
4049 Mask = peekThroughOneUseTruncation(Mask);
4050 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
4051 // Match `l>>`. Must only have one use!
4052 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
4053 return false;
4054 // We should be shifting truly all-ones constant.
4055 if (!isAllOnesConstant(V: Mask.getOperand(i: 0)))
4056 return false;
4057 SDValue M1 = Mask.getOperand(i: 1);
4058 // The shift amount should not be used externally.
4059 if (!checkOneUse(M1))
4060 return false;
4061 canonicalizeShiftAmt(M1, Bitwidth);
4062 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
4063 // is no extra use of the mask. Clearly, there was one since we are here.
4064 // But at the same time, if we need to negate the shift amount,
4065 // then we don't want the mask to stick around, else it's unprofitable.
4066 return !NegateNBits;
4067 };
4068
4069 SDValue X;
4070
4071 // d) x << z >> z but then we'll have to subtract z from bitwidth
4072 // or
4073 // d) x << (32 - y) >> (32 - y)
4074 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
4075 AllowExtraUsesByDefault, &NegateNBits,
4076 &X](SDNode *Node) -> bool {
4077 if (Node->getOpcode() != ISD::SRL)
4078 return false;
4079 SDValue N0 = Node->getOperand(Num: 0);
4080 if (N0->getOpcode() != ISD::SHL)
4081 return false;
4082 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
4083 SDValue N1 = Node->getOperand(Num: 1);
4084 SDValue N01 = N0->getOperand(Num: 1);
4085 // Both of the shifts must be by the exact same value.
4086 if (N1 != N01)
4087 return false;
4088 canonicalizeShiftAmt(N1, Bitwidth);
4089 // There should not be any external uses of the inner shift / shift amount.
4090 // Note that while we are generally okay with external uses given BMI2,
4091 // iff we need to negate the shift amount, we are not okay with extra uses.
4092 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4093 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
4094 return false;
4095 X = N0->getOperand(Num: 0);
4096 return true;
4097 };
4098
4099 auto matchLowBitMask = [matchPatternA, matchPatternB,
4100 matchPatternC](SDValue Mask) -> bool {
4101 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4102 };
4103
4104 if (Node->getOpcode() == ISD::AND) {
4105 X = Node->getOperand(Num: 0);
4106 SDValue Mask = Node->getOperand(Num: 1);
4107
4108 if (matchLowBitMask(Mask)) {
4109 // Great.
4110 } else {
4111 std::swap(a&: X, b&: Mask);
4112 if (!matchLowBitMask(Mask))
4113 return false;
4114 }
4115 } else if (matchLowBitMask(SDValue(Node, 0))) {
4116 X = CurDAG->getAllOnesConstant(DL: SDLoc(Node), VT: NVT);
4117 } else if (!matchPatternD(Node))
4118 return false;
4119
4120 // If we need to negate the shift amount, require BMI2 BZHI support.
4121 // It's just too unprofitable for BMI1 BEXTR.
4122 if (NegateNBits && !Subtarget->hasBMI2())
4123 return false;
4124
4125 SDLoc DL(Node);
4126
4127 if (NBits.getSimpleValueType() != MVT::i8) {
4128 // Truncate the shift amount.
4129 NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits);
4130 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4131 }
4132
4133 // Turn (i32)(x & imm8) into (i32)x & imm32.
4134 ConstantSDNode *Imm = nullptr;
4135 if (NBits->getOpcode() == ISD::AND)
4136 if ((Imm = dyn_cast<ConstantSDNode>(Val: NBits->getOperand(Num: 1))))
4137 NBits = NBits->getOperand(Num: 0);
4138
4139 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4140 // All the other bits are undefined, we do not care about them.
4141 SDValue ImplDef = SDValue(
4142 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), 0);
4143 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: ImplDef);
4144
4145 SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32);
4146 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: SRIdxVal);
4147 NBits = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL,
4148 VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal),
4149 0);
4150 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4151
4152 if (Imm) {
4153 NBits =
4154 CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: NBits,
4155 N2: CurDAG->getConstant(Val: Imm->getZExtValue(), DL, VT: MVT::i32));
4156 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4157 }
4158
4159 // We might have matched the amount of high bits to be cleared,
4160 // but we want the amount of low bits to be kept, so negate it then.
4161 if (NegateNBits) {
4162 SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32);
4163 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: BitWidthC);
4164
4165 NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits);
4166 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4167 }
4168
4169 if (Subtarget->hasBMI2()) {
4170 // Great, just emit the BZHI..
4171 if (NVT != MVT::i32) {
4172 // But have to place the bit count into the wide-enough register first.
4173 NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits);
4174 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4175 }
4176
4177 SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits);
4178 ReplaceNode(F: Node, T: Extract.getNode());
4179 SelectCode(N: Extract.getNode());
4180 return true;
4181 }
4182
4183 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4184 // *logically* shifted (potentially with one-use trunc inbetween),
4185 // and the truncation was the only use of the shift,
4186 // and if so look past one-use truncation.
4187 {
4188 SDValue RealX = peekThroughOneUseTruncation(X);
4189 // FIXME: only if the shift is one-use?
4190 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4191 X = RealX;
4192 }
4193
4194 MVT XVT = X.getSimpleValueType();
4195
4196 // Else, emitting BEXTR requires one more step.
4197 // The 'control' of BEXTR has the pattern of:
4198 // [15...8 bit][ 7...0 bit] location
4199 // [ bit count][ shift] name
4200 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4201
4202 // Shift NBits left by 8 bits, thus producing 'control'.
4203 // This makes the low 8 bits to be zero.
4204 SDValue C8 = CurDAG->getConstant(Val: 8, DL, VT: MVT::i8);
4205 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: C8);
4206 SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8);
4207 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4208
4209 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4210 // FIXME: only if the shift is one-use?
4211 if (X.getOpcode() == ISD::SRL) {
4212 SDValue ShiftAmt = X.getOperand(i: 1);
4213 X = X.getOperand(i: 0);
4214
4215 assert(ShiftAmt.getValueType() == MVT::i8 &&
4216 "Expected shift amount to be i8");
4217
4218 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4219 // We could zext to i16 in some form, but we intentionally don't do that.
4220 SDValue OrigShiftAmt = ShiftAmt;
4221 ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt);
4222 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt);
4223
4224 // And now 'or' these low 8 bits of shift amount into the 'control'.
4225 Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt);
4226 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4227 }
4228
4229 // But have to place the 'control' into the wide-enough register first.
4230 if (XVT != MVT::i32) {
4231 Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control);
4232 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4233 }
4234
4235 // And finally, form the BEXTR itself.
4236 SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control);
4237
4238 // The 'X' was originally truncated. Do that now.
4239 if (XVT != NVT) {
4240 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Extract);
4241 Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract);
4242 }
4243
4244 ReplaceNode(F: Node, T: Extract.getNode());
4245 SelectCode(N: Extract.getNode());
4246
4247 return true;
4248}
4249
4250// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4251MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4252 MVT NVT = Node->getSimpleValueType(ResNo: 0);
4253 SDLoc dl(Node);
4254
4255 SDValue N0 = Node->getOperand(Num: 0);
4256 SDValue N1 = Node->getOperand(Num: 1);
4257
4258 // If we have TBM we can use an immediate for the control. If we have BMI
4259 // we should only do this if the BEXTR instruction is implemented well.
4260 // Otherwise moving the control into a register makes this more costly.
4261 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4262 // hoisting the move immediate would make it worthwhile with a less optimal
4263 // BEXTR?
4264 bool PreferBEXTR =
4265 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4266 if (!PreferBEXTR && !Subtarget->hasBMI2())
4267 return nullptr;
4268
4269 // Must have a shift right.
4270 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4271 return nullptr;
4272
4273 // Shift can't have additional users.
4274 if (!N0->hasOneUse())
4275 return nullptr;
4276
4277 // Only supported for 32 and 64 bits.
4278 if (NVT != MVT::i32 && NVT != MVT::i64)
4279 return nullptr;
4280
4281 // Shift amount and RHS of and must be constant.
4282 auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1);
4283 auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1));
4284 if (!MaskCst || !ShiftCst)
4285 return nullptr;
4286
4287 // And RHS must be a mask.
4288 uint64_t Mask = MaskCst->getZExtValue();
4289 if (!isMask_64(Value: Mask))
4290 return nullptr;
4291
4292 uint64_t Shift = ShiftCst->getZExtValue();
4293 uint64_t MaskSize = llvm::popcount(Value: Mask);
4294
4295 // Don't interfere with something that can be handled by extracting AH.
4296 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4297 if (Shift == 8 && MaskSize == 8)
4298 return nullptr;
4299
4300 // Make sure we are only using bits that were in the original value, not
4301 // shifted in.
4302 if (Shift + MaskSize > NVT.getSizeInBits())
4303 return nullptr;
4304
4305 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4306 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4307 // does not fit into 32 bits. Load folding is not a sufficient reason.
4308 if (!PreferBEXTR && MaskSize <= 32)
4309 return nullptr;
4310
4311 SDValue Control;
4312 unsigned ROpc, MOpc;
4313
4314#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4315 if (!PreferBEXTR) {
4316 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4317 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4318 // Let's perform the mask first, and apply shift later. Note that we need to
4319 // widen the mask to account for the fact that we'll apply shift afterwards!
4320 Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT);
4321 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4322 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4323 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4324 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4325 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4326 Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0);
4327 } else {
4328 // The 'control' of BEXTR has the pattern of:
4329 // [15...8 bit][ 7...0 bit] location
4330 // [ bit count][ shift] name
4331 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4332 Control = CurDAG->getTargetConstant(Val: Shift | (MaskSize << 8), DL: dl, VT: NVT);
4333 if (Subtarget->hasTBM()) {
4334 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4335 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4336 } else {
4337 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4338 // BMI requires the immediate to placed in a register.
4339 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4340 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4341 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4342 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4343 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4344 Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0);
4345 }
4346 }
4347
4348 MachineSDNode *NewNode;
4349 SDValue Input = N0->getOperand(Num: 0);
4350 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4351 if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4352 SDValue Ops[] = {
4353 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: 0)};
4354 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
4355 NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4356 // Update the chain.
4357 ReplaceUses(F: Input.getValue(R: 1), T: SDValue(NewNode, 2));
4358 // Record the mem-refs
4359 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()});
4360 } else {
4361 NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control);
4362 }
4363
4364 if (!PreferBEXTR) {
4365 // We still need to apply the shift.
4366 SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT);
4367 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4368 : GET_ND_IF_ENABLED(X86::SHR32ri);
4369 NewNode =
4370 CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue(NewNode, 0), Op2: ShAmt);
4371 }
4372
4373 return NewNode;
4374}
4375
4376// Emit a PCMISTR(I/M) instruction.
4377MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4378 bool MayFoldLoad, const SDLoc &dl,
4379 MVT VT, SDNode *Node) {
4380 SDValue N0 = Node->getOperand(Num: 0);
4381 SDValue N1 = Node->getOperand(Num: 1);
4382 SDValue Imm = Node->getOperand(Num: 2);
4383 auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4384 Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType());
4385
4386 // Try to fold a load. No need to check alignment.
4387 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4388 if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4389 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4390 N1.getOperand(i: 0) };
4391 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other);
4392 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4393 // Update the chain.
4394 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 2));
4395 // Record the mem-refs
4396 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
4397 return CNode;
4398 }
4399
4400 SDValue Ops[] = { N0, N1, Imm };
4401 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32);
4402 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4403 return CNode;
4404}
4405
4406// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4407// to emit a second instruction after this one. This is needed since we have two
4408// copyToReg nodes glued before this and we need to continue that glue through.
4409MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4410 bool MayFoldLoad, const SDLoc &dl,
4411 MVT VT, SDNode *Node,
4412 SDValue &InGlue) {
4413 SDValue N0 = Node->getOperand(Num: 0);
4414 SDValue N2 = Node->getOperand(Num: 2);
4415 SDValue Imm = Node->getOperand(Num: 4);
4416 auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4417 Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType());
4418
4419 // Try to fold a load. No need to check alignment.
4420 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4421 if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4422 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4423 N2.getOperand(i: 0), InGlue };
4424 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
4425 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4426 InGlue = SDValue(CNode, 3);
4427 // Update the chain.
4428 ReplaceUses(F: N2.getValue(R: 1), T: SDValue(CNode, 2));
4429 // Record the mem-refs
4430 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()});
4431 return CNode;
4432 }
4433
4434 SDValue Ops[] = { N0, N2, Imm, InGlue };
4435 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue);
4436 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4437 InGlue = SDValue(CNode, 2);
4438 return CNode;
4439}
4440
4441bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4442 EVT VT = N->getValueType(ResNo: 0);
4443
4444 // Only handle scalar shifts.
4445 if (VT.isVector())
4446 return false;
4447
4448 // Narrower shifts only mask to 5 bits in hardware.
4449 unsigned Size = VT == MVT::i64 ? 64 : 32;
4450
4451 SDValue OrigShiftAmt = N->getOperand(Num: 1);
4452 SDValue ShiftAmt = OrigShiftAmt;
4453 SDLoc DL(N);
4454
4455 // Skip over a truncate of the shift amount.
4456 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4457 ShiftAmt = ShiftAmt->getOperand(Num: 0);
4458
4459 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4460 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4461
4462 SDValue NewShiftAmt;
4463 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4464 ShiftAmt->getOpcode() == ISD::XOR) {
4465 SDValue Add0 = ShiftAmt->getOperand(Num: 0);
4466 SDValue Add1 = ShiftAmt->getOperand(Num: 1);
4467 auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0);
4468 auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1);
4469 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4470 // to avoid the ADD/SUB/XOR.
4471 if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == 0) {
4472 NewShiftAmt = Add0;
4473
4474 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4475 ((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - 1) ||
4476 (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - 1))) {
4477 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4478 // we can replace it with a NOT. In the XOR case it may save some code
4479 // size, in the SUB case it also may save a move.
4480 assert(Add0C == nullptr || Add1C == nullptr);
4481
4482 // We can only do N-X, not X-N
4483 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4484 return false;
4485
4486 EVT OpVT = ShiftAmt.getValueType();
4487
4488 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT);
4489 NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT,
4490 N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes);
4491 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes);
4492 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4493 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4494 // -X to generate a NEG instead of a SUB of a constant.
4495 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4496 Add0C->getZExtValue() != 0) {
4497 EVT SubVT = ShiftAmt.getValueType();
4498 SDValue X;
4499 if (Add0C->getZExtValue() % Size == 0)
4500 X = Add1;
4501 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4502 Add0C->getZExtValue() % 32 == 0) {
4503 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4504 // This is mainly beneficial if we already compute (x+n*32).
4505 if (Add1.getOpcode() == ISD::TRUNCATE) {
4506 Add1 = Add1.getOperand(i: 0);
4507 SubVT = Add1.getValueType();
4508 }
4509 if (Add0.getValueType() != SubVT) {
4510 Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT);
4511 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0);
4512 }
4513
4514 X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0);
4515 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X);
4516 } else
4517 return false;
4518 // Insert a negate op.
4519 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4520 // that uses it that's not a shift.
4521 SDValue Zero = CurDAG->getConstant(Val: 0, DL, VT: SubVT);
4522 SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X);
4523 NewShiftAmt = Neg;
4524
4525 // Insert these operands into a valid topological order so they can
4526 // get selected independently.
4527 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero);
4528 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg);
4529 } else
4530 return false;
4531 } else
4532 return false;
4533
4534 if (NewShiftAmt.getValueType() != MVT::i8) {
4535 // Need to truncate the shift amount.
4536 NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt);
4537 // Add to a correct topological ordering.
4538 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4539 }
4540
4541 // Insert a new mask to keep the shift amount legal. This should be removed
4542 // by isel patterns.
4543 NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt,
4544 N2: CurDAG->getConstant(Val: Size - 1, DL, VT: MVT::i8));
4545 // Place in a correct topological ordering.
4546 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4547
4548 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0),
4549 Op2: NewShiftAmt);
4550 if (UpdatedNode != N) {
4551 // If we found an existing node, we should replace ourselves with that node
4552 // and wait for it to be selected after its other users.
4553 ReplaceNode(F: N, T: UpdatedNode);
4554 return true;
4555 }
4556
4557 // If the original shift amount is now dead, delete it so that we don't run
4558 // it through isel.
4559 if (OrigShiftAmt.getNode()->use_empty())
4560 CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode());
4561
4562 // Now that we've optimized the shift amount, defer to normal isel to get
4563 // load folding and legacy vs BMI2 selection without repeating it here.
4564 SelectCode(N);
4565 return true;
4566}
4567
4568bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4569 MVT NVT = N->getSimpleValueType(ResNo: 0);
4570 unsigned Opcode = N->getOpcode();
4571 SDLoc dl(N);
4572
4573 // For operations of the form (x << C1) op C2, check if we can use a smaller
4574 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4575 SDValue Shift = N->getOperand(Num: 0);
4576 SDValue N1 = N->getOperand(Num: 1);
4577
4578 auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
4579 if (!Cst)
4580 return false;
4581
4582 int64_t Val = Cst->getSExtValue();
4583
4584 // If we have an any_extend feeding the AND, look through it to see if there
4585 // is a shift behind it. But only if the AND doesn't use the extended bits.
4586 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4587 bool FoundAnyExtend = false;
4588 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4589 Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 &&
4590 isUInt<32>(x: Val)) {
4591 FoundAnyExtend = true;
4592 Shift = Shift.getOperand(i: 0);
4593 }
4594
4595 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4596 return false;
4597
4598 // i8 is unshrinkable, i16 should be promoted to i32.
4599 if (NVT != MVT::i32 && NVT != MVT::i64)
4600 return false;
4601
4602 auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1));
4603 if (!ShlCst)
4604 return false;
4605
4606 uint64_t ShAmt = ShlCst->getZExtValue();
4607
4608 // Make sure that we don't change the operation by removing bits.
4609 // This only matters for OR and XOR, AND is unaffected.
4610 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4611 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4612 return false;
4613
4614 // Check the minimum bitwidth for the new constant.
4615 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4616 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4617 if (Opcode == ISD::AND) {
4618 // AND32ri is the same as AND64ri32 with zext imm.
4619 // Try this before sign extended immediates below.
4620 ShiftedVal = (uint64_t)Val >> ShAmt;
4621 if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal))
4622 return true;
4623 // Also swap order when the AND can become MOVZX.
4624 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4625 return true;
4626 }
4627 ShiftedVal = Val >> ShAmt;
4628 if ((!isInt<8>(x: Val) && isInt<8>(x: ShiftedVal)) ||
4629 (!isInt<32>(x: Val) && isInt<32>(x: ShiftedVal)))
4630 return true;
4631 if (Opcode != ISD::AND) {
4632 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4633 ShiftedVal = (uint64_t)Val >> ShAmt;
4634 if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal))
4635 return true;
4636 }
4637 return false;
4638 };
4639
4640 int64_t ShiftedVal;
4641 if (!CanShrinkImmediate(ShiftedVal))
4642 return false;
4643
4644 // Ok, we can reorder to get a smaller immediate.
4645
4646 // But, its possible the original immediate allowed an AND to become MOVZX.
4647 // Doing this late due to avoid the MakedValueIsZero call as late as
4648 // possible.
4649 if (Opcode == ISD::AND) {
4650 // Find the smallest zext this could possibly be.
4651 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4652 ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: 8U));
4653
4654 // Figure out which bits need to be zero to achieve that mask.
4655 APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(),
4656 loBitsSet: ZExtWidth);
4657 NeededMask &= ~Cst->getAPIntValue();
4658
4659 if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: 0), Mask: NeededMask))
4660 return false;
4661 }
4662
4663 SDValue X = Shift.getOperand(i: 0);
4664 if (FoundAnyExtend) {
4665 SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X);
4666 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewX);
4667 X = NewX;
4668 }
4669
4670 SDValue NewCst = CurDAG->getSignedConstant(Val: ShiftedVal, DL: dl, VT: NVT);
4671 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewCst);
4672 SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst);
4673 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewBinOp);
4674 SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp,
4675 N2: Shift.getOperand(i: 1));
4676 ReplaceNode(F: N, T: NewSHL.getNode());
4677 SelectCode(N: NewSHL.getNode());
4678 return true;
4679}
4680
4681bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4682 SDNode *ParentB, SDNode *ParentC,
4683 SDValue A, SDValue B, SDValue C,
4684 uint8_t Imm) {
4685 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4686 C.isOperandOf(ParentC) && "Incorrect parent node");
4687
4688 auto tryFoldLoadOrBCast =
4689 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4690 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4691 if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4692 return true;
4693
4694 // Not a load, check for broadcast which may be behind a bitcast.
4695 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4696 P = L.getNode();
4697 L = L.getOperand(i: 0);
4698 }
4699
4700 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4701 return false;
4702
4703 // Only 32 and 64 bit broadcasts are supported.
4704 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4705 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4706 if (Size != 32 && Size != 64)
4707 return false;
4708
4709 return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4710 };
4711
4712 bool FoldedLoad = false;
4713 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4714 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4715 FoldedLoad = true;
4716 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4717 Tmp4)) {
4718 FoldedLoad = true;
4719 std::swap(a&: A, b&: C);
4720 // Swap bits 1/4 and 3/6.
4721 uint8_t OldImm = Imm;
4722 Imm = OldImm & 0xa5;
4723 if (OldImm & 0x02) Imm |= 0x10;
4724 if (OldImm & 0x10) Imm |= 0x02;
4725 if (OldImm & 0x08) Imm |= 0x40;
4726 if (OldImm & 0x40) Imm |= 0x08;
4727 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4728 Tmp4)) {
4729 FoldedLoad = true;
4730 std::swap(a&: B, b&: C);
4731 // Swap bits 1/2 and 5/6.
4732 uint8_t OldImm = Imm;
4733 Imm = OldImm & 0x99;
4734 if (OldImm & 0x02) Imm |= 0x04;
4735 if (OldImm & 0x04) Imm |= 0x02;
4736 if (OldImm & 0x20) Imm |= 0x40;
4737 if (OldImm & 0x40) Imm |= 0x20;
4738 }
4739
4740 SDLoc DL(Root);
4741
4742 SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
4743
4744 MVT NVT = Root->getSimpleValueType(ResNo: 0);
4745
4746 MachineSDNode *MNode;
4747 if (FoldedLoad) {
4748 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
4749
4750 unsigned Opc;
4751 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4752 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C);
4753 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4754 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4755
4756 bool UseD = EltSize == 32;
4757 if (NVT.is128BitVector())
4758 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4759 else if (NVT.is256BitVector())
4760 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4761 else if (NVT.is512BitVector())
4762 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4763 else
4764 llvm_unreachable("Unexpected vector size!");
4765 } else {
4766 bool UseD = NVT.getVectorElementType() == MVT::i32;
4767 if (NVT.is128BitVector())
4768 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4769 else if (NVT.is256BitVector())
4770 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4771 else if (NVT.is512BitVector())
4772 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4773 else
4774 llvm_unreachable("Unexpected vector size!");
4775 }
4776
4777 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: 0)};
4778 MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops);
4779
4780 // Update the chain.
4781 ReplaceUses(F: C.getValue(R: 1), T: SDValue(MNode, 1));
4782 // Record the mem-refs
4783 CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()});
4784 } else {
4785 bool UseD = NVT.getVectorElementType() == MVT::i32;
4786 unsigned Opc;
4787 if (NVT.is128BitVector())
4788 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4789 else if (NVT.is256BitVector())
4790 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4791 else if (NVT.is512BitVector())
4792 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4793 else
4794 llvm_unreachable("Unexpected vector size!");
4795
4796 MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm});
4797 }
4798
4799 ReplaceUses(F: SDValue(Root, 0), T: SDValue(MNode, 0));
4800 CurDAG->RemoveDeadNode(N: Root);
4801 return true;
4802}
4803
4804// Try to match two logic ops to a VPTERNLOG.
4805// FIXME: Handle more complex patterns that use an operand more than once?
4806bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4807 MVT NVT = N->getSimpleValueType(ResNo: 0);
4808
4809 // Make sure we support VPTERNLOG.
4810 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4811 NVT.getVectorElementType() == MVT::i1)
4812 return false;
4813
4814 // We need VLX for 128/256-bit.
4815 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4816 return false;
4817
4818 auto getFoldableLogicOp = [](SDValue Op) {
4819 // Peek through single use bitcast.
4820 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4821 Op = Op.getOperand(i: 0);
4822
4823 if (!Op.hasOneUse())
4824 return SDValue();
4825
4826 unsigned Opc = Op.getOpcode();
4827 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4828 Opc == X86ISD::ANDNP)
4829 return Op;
4830
4831 return SDValue();
4832 };
4833
4834 SDValue N0, N1, A, FoldableOp;
4835
4836 // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
4837 auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
4838 if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
4839 ISD::isBuildVectorAllOnes(N: Op->getOperand(Num: 1).getNode())) {
4840 SDValue InnerOp = getFoldableLogicOp(Op->getOperand(Num: 0));
4841
4842 if (!InnerOp)
4843 return SDValue();
4844
4845 N0 = InnerOp.getOperand(i: 0);
4846 N1 = InnerOp.getOperand(i: 1);
4847 if ((FoldableOp = getFoldableLogicOp(N1))) {
4848 A = N0;
4849 return InnerOp;
4850 }
4851 if ((FoldableOp = getFoldableLogicOp(N0))) {
4852 A = N1;
4853 return InnerOp;
4854 }
4855 }
4856 return SDValue();
4857 };
4858
4859 bool PeeledOuterNot = false;
4860 SDNode *OriN = N;
4861 if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
4862 PeeledOuterNot = true;
4863 N = InnerOp.getNode();
4864 } else {
4865 N0 = N->getOperand(Num: 0);
4866 N1 = N->getOperand(Num: 1);
4867
4868 if ((FoldableOp = getFoldableLogicOp(N1)))
4869 A = N0;
4870 else if ((FoldableOp = getFoldableLogicOp(N0)))
4871 A = N1;
4872 else
4873 return false;
4874 }
4875
4876 SDValue B = FoldableOp.getOperand(i: 0);
4877 SDValue C = FoldableOp.getOperand(i: 1);
4878 SDNode *ParentA = N;
4879 SDNode *ParentB = FoldableOp.getNode();
4880 SDNode *ParentC = FoldableOp.getNode();
4881
4882 // We can build the appropriate control immediate by performing the logic
4883 // operation we're matching using these constants for A, B, and C.
4884 uint8_t TernlogMagicA = 0xf0;
4885 uint8_t TernlogMagicB = 0xcc;
4886 uint8_t TernlogMagicC = 0xaa;
4887
4888 // Some of the inputs may be inverted, peek through them and invert the
4889 // magic values accordingly.
4890 // TODO: There may be a bitcast before the xor that we should peek through.
4891 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4892 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4893 ISD::isBuildVectorAllOnes(N: Op.getOperand(i: 1).getNode())) {
4894 Magic = ~Magic;
4895 Parent = Op.getNode();
4896 Op = Op.getOperand(i: 0);
4897 }
4898 };
4899
4900 PeekThroughNot(A, ParentA, TernlogMagicA);
4901 PeekThroughNot(B, ParentB, TernlogMagicB);
4902 PeekThroughNot(C, ParentC, TernlogMagicC);
4903
4904 uint8_t Imm;
4905 switch (FoldableOp.getOpcode()) {
4906 default: llvm_unreachable("Unexpected opcode!");
4907 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4908 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4909 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4910 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4911 }
4912
4913 switch (N->getOpcode()) {
4914 default: llvm_unreachable("Unexpected opcode!");
4915 case X86ISD::ANDNP:
4916 if (A == N0)
4917 Imm &= ~TernlogMagicA;
4918 else
4919 Imm = ~(Imm) & TernlogMagicA;
4920 break;
4921 case ISD::AND: Imm &= TernlogMagicA; break;
4922 case ISD::OR: Imm |= TernlogMagicA; break;
4923 case ISD::XOR: Imm ^= TernlogMagicA; break;
4924 }
4925
4926 if (PeeledOuterNot)
4927 Imm = ~Imm;
4928
4929 return matchVPTERNLOG(Root: OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
4930}
4931
4932/// If the high bits of an 'and' operand are known zero, try setting the
4933/// high bits of an 'and' constant operand to produce a smaller encoding by
4934/// creating a small, sign-extended negative immediate rather than a large
4935/// positive one. This reverses a transform in SimplifyDemandedBits that
4936/// shrinks mask constants by clearing bits. There is also a possibility that
4937/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4938/// case, just replace the 'and'. Return 'true' if the node is replaced.
4939bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4940 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4941 // have immediate operands.
4942 MVT VT = And->getSimpleValueType(ResNo: 0);
4943 if (VT != MVT::i32 && VT != MVT::i64)
4944 return false;
4945
4946 auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1));
4947 if (!And1C)
4948 return false;
4949
4950 // Bail out if the mask constant is already negative. It's can't shrink more.
4951 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4952 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4953 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4954 // are negative too.
4955 APInt MaskVal = And1C->getAPIntValue();
4956 unsigned MaskLZ = MaskVal.countl_zero();
4957 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4958 return false;
4959
4960 // Don't extend into the upper 32 bits of a 64 bit mask.
4961 if (VT == MVT::i64 && MaskLZ >= 32) {
4962 MaskLZ -= 32;
4963 MaskVal = MaskVal.trunc(width: 32);
4964 }
4965
4966 SDValue And0 = And->getOperand(Num: 0);
4967 APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ);
4968 APInt NegMaskVal = MaskVal | HighZeros;
4969
4970 // If a negative constant would not allow a smaller encoding, there's no need
4971 // to continue. Only change the constant when we know it's a win.
4972 unsigned MinWidth = NegMaskVal.getSignificantBits();
4973 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4974 return false;
4975
4976 // Extend masks if we truncated above.
4977 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4978 NegMaskVal = NegMaskVal.zext(width: 64);
4979 HighZeros = HighZeros.zext(width: 64);
4980 }
4981
4982 // The variable operand must be all zeros in the top bits to allow using the
4983 // new, negative constant as the mask.
4984 // TODO: Handle constant folding?
4985 KnownBits Known0 = CurDAG->computeKnownBits(Op: And0);
4986 if (Known0.isConstant() || !HighZeros.isSubsetOf(RHS: Known0.Zero))
4987 return false;
4988
4989 // Check if the mask is -1. In that case, this is an unnecessary instruction
4990 // that escaped earlier analysis.
4991 if (NegMaskVal.isAllOnes()) {
4992 ReplaceNode(F: And, T: And0.getNode());
4993 return true;
4994 }
4995
4996 // A negative mask allows a smaller encoding. Create a new 'and' node.
4997 SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc(And), VT);
4998 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(And, 0), N: NewMask);
4999 SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(And), VT, N1: And0, N2: NewMask);
5000 ReplaceNode(F: And, T: NewAnd.getNode());
5001 SelectCode(N: NewAnd.getNode());
5002 return true;
5003}
5004
5005static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
5006 bool FoldedBCast, bool Masked) {
5007#define VPTESTM_CASE(VT, SUFFIX) \
5008case MVT::VT: \
5009 if (Masked) \
5010 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
5011 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
5012
5013
5014#define VPTESTM_BROADCAST_CASES(SUFFIX) \
5015default: llvm_unreachable("Unexpected VT!"); \
5016VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
5017VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
5018VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
5019VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
5020VPTESTM_CASE(v16i32, DZ##SUFFIX) \
5021VPTESTM_CASE(v8i64, QZ##SUFFIX)
5022
5023#define VPTESTM_FULL_CASES(SUFFIX) \
5024VPTESTM_BROADCAST_CASES(SUFFIX) \
5025VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
5026VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
5027VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
5028VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
5029VPTESTM_CASE(v64i8, BZ##SUFFIX) \
5030VPTESTM_CASE(v32i16, WZ##SUFFIX)
5031
5032 if (FoldedBCast) {
5033 switch (TestVT.SimpleTy) {
5034 VPTESTM_BROADCAST_CASES(rmb)
5035 }
5036 }
5037
5038 if (FoldedLoad) {
5039 switch (TestVT.SimpleTy) {
5040 VPTESTM_FULL_CASES(rm)
5041 }
5042 }
5043
5044 switch (TestVT.SimpleTy) {
5045 VPTESTM_FULL_CASES(rr)
5046 }
5047
5048#undef VPTESTM_FULL_CASES
5049#undef VPTESTM_BROADCAST_CASES
5050#undef VPTESTM_CASE
5051}
5052
5053// Try to create VPTESTM instruction. If InMask is not null, it will be used
5054// to form a masked operation.
5055bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
5056 SDValue InMask) {
5057 assert(Subtarget->hasAVX512() && "Expected AVX512!");
5058 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
5059 "Unexpected VT!");
5060
5061 // Look for equal and not equal compares.
5062 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: 2))->get();
5063 if (CC != ISD::SETEQ && CC != ISD::SETNE)
5064 return false;
5065
5066 SDValue SetccOp0 = Setcc.getOperand(i: 0);
5067 SDValue SetccOp1 = Setcc.getOperand(i: 1);
5068
5069 // Canonicalize the all zero vector to the RHS.
5070 if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode()))
5071 std::swap(a&: SetccOp0, b&: SetccOp1);
5072
5073 // See if we're comparing against zero.
5074 if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode()))
5075 return false;
5076
5077 SDValue N0 = SetccOp0;
5078
5079 MVT CmpVT = N0.getSimpleValueType();
5080 MVT CmpSVT = CmpVT.getVectorElementType();
5081
5082 // Start with both operands the same. We'll try to refine this.
5083 SDValue Src0 = N0;
5084 SDValue Src1 = N0;
5085
5086 {
5087 // Look through single use bitcasts.
5088 SDValue N0Temp = N0;
5089 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
5090 N0Temp = N0.getOperand(i: 0);
5091
5092 // Look for single use AND.
5093 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
5094 Src0 = N0Temp.getOperand(i: 0);
5095 Src1 = N0Temp.getOperand(i: 1);
5096 }
5097 }
5098
5099 // Without VLX we need to widen the operation.
5100 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
5101
5102 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
5103 SDValue &Base, SDValue &Scale, SDValue &Index,
5104 SDValue &Disp, SDValue &Segment) {
5105 // If we need to widen, we can't fold the load.
5106 if (!Widen)
5107 if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
5108 return true;
5109
5110 // If we didn't fold a load, try to match broadcast. No widening limitation
5111 // for this. But only 32 and 64 bit types are supported.
5112 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
5113 return false;
5114
5115 // Look through single use bitcasts.
5116 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
5117 P = L.getNode();
5118 L = L.getOperand(i: 0);
5119 }
5120
5121 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
5122 return false;
5123
5124 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
5125 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
5126 return false;
5127
5128 return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
5129 };
5130
5131 // We can only fold loads if the sources are unique.
5132 bool CanFoldLoads = Src0 != Src1;
5133
5134 bool FoldedLoad = false;
5135 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5136 if (CanFoldLoads) {
5137 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5138 Tmp3, Tmp4);
5139 if (!FoldedLoad) {
5140 // And is commutative.
5141 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
5142 Tmp2, Tmp3, Tmp4);
5143 if (FoldedLoad)
5144 std::swap(a&: Src0, b&: Src1);
5145 }
5146 }
5147
5148 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5149
5150 bool IsMasked = InMask.getNode() != nullptr;
5151
5152 SDLoc dl(Root);
5153
5154 MVT ResVT = Setcc.getSimpleValueType();
5155 MVT MaskVT = ResVT;
5156 if (Widen) {
5157 // Widen the inputs using insert_subreg or copy_to_regclass.
5158 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5159 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5160 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5161 CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts);
5162 MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts);
5163 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl,
5164 VT: CmpVT), 0);
5165 Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0);
5166
5167 if (!FoldedBCast)
5168 Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1);
5169
5170 if (IsMasked) {
5171 // Widen the mask.
5172 unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID();
5173 SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5174 InMask = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5175 dl, VT: MaskVT, Op1: InMask, Op2: RC), 0);
5176 }
5177 }
5178
5179 bool IsTestN = CC == ISD::SETEQ;
5180 unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5181 Masked: IsMasked);
5182
5183 MachineSDNode *CNode;
5184 if (FoldedLoad) {
5185 SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other);
5186
5187 if (IsMasked) {
5188 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5189 Src1.getOperand(i: 0) };
5190 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5191 } else {
5192 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5193 Src1.getOperand(i: 0) };
5194 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5195 }
5196
5197 // Update the chain.
5198 ReplaceUses(F: Src1.getValue(R: 1), T: SDValue(CNode, 1));
5199 // Record the mem-refs
5200 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()});
5201 } else {
5202 if (IsMasked)
5203 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1);
5204 else
5205 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1);
5206 }
5207
5208 // If we widened, we need to shrink the mask VT.
5209 if (Widen) {
5210 unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID();
5211 SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5212 CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5213 dl, VT: ResVT, Op1: SDValue(CNode, 0), Op2: RC);
5214 }
5215
5216 ReplaceUses(F: SDValue(Root, 0), T: SDValue(CNode, 0));
5217 CurDAG->RemoveDeadNode(N: Root);
5218 return true;
5219}
5220
5221// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5222// into vpternlog.
5223bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5224 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5225
5226 MVT NVT = N->getSimpleValueType(ResNo: 0);
5227
5228 // Make sure we support VPTERNLOG.
5229 if (!NVT.isVector() || !Subtarget->hasAVX512())
5230 return false;
5231
5232 // We need VLX for 128/256-bit.
5233 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5234 return false;
5235
5236 SDValue N0 = N->getOperand(Num: 0);
5237 SDValue N1 = N->getOperand(Num: 1);
5238
5239 // Canonicalize AND to LHS.
5240 if (N1.getOpcode() == ISD::AND)
5241 std::swap(a&: N0, b&: N1);
5242
5243 if (N0.getOpcode() != ISD::AND ||
5244 N1.getOpcode() != X86ISD::ANDNP ||
5245 !N0.hasOneUse() || !N1.hasOneUse())
5246 return false;
5247
5248 // ANDN is not commutable, use it to pick down A and C.
5249 SDValue A = N1.getOperand(i: 0);
5250 SDValue C = N1.getOperand(i: 1);
5251
5252 // AND is commutable, if one operand matches A, the other operand is B.
5253 // Otherwise this isn't a match.
5254 SDValue B;
5255 if (N0.getOperand(i: 0) == A)
5256 B = N0.getOperand(i: 1);
5257 else if (N0.getOperand(i: 1) == A)
5258 B = N0.getOperand(i: 0);
5259 else
5260 return false;
5261
5262 SDLoc dl(N);
5263 SDValue Imm = CurDAG->getTargetConstant(Val: 0xCA, DL: dl, VT: MVT::i8);
5264 SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm);
5265 ReplaceNode(F: N, T: Ternlog.getNode());
5266
5267 return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(),
5268 ParentC: Ternlog.getNode(), A, B, C, Imm: 0xCA);
5269}
5270
5271void X86DAGToDAGISel::Select(SDNode *Node) {
5272 MVT NVT = Node->getSimpleValueType(ResNo: 0);
5273 unsigned Opcode = Node->getOpcode();
5274 SDLoc dl(Node);
5275
5276 if (Node->isMachineOpcode()) {
5277 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5278 Node->setNodeId(-1);
5279 return; // Already selected.
5280 }
5281
5282 switch (Opcode) {
5283 default: break;
5284 case ISD::INTRINSIC_W_CHAIN: {
5285 unsigned IntNo = Node->getConstantOperandVal(Num: 1);
5286 switch (IntNo) {
5287 default: break;
5288 case Intrinsic::x86_encodekey128:
5289 case Intrinsic::x86_encodekey256: {
5290 if (!Subtarget->hasKL())
5291 break;
5292
5293 unsigned Opcode;
5294 switch (IntNo) {
5295 default: llvm_unreachable("Impossible intrinsic");
5296 case Intrinsic::x86_encodekey128:
5297 Opcode = X86::ENCODEKEY128;
5298 break;
5299 case Intrinsic::x86_encodekey256:
5300 Opcode = X86::ENCODEKEY256;
5301 break;
5302 }
5303
5304 SDValue Chain = Node->getOperand(Num: 0);
5305 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 3),
5306 Glue: SDValue());
5307 if (Opcode == X86::ENCODEKEY256)
5308 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 4),
5309 Glue: Chain.getValue(R: 1));
5310
5311 MachineSDNode *Res = CurDAG->getMachineNode(
5312 Opcode, dl, VTs: Node->getVTList(),
5313 Ops: {Node->getOperand(Num: 2), Chain, Chain.getValue(R: 1)});
5314 ReplaceNode(F: Node, T: Res);
5315 return;
5316 }
5317 case Intrinsic::x86_tileloaddrs64_internal:
5318 case Intrinsic::x86_tileloaddrst164_internal:
5319 if (!Subtarget->hasAMXMOVRS())
5320 break;
5321 [[fallthrough]];
5322 case Intrinsic::x86_tileloadd64_internal:
5323 case Intrinsic::x86_tileloaddt164_internal: {
5324 if (!Subtarget->hasAMXTILE())
5325 break;
5326 auto *MFI =
5327 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5328 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5329 unsigned Opc;
5330 switch (IntNo) {
5331 default:
5332 llvm_unreachable("Unexpected intrinsic!");
5333 case Intrinsic::x86_tileloaddrs64_internal:
5334 Opc = X86::PTILELOADDRSV;
5335 break;
5336 case Intrinsic::x86_tileloaddrst164_internal:
5337 Opc = X86::PTILELOADDRST1V;
5338 break;
5339 case Intrinsic::x86_tileloadd64_internal:
5340 Opc = X86::PTILELOADDV;
5341 break;
5342 case Intrinsic::x86_tileloaddt164_internal:
5343 Opc = X86::PTILELOADDT1V;
5344 break;
5345 }
5346 // _tile_loadd_internal(row, col, buf, STRIDE)
5347 SDValue Base = Node->getOperand(Num: 4);
5348 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5349 SDValue Index = Node->getOperand(Num: 5);
5350 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5351 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5352 SDValue Chain = Node->getOperand(Num: 0);
5353 MachineSDNode *CNode;
5354 SDValue Ops[] = {Node->getOperand(Num: 2),
5355 Node->getOperand(Num: 3),
5356 Base,
5357 Scale,
5358 Index,
5359 Disp,
5360 Segment,
5361 Chain};
5362 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops);
5363 ReplaceNode(F: Node, T: CNode);
5364 return;
5365 }
5366 }
5367 break;
5368 }
5369 case ISD::INTRINSIC_VOID: {
5370 unsigned IntNo = Node->getConstantOperandVal(Num: 1);
5371 switch (IntNo) {
5372 default: break;
5373 case Intrinsic::x86_sse3_monitor:
5374 case Intrinsic::x86_monitorx:
5375 case Intrinsic::x86_clzero: {
5376 bool Use64BitPtr = Node->getOperand(Num: 2).getValueType() == MVT::i64;
5377
5378 unsigned Opc = 0;
5379 switch (IntNo) {
5380 default: llvm_unreachable("Unexpected intrinsic!");
5381 case Intrinsic::x86_sse3_monitor:
5382 if (!Subtarget->hasSSE3())
5383 break;
5384 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5385 break;
5386 case Intrinsic::x86_monitorx:
5387 if (!Subtarget->hasMWAITX())
5388 break;
5389 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5390 break;
5391 case Intrinsic::x86_clzero:
5392 if (!Subtarget->hasCLZERO())
5393 break;
5394 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5395 break;
5396 }
5397
5398 if (Opc) {
5399 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5400 SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: 0), dl, Reg: PtrReg,
5401 N: Node->getOperand(Num: 2), Glue: SDValue());
5402 SDValue InGlue = Chain.getValue(R: 1);
5403
5404 if (IntNo == Intrinsic::x86_sse3_monitor ||
5405 IntNo == Intrinsic::x86_monitorx) {
5406 // Copy the other two operands to ECX and EDX.
5407 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: 3),
5408 Glue: InGlue);
5409 InGlue = Chain.getValue(R: 1);
5410 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: 4),
5411 Glue: InGlue);
5412 InGlue = Chain.getValue(R: 1);
5413 }
5414
5415 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other,
5416 Ops: { Chain, InGlue});
5417 ReplaceNode(F: Node, T: CNode);
5418 return;
5419 }
5420
5421 break;
5422 }
5423 case Intrinsic::x86_tilestored64_internal: {
5424 auto *MFI =
5425 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5426 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5427 unsigned Opc = X86::PTILESTOREDV;
5428 // _tile_stored_internal(row, col, buf, STRIDE, c)
5429 SDValue Base = Node->getOperand(Num: 4);
5430 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5431 SDValue Index = Node->getOperand(Num: 5);
5432 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5433 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5434 SDValue Chain = Node->getOperand(Num: 0);
5435 MachineSDNode *CNode;
5436 SDValue Ops[] = {Node->getOperand(Num: 2),
5437 Node->getOperand(Num: 3),
5438 Base,
5439 Scale,
5440 Index,
5441 Disp,
5442 Segment,
5443 Node->getOperand(Num: 6),
5444 Chain};
5445 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5446 ReplaceNode(F: Node, T: CNode);
5447 return;
5448 }
5449 case Intrinsic::x86_tileloaddrs64:
5450 case Intrinsic::x86_tileloaddrst164:
5451 if (!Subtarget->hasAMXMOVRS())
5452 break;
5453 [[fallthrough]];
5454 case Intrinsic::x86_tileloadd64:
5455 case Intrinsic::x86_tileloaddt164:
5456 case Intrinsic::x86_tilestored64: {
5457 if (!Subtarget->hasAMXTILE())
5458 break;
5459 auto *MFI =
5460 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5461 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5462 unsigned Opc;
5463 switch (IntNo) {
5464 default: llvm_unreachable("Unexpected intrinsic!");
5465 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5466 case Intrinsic::x86_tileloaddrs64:
5467 Opc = X86::PTILELOADDRS;
5468 break;
5469 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5470 case Intrinsic::x86_tileloaddrst164:
5471 Opc = X86::PTILELOADDRST1;
5472 break;
5473 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5474 }
5475 // FIXME: Match displacement and scale.
5476 unsigned TIndex = Node->getConstantOperandVal(Num: 2);
5477 SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5478 SDValue Base = Node->getOperand(Num: 3);
5479 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5480 SDValue Index = Node->getOperand(Num: 4);
5481 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5482 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5483 SDValue Chain = Node->getOperand(Num: 0);
5484 MachineSDNode *CNode;
5485 if (Opc == X86::PTILESTORED) {
5486 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5487 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5488 } else {
5489 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5490 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5491 }
5492 ReplaceNode(F: Node, T: CNode);
5493 return;
5494 }
5495 }
5496 break;
5497 }
5498 case ISD::BRIND:
5499 case X86ISD::NT_BRIND: {
5500 if (Subtarget->isTarget64BitILP32()) {
5501 // Converts a 32-bit register to a 64-bit, zero-extended version of
5502 // it. This is needed because x86-64 can do many things, but jmp %r32
5503 // ain't one of them.
5504 SDValue Target = Node->getOperand(Num: 1);
5505 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5506 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64);
5507 SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other,
5508 N1: Node->getOperand(Num: 0), N2: ZextTarget);
5509 ReplaceNode(F: Node, T: Brind.getNode());
5510 SelectCode(N: ZextTarget.getNode());
5511 SelectCode(N: Brind.getNode());
5512 return;
5513 }
5514 break;
5515 }
5516 case X86ISD::GlobalBaseReg:
5517 ReplaceNode(F: Node, T: getGlobalBaseReg());
5518 return;
5519
5520 case ISD::BITCAST:
5521 // Just drop all 128/256/512-bit bitcasts.
5522 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5523 NVT == MVT::f128) {
5524 ReplaceUses(F: SDValue(Node, 0), T: Node->getOperand(Num: 0));
5525 CurDAG->RemoveDeadNode(N: Node);
5526 return;
5527 }
5528 break;
5529
5530 case ISD::SRL:
5531 if (matchBitExtract(Node))
5532 return;
5533 [[fallthrough]];
5534 case ISD::SRA:
5535 case ISD::SHL:
5536 if (tryShiftAmountMod(N: Node))
5537 return;
5538 break;
5539
5540 case X86ISD::VPTERNLOG: {
5541 uint8_t Imm = Node->getConstantOperandVal(Num: 3);
5542 if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: 0),
5543 B: Node->getOperand(Num: 1), C: Node->getOperand(Num: 2), Imm))
5544 return;
5545 break;
5546 }
5547
5548 case X86ISD::ANDNP:
5549 if (tryVPTERNLOG(N: Node))
5550 return;
5551 break;
5552
5553 case ISD::AND:
5554 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5555 // Try to form a masked VPTESTM. Operands can be in either order.
5556 SDValue N0 = Node->getOperand(Num: 0);
5557 SDValue N1 = Node->getOperand(Num: 1);
5558 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5559 tryVPTESTM(Root: Node, Setcc: N0, InMask: N1))
5560 return;
5561 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5562 tryVPTESTM(Root: Node, Setcc: N1, InMask: N0))
5563 return;
5564 }
5565
5566 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5567 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
5568 CurDAG->RemoveDeadNode(N: Node);
5569 return;
5570 }
5571 if (matchBitExtract(Node))
5572 return;
5573 if (AndImmShrink && shrinkAndImmediate(And: Node))
5574 return;
5575
5576 [[fallthrough]];
5577 case ISD::OR:
5578 case ISD::XOR:
5579 if (tryShrinkShlLogicImm(N: Node))
5580 return;
5581 if (Opcode == ISD::OR && tryMatchBitSelect(N: Node))
5582 return;
5583 if (tryVPTERNLOG(N: Node))
5584 return;
5585
5586 [[fallthrough]];
5587 case ISD::ADD:
5588 if (Opcode == ISD::ADD && matchBitExtract(Node))
5589 return;
5590 [[fallthrough]];
5591 case ISD::SUB: {
5592 // Try to avoid folding immediates with multiple uses for optsize.
5593 // This code tries to select to register form directly to avoid going
5594 // through the isel table which might fold the immediate. We can't change
5595 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5596 // tablegen files to check immediate use count without making the patterns
5597 // unavailable to the fast-isel table.
5598 if (!CurDAG->shouldOptForSize())
5599 break;
5600
5601 // Only handle i8/i16/i32/i64.
5602 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5603 break;
5604
5605 SDValue N0 = Node->getOperand(Num: 0);
5606 SDValue N1 = Node->getOperand(Num: 1);
5607
5608 auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
5609 if (!Cst)
5610 break;
5611
5612 int64_t Val = Cst->getSExtValue();
5613
5614 // Make sure its an immediate that is considered foldable.
5615 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5616 if (!isInt<8>(x: Val) && !isInt<32>(x: Val))
5617 break;
5618
5619 // If this can match to INC/DEC, let it go.
5620 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5621 break;
5622
5623 // Check if we should avoid folding this immediate.
5624 if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode()))
5625 break;
5626
5627 // We should not fold the immediate. So we need a register form instead.
5628 unsigned ROpc, MOpc;
5629 switch (NVT.SimpleTy) {
5630 default: llvm_unreachable("Unexpected VT!");
5631 case MVT::i8:
5632 switch (Opcode) {
5633 default: llvm_unreachable("Unexpected opcode!");
5634 case ISD::ADD:
5635 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5636 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5637 break;
5638 case ISD::SUB:
5639 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5640 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5641 break;
5642 case ISD::AND:
5643 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5644 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5645 break;
5646 case ISD::OR:
5647 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5648 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5649 break;
5650 case ISD::XOR:
5651 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5652 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5653 break;
5654 }
5655 break;
5656 case MVT::i16:
5657 switch (Opcode) {
5658 default: llvm_unreachable("Unexpected opcode!");
5659 case ISD::ADD:
5660 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5661 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5662 break;
5663 case ISD::SUB:
5664 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5665 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5666 break;
5667 case ISD::AND:
5668 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5669 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5670 break;
5671 case ISD::OR:
5672 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5673 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5674 break;
5675 case ISD::XOR:
5676 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5677 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5678 break;
5679 }
5680 break;
5681 case MVT::i32:
5682 switch (Opcode) {
5683 default: llvm_unreachable("Unexpected opcode!");
5684 case ISD::ADD:
5685 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5686 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5687 break;
5688 case ISD::SUB:
5689 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5690 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5691 break;
5692 case ISD::AND:
5693 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5694 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5695 break;
5696 case ISD::OR:
5697 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5698 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5699 break;
5700 case ISD::XOR:
5701 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5702 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5703 break;
5704 }
5705 break;
5706 case MVT::i64:
5707 switch (Opcode) {
5708 default: llvm_unreachable("Unexpected opcode!");
5709 case ISD::ADD:
5710 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5711 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5712 break;
5713 case ISD::SUB:
5714 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5715 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5716 break;
5717 case ISD::AND:
5718 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5719 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5720 break;
5721 case ISD::OR:
5722 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5723 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5724 break;
5725 case ISD::XOR:
5726 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5727 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5728 break;
5729 }
5730 break;
5731 }
5732
5733 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5734
5735 // If this is a not a subtract, we can still try to fold a load.
5736 if (Opcode != ISD::SUB) {
5737 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5738 if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5739 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) };
5740 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5741 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5742 // Update the chain.
5743 ReplaceUses(F: N0.getValue(R: 1), T: SDValue(CNode, 2));
5744 // Record the mem-refs
5745 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5746 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
5747 CurDAG->RemoveDeadNode(N: Node);
5748 return;
5749 }
5750 }
5751
5752 CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1);
5753 return;
5754 }
5755
5756 case X86ISD::SMUL:
5757 // i16/i32/i64 are handled with isel patterns.
5758 if (NVT != MVT::i8)
5759 break;
5760 [[fallthrough]];
5761 case X86ISD::UMUL: {
5762 SDValue N0 = Node->getOperand(Num: 0);
5763 SDValue N1 = Node->getOperand(Num: 1);
5764
5765 unsigned LoReg, ROpc, MOpc;
5766 switch (NVT.SimpleTy) {
5767 default: llvm_unreachable("Unsupported VT!");
5768 case MVT::i8:
5769 LoReg = X86::AL;
5770 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5771 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5772 break;
5773 case MVT::i16:
5774 LoReg = X86::AX;
5775 ROpc = X86::MUL16r;
5776 MOpc = X86::MUL16m;
5777 break;
5778 case MVT::i32:
5779 LoReg = X86::EAX;
5780 ROpc = X86::MUL32r;
5781 MOpc = X86::MUL32m;
5782 break;
5783 case MVT::i64:
5784 LoReg = X86::RAX;
5785 ROpc = X86::MUL64r;
5786 MOpc = X86::MUL64m;
5787 break;
5788 }
5789
5790 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5791 bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5792 // Multiply is commutative.
5793 if (!FoldedLoad) {
5794 FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5795 if (FoldedLoad)
5796 std::swap(a&: N0, b&: N1);
5797 }
5798
5799 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5800 N: N0, Glue: SDValue()).getValue(R: 1);
5801
5802 MachineSDNode *CNode;
5803 if (FoldedLoad) {
5804 // i16/i32/i64 use an instruction that produces a low and high result even
5805 // though only the low result is used.
5806 SDVTList VTs;
5807 if (NVT == MVT::i8)
5808 VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5809 else
5810 VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other);
5811
5812 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5813 InGlue };
5814 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5815
5816 // Update the chain.
5817 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5818 // Record the mem-refs
5819 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5820 } else {
5821 // i16/i32/i64 use an instruction that produces a low and high result even
5822 // though only the low result is used.
5823 SDVTList VTs;
5824 if (NVT == MVT::i8)
5825 VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32);
5826 else
5827 VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32);
5828
5829 CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue});
5830 }
5831
5832 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
5833 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5834 CurDAG->RemoveDeadNode(N: Node);
5835 return;
5836 }
5837
5838 case ISD::SMUL_LOHI:
5839 case ISD::UMUL_LOHI: {
5840 SDValue N0 = Node->getOperand(Num: 0);
5841 SDValue N1 = Node->getOperand(Num: 1);
5842
5843 unsigned Opc, MOpc;
5844 unsigned LoReg, HiReg;
5845 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5846 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5847 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5848 switch (NVT.SimpleTy) {
5849 default: llvm_unreachable("Unsupported VT!");
5850 case MVT::i32:
5851 Opc = UseMULXHi ? X86::MULX32Hrr
5852 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5853 : IsSigned ? X86::IMUL32r
5854 : X86::MUL32r;
5855 MOpc = UseMULXHi ? X86::MULX32Hrm
5856 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5857 : IsSigned ? X86::IMUL32m
5858 : X86::MUL32m;
5859 LoReg = UseMULX ? X86::EDX : X86::EAX;
5860 HiReg = X86::EDX;
5861 break;
5862 case MVT::i64:
5863 Opc = UseMULXHi ? X86::MULX64Hrr
5864 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5865 : IsSigned ? X86::IMUL64r
5866 : X86::MUL64r;
5867 MOpc = UseMULXHi ? X86::MULX64Hrm
5868 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5869 : IsSigned ? X86::IMUL64m
5870 : X86::MUL64m;
5871 LoReg = UseMULX ? X86::RDX : X86::RAX;
5872 HiReg = X86::RDX;
5873 break;
5874 }
5875
5876 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5877 bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5878 // Multiply is commutative.
5879 if (!foldedLoad) {
5880 foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5881 if (foldedLoad)
5882 std::swap(a&: N0, b&: N1);
5883 }
5884
5885 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5886 N: N0, Glue: SDValue()).getValue(R: 1);
5887 SDValue ResHi, ResLo;
5888 if (foldedLoad) {
5889 SDValue Chain;
5890 MachineSDNode *CNode = nullptr;
5891 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5892 InGlue };
5893 if (UseMULXHi) {
5894 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
5895 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5896 ResHi = SDValue(CNode, 0);
5897 Chain = SDValue(CNode, 1);
5898 } else if (UseMULX) {
5899 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other);
5900 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5901 ResHi = SDValue(CNode, 0);
5902 ResLo = SDValue(CNode, 1);
5903 Chain = SDValue(CNode, 2);
5904 } else {
5905 SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
5906 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5907 Chain = SDValue(CNode, 0);
5908 InGlue = SDValue(CNode, 1);
5909 }
5910
5911 // Update the chain.
5912 ReplaceUses(F: N1.getValue(R: 1), T: Chain);
5913 // Record the mem-refs
5914 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5915 } else {
5916 SDValue Ops[] = { N1, InGlue };
5917 if (UseMULXHi) {
5918 SDVTList VTs = CurDAG->getVTList(VT: NVT);
5919 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5920 ResHi = SDValue(CNode, 0);
5921 } else if (UseMULX) {
5922 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT);
5923 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5924 ResHi = SDValue(CNode, 0);
5925 ResLo = SDValue(CNode, 1);
5926 } else {
5927 SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue);
5928 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5929 InGlue = SDValue(CNode, 0);
5930 }
5931 }
5932
5933 // Copy the low half of the result, if it is needed.
5934 if (!SDValue(Node, 0).use_empty()) {
5935 if (!ResLo) {
5936 assert(LoReg && "Register for low half is not defined!");
5937 ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5938 VT: NVT, Glue: InGlue);
5939 InGlue = ResLo.getValue(R: 2);
5940 }
5941 ReplaceUses(F: SDValue(Node, 0), T: ResLo);
5942 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5943 dbgs() << '\n');
5944 }
5945 // Copy the high half of the result, if it is needed.
5946 if (!SDValue(Node, 1).use_empty()) {
5947 if (!ResHi) {
5948 assert(HiReg && "Register for high half is not defined!");
5949 ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg,
5950 VT: NVT, Glue: InGlue);
5951 InGlue = ResHi.getValue(R: 2);
5952 }
5953 ReplaceUses(F: SDValue(Node, 1), T: ResHi);
5954 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5955 dbgs() << '\n');
5956 }
5957
5958 CurDAG->RemoveDeadNode(N: Node);
5959 return;
5960 }
5961
5962 case ISD::SDIVREM:
5963 case ISD::UDIVREM: {
5964 SDValue N0 = Node->getOperand(Num: 0);
5965 SDValue N1 = Node->getOperand(Num: 1);
5966
5967 unsigned ROpc, MOpc;
5968 bool isSigned = Opcode == ISD::SDIVREM;
5969 if (!isSigned) {
5970 switch (NVT.SimpleTy) {
5971 default: llvm_unreachable("Unsupported VT!");
5972 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5973 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5974 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5975 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5976 }
5977 } else {
5978 switch (NVT.SimpleTy) {
5979 default: llvm_unreachable("Unsupported VT!");
5980 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5981 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5982 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5983 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5984 }
5985 }
5986
5987 unsigned LoReg, HiReg, ClrReg;
5988 unsigned SExtOpcode;
5989 switch (NVT.SimpleTy) {
5990 default: llvm_unreachable("Unsupported VT!");
5991 case MVT::i8:
5992 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5993 SExtOpcode = 0; // Not used.
5994 break;
5995 case MVT::i16:
5996 LoReg = X86::AX; HiReg = X86::DX;
5997 ClrReg = X86::DX;
5998 SExtOpcode = X86::CWD;
5999 break;
6000 case MVT::i32:
6001 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
6002 SExtOpcode = X86::CDQ;
6003 break;
6004 case MVT::i64:
6005 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
6006 SExtOpcode = X86::CQO;
6007 break;
6008 }
6009
6010 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6011 bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
6012 bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0);
6013
6014 SDValue InGlue;
6015 if (NVT == MVT::i8) {
6016 // Special case for div8, just use a move with zero extension to AX to
6017 // clear the upper 8 bits (AH).
6018 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
6019 MachineSDNode *Move;
6020 if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6021 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) };
6022 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
6023 : X86::MOVZX16rm8;
6024 Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops);
6025 Chain = SDValue(Move, 1);
6026 ReplaceUses(F: N0.getValue(R: 1), T: Chain);
6027 // Record the mem-refs
6028 CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
6029 } else {
6030 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
6031 : X86::MOVZX16rr8;
6032 Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0);
6033 Chain = CurDAG->getEntryNode();
6034 }
6035 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue(Move, 0),
6036 Glue: SDValue());
6037 InGlue = Chain.getValue(R: 1);
6038 } else {
6039 InGlue =
6040 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl,
6041 Reg: LoReg, N: N0, Glue: SDValue()).getValue(R: 1);
6042 if (isSigned && !signBitIsZero) {
6043 // Sign extend the low part into the high part.
6044 InGlue =
6045 SDValue(CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),0);
6046 } else {
6047 // Zero out the high part, effectively zero extending the input.
6048 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
6049 SDValue ClrNode =
6050 SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0);
6051 switch (NVT.SimpleTy) {
6052 case MVT::i16:
6053 ClrNode =
6054 SDValue(CurDAG->getMachineNode(
6055 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode,
6056 Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl,
6057 VT: MVT::i32)),
6058 0);
6059 break;
6060 case MVT::i32:
6061 break;
6062 case MVT::i64:
6063 ClrNode = SDValue(
6064 CurDAG->getMachineNode(
6065 Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, Op1: ClrNode,
6066 Op2: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
6067 0);
6068 break;
6069 default:
6070 llvm_unreachable("Unexpected division source");
6071 }
6072
6073 InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg,
6074 N: ClrNode, Glue: InGlue).getValue(R: 1);
6075 }
6076 }
6077
6078 if (foldedLoad) {
6079 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
6080 InGlue };
6081 MachineSDNode *CNode =
6082 CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops);
6083 InGlue = SDValue(CNode, 1);
6084 // Update the chain.
6085 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 0));
6086 // Record the mem-refs
6087 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
6088 } else {
6089 InGlue =
6090 SDValue(CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), 0);
6091 }
6092
6093 // Prevent use of AH in a REX instruction by explicitly copying it to
6094 // an ABCD_L register.
6095 //
6096 // The current assumption of the register allocator is that isel
6097 // won't generate explicit references to the GR8_ABCD_H registers. If
6098 // the allocator and/or the backend get enhanced to be more robust in
6099 // that regard, this can be, and should be, removed.
6100 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6101 SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8);
6102 unsigned AHExtOpcode =
6103 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6104
6105 SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32,
6106 VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue);
6107 SDValue Result(RNode, 0);
6108 InGlue = SDValue(RNode, 1);
6109
6110 Result =
6111 CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result);
6112
6113 ReplaceUses(F: SDValue(Node, 1), T: Result);
6114 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6115 dbgs() << '\n');
6116 }
6117 // Copy the division (low) result, if it is needed.
6118 if (!SDValue(Node, 0).use_empty()) {
6119 SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6120 Reg: LoReg, VT: NVT, Glue: InGlue);
6121 InGlue = Result.getValue(R: 2);
6122 ReplaceUses(F: SDValue(Node, 0), T: Result);
6123 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6124 dbgs() << '\n');
6125 }
6126 // Copy the remainder (high) result, if it is needed.
6127 if (!SDValue(Node, 1).use_empty()) {
6128 SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6129 Reg: HiReg, VT: NVT, Glue: InGlue);
6130 InGlue = Result.getValue(R: 2);
6131 ReplaceUses(F: SDValue(Node, 1), T: Result);
6132 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6133 dbgs() << '\n');
6134 }
6135 CurDAG->RemoveDeadNode(N: Node);
6136 return;
6137 }
6138
6139 case X86ISD::FCMP:
6140 case X86ISD::STRICT_FCMP:
6141 case X86ISD::STRICT_FCMPS: {
6142 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6143 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6144 SDValue N0 = Node->getOperand(Num: IsStrictCmp ? 1 : 0);
6145 SDValue N1 = Node->getOperand(Num: IsStrictCmp ? 2 : 1);
6146
6147 // Save the original VT of the compare.
6148 MVT CmpVT = N0.getSimpleValueType();
6149
6150 // Floating point needs special handling if we don't have FCOMI.
6151 if (Subtarget->canUseCMOV())
6152 break;
6153
6154 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6155
6156 unsigned Opc;
6157 switch (CmpVT.SimpleTy) {
6158 default: llvm_unreachable("Unexpected type!");
6159 case MVT::f32:
6160 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6161 break;
6162 case MVT::f64:
6163 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6164 break;
6165 case MVT::f80:
6166 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6167 break;
6168 }
6169
6170 SDValue Chain =
6171 IsStrictCmp ? Node->getOperand(Num: 0) : CurDAG->getEntryNode();
6172 SDValue Glue;
6173 if (IsStrictCmp) {
6174 SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
6175 Chain = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), 0);
6176 Glue = Chain.getValue(R: 1);
6177 } else {
6178 Glue = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), 0);
6179 }
6180
6181 // Move FPSW to AX.
6182 SDValue FNSTSW =
6183 SDValue(CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), 0);
6184
6185 // Extract upper 8-bits of AX.
6186 SDValue Extract =
6187 CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW);
6188
6189 // Move AH into flags.
6190 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6191 assert(Subtarget->canUseLAHFSAHF() &&
6192 "Target doesn't support SAHF or FCOMI?");
6193 SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue());
6194 Chain = AH;
6195 SDValue SAHF = SDValue(
6196 CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: 1)), 0);
6197
6198 if (IsStrictCmp)
6199 ReplaceUses(F: SDValue(Node, 1), T: Chain);
6200
6201 ReplaceUses(F: SDValue(Node, 0), T: SAHF);
6202 CurDAG->RemoveDeadNode(N: Node);
6203 return;
6204 }
6205
6206 case X86ISD::CMP: {
6207 SDValue N0 = Node->getOperand(Num: 0);
6208 SDValue N1 = Node->getOperand(Num: 1);
6209
6210 // Optimizations for TEST compares.
6211 if (!isNullConstant(V: N1))
6212 break;
6213
6214 // Save the original VT of the compare.
6215 MVT CmpVT = N0.getSimpleValueType();
6216
6217 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6218 // by a test instruction. The test should be removed later by
6219 // analyzeCompare if we are using only the zero flag.
6220 // TODO: Should we check the users and use the BEXTR flags directly?
6221 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6222 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) {
6223 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6224 : X86::TEST32rr;
6225 SDValue BEXTR = SDValue(NewNode, 0);
6226 NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR);
6227 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
6228 CurDAG->RemoveDeadNode(N: Node);
6229 return;
6230 }
6231 }
6232
6233 // We can peek through truncates, but we need to be careful below.
6234 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6235 N0 = N0.getOperand(i: 0);
6236
6237 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6238 // use a smaller encoding.
6239 // Look past the truncate if CMP is the only use of it.
6240 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6241 N0.getValueType() != MVT::i8) {
6242 auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
6243 if (!MaskC)
6244 break;
6245
6246 // We may have looked through a truncate so mask off any bits that
6247 // shouldn't be part of the compare.
6248 uint64_t Mask = MaskC->getZExtValue();
6249 Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits());
6250
6251 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6252 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6253 // zero flag.
6254 if (CmpVT == MVT::i64 && !isInt<8>(x: Mask) && isShiftedMask_64(Value: Mask) &&
6255 onlyUsesZeroFlag(Flags: SDValue(Node, 0))) {
6256 unsigned ShiftOpcode = ISD::DELETED_NODE;
6257 unsigned ShiftAmt;
6258 unsigned SubRegIdx;
6259 MVT SubRegVT;
6260 unsigned TestOpcode;
6261 unsigned LeadingZeros = llvm::countl_zero(Val: Mask);
6262 unsigned TrailingZeros = llvm::countr_zero(Val: Mask);
6263
6264 // With leading/trailing zeros, the transform is profitable if we can
6265 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6266 // incurring any extra register moves.
6267 bool SavesBytes = !isInt<32>(x: Mask) || N0.getOperand(i: 0).hasOneUse();
6268 if (LeadingZeros == 0 && SavesBytes) {
6269 // If the mask covers the most significant bit, then we can replace
6270 // TEST+AND with a SHR and check eflags.
6271 // This emits a redundant TEST which is subsequently eliminated.
6272 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6273 ShiftAmt = TrailingZeros;
6274 SubRegIdx = 0;
6275 TestOpcode = X86::TEST64rr;
6276 } else if (TrailingZeros == 0 && SavesBytes) {
6277 // If the mask covers the least significant bit, then we can replace
6278 // TEST+AND with a SHL and check eflags.
6279 // This emits a redundant TEST which is subsequently eliminated.
6280 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6281 ShiftAmt = LeadingZeros;
6282 SubRegIdx = 0;
6283 TestOpcode = X86::TEST64rr;
6284 } else if (MaskC->hasOneUse() && !isInt<32>(x: Mask)) {
6285 // If the shifted mask extends into the high half and is 8/16/32 bits
6286 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6287 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6288 if (PopCount == 8) {
6289 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6290 ShiftAmt = TrailingZeros;
6291 SubRegIdx = X86::sub_8bit;
6292 SubRegVT = MVT::i8;
6293 TestOpcode = X86::TEST8rr;
6294 } else if (PopCount == 16) {
6295 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6296 ShiftAmt = TrailingZeros;
6297 SubRegIdx = X86::sub_16bit;
6298 SubRegVT = MVT::i16;
6299 TestOpcode = X86::TEST16rr;
6300 } else if (PopCount == 32) {
6301 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6302 ShiftAmt = TrailingZeros;
6303 SubRegIdx = X86::sub_32bit;
6304 SubRegVT = MVT::i32;
6305 TestOpcode = X86::TEST32rr;
6306 }
6307 }
6308 if (ShiftOpcode != ISD::DELETED_NODE) {
6309 SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64);
6310 SDValue Shift = SDValue(
6311 CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32,
6312 Op1: N0.getOperand(i: 0), Op2: ShiftC),
6313 0);
6314 if (SubRegIdx != 0) {
6315 Shift =
6316 CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift);
6317 }
6318 MachineSDNode *Test =
6319 CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift);
6320 ReplaceNode(F: Node, T: Test);
6321 return;
6322 }
6323 }
6324
6325 MVT VT;
6326 int SubRegOp;
6327 unsigned ROpc, MOpc;
6328
6329 // For each of these checks we need to be careful if the sign flag is
6330 // being used. It is only safe to use the sign flag in two conditions,
6331 // either the sign bit in the shrunken mask is zero or the final test
6332 // size is equal to the original compare size.
6333
6334 if (isUInt<8>(x: Mask) &&
6335 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6336 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6337 // For example, convert "testl %eax, $8" to "testb %al, $8"
6338 VT = MVT::i8;
6339 SubRegOp = X86::sub_8bit;
6340 ROpc = X86::TEST8ri;
6341 MOpc = X86::TEST8mi;
6342 } else if (OptForMinSize && isUInt<16>(x: Mask) &&
6343 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6344 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6345 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6346 // NOTE: We only want to form TESTW instructions if optimizing for
6347 // min size. Otherwise we only save one byte and possibly get a length
6348 // changing prefix penalty in the decoders.
6349 VT = MVT::i16;
6350 SubRegOp = X86::sub_16bit;
6351 ROpc = X86::TEST16ri;
6352 MOpc = X86::TEST16mi;
6353 } else if (isUInt<32>(x: Mask) && N0.getValueType() != MVT::i16 &&
6354 ((!(Mask & 0x80000000) &&
6355 // Without minsize 16-bit Cmps can get here so we need to
6356 // be sure we calculate the correct sign flag if needed.
6357 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6358 CmpVT == MVT::i32 ||
6359 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6360 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6361 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6362 // Otherwize, we find ourselves in a position where we have to do
6363 // promotion. If previous passes did not promote the and, we assume
6364 // they had a good reason not to and do not promote here.
6365 VT = MVT::i32;
6366 SubRegOp = X86::sub_32bit;
6367 ROpc = X86::TEST32ri;
6368 MOpc = X86::TEST32mi;
6369 } else {
6370 // No eligible transformation was found.
6371 break;
6372 }
6373
6374 SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT);
6375 SDValue Reg = N0.getOperand(i: 0);
6376
6377 // Emit a testl or testw.
6378 MachineSDNode *NewNode;
6379 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6380 if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6381 if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: 0).getNode())) {
6382 if (!LoadN->isSimple()) {
6383 unsigned NumVolBits = LoadN->getValueType(ResNo: 0).getSizeInBits();
6384 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6385 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6386 (MOpc == X86::TEST32mi && NumVolBits != 32))
6387 break;
6388 }
6389 }
6390 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6391 Reg.getOperand(i: 0) };
6392 NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops);
6393 // Update the chain.
6394 ReplaceUses(F: Reg.getValue(R: 1), T: SDValue(NewNode, 1));
6395 // Record the mem-refs
6396 CurDAG->setNodeMemRefs(N: NewNode,
6397 NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()});
6398 } else {
6399 // Extract the subregister if necessary.
6400 if (N0.getValueType() != VT)
6401 Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg);
6402
6403 NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm);
6404 }
6405 // Replace CMP with TEST.
6406 ReplaceNode(F: Node, T: NewNode);
6407 return;
6408 }
6409 break;
6410 }
6411 case X86ISD::PCMPISTR: {
6412 if (!Subtarget->hasSSE42())
6413 break;
6414
6415 bool NeedIndex = !SDValue(Node, 0).use_empty();
6416 bool NeedMask = !SDValue(Node, 1).use_empty();
6417 // We can't fold a load if we are going to make two instructions.
6418 bool MayFoldLoad = !NeedIndex || !NeedMask;
6419
6420 MachineSDNode *CNode;
6421 if (NeedMask) {
6422 unsigned ROpc =
6423 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6424 unsigned MOpc =
6425 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6426 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node);
6427 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0));
6428 }
6429 if (NeedIndex || !NeedMask) {
6430 unsigned ROpc =
6431 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6432 unsigned MOpc =
6433 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6434 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node);
6435 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
6436 }
6437
6438 // Connect the flag usage to the last instruction created.
6439 ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1));
6440 CurDAG->RemoveDeadNode(N: Node);
6441 return;
6442 }
6443 case X86ISD::PCMPESTR: {
6444 if (!Subtarget->hasSSE42())
6445 break;
6446
6447 // Copy the two implicit register inputs.
6448 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX,
6449 N: Node->getOperand(Num: 1),
6450 Glue: SDValue()).getValue(R: 1);
6451 InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX,
6452 N: Node->getOperand(Num: 3), Glue: InGlue).getValue(R: 1);
6453
6454 bool NeedIndex = !SDValue(Node, 0).use_empty();
6455 bool NeedMask = !SDValue(Node, 1).use_empty();
6456 // We can't fold a load if we are going to make two instructions.
6457 bool MayFoldLoad = !NeedIndex || !NeedMask;
6458
6459 MachineSDNode *CNode;
6460 if (NeedMask) {
6461 unsigned ROpc =
6462 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6463 unsigned MOpc =
6464 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6465 CNode =
6466 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue);
6467 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0));
6468 }
6469 if (NeedIndex || !NeedMask) {
6470 unsigned ROpc =
6471 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6472 unsigned MOpc =
6473 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6474 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue);
6475 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
6476 }
6477 // Connect the flag usage to the last instruction created.
6478 ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1));
6479 CurDAG->RemoveDeadNode(N: Node);
6480 return;
6481 }
6482
6483 case ISD::SETCC: {
6484 if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue(Node, 0), InMask: SDValue()))
6485 return;
6486
6487 break;
6488 }
6489
6490 case ISD::STORE:
6491 if (foldLoadStoreIntoMemOperand(Node))
6492 return;
6493 break;
6494
6495 case X86ISD::SETCC_CARRY: {
6496 MVT VT = Node->getSimpleValueType(ResNo: 0);
6497 SDValue Result;
6498 if (Subtarget->hasSBBDepBreaking()) {
6499 // We have to do this manually because tblgen will put the eflags copy in
6500 // the wrong place if we use an extract_subreg in the pattern.
6501 // Copy flags to the EFLAGS register and glue it to next node.
6502 SDValue EFLAGS =
6503 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
6504 N: Node->getOperand(Num: 1), Glue: SDValue());
6505
6506 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6507 // 32-bit version.
6508 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6509 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6510 Result = SDValue(
6511 CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: 1)),
6512 0);
6513 } else {
6514 // The target does not recognize sbb with the same reg operand as a
6515 // no-source idiom, so we explicitly zero the input values.
6516 Result = getSBBZero(N: Node);
6517 }
6518
6519 // For less than 32-bits we need to extract from the 32-bit node.
6520 if (VT == MVT::i8 || VT == MVT::i16) {
6521 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6522 Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6523 }
6524
6525 ReplaceUses(F: SDValue(Node, 0), T: Result);
6526 CurDAG->RemoveDeadNode(N: Node);
6527 return;
6528 }
6529 case X86ISD::SBB: {
6530 if (isNullConstant(V: Node->getOperand(Num: 0)) &&
6531 isNullConstant(V: Node->getOperand(Num: 1))) {
6532 SDValue Result = getSBBZero(N: Node);
6533
6534 // Replace the flag use.
6535 ReplaceUses(F: SDValue(Node, 1), T: Result.getValue(R: 1));
6536
6537 // Replace the result use.
6538 if (!SDValue(Node, 0).use_empty()) {
6539 // For less than 32-bits we need to extract from the 32-bit node.
6540 MVT VT = Node->getSimpleValueType(ResNo: 0);
6541 if (VT == MVT::i8 || VT == MVT::i16) {
6542 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6543 Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6544 }
6545 ReplaceUses(F: SDValue(Node, 0), T: Result);
6546 }
6547
6548 CurDAG->RemoveDeadNode(N: Node);
6549 return;
6550 }
6551 break;
6552 }
6553 case X86ISD::MGATHER: {
6554 auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node);
6555 SDValue IndexOp = Mgt->getIndex();
6556 SDValue Mask = Mgt->getMask();
6557 MVT IndexVT = IndexOp.getSimpleValueType();
6558 MVT ValueVT = Node->getSimpleValueType(ResNo: 0);
6559 MVT MaskVT = Mask.getSimpleValueType();
6560
6561 // This is just to prevent crashes if the nodes are malformed somehow. We're
6562 // otherwise only doing loose type checking in here based on type what
6563 // a type constraint would say just like table based isel.
6564 if (!ValueVT.isVector() || !MaskVT.isVector())
6565 break;
6566
6567 unsigned NumElts = ValueVT.getVectorNumElements();
6568 MVT ValueSVT = ValueVT.getVectorElementType();
6569
6570 bool IsFP = ValueSVT.isFloatingPoint();
6571 unsigned EltSize = ValueSVT.getSizeInBits();
6572
6573 unsigned Opc = 0;
6574 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6575 if (AVX512Gather) {
6576 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6577 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6578 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6579 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6580 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6581 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6582 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6583 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6584 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6585 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6586 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6587 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6588 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6589 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6590 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6591 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6592 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6593 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6594 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6595 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6596 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6597 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6598 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6599 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6600 } else {
6601 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6602 "Unexpected mask VT!");
6603 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6604 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6605 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6606 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6607 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6608 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6609 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6610 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6611 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6612 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6613 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6614 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6615 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6616 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6617 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6618 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6619 }
6620
6621 if (!Opc)
6622 break;
6623
6624 SDValue Base, Scale, Index, Disp, Segment;
6625 if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(),
6626 Base, Scale, Index, Disp, Segment))
6627 break;
6628
6629 SDValue PassThru = Mgt->getPassThru();
6630 SDValue Chain = Mgt->getChain();
6631 // Gather instructions have a mask output not in the ISD node.
6632 SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other);
6633
6634 MachineSDNode *NewNode;
6635 if (AVX512Gather) {
6636 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6637 Index, Disp, Segment, Chain};
6638 NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6639 } else {
6640 SDValue Ops[] = {PassThru, Base, Scale, Index,
6641 Disp, Segment, Mask, Chain};
6642 NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6643 }
6644 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()});
6645 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
6646 ReplaceUses(F: SDValue(Node, 1), T: SDValue(NewNode, 2));
6647 CurDAG->RemoveDeadNode(N: Node);
6648 return;
6649 }
6650 case X86ISD::MSCATTER: {
6651 auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node);
6652 SDValue Value = Sc->getValue();
6653 SDValue IndexOp = Sc->getIndex();
6654 MVT IndexVT = IndexOp.getSimpleValueType();
6655 MVT ValueVT = Value.getSimpleValueType();
6656
6657 // This is just to prevent crashes if the nodes are malformed somehow. We're
6658 // otherwise only doing loose type checking in here based on type what
6659 // a type constraint would say just like table based isel.
6660 if (!ValueVT.isVector())
6661 break;
6662
6663 unsigned NumElts = ValueVT.getVectorNumElements();
6664 MVT ValueSVT = ValueVT.getVectorElementType();
6665
6666 bool IsFP = ValueSVT.isFloatingPoint();
6667 unsigned EltSize = ValueSVT.getSizeInBits();
6668
6669 unsigned Opc;
6670 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6671 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6672 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6673 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6674 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6675 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6676 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6677 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6678 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6679 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6680 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6681 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6682 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6683 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6684 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6685 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6686 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6687 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6688 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6689 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6690 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6691 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6692 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6693 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6694 else
6695 break;
6696
6697 SDValue Base, Scale, Index, Disp, Segment;
6698 if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(),
6699 Base, Scale, Index, Disp, Segment))
6700 break;
6701
6702 SDValue Mask = Sc->getMask();
6703 SDValue Chain = Sc->getChain();
6704 // Scatter instructions have a mask output not in the ISD node.
6705 SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other);
6706 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6707
6708 MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6709 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()});
6710 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 1));
6711 CurDAG->RemoveDeadNode(N: Node);
6712 return;
6713 }
6714 case ISD::PREALLOCATED_SETUP: {
6715 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6716 auto CallId = MFI->getPreallocatedIdForCallSite(
6717 CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue());
6718 SDValue Chain = Node->getOperand(Num: 0);
6719 SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6720 MachineSDNode *New = CurDAG->getMachineNode(
6721 Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain);
6722 ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Chain
6723 CurDAG->RemoveDeadNode(N: Node);
6724 return;
6725 }
6726 case ISD::PREALLOCATED_ARG: {
6727 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6728 auto CallId = MFI->getPreallocatedIdForCallSite(
6729 CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue());
6730 SDValue Chain = Node->getOperand(Num: 0);
6731 SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6732 SDValue ArgIndex = Node->getOperand(Num: 2);
6733 SDValue Ops[3];
6734 Ops[0] = CallIdValue;
6735 Ops[1] = ArgIndex;
6736 Ops[2] = Chain;
6737 MachineSDNode *New = CurDAG->getMachineNode(
6738 Opcode: TargetOpcode::PREALLOCATED_ARG, dl,
6739 VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()),
6740 VT2: MVT::Other),
6741 Ops);
6742 ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Arg pointer
6743 ReplaceUses(F: SDValue(Node, 1), T: SDValue(New, 1)); // Chain
6744 CurDAG->RemoveDeadNode(N: Node);
6745 return;
6746 }
6747 case X86ISD::AESENCWIDE128KL:
6748 case X86ISD::AESDECWIDE128KL:
6749 case X86ISD::AESENCWIDE256KL:
6750 case X86ISD::AESDECWIDE256KL: {
6751 if (!Subtarget->hasWIDEKL())
6752 break;
6753
6754 unsigned Opcode;
6755 switch (Node->getOpcode()) {
6756 default:
6757 llvm_unreachable("Unexpected opcode!");
6758 case X86ISD::AESENCWIDE128KL:
6759 Opcode = X86::AESENCWIDE128KL;
6760 break;
6761 case X86ISD::AESDECWIDE128KL:
6762 Opcode = X86::AESDECWIDE128KL;
6763 break;
6764 case X86ISD::AESENCWIDE256KL:
6765 Opcode = X86::AESENCWIDE256KL;
6766 break;
6767 case X86ISD::AESDECWIDE256KL:
6768 Opcode = X86::AESDECWIDE256KL;
6769 break;
6770 }
6771
6772 SDValue Chain = Node->getOperand(Num: 0);
6773 SDValue Addr = Node->getOperand(Num: 1);
6774
6775 SDValue Base, Scale, Index, Disp, Segment;
6776 if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment))
6777 break;
6778
6779 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 2),
6780 Glue: SDValue());
6781 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 3),
6782 Glue: Chain.getValue(R: 1));
6783 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: 4),
6784 Glue: Chain.getValue(R: 1));
6785 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: 5),
6786 Glue: Chain.getValue(R: 1));
6787 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: 6),
6788 Glue: Chain.getValue(R: 1));
6789 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: 7),
6790 Glue: Chain.getValue(R: 1));
6791 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: 8),
6792 Glue: Chain.getValue(R: 1));
6793 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: 9),
6794 Glue: Chain.getValue(R: 1));
6795
6796 MachineSDNode *Res = CurDAG->getMachineNode(
6797 Opcode, dl, VTs: Node->getVTList(),
6798 Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: 1)});
6799 CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand());
6800 ReplaceNode(F: Node, T: Res);
6801 return;
6802 }
6803 case X86ISD::POP_FROM_X87_REG: {
6804 SDValue Chain = Node->getOperand(Num: 0);
6805 Register Reg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1))->getReg();
6806 SDValue Glue;
6807 if (Node->getNumValues() == 3)
6808 Glue = Node->getOperand(Num: 2);
6809 SDValue Copy =
6810 CurDAG->getCopyFromReg(Chain, dl, Reg, VT: Node->getValueType(ResNo: 0), Glue);
6811 ReplaceNode(F: Node, T: Copy.getNode());
6812 return;
6813 }
6814 }
6815
6816 SelectCode(N: Node);
6817}
6818
6819bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6820 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6821 std::vector<SDValue> &OutOps) {
6822 SDValue Op0, Op1, Op2, Op3, Op4;
6823 switch (ConstraintID) {
6824 default:
6825 llvm_unreachable("Unexpected asm memory constraint");
6826 case InlineAsm::ConstraintCode::o: // offsetable ??
6827 case InlineAsm::ConstraintCode::v: // not offsetable ??
6828 case InlineAsm::ConstraintCode::m: // memory
6829 case InlineAsm::ConstraintCode::X:
6830 case InlineAsm::ConstraintCode::p: // address
6831 if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4))
6832 return true;
6833 break;
6834 }
6835
6836 OutOps.push_back(x: Op0);
6837 OutOps.push_back(x: Op1);
6838 OutOps.push_back(x: Op2);
6839 OutOps.push_back(x: Op3);
6840 OutOps.push_back(x: Op4);
6841 return false;
6842}
6843
6844X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6845 : SelectionDAGISelPass(
6846 std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
6847
6848/// This pass converts a legalized DAG into a X86-specific DAG,
6849/// ready for instruction scheduling.
6850FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6851 CodeGenOptLevel OptLevel) {
6852 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6853}
6854