1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
16#include "X86MachineFunctionInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/CodeGen/MachineModuleInfo.h"
21#include "llvm/CodeGen/SelectionDAGISel.h"
22#include "llvm/Config/llvm-config.h"
23#include "llvm/IR/ConstantRange.h"
24#include "llvm/IR/Function.h"
25#include "llvm/IR/Instructions.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/ErrorHandling.h"
32#include "llvm/Support/KnownBits.h"
33#include "llvm/Support/MathExtras.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(Val: true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45 cl::Hidden);
46
47static cl::opt<bool> EnablePromoteAnyextLoad(
48 "x86-promote-anyext-load", cl::init(Val: true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
51extern cl::opt<bool> IndirectBranchTracking;
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 Kind: "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
187 return SelectionDAGISel::runOnMachineFunction(mf&: MF);
188 }
189
190 void emitFunctionEntryCode() override;
191
192 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
193
194 void PreprocessISelDAG() override;
195 void PostprocessISelDAG() override;
196
197// Include the pieces autogenerated from the target description.
198#include "X86GenDAGISel.inc"
199
200 private:
201 void Select(SDNode *N) override;
202
203 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205 bool AllowSegmentRegForX32 = false);
206 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211 unsigned Depth);
212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
218 SDValue &Scale, SDValue &Index, SDValue &Disp,
219 SDValue &Segment);
220 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
221 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
222 SDValue &Index, SDValue &Disp, SDValue &Segment);
223 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
224 bool selectLEAAddr(SDValue N, SDValue &Base,
225 SDValue &Scale, SDValue &Index, SDValue &Disp,
226 SDValue &Segment);
227 bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
228 SDValue &Index, SDValue &Disp, SDValue &Segment);
229 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectRelocImm(SDValue N, SDValue &Op);
233
234 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
235 SDValue &Base, SDValue &Scale,
236 SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238
239 // Convenience method where P is also root.
240 bool tryFoldLoad(SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment) {
244 return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment);
245 }
246
247 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
248 SDValue &Base, SDValue &Scale,
249 SDValue &Index, SDValue &Disp,
250 SDValue &Segment);
251
252 bool isProfitableToFormMaskedOp(SDNode *N) const;
253
254 /// Implement addressing mode selection for inline asm expressions.
255 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
256 InlineAsm::ConstraintCode ConstraintID,
257 std::vector<SDValue> &OutOps) override;
258
259 void emitSpecialCodeForMain();
260
261 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
262 MVT VT, SDValue &Base, SDValue &Scale,
263 SDValue &Index, SDValue &Disp,
264 SDValue &Segment) {
265 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
266 Base = CurDAG->getTargetFrameIndex(
267 FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout()));
268 else if (AM.Base_Reg.getNode())
269 Base = AM.Base_Reg;
270 else
271 Base = CurDAG->getRegister(Reg: 0, VT);
272
273 Scale = getI8Imm(Imm: AM.Scale, DL);
274
275#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
276 // Negate the index if needed.
277 if (AM.NegateIndex) {
278 unsigned NegOpc;
279 switch (VT.SimpleTy) {
280 default:
281 llvm_unreachable("Unsupported VT!");
282 case MVT::i64:
283 NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
284 break;
285 case MVT::i32:
286 NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
287 break;
288 case MVT::i16:
289 NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
290 break;
291 case MVT::i8:
292 NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
293 break;
294 }
295 SDValue Neg = SDValue(CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32,
296 Ops: AM.IndexReg), 0);
297 AM.IndexReg = Neg;
298 }
299
300 if (AM.IndexReg.getNode())
301 Index = AM.IndexReg;
302 else
303 Index = CurDAG->getRegister(Reg: 0, VT);
304
305 // These are 32-bit even in 64-bit mode since RIP-relative offset
306 // is 32-bit.
307 if (AM.GV)
308 Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc(),
309 VT: MVT::i32, offset: AM.Disp,
310 TargetFlags: AM.SymbolFlags);
311 else if (AM.CP)
312 Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment,
313 Offset: AM.Disp, TargetFlags: AM.SymbolFlags);
314 else if (AM.ES) {
315 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
316 Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
317 } else if (AM.MCSym) {
318 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
319 assert(AM.SymbolFlags == 0 && "oo");
320 Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32);
321 } else if (AM.JT != -1) {
322 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
323 Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
324 } else if (AM.BlockAddr)
325 Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp,
326 TargetFlags: AM.SymbolFlags);
327 else
328 Disp = CurDAG->getSignedTargetConstant(Val: AM.Disp, DL, VT: MVT::i32);
329
330 if (AM.Segment.getNode())
331 Segment = AM.Segment;
332 else
333 Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
334 }
335
336 // Utility function to determine whether it is AMX SDNode right after
337 // lowering but before ISEL.
338 bool isAMXSDNode(SDNode *N) const {
339 // Check if N is AMX SDNode:
340 // 1. check result type;
341 // 2. check operand type;
342 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
343 if (N->getValueType(ResNo: Idx) == MVT::x86amx)
344 return true;
345 }
346 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
347 SDValue Op = N->getOperand(Num: Idx);
348 if (Op.getValueType() == MVT::x86amx)
349 return true;
350 }
351 return false;
352 }
353
354 // Utility function to determine whether we should avoid selecting
355 // immediate forms of instructions for better code size or not.
356 // At a high level, we'd like to avoid such instructions when
357 // we have similar constants used within the same basic block
358 // that can be kept in a register.
359 //
360 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
361 uint32_t UseCount = 0;
362
363 // Do not want to hoist if we're not optimizing for size.
364 // TODO: We'd like to remove this restriction.
365 // See the comment in X86InstrInfo.td for more info.
366 if (!CurDAG->shouldOptForSize())
367 return false;
368
369 // Walk all the users of the immediate.
370 for (const SDNode *User : N->users()) {
371 if (UseCount >= 2)
372 break;
373
374 // This user is already selected. Count it as a legitimate use and
375 // move on.
376 if (User->isMachineOpcode()) {
377 UseCount++;
378 continue;
379 }
380
381 // We want to count stores of immediates as real uses.
382 if (User->getOpcode() == ISD::STORE &&
383 User->getOperand(Num: 1).getNode() == N) {
384 UseCount++;
385 continue;
386 }
387
388 // We don't currently match users that have > 2 operands (except
389 // for stores, which are handled above)
390 // Those instruction won't match in ISEL, for now, and would
391 // be counted incorrectly.
392 // This may change in the future as we add additional instruction
393 // types.
394 if (User->getNumOperands() != 2)
395 continue;
396
397 // If this is a sign-extended 8-bit integer immediate used in an ALU
398 // instruction, there is probably an opcode encoding to save space.
399 auto *C = dyn_cast<ConstantSDNode>(Val: N);
400 if (C && isInt<8>(x: C->getSExtValue()))
401 continue;
402
403 // Immediates that are used for offsets as part of stack
404 // manipulation should be left alone. These are typically
405 // used to indicate SP offsets for argument passing and
406 // will get pulled into stores/pushes (implicitly).
407 if (User->getOpcode() == X86ISD::ADD ||
408 User->getOpcode() == ISD::ADD ||
409 User->getOpcode() == X86ISD::SUB ||
410 User->getOpcode() == ISD::SUB) {
411
412 // Find the other operand of the add/sub.
413 SDValue OtherOp = User->getOperand(Num: 0);
414 if (OtherOp.getNode() == N)
415 OtherOp = User->getOperand(Num: 1);
416
417 // Don't count if the other operand is SP.
418 RegisterSDNode *RegNode;
419 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
420 (RegNode = dyn_cast_or_null<RegisterSDNode>(
421 Val: OtherOp->getOperand(Num: 1).getNode())))
422 if ((RegNode->getReg() == X86::ESP) ||
423 (RegNode->getReg() == X86::RSP))
424 continue;
425 }
426
427 // ... otherwise, count this and move on.
428 UseCount++;
429 }
430
431 // If we have more than 1 use, then recommend for hoisting.
432 return (UseCount > 1);
433 }
434
435 /// Return a target constant with the specified value of type i8.
436 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
437 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
438 }
439
440 /// Return a target constant with the specified value, of type i32.
441 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
442 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32);
443 }
444
445 /// Return a target constant with the specified value, of type i64.
446 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
447 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64);
448 }
449
450 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
451 const SDLoc &DL) {
452 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
453 uint64_t Index = N->getConstantOperandVal(Num: 1);
454 MVT VecVT = N->getOperand(Num: 0).getSimpleValueType();
455 return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
456 }
457
458 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
459 const SDLoc &DL) {
460 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
461 uint64_t Index = N->getConstantOperandVal(Num: 2);
462 MVT VecVT = N->getSimpleValueType(ResNo: 0);
463 return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
464 }
465
466 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
467 const SDLoc &DL) {
468 assert(VecWidth == 128 && "Unexpected vector width");
469 uint64_t Index = N->getConstantOperandVal(Num: 2);
470 MVT VecVT = N->getSimpleValueType(ResNo: 0);
471 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
472 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
473 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
474 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
475 return getI8Imm(Imm: InsertIdx ? 0x02 : 0x30, DL);
476 }
477
478 SDValue getSBBZero(SDNode *N) {
479 SDLoc dl(N);
480 MVT VT = N->getSimpleValueType(ResNo: 0);
481
482 // Create zero.
483 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
484 SDValue Zero =
485 SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0);
486 if (VT == MVT::i64) {
487 Zero = SDValue(
488 CurDAG->getMachineNode(
489 Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64,
490 Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: Zero,
491 Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
492 0);
493 }
494
495 // Copy flags to the EFLAGS register and glue it to next node.
496 unsigned Opcode = N->getOpcode();
497 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
498 "Unexpected opcode for SBB materialization");
499 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
500 SDValue EFLAGS =
501 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
502 N: N->getOperand(Num: FlagOpIndex), Glue: SDValue());
503
504 // Create a 64-bit instruction if the result is 64-bits otherwise use the
505 // 32-bit version.
506 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
507 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
508 VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32);
509 return SDValue(
510 CurDAG->getMachineNode(Opcode: Opc, dl, VTs,
511 Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: 1)}),
512 0);
513 }
514
515 // Helper to detect unneeded and instructions on shift amounts. Called
516 // from PatFrags in tablegen.
517 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
518 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
519 const APInt &Val = N->getConstantOperandAPInt(Num: 1);
520
521 if (Val.countr_one() >= Width)
522 return true;
523
524 APInt Mask = Val | CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero;
525 return Mask.countr_one() >= Width;
526 }
527
528 /// Return an SDNode that returns the value of the global base register.
529 /// Output instructions required to initialize the global base register,
530 /// if necessary.
531 SDNode *getGlobalBaseReg();
532
533 /// Return a reference to the TargetMachine, casted to the target-specific
534 /// type.
535 const X86TargetMachine &getTargetMachine() const {
536 return static_cast<const X86TargetMachine &>(TM);
537 }
538
539 /// Return a reference to the TargetInstrInfo, casted to the target-specific
540 /// type.
541 const X86InstrInfo *getInstrInfo() const {
542 return Subtarget->getInstrInfo();
543 }
544
545 /// Return a condition code of the given SDNode
546 X86::CondCode getCondFromNode(SDNode *N) const;
547
548 /// Address-mode matching performs shift-of-and to and-of-shift
549 /// reassociation in order to expose more scaled addressing
550 /// opportunities.
551 bool ComplexPatternFuncMutatesDAG() const override {
552 return true;
553 }
554
555 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
556
557 // Indicates we should prefer to use a non-temporal load for this load.
558 bool useNonTemporalLoad(LoadSDNode *N) const {
559 if (!N->isNonTemporal())
560 return false;
561
562 unsigned StoreSize = N->getMemoryVT().getStoreSize();
563
564 if (N->getAlign().value() < StoreSize)
565 return false;
566
567 switch (StoreSize) {
568 default: llvm_unreachable("Unsupported store size");
569 case 4:
570 case 8:
571 return false;
572 case 16:
573 return Subtarget->hasSSE41();
574 case 32:
575 return Subtarget->hasAVX2();
576 case 64:
577 return Subtarget->hasAVX512();
578 }
579 }
580
581 bool foldLoadStoreIntoMemOperand(SDNode *Node);
582 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
583 bool matchBitExtract(SDNode *Node);
584 bool shrinkAndImmediate(SDNode *N);
585 bool isMaskZeroExtended(SDNode *N) const;
586 bool tryShiftAmountMod(SDNode *N);
587 bool tryShrinkShlLogicImm(SDNode *N);
588 bool tryVPTERNLOG(SDNode *N);
589 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
590 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
591 uint8_t Imm);
592 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
593 bool tryMatchBitSelect(SDNode *N);
594
595 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
596 const SDLoc &dl, MVT VT, SDNode *Node);
597 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
598 const SDLoc &dl, MVT VT, SDNode *Node,
599 SDValue &InGlue);
600
601 bool tryOptimizeRem8Extend(SDNode *N);
602
603 bool onlyUsesZeroFlag(SDValue Flags) const;
604 bool hasNoSignFlagUses(SDValue Flags) const;
605 bool hasNoCarryFlagUses(SDValue Flags) const;
606 };
607
608 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
609 public:
610 static char ID;
611 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
612 CodeGenOptLevel OptLevel)
613 : SelectionDAGISelLegacy(
614 ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {}
615 };
616}
617
618char X86DAGToDAGISelLegacy::ID = 0;
619
620INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
621
622// Returns true if this masked compare can be implemented legally with this
623// type.
624static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
625 unsigned Opcode = N->getOpcode();
626 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
627 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
628 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
629 // We can get 256-bit 8 element types here without VLX being enabled. When
630 // this happens we will use 512-bit operations and the mask will not be
631 // zero extended.
632 EVT OpVT = N->getOperand(Num: 0).getValueType();
633 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
634 // second operand.
635 if (Opcode == X86ISD::STRICT_CMPM)
636 OpVT = N->getOperand(Num: 1).getValueType();
637 if (OpVT.is256BitVector() || OpVT.is128BitVector())
638 return Subtarget->hasVLX();
639
640 return true;
641 }
642 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
643 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
644 Opcode == X86ISD::FSETCCM_SAE)
645 return true;
646
647 return false;
648}
649
650// Returns true if we can assume the writer of the mask has zero extended it
651// for us.
652bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
653 // If this is an AND, check if we have a compare on either side. As long as
654 // one side guarantees the mask is zero extended, the AND will preserve those
655 // zeros.
656 if (N->getOpcode() == ISD::AND)
657 return isLegalMaskCompare(N: N->getOperand(Num: 0).getNode(), Subtarget) ||
658 isLegalMaskCompare(N: N->getOperand(Num: 1).getNode(), Subtarget);
659
660 return isLegalMaskCompare(N, Subtarget);
661}
662
663bool
664X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
665 if (OptLevel == CodeGenOptLevel::None)
666 return false;
667
668 if (!N.hasOneUse())
669 return false;
670
671 if (N.getOpcode() != ISD::LOAD)
672 return true;
673
674 // Don't fold non-temporal loads if we have an instruction for them.
675 if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N)))
676 return false;
677
678 // If N is a load, do additional profitability checks.
679 if (U == Root) {
680 switch (U->getOpcode()) {
681 default: break;
682 case X86ISD::ADD:
683 case X86ISD::ADC:
684 case X86ISD::SUB:
685 case X86ISD::SBB:
686 case X86ISD::AND:
687 case X86ISD::XOR:
688 case X86ISD::OR:
689 case ISD::ADD:
690 case ISD::UADDO_CARRY:
691 case ISD::AND:
692 case ISD::OR:
693 case ISD::XOR: {
694 SDValue Op1 = U->getOperand(Num: 1);
695
696 // If the other operand is a 8-bit immediate we should fold the immediate
697 // instead. This reduces code size.
698 // e.g.
699 // movl 4(%esp), %eax
700 // addl $4, %eax
701 // vs.
702 // movl $4, %eax
703 // addl 4(%esp), %eax
704 // The former is 2 bytes shorter. In case where the increment is 1, then
705 // the saving can be 4 bytes (by using incl %eax).
706 if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) {
707 if (Imm->getAPIntValue().isSignedIntN(N: 8))
708 return false;
709
710 // If this is a 64-bit AND with an immediate that fits in 32-bits,
711 // prefer using the smaller and over folding the load. This is needed to
712 // make sure immediates created by shrinkAndImmediate are always folded.
713 // Ideally we would narrow the load during DAG combine and get the
714 // best of both worlds.
715 if (U->getOpcode() == ISD::AND &&
716 Imm->getAPIntValue().getBitWidth() == 64 &&
717 Imm->getAPIntValue().isIntN(N: 32))
718 return false;
719
720 // If this really a zext_inreg that can be represented with a movzx
721 // instruction, prefer that.
722 // TODO: We could shrink the load and fold if it is non-volatile.
723 if (U->getOpcode() == ISD::AND &&
724 (Imm->getAPIntValue() == UINT8_MAX ||
725 Imm->getAPIntValue() == UINT16_MAX ||
726 Imm->getAPIntValue() == UINT32_MAX))
727 return false;
728
729 // ADD/SUB with can negate the immediate and use the opposite operation
730 // to fit 128 into a sign extended 8 bit immediate.
731 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
732 (-Imm->getAPIntValue()).isSignedIntN(N: 8))
733 return false;
734
735 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
736 (-Imm->getAPIntValue()).isSignedIntN(N: 8) &&
737 hasNoCarryFlagUses(Flags: SDValue(U, 1)))
738 return false;
739 }
740
741 // If the other operand is a TLS address, we should fold it instead.
742 // This produces
743 // movl %gs:0, %eax
744 // leal i@NTPOFF(%eax), %eax
745 // instead of
746 // movl $i@NTPOFF, %eax
747 // addl %gs:0, %eax
748 // if the block also has an access to a second TLS address this will save
749 // a load.
750 // FIXME: This is probably also true for non-TLS addresses.
751 if (Op1.getOpcode() == X86ISD::Wrapper) {
752 SDValue Val = Op1.getOperand(i: 0);
753 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
754 return false;
755 }
756
757 // Don't fold load if this matches the BTS/BTR/BTC patterns.
758 // BTS: (or X, (shl 1, n))
759 // BTR: (and X, (rotl -2, n))
760 // BTC: (xor X, (shl 1, n))
761 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
762 if (U->getOperand(Num: 0).getOpcode() == ISD::SHL &&
763 isOneConstant(V: U->getOperand(Num: 0).getOperand(i: 0)))
764 return false;
765
766 if (U->getOperand(Num: 1).getOpcode() == ISD::SHL &&
767 isOneConstant(V: U->getOperand(Num: 1).getOperand(i: 0)))
768 return false;
769 }
770 if (U->getOpcode() == ISD::AND) {
771 SDValue U0 = U->getOperand(Num: 0);
772 SDValue U1 = U->getOperand(Num: 1);
773 if (U0.getOpcode() == ISD::ROTL) {
774 auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: 0));
775 if (C && C->getSExtValue() == -2)
776 return false;
777 }
778
779 if (U1.getOpcode() == ISD::ROTL) {
780 auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: 0));
781 if (C && C->getSExtValue() == -2)
782 return false;
783 }
784 }
785
786 break;
787 }
788 case ISD::SHL:
789 case ISD::SRA:
790 case ISD::SRL:
791 // Don't fold a load into a shift by immediate. The BMI2 instructions
792 // support folding a load, but not an immediate. The legacy instructions
793 // support folding an immediate, but can't fold a load. Folding an
794 // immediate is preferable to folding a load.
795 if (isa<ConstantSDNode>(Val: U->getOperand(Num: 1)))
796 return false;
797
798 break;
799 }
800 }
801
802 // Prevent folding a load if this can implemented with an insert_subreg or
803 // a move that implicitly zeroes.
804 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
805 isNullConstant(V: Root->getOperand(Num: 2)) &&
806 (Root->getOperand(Num: 0).isUndef() ||
807 ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: 0).getNode())))
808 return false;
809
810 return true;
811}
812
813// Indicates it is profitable to form an AVX512 masked operation. Returning
814// false will favor a masked register-register masked move or vblendm and the
815// operation will be selected separately.
816bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
817 assert(
818 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
819 "Unexpected opcode!");
820
821 // If the operation has additional users, the operation will be duplicated.
822 // Check the use count to prevent that.
823 // FIXME: Are there cheap opcodes we might want to duplicate?
824 return N->getOperand(Num: 1).hasOneUse();
825}
826
827/// Replace the original chain operand of the call with
828/// load's chain operand and move load below the call's chain operand.
829static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
830 SDValue Call, SDValue OrigChain) {
831 SmallVector<SDValue, 8> Ops;
832 SDValue Chain = OrigChain.getOperand(i: 0);
833 if (Chain.getNode() == Load.getNode())
834 Ops.push_back(Elt: Load.getOperand(i: 0));
835 else {
836 assert(Chain.getOpcode() == ISD::TokenFactor &&
837 "Unexpected chain operand");
838 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
839 if (Chain.getOperand(i).getNode() == Load.getNode())
840 Ops.push_back(Elt: Load.getOperand(i: 0));
841 else
842 Ops.push_back(Elt: Chain.getOperand(i));
843 SDValue NewChain =
844 CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Load), VT: MVT::Other, Ops);
845 Ops.clear();
846 Ops.push_back(Elt: NewChain);
847 }
848 Ops.append(in_start: OrigChain->op_begin() + 1, in_end: OrigChain->op_end());
849 CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops);
850 CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: 0),
851 Op2: Load.getOperand(i: 1), Op3: Load.getOperand(i: 2));
852
853 Ops.clear();
854 Ops.push_back(Elt: SDValue(Load.getNode(), 1));
855 Ops.append(in_start: Call->op_begin() + 1, in_end: Call->op_end());
856 CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops);
857}
858
859/// Return true if call address is a load and it can be
860/// moved below CALLSEQ_START and the chains leading up to the call.
861/// Return the CALLSEQ_START by reference as a second output.
862/// In the case of a tail call, there isn't a callseq node between the call
863/// chain and the load.
864static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
865 // The transformation is somewhat dangerous if the call's chain was glued to
866 // the call. After MoveBelowOrigChain the load is moved between the call and
867 // the chain, this can create a cycle if the load is not folded. So it is
868 // *really* important that we are sure the load will be folded.
869 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
870 return false;
871 auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode());
872 if (!LD ||
873 !LD->isSimple() ||
874 LD->getAddressingMode() != ISD::UNINDEXED ||
875 LD->getExtensionType() != ISD::NON_EXTLOAD)
876 return false;
877
878 // Now let's find the callseq_start.
879 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
880 if (!Chain.hasOneUse())
881 return false;
882 Chain = Chain.getOperand(i: 0);
883 }
884
885 if (!Chain.getNumOperands())
886 return false;
887 // Since we are not checking for AA here, conservatively abort if the chain
888 // writes to memory. It's not safe to move the callee (a load) across a store.
889 if (isa<MemSDNode>(Val: Chain.getNode()) &&
890 cast<MemSDNode>(Val: Chain.getNode())->writeMem())
891 return false;
892 if (Chain.getOperand(i: 0).getNode() == Callee.getNode())
893 return true;
894 if (Chain.getOperand(i: 0).getOpcode() == ISD::TokenFactor &&
895 Callee.getValue(R: 1).isOperandOf(N: Chain.getOperand(i: 0).getNode()) &&
896 Callee.getValue(R: 1).hasOneUse())
897 return true;
898 return false;
899}
900
901static bool isEndbrImm64(uint64_t Imm) {
902// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
903// i.g: 0xF3660F1EFA, 0xF3670F1EFA
904 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
905 return false;
906
907 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
908 0x65, 0x66, 0x67, 0xf0, 0xf2};
909 int i = 24; // 24bit 0x0F1EFA has matched
910 while (i < 64) {
911 uint8_t Byte = (Imm >> i) & 0xFF;
912 if (Byte == 0xF3)
913 return true;
914 if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte))
915 return false;
916 i += 8;
917 }
918
919 return false;
920}
921
922static bool needBWI(MVT VT) {
923 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
924}
925
926void X86DAGToDAGISel::PreprocessISelDAG() {
927 bool MadeChange = false;
928 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
929 E = CurDAG->allnodes_end(); I != E; ) {
930 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
931
932 // This is for CET enhancement.
933 //
934 // ENDBR32 and ENDBR64 have specific opcodes:
935 // ENDBR32: F3 0F 1E FB
936 // ENDBR64: F3 0F 1E FA
937 // And we want that attackers won’t find unintended ENDBR32/64
938 // opcode matches in the binary
939 // Here’s an example:
940 // If the compiler had to generate asm for the following code:
941 // a = 0xF30F1EFA
942 // it could, for example, generate:
943 // mov 0xF30F1EFA, dword ptr[a]
944 // In such a case, the binary would include a gadget that starts
945 // with a fake ENDBR64 opcode. Therefore, we split such generation
946 // into multiple operations, let it not shows in the binary
947 if (N->getOpcode() == ISD::Constant) {
948 MVT VT = N->getSimpleValueType(ResNo: 0);
949 int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
950 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
951 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
952 // Check that the cf-protection-branch is enabled.
953 Metadata *CFProtectionBranch =
954 MF->getFunction().getParent()->getModuleFlag(
955 Key: "cf-protection-branch");
956 if (CFProtectionBranch || IndirectBranchTracking) {
957 SDLoc dl(N);
958 SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true);
959 Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT);
960 --I;
961 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Complement);
962 ++I;
963 MadeChange = true;
964 continue;
965 }
966 }
967 }
968
969 // If this is a target specific AND node with no flag usages, turn it back
970 // into ISD::AND to enable test instruction matching.
971 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: 1)) {
972 SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
973 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
974 --I;
975 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
976 ++I;
977 MadeChange = true;
978 continue;
979 }
980
981 // Convert vector increment or decrement to sub/add with an all-ones
982 // constant:
983 // add X, <1, 1...> --> sub X, <-1, -1...>
984 // sub X, <1, 1...> --> add X, <-1, -1...>
985 // The all-ones vector constant can be materialized using a pcmpeq
986 // instruction that is commonly recognized as an idiom (has no register
987 // dependency), so that's better/smaller than loading a splat 1 constant.
988 //
989 // But don't do this if it would inhibit a potentially profitable load
990 // folding opportunity for the other operand. That only occurs with the
991 // intersection of:
992 // (1) The other operand (op0) is load foldable.
993 // (2) The op is an add (otherwise, we are *creating* an add and can still
994 // load fold the other op).
995 // (3) The target has AVX (otherwise, we have a destructive add and can't
996 // load fold the other op without killing the constant op).
997 // (4) The constant 1 vector has multiple uses (so it is profitable to load
998 // into a register anyway).
999 auto mayPreventLoadFold = [&]() {
1000 return X86::mayFoldLoad(Op: N->getOperand(Num: 0), Subtarget: *Subtarget) &&
1001 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1002 !N->getOperand(Num: 1).hasOneUse();
1003 };
1004 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1005 N->getSimpleValueType(ResNo: 0).isVector() && !mayPreventLoadFold()) {
1006 APInt SplatVal;
1007 if (!ISD::isBuildVectorOfConstantSDNodes(
1008 N: peekThroughBitcasts(V: N->getOperand(Num: 0)).getNode()) &&
1009 X86::isConstantSplat(Op: N->getOperand(Num: 1), SplatVal) &&
1010 SplatVal.isOne()) {
1011 SDLoc DL(N);
1012
1013 MVT VT = N->getSimpleValueType(ResNo: 0);
1014 unsigned NumElts = VT.getSizeInBits() / 32;
1015 SDValue AllOnes =
1016 CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts));
1017 AllOnes = CurDAG->getBitcast(VT, V: AllOnes);
1018
1019 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1020 SDValue Res =
1021 CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: 0), N2: AllOnes);
1022 --I;
1023 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1024 ++I;
1025 MadeChange = true;
1026 continue;
1027 }
1028 }
1029
1030 switch (N->getOpcode()) {
1031 case X86ISD::VBROADCAST: {
1032 MVT VT = N->getSimpleValueType(ResNo: 0);
1033 // Emulate v32i16/v64i8 broadcast without BWI.
1034 if (!Subtarget->hasBWI() && needBWI(VT)) {
1035 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1036 SDLoc dl(N);
1037 SDValue NarrowBCast =
1038 CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: 0));
1039 SDValue Res =
1040 CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1041 N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1042 unsigned Index = NarrowVT.getVectorMinNumElements();
1043 Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1044 N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1045
1046 --I;
1047 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1048 ++I;
1049 MadeChange = true;
1050 continue;
1051 }
1052
1053 break;
1054 }
1055 case X86ISD::VBROADCAST_LOAD: {
1056 MVT VT = N->getSimpleValueType(ResNo: 0);
1057 // Emulate v32i16/v64i8 broadcast without BWI.
1058 if (!Subtarget->hasBWI() && needBWI(VT)) {
1059 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1060 auto *MemNode = cast<MemSDNode>(Val: N);
1061 SDLoc dl(N);
1062 SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other);
1063 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1064 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1065 Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(),
1066 MMO: MemNode->getMemOperand());
1067 SDValue Res =
1068 CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1069 N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1070 unsigned Index = NarrowVT.getVectorMinNumElements();
1071 Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1072 N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1073
1074 --I;
1075 SDValue To[] = {Res, NarrowBCast.getValue(R: 1)};
1076 CurDAG->ReplaceAllUsesWith(From: N, To);
1077 ++I;
1078 MadeChange = true;
1079 continue;
1080 }
1081
1082 break;
1083 }
1084 case ISD::LOAD: {
1085 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1086 // load, then just extract the lower subvector and avoid the second load.
1087 auto *Ld = cast<LoadSDNode>(Val: N);
1088 MVT VT = N->getSimpleValueType(ResNo: 0);
1089 if (!ISD::isNormalLoad(N: Ld) || !Ld->isSimple() ||
1090 !(VT.is128BitVector() || VT.is256BitVector()))
1091 break;
1092
1093 MVT MaxVT = VT;
1094 SDNode *MaxLd = nullptr;
1095 SDValue Ptr = Ld->getBasePtr();
1096 SDValue Chain = Ld->getChain();
1097 for (SDNode *User : Ptr->users()) {
1098 auto *UserLd = dyn_cast<LoadSDNode>(Val: User);
1099 MVT UserVT = User->getSimpleValueType(ResNo: 0);
1100 if (User != N && UserLd && ISD::isNormalLoad(N: User) &&
1101 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1102 !User->hasAnyUseOfValue(Value: 1) &&
1103 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1104 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1105 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1106 MaxLd = User;
1107 MaxVT = UserVT;
1108 }
1109 }
1110 if (MaxLd) {
1111 SDLoc dl(N);
1112 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1113 MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts);
1114 SDValue Extract = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT,
1115 N1: SDValue(MaxLd, 0),
1116 N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1117 SDValue Res = CurDAG->getBitcast(VT, V: Extract);
1118
1119 --I;
1120 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1121 CurDAG->ReplaceAllUsesWith(From: N, To);
1122 ++I;
1123 MadeChange = true;
1124 continue;
1125 }
1126 break;
1127 }
1128 case ISD::VSELECT: {
1129 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1130 EVT EleVT = N->getOperand(Num: 0).getValueType().getVectorElementType();
1131 if (EleVT == MVT::i1)
1132 break;
1133
1134 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1135 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1136 "We can't replace VSELECT with BLENDV in vXi16!");
1137 SDValue R;
1138 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: 0)) ==
1139 EleVT.getSizeInBits()) {
1140 R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1141 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), N3: N->getOperand(Num: 2),
1142 N4: CurDAG->getTargetConstant(Val: 0xCA, DL: SDLoc(N), VT: MVT::i8));
1143 } else {
1144 R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1145 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1),
1146 N3: N->getOperand(Num: 2));
1147 }
1148 --I;
1149 CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode());
1150 ++I;
1151 MadeChange = true;
1152 continue;
1153 }
1154 case ISD::FP_ROUND:
1155 case ISD::STRICT_FP_ROUND:
1156 case ISD::FP_TO_SINT:
1157 case ISD::FP_TO_UINT:
1158 case ISD::STRICT_FP_TO_SINT:
1159 case ISD::STRICT_FP_TO_UINT: {
1160 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1161 // don't need 2 sets of patterns.
1162 if (!N->getSimpleValueType(ResNo: 0).isVector())
1163 break;
1164
1165 unsigned NewOpc;
1166 switch (N->getOpcode()) {
1167 default: llvm_unreachable("Unexpected opcode!");
1168 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1169 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1170 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1171 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1172 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1173 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1174 }
1175 SDValue Res;
1176 if (N->isStrictFPOpcode())
1177 Res =
1178 CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), ResultTys: {N->getValueType(ResNo: 0), MVT::Other},
1179 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)});
1180 else
1181 Res =
1182 CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1183 Operand: N->getOperand(Num: 0));
1184 --I;
1185 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1186 ++I;
1187 MadeChange = true;
1188 continue;
1189 }
1190 case ISD::SHL:
1191 case ISD::SRA:
1192 case ISD::SRL: {
1193 // Replace vector shifts with their X86 specific equivalent so we don't
1194 // need 2 sets of patterns.
1195 if (!N->getValueType(ResNo: 0).isVector())
1196 break;
1197
1198 unsigned NewOpc;
1199 switch (N->getOpcode()) {
1200 default: llvm_unreachable("Unexpected opcode!");
1201 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1202 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1203 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1204 }
1205 SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1206 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
1207 --I;
1208 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1209 ++I;
1210 MadeChange = true;
1211 continue;
1212 }
1213 case ISD::ANY_EXTEND:
1214 case ISD::ANY_EXTEND_VECTOR_INREG: {
1215 // Replace vector any extend with the zero extend equivalents so we don't
1216 // need 2 sets of patterns. Ignore vXi1 extensions.
1217 if (!N->getValueType(ResNo: 0).isVector())
1218 break;
1219
1220 unsigned NewOpc;
1221 if (N->getOperand(Num: 0).getScalarValueSizeInBits() == 1) {
1222 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1223 "Unexpected opcode for mask vector!");
1224 NewOpc = ISD::SIGN_EXTEND;
1225 } else {
1226 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1227 ? ISD::ZERO_EXTEND
1228 : ISD::ZERO_EXTEND_VECTOR_INREG;
1229 }
1230
1231 SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1232 Operand: N->getOperand(Num: 0));
1233 --I;
1234 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1235 ++I;
1236 MadeChange = true;
1237 continue;
1238 }
1239 case ISD::FCEIL:
1240 case ISD::STRICT_FCEIL:
1241 case ISD::FFLOOR:
1242 case ISD::STRICT_FFLOOR:
1243 case ISD::FTRUNC:
1244 case ISD::STRICT_FTRUNC:
1245 case ISD::FROUNDEVEN:
1246 case ISD::STRICT_FROUNDEVEN:
1247 case ISD::FNEARBYINT:
1248 case ISD::STRICT_FNEARBYINT:
1249 case ISD::FRINT:
1250 case ISD::STRICT_FRINT: {
1251 // Replace fp rounding with their X86 specific equivalent so we don't
1252 // need 2 sets of patterns.
1253 unsigned Imm;
1254 switch (N->getOpcode()) {
1255 default: llvm_unreachable("Unexpected opcode!");
1256 case ISD::STRICT_FCEIL:
1257 case ISD::FCEIL: Imm = 0xA; break;
1258 case ISD::STRICT_FFLOOR:
1259 case ISD::FFLOOR: Imm = 0x9; break;
1260 case ISD::STRICT_FTRUNC:
1261 case ISD::FTRUNC: Imm = 0xB; break;
1262 case ISD::STRICT_FROUNDEVEN:
1263 case ISD::FROUNDEVEN: Imm = 0x8; break;
1264 case ISD::STRICT_FNEARBYINT:
1265 case ISD::FNEARBYINT: Imm = 0xC; break;
1266 case ISD::STRICT_FRINT:
1267 case ISD::FRINT: Imm = 0x4; break;
1268 }
1269 SDLoc dl(N);
1270 bool IsStrict = N->isStrictFPOpcode();
1271 SDValue Res;
1272 if (IsStrict)
1273 Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl,
1274 ResultTys: {N->getValueType(ResNo: 0), MVT::Other},
1275 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1),
1276 CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)});
1277 else
1278 Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: 0),
1279 N1: N->getOperand(Num: 0),
1280 N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32));
1281 --I;
1282 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1283 ++I;
1284 MadeChange = true;
1285 continue;
1286 }
1287 case X86ISD::FANDN:
1288 case X86ISD::FAND:
1289 case X86ISD::FOR:
1290 case X86ISD::FXOR: {
1291 // Widen scalar fp logic ops to vector to reduce isel patterns.
1292 // FIXME: Can we do this during lowering/combine.
1293 MVT VT = N->getSimpleValueType(ResNo: 0);
1294 if (VT.isVector() || VT == MVT::f128)
1295 break;
1296
1297 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1298 : VT == MVT::f32 ? MVT::v4f32
1299 : MVT::v8f16;
1300
1301 SDLoc dl(N);
1302 SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1303 Operand: N->getOperand(Num: 0));
1304 SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1305 Operand: N->getOperand(Num: 1));
1306
1307 SDValue Res;
1308 if (Subtarget->hasSSE2()) {
1309 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1310 Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0);
1311 Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1);
1312 unsigned Opc;
1313 switch (N->getOpcode()) {
1314 default: llvm_unreachable("Unexpected opcode!");
1315 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1316 case X86ISD::FAND: Opc = ISD::AND; break;
1317 case X86ISD::FOR: Opc = ISD::OR; break;
1318 case X86ISD::FXOR: Opc = ISD::XOR; break;
1319 }
1320 Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1);
1321 Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res);
1322 } else {
1323 Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1);
1324 }
1325 Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res,
1326 N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1327 --I;
1328 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1329 ++I;
1330 MadeChange = true;
1331 continue;
1332 }
1333 }
1334
1335 if (OptLevel != CodeGenOptLevel::None &&
1336 // Only do this when the target can fold the load into the call or
1337 // jmp.
1338 !Subtarget->useIndirectThunkCalls() &&
1339 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1340 (N->getOpcode() == X86ISD::TC_RETURN &&
1341 (Subtarget->is64Bit() ||
1342 !getTargetMachine().isPositionIndependent())))) {
1343 /// Also try moving call address load from outside callseq_start to just
1344 /// before the call to allow it to be folded.
1345 ///
1346 /// [Load chain]
1347 /// ^
1348 /// |
1349 /// [Load]
1350 /// ^ ^
1351 /// | |
1352 /// / \--
1353 /// / |
1354 ///[CALLSEQ_START] |
1355 /// ^ |
1356 /// | |
1357 /// [LOAD/C2Reg] |
1358 /// | |
1359 /// \ /
1360 /// \ /
1361 /// [CALL]
1362 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1363 SDValue Chain = N->getOperand(Num: 0);
1364 SDValue Load = N->getOperand(Num: 1);
1365 if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq))
1366 continue;
1367 moveBelowOrigChain(CurDAG, Load, Call: SDValue(N, 0), OrigChain: Chain);
1368 ++NumLoadMoved;
1369 MadeChange = true;
1370 continue;
1371 }
1372
1373 // Lower fpround and fpextend nodes that target the FP stack to be store and
1374 // load to the stack. This is a gross hack. We would like to simply mark
1375 // these as being illegal, but when we do that, legalize produces these when
1376 // it expands calls, then expands these in the same legalize pass. We would
1377 // like dag combine to be able to hack on these between the call expansion
1378 // and the node legalization. As such this pass basically does "really
1379 // late" legalization of these inline with the X86 isel pass.
1380 // FIXME: This should only happen when not compiled with -O0.
1381 switch (N->getOpcode()) {
1382 default: continue;
1383 case ISD::FP_ROUND:
1384 case ISD::FP_EXTEND:
1385 {
1386 MVT SrcVT = N->getOperand(Num: 0).getSimpleValueType();
1387 MVT DstVT = N->getSimpleValueType(ResNo: 0);
1388
1389 // If any of the sources are vectors, no fp stack involved.
1390 if (SrcVT.isVector() || DstVT.isVector())
1391 continue;
1392
1393 // If the source and destination are SSE registers, then this is a legal
1394 // conversion that should not be lowered.
1395 const X86TargetLowering *X86Lowering =
1396 static_cast<const X86TargetLowering *>(TLI);
1397 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1398 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1399 if (SrcIsSSE && DstIsSSE)
1400 continue;
1401
1402 if (!SrcIsSSE && !DstIsSSE) {
1403 // If this is an FPStack extension, it is a noop.
1404 if (N->getOpcode() == ISD::FP_EXTEND)
1405 continue;
1406 // If this is a value-preserving FPStack truncation, it is a noop.
1407 if (N->getConstantOperandVal(Num: 1))
1408 continue;
1409 }
1410
1411 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1412 // FPStack has extload and truncstore. SSE can fold direct loads into other
1413 // operations. Based on this, decide what we want to do.
1414 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1415 SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1416 int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1417 MachinePointerInfo MPI =
1418 MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1419 SDLoc dl(N);
1420
1421 // FIXME: optimize the case where the src/dest is a load or store?
1422
1423 SDValue Store = CurDAG->getTruncStore(
1424 Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: 0), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT);
1425 SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store,
1426 Ptr: MemTmp, PtrInfo: MPI, MemVT);
1427
1428 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1429 // extload we created. This will cause general havok on the dag because
1430 // anything below the conversion could be folded into other existing nodes.
1431 // To avoid invalidating 'I', back it up to the convert node.
1432 --I;
1433 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Result);
1434 break;
1435 }
1436
1437 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1438 //dealing with the chain differently, as there is already a preexisting chain.
1439 case ISD::STRICT_FP_ROUND:
1440 case ISD::STRICT_FP_EXTEND:
1441 {
1442 MVT SrcVT = N->getOperand(Num: 1).getSimpleValueType();
1443 MVT DstVT = N->getSimpleValueType(ResNo: 0);
1444
1445 // If any of the sources are vectors, no fp stack involved.
1446 if (SrcVT.isVector() || DstVT.isVector())
1447 continue;
1448
1449 // If the source and destination are SSE registers, then this is a legal
1450 // conversion that should not be lowered.
1451 const X86TargetLowering *X86Lowering =
1452 static_cast<const X86TargetLowering *>(TLI);
1453 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1454 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1455 if (SrcIsSSE && DstIsSSE)
1456 continue;
1457
1458 if (!SrcIsSSE && !DstIsSSE) {
1459 // If this is an FPStack extension, it is a noop.
1460 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1461 continue;
1462 // If this is a value-preserving FPStack truncation, it is a noop.
1463 if (N->getConstantOperandVal(Num: 2))
1464 continue;
1465 }
1466
1467 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1468 // FPStack has extload and truncstore. SSE can fold direct loads into other
1469 // operations. Based on this, decide what we want to do.
1470 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1471 SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1472 int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1473 MachinePointerInfo MPI =
1474 MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1475 SDLoc dl(N);
1476
1477 // FIXME: optimize the case where the src/dest is a load or store?
1478
1479 //Since the operation is StrictFP, use the preexisting chain.
1480 SDValue Store, Result;
1481 if (!SrcIsSSE) {
1482 SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
1483 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), MemTmp};
1484 Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT,
1485 PtrInfo: MPI, /*Align*/ Alignment: std::nullopt,
1486 Flags: MachineMemOperand::MOStore);
1487 if (N->getFlags().hasNoFPExcept()) {
1488 SDNodeFlags Flags = Store->getFlags();
1489 Flags.setNoFPExcept(true);
1490 Store->setFlags(Flags);
1491 }
1492 } else {
1493 assert(SrcVT == MemVT && "Unexpected VT!");
1494 Store = CurDAG->getStore(Chain: N->getOperand(Num: 0), dl, Val: N->getOperand(Num: 1), Ptr: MemTmp,
1495 PtrInfo: MPI);
1496 }
1497
1498 if (!DstIsSSE) {
1499 SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other);
1500 SDValue Ops[] = {Store, MemTmp};
1501 Result = CurDAG->getMemIntrinsicNode(
1502 Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI,
1503 /*Align*/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad);
1504 if (N->getFlags().hasNoFPExcept()) {
1505 SDNodeFlags Flags = Result->getFlags();
1506 Flags.setNoFPExcept(true);
1507 Result->setFlags(Flags);
1508 }
1509 } else {
1510 assert(DstVT == MemVT && "Unexpected VT!");
1511 Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI);
1512 }
1513
1514 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1515 // extload we created. This will cause general havok on the dag because
1516 // anything below the conversion could be folded into other existing nodes.
1517 // To avoid invalidating 'I', back it up to the convert node.
1518 --I;
1519 CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode());
1520 break;
1521 }
1522 }
1523
1524
1525 // Now that we did that, the node is dead. Increment the iterator to the
1526 // next node to process, then delete N.
1527 ++I;
1528 MadeChange = true;
1529 }
1530
1531 // Remove any dead nodes that may have been left behind.
1532 if (MadeChange)
1533 CurDAG->RemoveDeadNodes();
1534}
1535
1536// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1537bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1538 unsigned Opc = N->getMachineOpcode();
1539 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1540 Opc != X86::MOVSX64rr8)
1541 return false;
1542
1543 SDValue N0 = N->getOperand(Num: 0);
1544
1545 // We need to be extracting the lower bit of an extend.
1546 if (!N0.isMachineOpcode() ||
1547 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1548 N0.getConstantOperandVal(i: 1) != X86::sub_8bit)
1549 return false;
1550
1551 // We're looking for either a movsx or movzx to match the original opcode.
1552 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1553 : X86::MOVSX32rr8_NOREX;
1554 SDValue N00 = N0.getOperand(i: 0);
1555 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1556 return false;
1557
1558 if (Opc == X86::MOVSX64rr8) {
1559 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1560 // to 64.
1561 MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc(N),
1562 VT: MVT::i64, Op1: N00);
1563 ReplaceUses(F: N, T: Extend);
1564 } else {
1565 // Ok we can drop this extend and just use the original extend.
1566 ReplaceUses(F: N, T: N00.getNode());
1567 }
1568
1569 return true;
1570}
1571
1572void X86DAGToDAGISel::PostprocessISelDAG() {
1573 // Skip peepholes at -O0.
1574 if (TM.getOptLevel() == CodeGenOptLevel::None)
1575 return;
1576
1577 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1578
1579 bool MadeChange = false;
1580 while (Position != CurDAG->allnodes_begin()) {
1581 SDNode *N = &*--Position;
1582 // Skip dead nodes and any non-machine opcodes.
1583 if (N->use_empty() || !N->isMachineOpcode())
1584 continue;
1585
1586 if (tryOptimizeRem8Extend(N)) {
1587 MadeChange = true;
1588 continue;
1589 }
1590
1591 unsigned Opc = N->getMachineOpcode();
1592 switch (Opc) {
1593 default:
1594 continue;
1595 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1596 case X86::TEST8rr:
1597 case X86::TEST16rr:
1598 case X86::TEST32rr:
1599 case X86::TEST64rr:
1600 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1601 case X86::CTEST8rr:
1602 case X86::CTEST16rr:
1603 case X86::CTEST32rr:
1604 case X86::CTEST64rr: {
1605 auto &Op0 = N->getOperand(Num: 0);
1606 if (Op0 != N->getOperand(Num: 1) || !Op0->hasNUsesOfValue(NUses: 2, Value: Op0.getResNo()) ||
1607 !Op0.isMachineOpcode())
1608 continue;
1609 SDValue And = N->getOperand(Num: 0);
1610#define CASE_ND(OP) \
1611 case X86::OP: \
1612 case X86::OP##_ND:
1613 switch (And.getMachineOpcode()) {
1614 default:
1615 continue;
1616 CASE_ND(AND8rr)
1617 CASE_ND(AND16rr)
1618 CASE_ND(AND32rr)
1619 CASE_ND(AND64rr) {
1620 if (And->hasAnyUseOfValue(Value: 1))
1621 continue;
1622 SmallVector<SDValue> Ops(N->op_values());
1623 Ops[0] = And.getOperand(i: 0);
1624 Ops[1] = And.getOperand(i: 1);
1625 MachineSDNode *Test =
1626 CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: MVT::i32, Ops);
1627 ReplaceUses(F: N, T: Test);
1628 MadeChange = true;
1629 continue;
1630 }
1631 CASE_ND(AND8rm)
1632 CASE_ND(AND16rm)
1633 CASE_ND(AND32rm)
1634 CASE_ND(AND64rm) {
1635 if (And->hasAnyUseOfValue(Value: 1))
1636 continue;
1637 unsigned NewOpc;
1638 bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc);
1639#define FROM_TO(A, B) \
1640 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1641 break;
1642 switch (And.getMachineOpcode()) {
1643 FROM_TO(AND8rm, TEST8mr);
1644 FROM_TO(AND16rm, TEST16mr);
1645 FROM_TO(AND32rm, TEST32mr);
1646 FROM_TO(AND64rm, TEST64mr);
1647 }
1648#undef FROM_TO
1649#undef CASE_ND
1650 // Need to swap the memory and register operand.
1651 SmallVector<SDValue> Ops = {And.getOperand(i: 1), And.getOperand(i: 2),
1652 And.getOperand(i: 3), And.getOperand(i: 4),
1653 And.getOperand(i: 5), And.getOperand(i: 0)};
1654 // CC, Cflags.
1655 if (IsCTESTCC) {
1656 Ops.push_back(Elt: N->getOperand(Num: 2));
1657 Ops.push_back(Elt: N->getOperand(Num: 3));
1658 }
1659 // Chain of memory load
1660 Ops.push_back(Elt: And.getOperand(i: 6));
1661 // Glue
1662 if (IsCTESTCC)
1663 Ops.push_back(Elt: N->getOperand(Num: 4));
1664
1665 MachineSDNode *Test = CurDAG->getMachineNode(
1666 Opcode: NewOpc, dl: SDLoc(N), VT1: MVT::i32, VT2: MVT::Other, Ops);
1667 CurDAG->setNodeMemRefs(
1668 N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands());
1669 ReplaceUses(F: And.getValue(R: 2), T: SDValue(Test, 1));
1670 ReplaceUses(F: SDValue(N, 0), T: SDValue(Test, 0));
1671 MadeChange = true;
1672 continue;
1673 }
1674 }
1675 }
1676 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1677 // used. We're doing this late so we can prefer to fold the AND into masked
1678 // comparisons. Doing that can be better for the live range of the mask
1679 // register.
1680 case X86::KORTESTBkk:
1681 case X86::KORTESTWkk:
1682 case X86::KORTESTDkk:
1683 case X86::KORTESTQkk: {
1684 SDValue Op0 = N->getOperand(Num: 0);
1685 if (Op0 != N->getOperand(Num: 1) || !N->isOnlyUserOf(N: Op0.getNode()) ||
1686 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(Flags: SDValue(N, 0)))
1687 continue;
1688#define CASE(A) \
1689 case X86::A: \
1690 break;
1691 switch (Op0.getMachineOpcode()) {
1692 default:
1693 continue;
1694 CASE(KANDBkk)
1695 CASE(KANDWkk)
1696 CASE(KANDDkk)
1697 CASE(KANDQkk)
1698 }
1699 unsigned NewOpc;
1700#define FROM_TO(A, B) \
1701 case X86::A: \
1702 NewOpc = X86::B; \
1703 break;
1704 switch (Opc) {
1705 FROM_TO(KORTESTBkk, KTESTBkk)
1706 FROM_TO(KORTESTWkk, KTESTWkk)
1707 FROM_TO(KORTESTDkk, KTESTDkk)
1708 FROM_TO(KORTESTQkk, KTESTQkk)
1709 }
1710 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1711 // KAND instructions and KTEST use the same ISA feature.
1712 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1713 continue;
1714#undef FROM_TO
1715 MachineSDNode *KTest = CurDAG->getMachineNode(
1716 Opcode: NewOpc, dl: SDLoc(N), VT: MVT::i32, Op1: Op0.getOperand(i: 0), Op2: Op0.getOperand(i: 1));
1717 ReplaceUses(F: N, T: KTest);
1718 MadeChange = true;
1719 continue;
1720 }
1721 // Attempt to remove vectors moves that were inserted to zero upper bits.
1722 case TargetOpcode::SUBREG_TO_REG: {
1723 unsigned SubRegIdx = N->getConstantOperandVal(Num: 2);
1724 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1725 continue;
1726
1727 SDValue Move = N->getOperand(Num: 1);
1728 if (!Move.isMachineOpcode())
1729 continue;
1730
1731 // Make sure its one of the move opcodes we recognize.
1732 switch (Move.getMachineOpcode()) {
1733 default:
1734 continue;
1735 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1736 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1737 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1738 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1739 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1740 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1741 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1742 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1743 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1744 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1745 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1746 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1747 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1748 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1749 }
1750#undef CASE
1751
1752 SDValue In = Move.getOperand(i: 0);
1753 if (!In.isMachineOpcode() ||
1754 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1755 continue;
1756
1757 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1758 // the SHA instructions which use a legacy encoding.
1759 uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags;
1760 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1761 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1762 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1763 continue;
1764
1765 // Producing instruction is another vector instruction. We can drop the
1766 // move.
1767 CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: In, Op3: N->getOperand(Num: 2));
1768 MadeChange = true;
1769 }
1770 }
1771 }
1772
1773 if (MadeChange)
1774 CurDAG->RemoveDeadNodes();
1775}
1776
1777
1778/// Emit any code that needs to be executed only in the main function.
1779void X86DAGToDAGISel::emitSpecialCodeForMain() {
1780 if (Subtarget->isTargetCygMing()) {
1781 TargetLowering::ArgListTy Args;
1782 auto &DL = CurDAG->getDataLayout();
1783
1784 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1785 CLI.setChain(CurDAG->getRoot())
1786 .setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()),
1787 Target: CurDAG->getExternalSymbol(Sym: "__main", VT: TLI->getPointerTy(DL)),
1788 ArgsList: std::move(Args));
1789 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1790 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1791 CurDAG->setRoot(Result.second);
1792 }
1793}
1794
1795void X86DAGToDAGISel::emitFunctionEntryCode() {
1796 // If this is main, emit special code for main.
1797 const Function &F = MF->getFunction();
1798 if (F.hasExternalLinkage() && F.getName() == "main")
1799 emitSpecialCodeForMain();
1800}
1801
1802static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1803 // We can run into an issue where a frame index or a register base
1804 // includes a displacement that, when added to the explicit displacement,
1805 // will overflow the displacement field. Assuming that the
1806 // displacement fits into a 31-bit integer (which is only slightly more
1807 // aggressive than the current fundamental assumption that it fits into
1808 // a 32-bit integer), a 31-bit disp should always be safe.
1809 return isInt<31>(x: Val);
1810}
1811
1812bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1813 X86ISelAddressMode &AM) {
1814 // We may have already matched a displacement and the caller just added the
1815 // symbolic displacement. So we still need to do the checks even if Offset
1816 // is zero.
1817
1818 int64_t Val = AM.Disp + Offset;
1819
1820 // Cannot combine ExternalSymbol displacements with integer offsets.
1821 if (Val != 0 && (AM.ES || AM.MCSym))
1822 return true;
1823
1824 CodeModel::Model M = TM.getCodeModel();
1825 if (Subtarget->is64Bit()) {
1826 if (Val != 0 &&
1827 !X86::isOffsetSuitableForCodeModel(Offset: Val, M,
1828 hasSymbolicDisplacement: AM.hasSymbolicDisplacement()))
1829 return true;
1830 // In addition to the checks required for a register base, check that
1831 // we do not try to use an unsafe Disp with a frame index.
1832 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1833 !isDispSafeForFrameIndexOrRegBase(Val))
1834 return true;
1835 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1836 // 64 bits. Instructions with 32-bit register addresses perform this zero
1837 // extension for us and we can safely ignore the high bits of Offset.
1838 // Instructions with only a 32-bit immediate address do not, though: they
1839 // sign extend instead. This means only address the low 2GB of address space
1840 // is directly addressable, we need indirect addressing for the high 2GB of
1841 // address space.
1842 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1843 // implicit zero extension of instructions would cover up any problem.
1844 // However, we have asserts elsewhere that get triggered if we do, so keep
1845 // the checks for now.
1846 // TODO: We would actually be able to accept these, as well as the same
1847 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1848 // to get an address size override to be emitted. However, this
1849 // pseudo-register is not part of any register class and therefore causes
1850 // MIR verification to fail.
1851 if (Subtarget->isTarget64BitILP32() &&
1852 !isDispSafeForFrameIndexOrRegBase(Val: (uint32_t)Val) &&
1853 !AM.hasBaseOrIndexReg())
1854 return true;
1855 } else if (Subtarget->is16Bit()) {
1856 // In 16-bit mode, displacements are limited to [-65535,65535] for FK_Data_2
1857 // fixups of unknown signedness. See X86AsmBackend::applyFixup.
1858 if (Val < -(int64_t)UINT16_MAX || Val > (int64_t)UINT16_MAX)
1859 return true;
1860 } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1861 // For 32-bit X86, make sure the displacement still isn't close to the
1862 // expressible limit.
1863 return true;
1864 AM.Disp = Val;
1865 return false;
1866}
1867
1868bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1869 bool AllowSegmentRegForX32) {
1870 SDValue Address = N->getOperand(Num: 1);
1871
1872 // load gs:0 -> GS segment register.
1873 // load fs:0 -> FS segment register.
1874 //
1875 // This optimization is generally valid because the GNU TLS model defines that
1876 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1877 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1878 // zero-extended to 64 bits and then added it to the base address, which gives
1879 // unwanted results when the register holds a negative value.
1880 // For more information see http://people.redhat.com/drepper/tls.pdf
1881 if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr &&
1882 !IndirectTlsSegRefs &&
1883 (Subtarget->isTargetGlibc() || Subtarget->isTargetMusl() ||
1884 Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) {
1885 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1886 return true;
1887 switch (N->getPointerInfo().getAddrSpace()) {
1888 case X86AS::GS:
1889 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
1890 return false;
1891 case X86AS::FS:
1892 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
1893 return false;
1894 // Address space X86AS::SS is not handled here, because it is not used to
1895 // address TLS areas.
1896 }
1897 }
1898
1899 return true;
1900}
1901
1902/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1903/// mode. These wrap things that will resolve down into a symbol reference.
1904/// If no match is possible, this returns true, otherwise it returns false.
1905bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1906 // If the addressing mode already has a symbol as the displacement, we can
1907 // never match another symbol.
1908 if (AM.hasSymbolicDisplacement())
1909 return true;
1910
1911 bool IsRIPRelTLS = false;
1912 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1913 if (IsRIPRel) {
1914 SDValue Val = N.getOperand(i: 0);
1915 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1916 IsRIPRelTLS = true;
1917 }
1918
1919 // We can't use an addressing mode in the 64-bit large code model.
1920 // Global TLS addressing is an exception. In the medium code model,
1921 // we use can use a mode when RIP wrappers are present.
1922 // That signifies access to globals that are known to be "near",
1923 // such as the GOT itself.
1924 CodeModel::Model M = TM.getCodeModel();
1925 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1926 return true;
1927
1928 // Base and index reg must be 0 in order to use %rip as base.
1929 if (IsRIPRel && AM.hasBaseOrIndexReg())
1930 return true;
1931
1932 // Make a local copy in case we can't do this fold.
1933 X86ISelAddressMode Backup = AM;
1934
1935 int64_t Offset = 0;
1936 SDValue N0 = N.getOperand(i: 0);
1937 if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) {
1938 AM.GV = G->getGlobal();
1939 AM.SymbolFlags = G->getTargetFlags();
1940 Offset = G->getOffset();
1941 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) {
1942 AM.CP = CP->getConstVal();
1943 AM.Alignment = CP->getAlign();
1944 AM.SymbolFlags = CP->getTargetFlags();
1945 Offset = CP->getOffset();
1946 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) {
1947 AM.ES = S->getSymbol();
1948 AM.SymbolFlags = S->getTargetFlags();
1949 } else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) {
1950 AM.MCSym = S->getMCSymbol();
1951 } else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) {
1952 AM.JT = J->getIndex();
1953 AM.SymbolFlags = J->getTargetFlags();
1954 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) {
1955 AM.BlockAddr = BA->getBlockAddress();
1956 AM.SymbolFlags = BA->getTargetFlags();
1957 Offset = BA->getOffset();
1958 } else
1959 llvm_unreachable("Unhandled symbol reference node.");
1960
1961 // Can't use an addressing mode with large globals.
1962 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1963 TM.isLargeGlobalValue(GV: AM.GV)) {
1964 AM = Backup;
1965 return true;
1966 }
1967
1968 if (foldOffsetIntoAddress(Offset, AM)) {
1969 AM = Backup;
1970 return true;
1971 }
1972
1973 if (IsRIPRel)
1974 AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64));
1975
1976 // Commit the changes now that we know this fold is safe.
1977 return false;
1978}
1979
1980/// Add the specified node to the specified addressing mode, returning true if
1981/// it cannot be done. This just pattern matches for the addressing mode.
1982bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1983 if (matchAddressRecursively(N, AM, Depth: 0))
1984 return true;
1985
1986 // Post-processing: Make a second attempt to fold a load, if we now know
1987 // that there will not be any other register. This is only performed for
1988 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1989 // any foldable load the first time.
1990 if (Subtarget->isTarget64BitILP32() &&
1991 AM.BaseType == X86ISelAddressMode::RegBase &&
1992 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1993 SDValue Save_Base_Reg = AM.Base_Reg;
1994 if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) {
1995 AM.Base_Reg = SDValue();
1996 if (matchLoadInAddress(N: LoadN, AM, /*AllowSegmentRegForX32=*/true))
1997 AM.Base_Reg = Save_Base_Reg;
1998 }
1999 }
2000
2001 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
2002 // a smaller encoding and avoids a scaled-index.
2003 if (AM.Scale == 2 &&
2004 AM.BaseType == X86ISelAddressMode::RegBase &&
2005 AM.Base_Reg.getNode() == nullptr) {
2006 AM.Base_Reg = AM.IndexReg;
2007 AM.Scale = 1;
2008 }
2009
2010 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2011 // because it has a smaller encoding.
2012 if (TM.getCodeModel() != CodeModel::Large &&
2013 (!AM.GV || !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() &&
2014 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2015 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2016 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2017 // However, when GV is a local function symbol and in the same section as
2018 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2019 // referencing GV+Disp generates a relocation referencing the section symbol
2020 // with an even smaller offset, which might underflow. We should bail out if
2021 // the negative offset is too close to INT32_MIN. Actually, we are more
2022 // conservative here, using a smaller magic number also used by
2023 // isOffsetSuitableForCodeModel.
2024 if (isa_and_nonnull<Function>(Val: AM.GV) && AM.Disp < -16 * 1024 * 1024)
2025 return true;
2026
2027 AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64);
2028 }
2029
2030 return false;
2031}
2032
2033bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2034 unsigned Depth) {
2035 // Add an artificial use to this node so that we can keep track of
2036 // it if it gets CSE'd with a different node.
2037 HandleSDNode Handle(N);
2038
2039 X86ISelAddressMode Backup = AM;
2040 if (!matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1) &&
2041 !matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, Depth: Depth+1))
2042 return false;
2043 AM = Backup;
2044
2045 // Try again after commutating the operands.
2046 if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2047 Depth: Depth + 1) &&
2048 !matchAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, Depth: Depth + 1))
2049 return false;
2050 AM = Backup;
2051
2052 // If we couldn't fold both operands into the address at the same time,
2053 // see if we can just put each operand into a register and fold at least
2054 // the add.
2055 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2056 !AM.Base_Reg.getNode() &&
2057 !AM.IndexReg.getNode()) {
2058 N = Handle.getValue();
2059 AM.Base_Reg = N.getOperand(i: 0);
2060 AM.IndexReg = N.getOperand(i: 1);
2061 AM.Scale = 1;
2062 return false;
2063 }
2064 N = Handle.getValue();
2065 return true;
2066}
2067
2068// Insert a node into the DAG at least before the Pos node's position. This
2069// will reposition the node as needed, and will assign it a node ID that is <=
2070// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2071// IDs! The selection DAG must no longer depend on their uniqueness when this
2072// is used.
2073static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2074 if (N->getNodeId() == -1 ||
2075 (SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) >
2076 SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) {
2077 DAG.RepositionNode(Position: Pos->getIterator(), N: N.getNode());
2078 // Mark Node as invalid for pruning as after this it may be a successor to a
2079 // selected node but otherwise be in the same position of Pos.
2080 // Conservatively mark it with the same -abs(Id) to assure node id
2081 // invariant is preserved.
2082 N->setNodeId(Pos->getNodeId());
2083 SelectionDAGISel::InvalidateNodeId(N: N.getNode());
2084 }
2085}
2086
2087// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2088// safe. This allows us to convert the shift and and into an h-register
2089// extract and a scaled index. Returns false if the simplification is
2090// performed.
2091static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2092 uint64_t Mask,
2093 SDValue Shift, SDValue X,
2094 X86ISelAddressMode &AM) {
2095 if (Shift.getOpcode() != ISD::SRL ||
2096 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) ||
2097 !Shift.hasOneUse())
2098 return true;
2099
2100 int ScaleLog = 8 - Shift.getConstantOperandVal(i: 1);
2101 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2102 Mask != (0xffu << ScaleLog))
2103 return true;
2104
2105 MVT XVT = X.getSimpleValueType();
2106 MVT VT = N.getSimpleValueType();
2107 SDLoc DL(N);
2108 SDValue Eight = DAG.getConstant(Val: 8, DL, VT: MVT::i8);
2109 SDValue NewMask = DAG.getConstant(Val: 0xff, DL, VT: XVT);
2110 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight);
2111 SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask);
2112 SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT);
2113 SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8);
2114 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount);
2115
2116 // Insert the new nodes into the topological ordering. We must do this in
2117 // a valid topological ordering as nothing is going to go back and re-sort
2118 // these nodes. We continually insert before 'N' in sequence as this is
2119 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2120 // hierarchy left to express.
2121 insertDAGNode(DAG, Pos: N, N: Eight);
2122 insertDAGNode(DAG, Pos: N, N: NewMask);
2123 insertDAGNode(DAG, Pos: N, N: Srl);
2124 insertDAGNode(DAG, Pos: N, N: And);
2125 insertDAGNode(DAG, Pos: N, N: Ext);
2126 insertDAGNode(DAG, Pos: N, N: ShlCount);
2127 insertDAGNode(DAG, Pos: N, N: Shl);
2128 DAG.ReplaceAllUsesWith(From: N, To: Shl);
2129 DAG.RemoveDeadNode(N: N.getNode());
2130 AM.IndexReg = Ext;
2131 AM.Scale = (1 << ScaleLog);
2132 return false;
2133}
2134
2135// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2136// allows us to fold the shift into this addressing mode. Returns false if the
2137// transform succeeded.
2138static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2139 X86ISelAddressMode &AM) {
2140 SDValue Shift = N.getOperand(i: 0);
2141
2142 // Use a signed mask so that shifting right will insert sign bits. These
2143 // bits will be removed when we shift the result left so it doesn't matter
2144 // what we use. This might allow a smaller immediate encoding.
2145 int64_t Mask = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getSExtValue();
2146
2147 // If we have an any_extend feeding the AND, look through it to see if there
2148 // is a shift behind it. But only if the AND doesn't use the extended bits.
2149 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2150 bool FoundAnyExtend = false;
2151 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2152 Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 &&
2153 isUInt<32>(x: Mask)) {
2154 FoundAnyExtend = true;
2155 Shift = Shift.getOperand(i: 0);
2156 }
2157
2158 if (Shift.getOpcode() != ISD::SHL ||
2159 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
2160 return true;
2161
2162 SDValue X = Shift.getOperand(i: 0);
2163
2164 // Not likely to be profitable if either the AND or SHIFT node has more
2165 // than one use (unless all uses are for address computation). Besides,
2166 // isel mechanism requires their node ids to be reused.
2167 if (!N.hasOneUse() || !Shift.hasOneUse())
2168 return true;
2169
2170 // Verify that the shift amount is something we can fold.
2171 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2172 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2173 return true;
2174
2175 MVT VT = N.getSimpleValueType();
2176 SDLoc DL(N);
2177 if (FoundAnyExtend) {
2178 SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X);
2179 insertDAGNode(DAG, Pos: N, N: NewX);
2180 X = NewX;
2181 }
2182
2183 SDValue NewMask = DAG.getSignedConstant(Val: Mask >> ShiftAmt, DL, VT);
2184 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask);
2185 SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: 1));
2186
2187 // Insert the new nodes into the topological ordering. We must do this in
2188 // a valid topological ordering as nothing is going to go back and re-sort
2189 // these nodes. We continually insert before 'N' in sequence as this is
2190 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2191 // hierarchy left to express.
2192 insertDAGNode(DAG, Pos: N, N: NewMask);
2193 insertDAGNode(DAG, Pos: N, N: NewAnd);
2194 insertDAGNode(DAG, Pos: N, N: NewShift);
2195 DAG.ReplaceAllUsesWith(From: N, To: NewShift);
2196 DAG.RemoveDeadNode(N: N.getNode());
2197
2198 AM.Scale = 1 << ShiftAmt;
2199 AM.IndexReg = NewAnd;
2200 return false;
2201}
2202
2203// Implement some heroics to detect shifts of masked values where the mask can
2204// be replaced by extending the shift and undoing that in the addressing mode
2205// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2206// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2207// the addressing mode. This results in code such as:
2208//
2209// int f(short *y, int *lookup_table) {
2210// ...
2211// return *y + lookup_table[*y >> 11];
2212// }
2213//
2214// Turning into:
2215// movzwl (%rdi), %eax
2216// movl %eax, %ecx
2217// shrl $11, %ecx
2218// addl (%rsi,%rcx,4), %eax
2219//
2220// Instead of:
2221// movzwl (%rdi), %eax
2222// movl %eax, %ecx
2223// shrl $9, %ecx
2224// andl $124, %rcx
2225// addl (%rsi,%rcx), %eax
2226//
2227// Note that this function assumes the mask is provided as a mask *after* the
2228// value is shifted. The input chain may or may not match that, but computing
2229// such a mask is trivial.
2230static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2231 uint64_t Mask,
2232 SDValue Shift, SDValue X,
2233 X86ISelAddressMode &AM) {
2234 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2235 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
2236 return true;
2237
2238 // We need to ensure that mask is a continuous run of bits.
2239 unsigned MaskIdx, MaskLen;
2240 if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2241 return true;
2242 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2243
2244 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2245
2246 // The amount of shift we're trying to fit into the addressing mode is taken
2247 // from the shifted mask index (number of trailing zeros of the mask).
2248 unsigned AMShiftAmt = MaskIdx;
2249
2250 // There is nothing we can do here unless the mask is removing some bits.
2251 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2252 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2253
2254 // Scale the leading zero count down based on the actual size of the value.
2255 // Also scale it down based on the size of the shift.
2256 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2257 if (MaskLZ < ScaleDown)
2258 return true;
2259 MaskLZ -= ScaleDown;
2260
2261 // The final check is to ensure that any masked out high bits of X are
2262 // already known to be zero. Otherwise, the mask has a semantic impact
2263 // other than masking out a couple of low bits. Unfortunately, because of
2264 // the mask, zero extensions will be removed from operands in some cases.
2265 // This code works extra hard to look through extensions because we can
2266 // replace them with zero extensions cheaply if necessary.
2267 bool ReplacingAnyExtend = false;
2268 if (X.getOpcode() == ISD::ANY_EXTEND) {
2269 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2270 X.getOperand(i: 0).getSimpleValueType().getSizeInBits();
2271 // Assume that we'll replace the any-extend with a zero-extend, and
2272 // narrow the search to the extended value.
2273 X = X.getOperand(i: 0);
2274 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2275 ReplacingAnyExtend = true;
2276 }
2277 APInt MaskedHighBits =
2278 APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ);
2279 if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits))
2280 return true;
2281
2282 // We've identified a pattern that can be transformed into a single shift
2283 // and an addressing mode. Make it so.
2284 MVT VT = N.getSimpleValueType();
2285 if (ReplacingAnyExtend) {
2286 assert(X.getValueType() != VT);
2287 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2288 SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(X), VT, Operand: X);
2289 insertDAGNode(DAG, Pos: N, N: NewX);
2290 X = NewX;
2291 }
2292
2293 MVT XVT = X.getSimpleValueType();
2294 SDLoc DL(N);
2295 SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2296 SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2297 SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT);
2298 SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2299 SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2300
2301 // Insert the new nodes into the topological ordering. We must do this in
2302 // a valid topological ordering as nothing is going to go back and re-sort
2303 // these nodes. We continually insert before 'N' in sequence as this is
2304 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2305 // hierarchy left to express.
2306 insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2307 insertDAGNode(DAG, Pos: N, N: NewSRL);
2308 insertDAGNode(DAG, Pos: N, N: NewExt);
2309 insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2310 insertDAGNode(DAG, Pos: N, N: NewSHL);
2311 DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2312 DAG.RemoveDeadNode(N: N.getNode());
2313
2314 AM.Scale = 1 << AMShiftAmt;
2315 AM.IndexReg = NewExt;
2316 return false;
2317}
2318
2319// Transform "(X >> SHIFT) & (MASK << C1)" to
2320// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2321// matched to a BEXTR later. Returns false if the simplification is performed.
2322static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2323 uint64_t Mask,
2324 SDValue Shift, SDValue X,
2325 X86ISelAddressMode &AM,
2326 const X86Subtarget &Subtarget) {
2327 if (Shift.getOpcode() != ISD::SRL ||
2328 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) ||
2329 !Shift.hasOneUse() || !N.hasOneUse())
2330 return true;
2331
2332 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2333 if (!Subtarget.hasTBM() &&
2334 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2335 return true;
2336
2337 // We need to ensure that mask is a continuous run of bits.
2338 unsigned MaskIdx, MaskLen;
2339 if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2340 return true;
2341
2342 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2343
2344 // The amount of shift we're trying to fit into the addressing mode is taken
2345 // from the shifted mask index (number of trailing zeros of the mask).
2346 unsigned AMShiftAmt = MaskIdx;
2347
2348 // There is nothing we can do here unless the mask is removing some bits.
2349 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2350 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2351
2352 MVT XVT = X.getSimpleValueType();
2353 MVT VT = N.getSimpleValueType();
2354 SDLoc DL(N);
2355 SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2356 SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2357 SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT);
2358 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask);
2359 SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT);
2360 SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2361 SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2362
2363 // Insert the new nodes into the topological ordering. We must do this in
2364 // a valid topological ordering as nothing is going to go back and re-sort
2365 // these nodes. We continually insert before 'N' in sequence as this is
2366 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2367 // hierarchy left to express.
2368 insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2369 insertDAGNode(DAG, Pos: N, N: NewSRL);
2370 insertDAGNode(DAG, Pos: N, N: NewMask);
2371 insertDAGNode(DAG, Pos: N, N: NewAnd);
2372 insertDAGNode(DAG, Pos: N, N: NewExt);
2373 insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2374 insertDAGNode(DAG, Pos: N, N: NewSHL);
2375 DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2376 DAG.RemoveDeadNode(N: N.getNode());
2377
2378 AM.Scale = 1 << AMShiftAmt;
2379 AM.IndexReg = NewExt;
2380 return false;
2381}
2382
2383// Attempt to peek further into a scaled index register, collecting additional
2384// extensions / offsets / etc. Returns /p N if we can't peek any further.
2385SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2386 X86ISelAddressMode &AM,
2387 unsigned Depth) {
2388 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2389 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2390 "Illegal index scale");
2391
2392 // Limit recursion.
2393 if (Depth >= SelectionDAG::MaxRecursionDepth)
2394 return N;
2395
2396 EVT VT = N.getValueType();
2397 unsigned Opc = N.getOpcode();
2398
2399 // index: add(x,c) -> index: x, disp + c
2400 if (CurDAG->isBaseWithConstantOffset(Op: N)) {
2401 auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: 1));
2402 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2403 if (!foldOffsetIntoAddress(Offset, AM))
2404 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2405 }
2406
2407 // index: add(x,x) -> index: x, scale * 2
2408 if (Opc == ISD::ADD && N.getOperand(i: 0) == N.getOperand(i: 1)) {
2409 if (AM.Scale <= 4) {
2410 AM.Scale *= 2;
2411 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2412 }
2413 }
2414
2415 // index: shl(x,i) -> index: x, scale * (1 << i)
2416 if (Opc == X86ISD::VSHLI) {
2417 uint64_t ShiftAmt = N.getConstantOperandVal(i: 1);
2418 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2419 if ((AM.Scale * ScaleAmt) <= 8) {
2420 AM.Scale *= ScaleAmt;
2421 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2422 }
2423 }
2424
2425 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2426 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2427 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2428 SDValue Src = N.getOperand(i: 0);
2429 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2430 Src.hasOneUse()) {
2431 if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2432 SDValue AddSrc = Src.getOperand(i: 0);
2433 auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: 1));
2434 int64_t Offset = AddVal->getSExtValue();
2435 if (!foldOffsetIntoAddress(Offset: (uint64_t)Offset * AM.Scale, AM)) {
2436 SDLoc DL(N);
2437 SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2438 SDValue ExtVal = CurDAG->getSignedConstant(Val: Offset, DL, VT);
2439 SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal);
2440 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2441 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2442 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2443 CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2444 CurDAG->RemoveDeadNode(N: N.getNode());
2445 return ExtSrc;
2446 }
2447 }
2448 }
2449 }
2450
2451 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2452 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2453 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2454 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2455 SDValue Src = N.getOperand(i: 0);
2456 unsigned SrcOpc = Src.getOpcode();
2457 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2458 CurDAG->isADDLike(Op: Src, /*NoWrap=*/true)) &&
2459 Src.hasOneUse()) {
2460 if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2461 SDValue AddSrc = Src.getOperand(i: 0);
2462 uint64_t Offset = Src.getConstantOperandVal(i: 1);
2463 if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2464 SDLoc DL(N);
2465 SDValue Res;
2466 // If we're also scaling, see if we can use that as well.
2467 if (AddSrc.getOpcode() == ISD::SHL &&
2468 isa<ConstantSDNode>(Val: AddSrc.getOperand(i: 1))) {
2469 SDValue ShVal = AddSrc.getOperand(i: 0);
2470 uint64_t ShAmt = AddSrc.getConstantOperandVal(i: 1);
2471 APInt HiBits =
2472 APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt);
2473 uint64_t ScaleAmt = 1ULL << ShAmt;
2474 if ((AM.Scale * ScaleAmt) <= 8 &&
2475 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2476 CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) {
2477 AM.Scale *= ScaleAmt;
2478 SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal);
2479 SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal,
2480 N2: AddSrc.getOperand(i: 1));
2481 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal);
2482 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift);
2483 AddSrc = ExtShift;
2484 Res = ExtShVal;
2485 }
2486 }
2487 SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2488 SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2489 SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal);
2490 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2491 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2492 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2493 CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2494 CurDAG->RemoveDeadNode(N: N.getNode());
2495 return Res ? Res : ExtSrc;
2496 }
2497 }
2498 }
2499 }
2500
2501 // TODO: Handle extensions, shifted masks etc.
2502 return N;
2503}
2504
2505bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2506 unsigned Depth) {
2507 LLVM_DEBUG({
2508 dbgs() << "MatchAddress: ";
2509 AM.dump(CurDAG);
2510 });
2511 // Limit recursion.
2512 if (Depth >= SelectionDAG::MaxRecursionDepth)
2513 return matchAddressBase(N, AM);
2514
2515 // If this is already a %rip relative address, we can only merge immediates
2516 // into it. Instead of handling this in every case, we handle it here.
2517 // RIP relative addressing: %rip + 32-bit displacement!
2518 if (AM.isRIPRelative()) {
2519 // FIXME: JumpTable and ExternalSymbol address currently don't like
2520 // displacements. It isn't very important, but this should be fixed for
2521 // consistency.
2522 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2523 return true;
2524
2525 if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N))
2526 if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM))
2527 return false;
2528 return true;
2529 }
2530
2531 switch (N.getOpcode()) {
2532 default: break;
2533 case ISD::LOCAL_RECOVER: {
2534 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2535 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: 0))) {
2536 // Use the symbol and don't prefix it.
2537 AM.MCSym = ESNode->getMCSymbol();
2538 return false;
2539 }
2540 break;
2541 }
2542 case ISD::Constant: {
2543 uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2544 if (!foldOffsetIntoAddress(Offset: Val, AM))
2545 return false;
2546 break;
2547 }
2548
2549 case X86ISD::Wrapper:
2550 case X86ISD::WrapperRIP:
2551 if (!matchWrapper(N, AM))
2552 return false;
2553 break;
2554
2555 case ISD::LOAD:
2556 if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM))
2557 return false;
2558 break;
2559
2560 case ISD::FrameIndex:
2561 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2562 AM.Base_Reg.getNode() == nullptr &&
2563 (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(Val: AM.Disp))) {
2564 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2565 AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex();
2566 return false;
2567 }
2568 break;
2569
2570 case ISD::SHL:
2571 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2572 break;
2573
2574 if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) {
2575 unsigned Val = CN->getZExtValue();
2576 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2577 // that the base operand remains free for further matching. If
2578 // the base doesn't end up getting used, a post-processing step
2579 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2580 if (Val == 1 || Val == 2 || Val == 3) {
2581 SDValue ShVal = N.getOperand(i: 0);
2582 AM.Scale = 1 << Val;
2583 AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + 1);
2584 return false;
2585 }
2586 }
2587 break;
2588
2589 case ISD::SRL: {
2590 // Scale must not be used already.
2591 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2592
2593 // We only handle up to 64-bit values here as those are what matter for
2594 // addressing mode optimizations.
2595 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2596 "Unexpected value size!");
2597
2598 SDValue And = N.getOperand(i: 0);
2599 if (And.getOpcode() != ISD::AND) break;
2600 SDValue X = And.getOperand(i: 0);
2601
2602 // The mask used for the transform is expected to be post-shift, but we
2603 // found the shift first so just apply the shift to the mask before passing
2604 // it down.
2605 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)) ||
2606 !isa<ConstantSDNode>(Val: And.getOperand(i: 1)))
2607 break;
2608 uint64_t Mask = And.getConstantOperandVal(i: 1) >> N.getConstantOperandVal(i: 1);
2609
2610 // Try to fold the mask and shift into the scale, and return false if we
2611 // succeed.
2612 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM))
2613 return false;
2614 break;
2615 }
2616
2617 case ISD::SMUL_LOHI:
2618 case ISD::UMUL_LOHI:
2619 // A mul_lohi where we need the low part can be folded as a plain multiply.
2620 if (N.getResNo() != 0) break;
2621 [[fallthrough]];
2622 case ISD::MUL:
2623 case X86ISD::MUL_IMM:
2624 // X*[3,5,9] -> X+X*[2,4,8]
2625 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2626 AM.Base_Reg.getNode() == nullptr &&
2627 AM.IndexReg.getNode() == nullptr) {
2628 if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1)))
2629 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2630 CN->getZExtValue() == 9) {
2631 AM.Scale = unsigned(CN->getZExtValue())-1;
2632
2633 SDValue MulVal = N.getOperand(i: 0);
2634 SDValue Reg;
2635
2636 // Okay, we know that we have a scale by now. However, if the scaled
2637 // value is an add of something and a constant, we can fold the
2638 // constant into the disp field here.
2639 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2640 isa<ConstantSDNode>(Val: MulVal.getOperand(i: 1))) {
2641 Reg = MulVal.getOperand(i: 0);
2642 auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: 1));
2643 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2644 if (foldOffsetIntoAddress(Offset: Disp, AM))
2645 Reg = N.getOperand(i: 0);
2646 } else {
2647 Reg = N.getOperand(i: 0);
2648 }
2649
2650 AM.IndexReg = AM.Base_Reg = Reg;
2651 return false;
2652 }
2653 }
2654 break;
2655
2656 case ISD::SUB: {
2657 // Given A-B, if A can be completely folded into the address and
2658 // the index field with the index field unused, use -B as the index.
2659 // This is a win if a has multiple parts that can be folded into
2660 // the address. Also, this saves a mov if the base register has
2661 // other uses, since it avoids a two-address sub instruction, however
2662 // it costs an additional mov if the index register has other uses.
2663
2664 // Add an artificial use to this node so that we can keep track of
2665 // it if it gets CSE'd with a different node.
2666 HandleSDNode Handle(N);
2667
2668 // Test if the LHS of the sub can be folded.
2669 X86ISelAddressMode Backup = AM;
2670 if (matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1)) {
2671 N = Handle.getValue();
2672 AM = Backup;
2673 break;
2674 }
2675 N = Handle.getValue();
2676 // Test if the index field is free for use.
2677 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2678 AM = Backup;
2679 break;
2680 }
2681
2682 int Cost = 0;
2683 SDValue RHS = N.getOperand(i: 1);
2684 // If the RHS involves a register with multiple uses, this
2685 // transformation incurs an extra mov, due to the neg instruction
2686 // clobbering its operand.
2687 if (!RHS.getNode()->hasOneUse() ||
2688 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2689 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2690 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2691 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2692 RHS.getOperand(i: 0).getValueType() == MVT::i32))
2693 ++Cost;
2694 // If the base is a register with multiple uses, this
2695 // transformation may save a mov.
2696 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2697 !AM.Base_Reg.getNode()->hasOneUse()) ||
2698 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2699 --Cost;
2700 // If the folded LHS was interesting, this transformation saves
2701 // address arithmetic.
2702 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2703 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2704 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2705 --Cost;
2706 // If it doesn't look like it may be an overall win, don't do it.
2707 if (Cost >= 0) {
2708 AM = Backup;
2709 break;
2710 }
2711
2712 // Ok, the transformation is legal and appears profitable. Go for it.
2713 // Negation will be emitted later to avoid creating dangling nodes if this
2714 // was an unprofitable LEA.
2715 AM.IndexReg = RHS;
2716 AM.NegateIndex = true;
2717 AM.Scale = 1;
2718 return false;
2719 }
2720
2721 case ISD::OR:
2722 case ISD::XOR:
2723 // See if we can treat the OR/XOR node as an ADD node.
2724 if (!CurDAG->isADDLike(Op: N))
2725 break;
2726 [[fallthrough]];
2727 case ISD::ADD:
2728 if (!matchAdd(N, AM, Depth))
2729 return false;
2730 break;
2731
2732 case ISD::AND: {
2733 // Perform some heroic transforms on an and of a constant-count shift
2734 // with a constant to enable use of the scaled offset field.
2735
2736 // Scale must not be used already.
2737 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2738
2739 // We only handle up to 64-bit values here as those are what matter for
2740 // addressing mode optimizations.
2741 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2742 "Unexpected value size!");
2743
2744 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)))
2745 break;
2746
2747 if (N.getOperand(i: 0).getOpcode() == ISD::SRL) {
2748 SDValue Shift = N.getOperand(i: 0);
2749 SDValue X = Shift.getOperand(i: 0);
2750
2751 uint64_t Mask = N.getConstantOperandVal(i: 1);
2752
2753 // Try to fold the mask and shift into an extract and scale.
2754 if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2755 return false;
2756
2757 // Try to fold the mask and shift directly into the scale.
2758 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2759 return false;
2760
2761 // Try to fold the mask and shift into BEXTR and scale.
2762 if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask, Shift, X, AM, Subtarget: *Subtarget))
2763 return false;
2764 }
2765
2766 // Try to swap the mask and shift to place shifts which can be done as
2767 // a scale on the outside of the mask.
2768 if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM))
2769 return false;
2770
2771 break;
2772 }
2773 case ISD::ZERO_EXTEND: {
2774 // Try to widen a zexted shift left to the same size as its use, so we can
2775 // match the shift as a scale factor.
2776 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2777 break;
2778
2779 SDValue Src = N.getOperand(i: 0);
2780
2781 // See if we can match a zext(addlike(x,c)).
2782 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2783 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2784 if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + 1))
2785 if (Index != N) {
2786 AM.IndexReg = Index;
2787 return false;
2788 }
2789
2790 // Peek through mask: zext(and(shl(x,c1),c2))
2791 APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits());
2792 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2793 if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1))) {
2794 Mask = MaskC->getAPIntValue();
2795 Src = Src.getOperand(i: 0);
2796 }
2797
2798 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2799 // Give up if the shift is not a valid scale factor [1,2,3].
2800 SDValue ShlSrc = Src.getOperand(i: 0);
2801 SDValue ShlAmt = Src.getOperand(i: 1);
2802 auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt);
2803 if (!ShAmtC)
2804 break;
2805 unsigned ShAmtV = ShAmtC->getZExtValue();
2806 if (ShAmtV > 3)
2807 break;
2808
2809 // The narrow shift must only shift out zero bits (it must be 'nuw').
2810 // That makes it safe to widen to the destination type.
2811 APInt HighZeros =
2812 APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV);
2813 if (!Src->getFlags().hasNoUnsignedWrap() &&
2814 !CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask))
2815 break;
2816
2817 // zext (shl nuw i8 %x, C1) to i32
2818 // --> shl (zext i8 %x to i32), (zext C1)
2819 // zext (and (shl nuw i8 %x, C1), C2) to i32
2820 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2821 MVT SrcVT = ShlSrc.getSimpleValueType();
2822 MVT VT = N.getSimpleValueType();
2823 SDLoc DL(N);
2824
2825 SDValue Res = ShlSrc;
2826 if (!Mask.isAllOnes()) {
2827 Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT);
2828 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2829 Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res);
2830 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2831 }
2832 SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res);
2833 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext);
2834 SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt);
2835 insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl);
2836 CurDAG->ReplaceAllUsesWith(From: N, To: NewShl);
2837 CurDAG->RemoveDeadNode(N: N.getNode());
2838
2839 // Convert the shift to scale factor.
2840 AM.Scale = 1 << ShAmtV;
2841 // If matchIndexRecursively is not called here,
2842 // Zext may be replaced by other nodes but later used to call a builder
2843 // method
2844 AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + 1);
2845 return false;
2846 }
2847
2848 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2849 // Try to fold the mask and shift into an extract and scale.
2850 if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2851 X: Src.getOperand(i: 0), AM))
2852 return false;
2853
2854 // Try to fold the mask and shift directly into the scale.
2855 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2856 X: Src.getOperand(i: 0), AM))
2857 return false;
2858
2859 // Try to fold the mask and shift into BEXTR and scale.
2860 if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2861 X: Src.getOperand(i: 0), AM, Subtarget: *Subtarget))
2862 return false;
2863 }
2864
2865 break;
2866 }
2867 }
2868
2869 return matchAddressBase(N, AM);
2870}
2871
2872/// Helper for MatchAddress. Add the specified node to the
2873/// specified addressing mode without any further recursion.
2874bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2875 // Is the base register already occupied?
2876 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2877 // If so, check to see if the scale index register is set.
2878 if (!AM.IndexReg.getNode()) {
2879 AM.IndexReg = N;
2880 AM.Scale = 1;
2881 return false;
2882 }
2883
2884 // Otherwise, we cannot select it.
2885 return true;
2886 }
2887
2888 // Default, generate it as a register.
2889 AM.BaseType = X86ISelAddressMode::RegBase;
2890 AM.Base_Reg = N;
2891 return false;
2892}
2893
2894bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2895 X86ISelAddressMode &AM,
2896 unsigned Depth) {
2897 LLVM_DEBUG({
2898 dbgs() << "MatchVectorAddress: ";
2899 AM.dump(CurDAG);
2900 });
2901 // Limit recursion.
2902 if (Depth >= SelectionDAG::MaxRecursionDepth)
2903 return matchAddressBase(N, AM);
2904
2905 // TODO: Support other operations.
2906 switch (N.getOpcode()) {
2907 case ISD::Constant: {
2908 uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2909 if (!foldOffsetIntoAddress(Offset: Val, AM))
2910 return false;
2911 break;
2912 }
2913 case X86ISD::Wrapper:
2914 if (!matchWrapper(N, AM))
2915 return false;
2916 break;
2917 case ISD::ADD: {
2918 // Add an artificial use to this node so that we can keep track of
2919 // it if it gets CSE'd with a different node.
2920 HandleSDNode Handle(N);
2921
2922 X86ISelAddressMode Backup = AM;
2923 if (!matchVectorAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1) &&
2924 !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2925 Depth: Depth + 1))
2926 return false;
2927 AM = Backup;
2928
2929 // Try again after commuting the operands.
2930 if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2931 Depth: Depth + 1) &&
2932 !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM,
2933 Depth: Depth + 1))
2934 return false;
2935 AM = Backup;
2936
2937 N = Handle.getValue();
2938 break;
2939 }
2940 }
2941
2942 return matchAddressBase(N, AM);
2943}
2944
2945/// Helper for selectVectorAddr. Handles things that can be folded into a
2946/// gather/scatter address. The index register and scale should have already
2947/// been handled.
2948bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2949 return matchVectorAddressRecursively(N, AM, Depth: 0);
2950}
2951
2952bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2953 SDValue IndexOp, SDValue ScaleOp,
2954 SDValue &Base, SDValue &Scale,
2955 SDValue &Index, SDValue &Disp,
2956 SDValue &Segment) {
2957 X86ISelAddressMode AM;
2958 AM.Scale = ScaleOp->getAsZExtVal();
2959
2960 // Attempt to match index patterns, as long as we're not relying on implicit
2961 // sign-extension, which is performed BEFORE scale.
2962 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2963 AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: 0);
2964 else
2965 AM.IndexReg = IndexOp;
2966
2967 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2968 if (AddrSpace == X86AS::GS)
2969 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
2970 if (AddrSpace == X86AS::FS)
2971 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
2972 if (AddrSpace == X86AS::SS)
2973 AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
2974
2975 SDLoc DL(BasePtr);
2976 MVT VT = BasePtr.getSimpleValueType();
2977
2978 // Try to match into the base and displacement fields.
2979 if (matchVectorAddress(N: BasePtr, AM))
2980 return false;
2981
2982 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2983 return true;
2984}
2985
2986/// Returns true if it is able to pattern match an addressing mode.
2987/// It returns the operands which make up the maximal addressing mode it can
2988/// match by reference.
2989///
2990/// Parent is the parent node of the addr operand that is being matched. It
2991/// is always a load, store, atomic node, or null. It is only null when
2992/// checking memory operands for inline asm nodes.
2993bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2994 SDValue &Scale, SDValue &Index,
2995 SDValue &Disp, SDValue &Segment) {
2996 X86ISelAddressMode AM;
2997
2998 if (Parent &&
2999 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
3000 // that are not a MemSDNode, and thus don't have proper addrspace info.
3001 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
3002 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
3003 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
3004 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3005 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3006 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3007 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3008 unsigned AddrSpace =
3009 cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace();
3010 if (AddrSpace == X86AS::GS)
3011 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
3012 if (AddrSpace == X86AS::FS)
3013 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
3014 if (AddrSpace == X86AS::SS)
3015 AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
3016 }
3017
3018 // Save the DL and VT before calling matchAddress, it can invalidate N.
3019 SDLoc DL(N);
3020 MVT VT = N.getSimpleValueType();
3021
3022 if (matchAddress(N, AM))
3023 return false;
3024
3025 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3026 return true;
3027}
3028
3029bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3030 // Cannot use 32 bit constants to reference objects in kernel/large code
3031 // model.
3032 if (TM.getCodeModel() == CodeModel::Kernel ||
3033 TM.getCodeModel() == CodeModel::Large)
3034 return false;
3035
3036 // In static codegen with small code model, we can get the address of a label
3037 // into a register with 'movl'
3038 if (N->getOpcode() != X86ISD::Wrapper)
3039 return false;
3040
3041 N = N.getOperand(i: 0);
3042
3043 // At least GNU as does not accept 'movl' for TPOFF relocations.
3044 // FIXME: We could use 'movl' when we know we are targeting MC.
3045 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3046 return false;
3047
3048 Imm = N;
3049 // Small/medium code model can reference non-TargetGlobalAddress objects with
3050 // 32 bit constants.
3051 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3052 return TM.getCodeModel() == CodeModel::Small ||
3053 TM.getCodeModel() == CodeModel::Medium;
3054 }
3055
3056 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal();
3057 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3058 return CR->getUnsignedMax().ult(RHS: 1ull << 32);
3059
3060 return !TM.isLargeGlobalValue(GV);
3061}
3062
3063bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3064 SDValue &Index, SDValue &Disp,
3065 SDValue &Segment) {
3066 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3067 SDLoc DL(N);
3068
3069 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3070 return false;
3071
3072 EVT BaseType = Base.getValueType();
3073 unsigned SubReg;
3074 if (BaseType == MVT::i8)
3075 SubReg = X86::sub_8bit;
3076 else if (BaseType == MVT::i16)
3077 SubReg = X86::sub_16bit;
3078 else
3079 SubReg = X86::sub_32bit;
3080
3081 auto *RN = dyn_cast<RegisterSDNode>(Val&: Base);
3082 if (RN && RN->getReg() == 0)
3083 Base = CurDAG->getRegister(Reg: 0, VT: MVT::i64);
3084 else if ((BaseType == MVT::i8 || BaseType == MVT::i16 ||
3085 BaseType == MVT::i32) &&
3086 !isa<FrameIndexSDNode>(Val: Base)) {
3087 // Base could already be %rip, particularly in the x32 ABI.
3088 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3089 VT: MVT::i64), 0);
3090 Base = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Base);
3091 }
3092
3093 [[maybe_unused]] EVT IndexType = Index.getValueType();
3094 RN = dyn_cast<RegisterSDNode>(Val&: Index);
3095 if (RN && RN->getReg() == 0)
3096 Index = CurDAG->getRegister(Reg: 0, VT: MVT::i64);
3097 else {
3098 assert((IndexType == BaseType) &&
3099 "Expect to be extending 8/16/32-bit registers for use in LEA");
3100 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3101 VT: MVT::i64), 0);
3102 Index = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Index);
3103 }
3104
3105 return true;
3106}
3107
3108/// Calls SelectAddr and determines if the maximal addressing
3109/// mode it matches can be cost effectively emitted as an LEA instruction.
3110bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3111 SDValue &Base, SDValue &Scale,
3112 SDValue &Index, SDValue &Disp,
3113 SDValue &Segment) {
3114 X86ISelAddressMode AM;
3115
3116 // Save the DL and VT before calling matchAddress, it can invalidate N.
3117 SDLoc DL(N);
3118 MVT VT = N.getSimpleValueType();
3119
3120 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3121 // segments.
3122 SDValue Copy = AM.Segment;
3123 SDValue T = CurDAG->getRegister(Reg: 0, VT: MVT::i32);
3124 AM.Segment = T;
3125 if (matchAddress(N, AM))
3126 return false;
3127 assert (T == AM.Segment);
3128 AM.Segment = Copy;
3129
3130 unsigned Complexity = 0;
3131 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3132 Complexity = 1;
3133 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3134 Complexity = 4;
3135
3136 if (AM.IndexReg.getNode())
3137 Complexity++;
3138
3139 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3140 // a simple shift.
3141 if (AM.Scale > 1)
3142 Complexity++;
3143
3144 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3145 // to a LEA. This is determined with some experimentation but is by no means
3146 // optimal (especially for code size consideration). LEA is nice because of
3147 // its three-address nature. Tweak the cost function again when we can run
3148 // convertToThreeAddress() at register allocation time.
3149 if (AM.hasSymbolicDisplacement()) {
3150 // For X86-64, always use LEA to materialize RIP-relative addresses.
3151 if (Subtarget->is64Bit())
3152 Complexity = 4;
3153 else
3154 Complexity += 2;
3155 }
3156
3157 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3158 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3159 // duplicating flag-producing instructions later in the pipeline.
3160 if (N.getOpcode() == ISD::ADD) {
3161 auto isMathWithFlags = [](SDValue V) {
3162 switch (V.getOpcode()) {
3163 case X86ISD::ADD:
3164 case X86ISD::SUB:
3165 case X86ISD::ADC:
3166 case X86ISD::SBB:
3167 case X86ISD::SMUL:
3168 case X86ISD::UMUL:
3169 /* TODO: These opcodes can be added safely, but we may want to justify
3170 their inclusion for different reasons (better for reg-alloc).
3171 case X86ISD::OR:
3172 case X86ISD::XOR:
3173 case X86ISD::AND:
3174 */
3175 // Value 1 is the flag output of the node - verify it's not dead.
3176 return !SDValue(V.getNode(), 1).use_empty();
3177 default:
3178 return false;
3179 }
3180 };
3181 // TODO: We might want to factor in whether there's a load folding
3182 // opportunity for the math op that disappears with LEA.
3183 if (isMathWithFlags(N.getOperand(i: 0)) || isMathWithFlags(N.getOperand(i: 1)))
3184 Complexity++;
3185 }
3186
3187 if (AM.Disp)
3188 Complexity++;
3189
3190 // If it isn't worth using an LEA, reject it.
3191 if (Complexity <= 2)
3192 return false;
3193
3194 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3195 return true;
3196}
3197
3198/// This is only run on TargetGlobalTLSAddress nodes.
3199bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3200 SDValue &Scale, SDValue &Index,
3201 SDValue &Disp, SDValue &Segment) {
3202 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3203 N.getOpcode() == ISD::TargetExternalSymbol);
3204
3205 X86ISelAddressMode AM;
3206 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) {
3207 AM.GV = GA->getGlobal();
3208 AM.Disp += GA->getOffset();
3209 AM.SymbolFlags = GA->getTargetFlags();
3210 } else {
3211 auto *SA = cast<ExternalSymbolSDNode>(Val&: N);
3212 AM.ES = SA->getSymbol();
3213 AM.SymbolFlags = SA->getTargetFlags();
3214 }
3215
3216 if (Subtarget->is32Bit()) {
3217 AM.Scale = 1;
3218 AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32);
3219 }
3220
3221 MVT VT = N.getSimpleValueType();
3222 getAddressOperands(AM, DL: SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3223 return true;
3224}
3225
3226bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3227 // Keep track of the original value type and whether this value was
3228 // truncated. If we see a truncation from pointer type to VT that truncates
3229 // bits that are known to be zero, we can use a narrow reference.
3230 EVT VT = N.getValueType();
3231 bool WasTruncated = false;
3232 if (N.getOpcode() == ISD::TRUNCATE) {
3233 WasTruncated = true;
3234 N = N.getOperand(i: 0);
3235 }
3236
3237 if (N.getOpcode() != X86ISD::Wrapper)
3238 return false;
3239
3240 // We can only use non-GlobalValues as immediates if they were not truncated,
3241 // as we do not have any range information. If we have a GlobalValue and the
3242 // address was not truncated, we can select it as an operand directly.
3243 unsigned Opc = N.getOperand(i: 0)->getOpcode();
3244 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3245 Op = N.getOperand(i: 0);
3246 // We can only select the operand directly if we didn't have to look past a
3247 // truncate.
3248 return !WasTruncated;
3249 }
3250
3251 // Check that the global's range fits into VT.
3252 auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: 0));
3253 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3254 if (!CR || CR->getUnsignedMax().uge(RHS: 1ull << VT.getSizeInBits()))
3255 return false;
3256
3257 // Okay, we can use a narrow reference.
3258 Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N), VT,
3259 offset: GA->getOffset(), TargetFlags: GA->getTargetFlags());
3260 return true;
3261}
3262
3263bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3264 SDValue &Base, SDValue &Scale,
3265 SDValue &Index, SDValue &Disp,
3266 SDValue &Segment) {
3267 assert(Root && P && "Unknown root/parent nodes");
3268 if (!ISD::isNON_EXTLoad(N: N.getNode()) ||
3269 !IsProfitableToFold(N, U: P, Root) ||
3270 !IsLegalToFold(N, U: P, Root, OptLevel))
3271 return false;
3272
3273 return selectAddr(Parent: N.getNode(),
3274 N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment);
3275}
3276
3277bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3278 SDValue &Base, SDValue &Scale,
3279 SDValue &Index, SDValue &Disp,
3280 SDValue &Segment) {
3281 assert(Root && P && "Unknown root/parent nodes");
3282 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3283 !IsProfitableToFold(N, U: P, Root) ||
3284 !IsLegalToFold(N, U: P, Root, OptLevel))
3285 return false;
3286
3287 return selectAddr(Parent: N.getNode(),
3288 N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment);
3289}
3290
3291/// Return an SDNode that returns the value of the global base register.
3292/// Output instructions required to initialize the global base register,
3293/// if necessary.
3294SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3295 Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3296 auto &DL = MF->getDataLayout();
3297 return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode();
3298}
3299
3300bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3301 if (N->getOpcode() == ISD::TRUNCATE)
3302 N = N->getOperand(Num: 0).getNode();
3303 if (N->getOpcode() != X86ISD::Wrapper)
3304 return false;
3305
3306 auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: 0));
3307 if (!GA)
3308 return false;
3309
3310 auto *GV = GA->getGlobal();
3311 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3312 if (CR)
3313 return CR->getSignedMin().sge(RHS: -1ull << Width) &&
3314 CR->getSignedMax().slt(RHS: 1ull << Width);
3315 // In the kernel code model, globals are in the negative 2GB of the address
3316 // space, so globals can be a sign extended 32-bit immediate.
3317 // In other code models, small globals are in the low 2GB of the address
3318 // space, so sign extending them is equivalent to zero extending them.
3319 return TM.getCodeModel() != CodeModel::Large && Width == 32 &&
3320 !TM.isLargeGlobalValue(GV);
3321}
3322
3323X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3324 assert(N->isMachineOpcode() && "Unexpected node");
3325 unsigned Opc = N->getMachineOpcode();
3326 const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc);
3327 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3328 if (CondNo < 0)
3329 return X86::COND_INVALID;
3330
3331 return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo));
3332}
3333
3334/// Test whether the given X86ISD::CMP node has any users that use a flag
3335/// other than ZF.
3336bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3337 // Examine each user of the node.
3338 for (SDUse &Use : Flags->uses()) {
3339 // Only check things that use the flags.
3340 if (Use.getResNo() != Flags.getResNo())
3341 continue;
3342 SDNode *User = Use.getUser();
3343 // Only examine CopyToReg uses that copy to EFLAGS.
3344 if (User->getOpcode() != ISD::CopyToReg ||
3345 cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3346 return false;
3347 // Examine each user of the CopyToReg use.
3348 for (SDUse &FlagUse : User->uses()) {
3349 // Only examine the Flag result.
3350 if (FlagUse.getResNo() != 1)
3351 continue;
3352 // Anything unusual: assume conservatively.
3353 if (!FlagUse.getUser()->isMachineOpcode())
3354 return false;
3355 // Examine the condition code of the user.
3356 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3357
3358 switch (CC) {
3359 // Comparisons which only use the zero flag.
3360 case X86::COND_E: case X86::COND_NE:
3361 continue;
3362 // Anything else: assume conservatively.
3363 default:
3364 return false;
3365 }
3366 }
3367 }
3368 return true;
3369}
3370
3371/// Test whether the given X86ISD::CMP node has any uses which require the SF
3372/// flag to be accurate.
3373bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3374 // Examine each user of the node.
3375 for (SDUse &Use : Flags->uses()) {
3376 // Only check things that use the flags.
3377 if (Use.getResNo() != Flags.getResNo())
3378 continue;
3379 SDNode *User = Use.getUser();
3380 // Only examine CopyToReg uses that copy to EFLAGS.
3381 if (User->getOpcode() != ISD::CopyToReg ||
3382 cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3383 return false;
3384 // Examine each user of the CopyToReg use.
3385 for (SDUse &FlagUse : User->uses()) {
3386 // Only examine the Flag result.
3387 if (FlagUse.getResNo() != 1)
3388 continue;
3389 // Anything unusual: assume conservatively.
3390 if (!FlagUse.getUser()->isMachineOpcode())
3391 return false;
3392 // Examine the condition code of the user.
3393 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3394
3395 switch (CC) {
3396 // Comparisons which don't examine the SF flag.
3397 case X86::COND_A: case X86::COND_AE:
3398 case X86::COND_B: case X86::COND_BE:
3399 case X86::COND_E: case X86::COND_NE:
3400 case X86::COND_O: case X86::COND_NO:
3401 case X86::COND_P: case X86::COND_NP:
3402 continue;
3403 // Anything else: assume conservatively.
3404 default:
3405 return false;
3406 }
3407 }
3408 }
3409 return true;
3410}
3411
3412static bool mayUseCarryFlag(X86::CondCode CC) {
3413 switch (CC) {
3414 // Comparisons which don't examine the CF flag.
3415 case X86::COND_O: case X86::COND_NO:
3416 case X86::COND_E: case X86::COND_NE:
3417 case X86::COND_S: case X86::COND_NS:
3418 case X86::COND_P: case X86::COND_NP:
3419 case X86::COND_L: case X86::COND_GE:
3420 case X86::COND_G: case X86::COND_LE:
3421 return false;
3422 // Anything else: assume conservatively.
3423 default:
3424 return true;
3425 }
3426}
3427
3428/// Test whether the given node which sets flags has any uses which require the
3429/// CF flag to be accurate.
3430 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3431 // Examine each user of the node.
3432 for (SDUse &Use : Flags->uses()) {
3433 // Only check things that use the flags.
3434 if (Use.getResNo() != Flags.getResNo())
3435 continue;
3436
3437 SDNode *User = Use.getUser();
3438 unsigned UserOpc = User->getOpcode();
3439
3440 if (UserOpc == ISD::CopyToReg) {
3441 // Only examine CopyToReg uses that copy to EFLAGS.
3442 if (cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3443 return false;
3444 // Examine each user of the CopyToReg use.
3445 for (SDUse &FlagUse : User->uses()) {
3446 // Only examine the Flag result.
3447 if (FlagUse.getResNo() != 1)
3448 continue;
3449 // Anything unusual: assume conservatively.
3450 if (!FlagUse.getUser()->isMachineOpcode())
3451 return false;
3452 // Examine the condition code of the user.
3453 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3454
3455 if (mayUseCarryFlag(CC))
3456 return false;
3457 }
3458
3459 // This CopyToReg is ok. Move on to the next user.
3460 continue;
3461 }
3462
3463 // This might be an unselected node. So look for the pre-isel opcodes that
3464 // use flags.
3465 unsigned CCOpNo;
3466 switch (UserOpc) {
3467 default:
3468 // Something unusual. Be conservative.
3469 return false;
3470 case X86ISD::SETCC: CCOpNo = 0; break;
3471 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3472 case X86ISD::CMOV: CCOpNo = 2; break;
3473 case X86ISD::BRCOND: CCOpNo = 2; break;
3474 }
3475
3476 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(Num: CCOpNo);
3477 if (mayUseCarryFlag(CC))
3478 return false;
3479 }
3480 return true;
3481}
3482
3483/// Check whether or not the chain ending in StoreNode is suitable for doing
3484/// the {load; op; store} to modify transformation.
3485static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3486 SDValue StoredVal, SelectionDAG *CurDAG,
3487 unsigned LoadOpNo,
3488 LoadSDNode *&LoadNode,
3489 SDValue &InputChain) {
3490 // Is the stored value result 0 of the operation?
3491 if (StoredVal.getResNo() != 0) return false;
3492
3493 // Are there other uses of the operation other than the store?
3494 if (!StoredVal.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) return false;
3495
3496 // Is the store non-extending and non-indexed?
3497 if (!ISD::isNormalStore(N: StoreNode) || StoreNode->isNonTemporal())
3498 return false;
3499
3500 SDValue Load = StoredVal->getOperand(Num: LoadOpNo);
3501 // Is the stored value a non-extending and non-indexed load?
3502 if (!ISD::isNormalLoad(N: Load.getNode())) return false;
3503
3504 // Return LoadNode by reference.
3505 LoadNode = cast<LoadSDNode>(Val&: Load);
3506
3507 // Is store the only read of the loaded value?
3508 if (!Load.hasOneUse())
3509 return false;
3510
3511 // Is the address of the store the same as the load?
3512 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3513 LoadNode->getOffset() != StoreNode->getOffset())
3514 return false;
3515
3516 bool FoundLoad = false;
3517 SmallVector<SDValue, 4> ChainOps;
3518 SmallVector<const SDNode *, 4> LoopWorklist;
3519 SmallPtrSet<const SDNode *, 16> Visited;
3520 const unsigned int Max = 1024;
3521
3522 // Visualization of Load-Op-Store fusion:
3523 // -------------------------
3524 // Legend:
3525 // *-lines = Chain operand dependencies.
3526 // |-lines = Normal operand dependencies.
3527 // Dependencies flow down and right. n-suffix references multiple nodes.
3528 //
3529 // C Xn C
3530 // * * *
3531 // * * *
3532 // Xn A-LD Yn TF Yn
3533 // * * \ | * |
3534 // * * \ | * |
3535 // * * \ | => A--LD_OP_ST
3536 // * * \| \
3537 // TF OP \
3538 // * | \ Zn
3539 // * | \
3540 // A-ST Zn
3541 //
3542
3543 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3544 // #2: Yn -> LD
3545 // #3: ST -> Zn
3546
3547 // Ensure the transform is safe by checking for the dual
3548 // dependencies to make sure we do not induce a loop.
3549
3550 // As LD is a predecessor to both OP and ST we can do this by checking:
3551 // a). if LD is a predecessor to a member of Xn or Yn.
3552 // b). if a Zn is a predecessor to ST.
3553
3554 // However, (b) can only occur through being a chain predecessor to
3555 // ST, which is the same as Zn being a member or predecessor of Xn,
3556 // which is a subset of LD being a predecessor of Xn. So it's
3557 // subsumed by check (a).
3558
3559 SDValue Chain = StoreNode->getChain();
3560
3561 // Gather X elements in ChainOps.
3562 if (Chain == Load.getValue(R: 1)) {
3563 FoundLoad = true;
3564 ChainOps.push_back(Elt: Load.getOperand(i: 0));
3565 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3566 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3567 SDValue Op = Chain.getOperand(i);
3568 if (Op == Load.getValue(R: 1)) {
3569 FoundLoad = true;
3570 // Drop Load, but keep its chain. No cycle check necessary.
3571 ChainOps.push_back(Elt: Load.getOperand(i: 0));
3572 continue;
3573 }
3574 LoopWorklist.push_back(Elt: Op.getNode());
3575 ChainOps.push_back(Elt: Op);
3576 }
3577 }
3578
3579 if (!FoundLoad)
3580 return false;
3581
3582 // Worklist is currently Xn. Add Yn to worklist.
3583 for (SDValue Op : StoredVal->ops())
3584 if (Op.getNode() != LoadNode)
3585 LoopWorklist.push_back(Elt: Op.getNode());
3586
3587 // Check (a) if Load is a predecessor to Xn + Yn
3588 if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max,
3589 TopologicalPrune: true))
3590 return false;
3591
3592 InputChain =
3593 CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ChainOps);
3594 return true;
3595}
3596
3597// Change a chain of {load; op; store} of the same value into a simple op
3598// through memory of that value, if the uses of the modified value and its
3599// address are suitable.
3600//
3601// The tablegen pattern memory operand pattern is currently not able to match
3602// the case where the EFLAGS on the original operation are used.
3603//
3604// To move this to tablegen, we'll need to improve tablegen to allow flags to
3605// be transferred from a node in the pattern to the result node, probably with
3606// a new keyword. For example, we have this
3607// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3608// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3609// but maybe need something like this
3610// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3611// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3612// (transferrable EFLAGS)]>;
3613//
3614// Until then, we manually fold these and instruction select the operation
3615// here.
3616bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3617 auto *StoreNode = cast<StoreSDNode>(Val: Node);
3618 SDValue StoredVal = StoreNode->getOperand(Num: 1);
3619 unsigned Opc = StoredVal->getOpcode();
3620
3621 // Before we try to select anything, make sure this is memory operand size
3622 // and opcode we can handle. Note that this must match the code below that
3623 // actually lowers the opcodes.
3624 EVT MemVT = StoreNode->getMemoryVT();
3625 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3626 MemVT != MVT::i8)
3627 return false;
3628
3629 bool IsCommutable = false;
3630 bool IsNegate = false;
3631 switch (Opc) {
3632 default:
3633 return false;
3634 case X86ISD::SUB:
3635 IsNegate = isNullConstant(V: StoredVal.getOperand(i: 0));
3636 break;
3637 case X86ISD::SBB:
3638 break;
3639 case X86ISD::ADD:
3640 case X86ISD::ADC:
3641 case X86ISD::AND:
3642 case X86ISD::OR:
3643 case X86ISD::XOR:
3644 IsCommutable = true;
3645 break;
3646 }
3647
3648 unsigned LoadOpNo = IsNegate ? 1 : 0;
3649 LoadSDNode *LoadNode = nullptr;
3650 SDValue InputChain;
3651 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3652 LoadNode, InputChain)) {
3653 if (!IsCommutable)
3654 return false;
3655
3656 // This operation is commutable, try the other operand.
3657 LoadOpNo = 1;
3658 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3659 LoadNode, InputChain))
3660 return false;
3661 }
3662
3663 SDValue Base, Scale, Index, Disp, Segment;
3664 if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3665 Segment))
3666 return false;
3667
3668 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3669 unsigned Opc8) {
3670 switch (MemVT.getSimpleVT().SimpleTy) {
3671 case MVT::i64:
3672 return Opc64;
3673 case MVT::i32:
3674 return Opc32;
3675 case MVT::i16:
3676 return Opc16;
3677 case MVT::i8:
3678 return Opc8;
3679 default:
3680 llvm_unreachable("Invalid size!");
3681 }
3682 };
3683
3684 MachineSDNode *Result;
3685 switch (Opc) {
3686 case X86ISD::SUB:
3687 // Handle negate.
3688 if (IsNegate) {
3689 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3690 X86::NEG8m);
3691 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3692 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32,
3693 VT2: MVT::Other, Ops);
3694 break;
3695 }
3696 [[fallthrough]];
3697 case X86ISD::ADD:
3698 // Try to match inc/dec.
3699 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3700 bool IsOne = isOneConstant(V: StoredVal.getOperand(i: 1));
3701 bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: 1));
3702 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3703 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) {
3704 unsigned NewOpc =
3705 ((Opc == X86ISD::ADD) == IsOne)
3706 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3707 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3708 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3709 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32,
3710 VT2: MVT::Other, Ops);
3711 break;
3712 }
3713 }
3714 [[fallthrough]];
3715 case X86ISD::ADC:
3716 case X86ISD::SBB:
3717 case X86ISD::AND:
3718 case X86ISD::OR:
3719 case X86ISD::XOR: {
3720 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3721 switch (Opc) {
3722 case X86ISD::ADD:
3723 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3724 X86::ADD8mr);
3725 case X86ISD::ADC:
3726 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3727 X86::ADC8mr);
3728 case X86ISD::SUB:
3729 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3730 X86::SUB8mr);
3731 case X86ISD::SBB:
3732 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3733 X86::SBB8mr);
3734 case X86ISD::AND:
3735 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3736 X86::AND8mr);
3737 case X86ISD::OR:
3738 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3739 case X86ISD::XOR:
3740 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3741 X86::XOR8mr);
3742 default:
3743 llvm_unreachable("Invalid opcode!");
3744 }
3745 };
3746 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3747 switch (Opc) {
3748 case X86ISD::ADD:
3749 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3750 X86::ADD8mi);
3751 case X86ISD::ADC:
3752 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3753 X86::ADC8mi);
3754 case X86ISD::SUB:
3755 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3756 X86::SUB8mi);
3757 case X86ISD::SBB:
3758 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3759 X86::SBB8mi);
3760 case X86ISD::AND:
3761 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3762 X86::AND8mi);
3763 case X86ISD::OR:
3764 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3765 X86::OR8mi);
3766 case X86ISD::XOR:
3767 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3768 X86::XOR8mi);
3769 default:
3770 llvm_unreachable("Invalid opcode!");
3771 }
3772 };
3773
3774 unsigned NewOpc = SelectRegOpcode(Opc);
3775 SDValue Operand = StoredVal->getOperand(Num: 1-LoadOpNo);
3776
3777 // See if the operand is a constant that we can fold into an immediate
3778 // operand.
3779 if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) {
3780 int64_t OperandV = OperandC->getSExtValue();
3781
3782 // Check if we can shrink the operand enough to fit in an immediate (or
3783 // fit into a smaller immediate) by negating it and switching the
3784 // operation.
3785 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3786 ((MemVT != MVT::i8 && !isInt<8>(x: OperandV) && isInt<8>(x: -OperandV)) ||
3787 (MemVT == MVT::i64 && !isInt<32>(x: OperandV) &&
3788 isInt<32>(x: -OperandV))) &&
3789 hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) {
3790 OperandV = -OperandV;
3791 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3792 }
3793
3794 if (MemVT != MVT::i64 || isInt<32>(x: OperandV)) {
3795 Operand = CurDAG->getSignedTargetConstant(Val: OperandV, DL: SDLoc(Node), VT: MemVT);
3796 NewOpc = SelectImmOpcode(Opc);
3797 }
3798 }
3799
3800 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3801 SDValue CopyTo =
3802 CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc(Node), Reg: X86::EFLAGS,
3803 N: StoredVal.getOperand(i: 2), Glue: SDValue());
3804
3805 const SDValue Ops[] = {Base, Scale, Index, Disp,
3806 Segment, Operand, CopyTo, CopyTo.getValue(R: 1)};
3807 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other,
3808 Ops);
3809 } else {
3810 const SDValue Ops[] = {Base, Scale, Index, Disp,
3811 Segment, Operand, InputChain};
3812 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other,
3813 Ops);
3814 }
3815 break;
3816 }
3817 default:
3818 llvm_unreachable("Invalid opcode!");
3819 }
3820
3821 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3822 LoadNode->getMemOperand()};
3823 CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps);
3824
3825 // Update Load Chain uses as well.
3826 ReplaceUses(F: SDValue(LoadNode, 1), T: SDValue(Result, 1));
3827 ReplaceUses(F: SDValue(StoreNode, 0), T: SDValue(Result, 1));
3828 ReplaceUses(F: SDValue(StoredVal.getNode(), 1), T: SDValue(Result, 0));
3829 CurDAG->RemoveDeadNode(N: Node);
3830 return true;
3831}
3832
3833// See if this is an X & Mask that we can match to BEXTR/BZHI.
3834// Where Mask is one of the following patterns:
3835// a) x & (1 << nbits) - 1
3836// b) x & ~(-1 << nbits)
3837// c) x & (-1 >> (32 - y))
3838// d) x << (32 - y) >> (32 - y)
3839// e) (1 << nbits) - 1
3840bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3841 assert(
3842 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3843 Node->getOpcode() == ISD::SRL) &&
3844 "Should be either an and-mask, or right-shift after clearing high bits.");
3845
3846 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3847 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3848 return false;
3849
3850 MVT NVT = Node->getSimpleValueType(ResNo: 0);
3851
3852 // Only supported for 32 and 64 bits.
3853 if (NVT != MVT::i32 && NVT != MVT::i64)
3854 return false;
3855
3856 SDValue NBits;
3857 bool NegateNBits;
3858
3859 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3860 // Else, if we only have BMI1's BEXTR, we require one-use.
3861 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3862 auto checkUses = [AllowExtraUsesByDefault](
3863 SDValue Op, unsigned NUses,
3864 std::optional<bool> AllowExtraUses) {
3865 return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) ||
3866 Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo());
3867 };
3868 auto checkOneUse = [checkUses](SDValue Op,
3869 std::optional<bool> AllowExtraUses =
3870 std::nullopt) {
3871 return checkUses(Op, 1, AllowExtraUses);
3872 };
3873 auto checkTwoUse = [checkUses](SDValue Op,
3874 std::optional<bool> AllowExtraUses =
3875 std::nullopt) {
3876 return checkUses(Op, 2, AllowExtraUses);
3877 };
3878
3879 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3880 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3881 assert(V.getSimpleValueType() == MVT::i32 &&
3882 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3883 "Expected i64 -> i32 truncation");
3884 V = V.getOperand(i: 0);
3885 }
3886 return V;
3887 };
3888
3889 // a) x & ((1 << nbits) + (-1))
3890 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3891 &NegateNBits](SDValue Mask) -> bool {
3892 // Match `add`. Must only have one use!
3893 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3894 return false;
3895 // We should be adding all-ones constant (i.e. subtracting one.)
3896 if (!isAllOnesConstant(V: Mask->getOperand(Num: 1)))
3897 return false;
3898 // Match `1 << nbits`. Might be truncated. Must only have one use!
3899 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0));
3900 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3901 return false;
3902 if (!isOneConstant(V: M0->getOperand(Num: 0)))
3903 return false;
3904 NBits = M0->getOperand(Num: 1);
3905 NegateNBits = false;
3906 return true;
3907 };
3908
3909 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3910 V = peekThroughOneUseTruncation(V);
3911 return CurDAG->MaskedValueIsAllOnes(
3912 Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(),
3913 loBitsSet: NVT.getSizeInBits()));
3914 };
3915
3916 // b) x & ~(-1 << nbits)
3917 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3918 &NBits, &NegateNBits](SDValue Mask) -> bool {
3919 // Match `~()`. Must only have one use!
3920 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3921 return false;
3922 // The -1 only has to be all-ones for the final Node's NVT.
3923 if (!isAllOnes(Mask->getOperand(Num: 1)))
3924 return false;
3925 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3926 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0));
3927 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3928 return false;
3929 // The -1 only has to be all-ones for the final Node's NVT.
3930 if (!isAllOnes(M0->getOperand(Num: 0)))
3931 return false;
3932 NBits = M0->getOperand(Num: 1);
3933 NegateNBits = false;
3934 return true;
3935 };
3936
3937 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3938 // or leave the shift amount as-is, but then we'll have to negate it.
3939 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3940 unsigned Bitwidth) {
3941 NBits = ShiftAmt;
3942 NegateNBits = true;
3943 // Skip over a truncate of the shift amount, if any.
3944 if (NBits.getOpcode() == ISD::TRUNCATE)
3945 NBits = NBits.getOperand(i: 0);
3946 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3947 // If it doesn't match, that's fine, we'll just negate it ourselves.
3948 if (NBits.getOpcode() != ISD::SUB)
3949 return;
3950 auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: 0));
3951 if (!V0 || V0->getZExtValue() != Bitwidth)
3952 return;
3953 NBits = NBits.getOperand(i: 1);
3954 NegateNBits = false;
3955 };
3956
3957 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3958 // or
3959 // c) x & (-1 >> (32 - y))
3960 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3961 canonicalizeShiftAmt](SDValue Mask) -> bool {
3962 // The mask itself may be truncated.
3963 Mask = peekThroughOneUseTruncation(Mask);
3964 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3965 // Match `l>>`. Must only have one use!
3966 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3967 return false;
3968 // We should be shifting truly all-ones constant.
3969 if (!isAllOnesConstant(V: Mask.getOperand(i: 0)))
3970 return false;
3971 SDValue M1 = Mask.getOperand(i: 1);
3972 // The shift amount should not be used externally.
3973 if (!checkOneUse(M1))
3974 return false;
3975 canonicalizeShiftAmt(M1, Bitwidth);
3976 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3977 // is no extra use of the mask. Clearly, there was one since we are here.
3978 // But at the same time, if we need to negate the shift amount,
3979 // then we don't want the mask to stick around, else it's unprofitable.
3980 return !NegateNBits;
3981 };
3982
3983 SDValue X;
3984
3985 // d) x << z >> z but then we'll have to subtract z from bitwidth
3986 // or
3987 // d) x << (32 - y) >> (32 - y)
3988 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3989 AllowExtraUsesByDefault, &NegateNBits,
3990 &X](SDNode *Node) -> bool {
3991 if (Node->getOpcode() != ISD::SRL)
3992 return false;
3993 SDValue N0 = Node->getOperand(Num: 0);
3994 if (N0->getOpcode() != ISD::SHL)
3995 return false;
3996 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3997 SDValue N1 = Node->getOperand(Num: 1);
3998 SDValue N01 = N0->getOperand(Num: 1);
3999 // Both of the shifts must be by the exact same value.
4000 if (N1 != N01)
4001 return false;
4002 canonicalizeShiftAmt(N1, Bitwidth);
4003 // There should not be any external uses of the inner shift / shift amount.
4004 // Note that while we are generally okay with external uses given BMI2,
4005 // iff we need to negate the shift amount, we are not okay with extra uses.
4006 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4007 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
4008 return false;
4009 X = N0->getOperand(Num: 0);
4010 return true;
4011 };
4012
4013 auto matchLowBitMask = [matchPatternA, matchPatternB,
4014 matchPatternC](SDValue Mask) -> bool {
4015 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4016 };
4017
4018 if (Node->getOpcode() == ISD::AND) {
4019 X = Node->getOperand(Num: 0);
4020 SDValue Mask = Node->getOperand(Num: 1);
4021
4022 if (matchLowBitMask(Mask)) {
4023 // Great.
4024 } else {
4025 std::swap(a&: X, b&: Mask);
4026 if (!matchLowBitMask(Mask))
4027 return false;
4028 }
4029 } else if (matchLowBitMask(SDValue(Node, 0))) {
4030 X = CurDAG->getAllOnesConstant(DL: SDLoc(Node), VT: NVT);
4031 } else if (!matchPatternD(Node))
4032 return false;
4033
4034 // If we need to negate the shift amount, require BMI2 BZHI support.
4035 // It's just too unprofitable for BMI1 BEXTR.
4036 if (NegateNBits && !Subtarget->hasBMI2())
4037 return false;
4038
4039 SDLoc DL(Node);
4040
4041 if (NBits.getSimpleValueType() != MVT::i8) {
4042 // Truncate the shift amount.
4043 NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits);
4044 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4045 }
4046
4047 // Turn (i32)(x & imm8) into (i32)x & imm32.
4048 ConstantSDNode *Imm = nullptr;
4049 if (NBits->getOpcode() == ISD::AND)
4050 if ((Imm = dyn_cast<ConstantSDNode>(Val: NBits->getOperand(Num: 1))))
4051 NBits = NBits->getOperand(Num: 0);
4052
4053 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4054 // All the other bits are undefined, we do not care about them.
4055 SDValue ImplDef = SDValue(
4056 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), 0);
4057 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: ImplDef);
4058
4059 SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32);
4060 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: SRIdxVal);
4061 NBits = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL,
4062 VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal),
4063 0);
4064 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4065
4066 if (Imm) {
4067 NBits =
4068 CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: NBits,
4069 N2: CurDAG->getConstant(Val: Imm->getZExtValue(), DL, VT: MVT::i32));
4070 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4071 }
4072
4073 // We might have matched the amount of high bits to be cleared,
4074 // but we want the amount of low bits to be kept, so negate it then.
4075 if (NegateNBits) {
4076 SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32);
4077 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: BitWidthC);
4078
4079 NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits);
4080 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4081 }
4082
4083 if (Subtarget->hasBMI2()) {
4084 // Great, just emit the BZHI..
4085 if (NVT != MVT::i32) {
4086 // But have to place the bit count into the wide-enough register first.
4087 NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits);
4088 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4089 }
4090
4091 SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits);
4092 ReplaceNode(F: Node, T: Extract.getNode());
4093 SelectCode(N: Extract.getNode());
4094 return true;
4095 }
4096
4097 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4098 // *logically* shifted (potentially with one-use trunc inbetween),
4099 // and the truncation was the only use of the shift,
4100 // and if so look past one-use truncation.
4101 {
4102 SDValue RealX = peekThroughOneUseTruncation(X);
4103 // FIXME: only if the shift is one-use?
4104 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4105 X = RealX;
4106 }
4107
4108 MVT XVT = X.getSimpleValueType();
4109
4110 // Else, emitting BEXTR requires one more step.
4111 // The 'control' of BEXTR has the pattern of:
4112 // [15...8 bit][ 7...0 bit] location
4113 // [ bit count][ shift] name
4114 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4115
4116 // Shift NBits left by 8 bits, thus producing 'control'.
4117 // This makes the low 8 bits to be zero.
4118 SDValue C8 = CurDAG->getConstant(Val: 8, DL, VT: MVT::i8);
4119 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: C8);
4120 SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8);
4121 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4122
4123 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4124 // FIXME: only if the shift is one-use?
4125 if (X.getOpcode() == ISD::SRL) {
4126 SDValue ShiftAmt = X.getOperand(i: 1);
4127 X = X.getOperand(i: 0);
4128
4129 assert(ShiftAmt.getValueType() == MVT::i8 &&
4130 "Expected shift amount to be i8");
4131
4132 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4133 // We could zext to i16 in some form, but we intentionally don't do that.
4134 SDValue OrigShiftAmt = ShiftAmt;
4135 ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt);
4136 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt);
4137
4138 // And now 'or' these low 8 bits of shift amount into the 'control'.
4139 Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt);
4140 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4141 }
4142
4143 // But have to place the 'control' into the wide-enough register first.
4144 if (XVT != MVT::i32) {
4145 Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control);
4146 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4147 }
4148
4149 // And finally, form the BEXTR itself.
4150 SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control);
4151
4152 // The 'X' was originally truncated. Do that now.
4153 if (XVT != NVT) {
4154 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Extract);
4155 Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract);
4156 }
4157
4158 ReplaceNode(F: Node, T: Extract.getNode());
4159 SelectCode(N: Extract.getNode());
4160
4161 return true;
4162}
4163
4164// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4165MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4166 MVT NVT = Node->getSimpleValueType(ResNo: 0);
4167 SDLoc dl(Node);
4168
4169 SDValue N0 = Node->getOperand(Num: 0);
4170 SDValue N1 = Node->getOperand(Num: 1);
4171
4172 // If we have TBM we can use an immediate for the control. If we have BMI
4173 // we should only do this if the BEXTR instruction is implemented well.
4174 // Otherwise moving the control into a register makes this more costly.
4175 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4176 // hoisting the move immediate would make it worthwhile with a less optimal
4177 // BEXTR?
4178 bool PreferBEXTR =
4179 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4180 if (!PreferBEXTR && !Subtarget->hasBMI2())
4181 return nullptr;
4182
4183 // Must have a shift right.
4184 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4185 return nullptr;
4186
4187 // Shift can't have additional users.
4188 if (!N0->hasOneUse())
4189 return nullptr;
4190
4191 // Only supported for 32 and 64 bits.
4192 if (NVT != MVT::i32 && NVT != MVT::i64)
4193 return nullptr;
4194
4195 // Shift amount and RHS of and must be constant.
4196 auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1);
4197 auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1));
4198 if (!MaskCst || !ShiftCst)
4199 return nullptr;
4200
4201 // And RHS must be a mask.
4202 uint64_t Mask = MaskCst->getZExtValue();
4203 if (!isMask_64(Value: Mask))
4204 return nullptr;
4205
4206 uint64_t Shift = ShiftCst->getZExtValue();
4207 uint64_t MaskSize = llvm::popcount(Value: Mask);
4208
4209 // Don't interfere with something that can be handled by extracting AH.
4210 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4211 if (Shift == 8 && MaskSize == 8)
4212 return nullptr;
4213
4214 // Make sure we are only using bits that were in the original value, not
4215 // shifted in.
4216 if (Shift + MaskSize > NVT.getSizeInBits())
4217 return nullptr;
4218
4219 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4220 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4221 // does not fit into 32 bits. Load folding is not a sufficient reason.
4222 if (!PreferBEXTR && MaskSize <= 32)
4223 return nullptr;
4224
4225 SDValue Control;
4226 unsigned ROpc, MOpc;
4227
4228#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4229 if (!PreferBEXTR) {
4230 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4231 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4232 // Let's perform the mask first, and apply shift later. Note that we need to
4233 // widen the mask to account for the fact that we'll apply shift afterwards!
4234 Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT);
4235 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4236 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4237 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4238 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4239 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4240 Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0);
4241 } else {
4242 // The 'control' of BEXTR has the pattern of:
4243 // [15...8 bit][ 7...0 bit] location
4244 // [ bit count][ shift] name
4245 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4246 Control = CurDAG->getTargetConstant(Val: Shift | (MaskSize << 8), DL: dl, VT: NVT);
4247 if (Subtarget->hasTBM()) {
4248 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4249 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4250 } else {
4251 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4252 // BMI requires the immediate to placed in a register.
4253 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4254 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4255 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4256 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4257 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4258 Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0);
4259 }
4260 }
4261
4262 MachineSDNode *NewNode;
4263 SDValue Input = N0->getOperand(Num: 0);
4264 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4265 if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4266 SDValue Ops[] = {
4267 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: 0)};
4268 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
4269 NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4270 // Update the chain.
4271 ReplaceUses(F: Input.getValue(R: 1), T: SDValue(NewNode, 2));
4272 // Record the mem-refs
4273 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()});
4274 } else {
4275 NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control);
4276 }
4277
4278 if (!PreferBEXTR) {
4279 // We still need to apply the shift.
4280 SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT);
4281 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4282 : GET_ND_IF_ENABLED(X86::SHR32ri);
4283 NewNode =
4284 CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue(NewNode, 0), Op2: ShAmt);
4285 }
4286
4287 return NewNode;
4288}
4289
4290// Emit a PCMISTR(I/M) instruction.
4291MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4292 bool MayFoldLoad, const SDLoc &dl,
4293 MVT VT, SDNode *Node) {
4294 SDValue N0 = Node->getOperand(Num: 0);
4295 SDValue N1 = Node->getOperand(Num: 1);
4296 SDValue Imm = Node->getOperand(Num: 2);
4297 auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4298 Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType());
4299
4300 // Try to fold a load. No need to check alignment.
4301 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4302 if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4303 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4304 N1.getOperand(i: 0) };
4305 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other);
4306 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4307 // Update the chain.
4308 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 2));
4309 // Record the mem-refs
4310 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
4311 return CNode;
4312 }
4313
4314 SDValue Ops[] = { N0, N1, Imm };
4315 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32);
4316 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4317 return CNode;
4318}
4319
4320// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4321// to emit a second instruction after this one. This is needed since we have two
4322// copyToReg nodes glued before this and we need to continue that glue through.
4323MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4324 bool MayFoldLoad, const SDLoc &dl,
4325 MVT VT, SDNode *Node,
4326 SDValue &InGlue) {
4327 SDValue N0 = Node->getOperand(Num: 0);
4328 SDValue N2 = Node->getOperand(Num: 2);
4329 SDValue Imm = Node->getOperand(Num: 4);
4330 auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4331 Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType());
4332
4333 // Try to fold a load. No need to check alignment.
4334 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4335 if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4336 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4337 N2.getOperand(i: 0), InGlue };
4338 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
4339 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4340 InGlue = SDValue(CNode, 3);
4341 // Update the chain.
4342 ReplaceUses(F: N2.getValue(R: 1), T: SDValue(CNode, 2));
4343 // Record the mem-refs
4344 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()});
4345 return CNode;
4346 }
4347
4348 SDValue Ops[] = { N0, N2, Imm, InGlue };
4349 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue);
4350 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4351 InGlue = SDValue(CNode, 2);
4352 return CNode;
4353}
4354
4355bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4356 EVT VT = N->getValueType(ResNo: 0);
4357
4358 // Only handle scalar shifts.
4359 if (VT.isVector())
4360 return false;
4361
4362 // Narrower shifts only mask to 5 bits in hardware.
4363 unsigned Size = VT == MVT::i64 ? 64 : 32;
4364
4365 SDValue OrigShiftAmt = N->getOperand(Num: 1);
4366 SDValue ShiftAmt = OrigShiftAmt;
4367 SDLoc DL(N);
4368
4369 // Skip over a truncate of the shift amount.
4370 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4371 ShiftAmt = ShiftAmt->getOperand(Num: 0);
4372
4373 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4374 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4375
4376 SDValue NewShiftAmt;
4377 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4378 ShiftAmt->getOpcode() == ISD::XOR) {
4379 SDValue Add0 = ShiftAmt->getOperand(Num: 0);
4380 SDValue Add1 = ShiftAmt->getOperand(Num: 1);
4381 auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0);
4382 auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1);
4383 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4384 // to avoid the ADD/SUB/XOR.
4385 if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == 0) {
4386 NewShiftAmt = Add0;
4387
4388 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4389 ((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - 1) ||
4390 (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - 1))) {
4391 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4392 // we can replace it with a NOT. In the XOR case it may save some code
4393 // size, in the SUB case it also may save a move.
4394 assert(Add0C == nullptr || Add1C == nullptr);
4395
4396 // We can only do N-X, not X-N
4397 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4398 return false;
4399
4400 EVT OpVT = ShiftAmt.getValueType();
4401
4402 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT);
4403 NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT,
4404 N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes);
4405 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes);
4406 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4407 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4408 // -X to generate a NEG instead of a SUB of a constant.
4409 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4410 Add0C->getZExtValue() != 0) {
4411 EVT SubVT = ShiftAmt.getValueType();
4412 SDValue X;
4413 if (Add0C->getZExtValue() % Size == 0)
4414 X = Add1;
4415 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4416 Add0C->getZExtValue() % 32 == 0) {
4417 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4418 // This is mainly beneficial if we already compute (x+n*32).
4419 if (Add1.getOpcode() == ISD::TRUNCATE) {
4420 Add1 = Add1.getOperand(i: 0);
4421 SubVT = Add1.getValueType();
4422 }
4423 if (Add0.getValueType() != SubVT) {
4424 Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT);
4425 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0);
4426 }
4427
4428 X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0);
4429 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X);
4430 } else
4431 return false;
4432 // Insert a negate op.
4433 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4434 // that uses it that's not a shift.
4435 SDValue Zero = CurDAG->getConstant(Val: 0, DL, VT: SubVT);
4436 SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X);
4437 NewShiftAmt = Neg;
4438
4439 // Insert these operands into a valid topological order so they can
4440 // get selected independently.
4441 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero);
4442 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg);
4443 } else
4444 return false;
4445 } else
4446 return false;
4447
4448 if (NewShiftAmt.getValueType() != MVT::i8) {
4449 // Need to truncate the shift amount.
4450 NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt);
4451 // Add to a correct topological ordering.
4452 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4453 }
4454
4455 // Insert a new mask to keep the shift amount legal. This should be removed
4456 // by isel patterns.
4457 NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt,
4458 N2: CurDAG->getConstant(Val: Size - 1, DL, VT: MVT::i8));
4459 // Place in a correct topological ordering.
4460 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4461
4462 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0),
4463 Op2: NewShiftAmt);
4464 if (UpdatedNode != N) {
4465 // If we found an existing node, we should replace ourselves with that node
4466 // and wait for it to be selected after its other users.
4467 ReplaceNode(F: N, T: UpdatedNode);
4468 return true;
4469 }
4470
4471 // If the original shift amount is now dead, delete it so that we don't run
4472 // it through isel.
4473 if (OrigShiftAmt.getNode()->use_empty())
4474 CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode());
4475
4476 // Now that we've optimized the shift amount, defer to normal isel to get
4477 // load folding and legacy vs BMI2 selection without repeating it here.
4478 SelectCode(N);
4479 return true;
4480}
4481
4482bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4483 MVT NVT = N->getSimpleValueType(ResNo: 0);
4484 unsigned Opcode = N->getOpcode();
4485 SDLoc dl(N);
4486
4487 // For operations of the form (x << C1) op C2, check if we can use a smaller
4488 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4489 SDValue Shift = N->getOperand(Num: 0);
4490 SDValue N1 = N->getOperand(Num: 1);
4491
4492 auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
4493 if (!Cst)
4494 return false;
4495
4496 int64_t Val = Cst->getSExtValue();
4497
4498 // If we have an any_extend feeding the AND, look through it to see if there
4499 // is a shift behind it. But only if the AND doesn't use the extended bits.
4500 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4501 bool FoundAnyExtend = false;
4502 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4503 Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 &&
4504 isUInt<32>(x: Val)) {
4505 FoundAnyExtend = true;
4506 Shift = Shift.getOperand(i: 0);
4507 }
4508
4509 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4510 return false;
4511
4512 // i8 is unshrinkable, i16 should be promoted to i32.
4513 if (NVT != MVT::i32 && NVT != MVT::i64)
4514 return false;
4515
4516 auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1));
4517 if (!ShlCst)
4518 return false;
4519
4520 uint64_t ShAmt = ShlCst->getZExtValue();
4521
4522 // Make sure that we don't change the operation by removing bits.
4523 // This only matters for OR and XOR, AND is unaffected.
4524 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4525 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4526 return false;
4527
4528 // Check the minimum bitwidth for the new constant.
4529 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4530 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4531 if (Opcode == ISD::AND) {
4532 // AND32ri is the same as AND64ri32 with zext imm.
4533 // Try this before sign extended immediates below.
4534 ShiftedVal = (uint64_t)Val >> ShAmt;
4535 if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal))
4536 return true;
4537 // Also swap order when the AND can become MOVZX.
4538 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4539 return true;
4540 }
4541 ShiftedVal = Val >> ShAmt;
4542 if ((!isInt<8>(x: Val) && isInt<8>(x: ShiftedVal)) ||
4543 (!isInt<32>(x: Val) && isInt<32>(x: ShiftedVal)))
4544 return true;
4545 if (Opcode != ISD::AND) {
4546 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4547 ShiftedVal = (uint64_t)Val >> ShAmt;
4548 if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal))
4549 return true;
4550 }
4551 return false;
4552 };
4553
4554 int64_t ShiftedVal;
4555 if (!CanShrinkImmediate(ShiftedVal))
4556 return false;
4557
4558 // Ok, we can reorder to get a smaller immediate.
4559
4560 // But, its possible the original immediate allowed an AND to become MOVZX.
4561 // Doing this late due to avoid the MakedValueIsZero call as late as
4562 // possible.
4563 if (Opcode == ISD::AND) {
4564 // Find the smallest zext this could possibly be.
4565 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4566 ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: 8U));
4567
4568 // Figure out which bits need to be zero to achieve that mask.
4569 APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(),
4570 loBitsSet: ZExtWidth);
4571 NeededMask &= ~Cst->getAPIntValue();
4572
4573 if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: 0), Mask: NeededMask))
4574 return false;
4575 }
4576
4577 SDValue X = Shift.getOperand(i: 0);
4578 if (FoundAnyExtend) {
4579 SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X);
4580 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewX);
4581 X = NewX;
4582 }
4583
4584 SDValue NewCst = CurDAG->getSignedConstant(Val: ShiftedVal, DL: dl, VT: NVT);
4585 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewCst);
4586 SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst);
4587 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewBinOp);
4588 SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp,
4589 N2: Shift.getOperand(i: 1));
4590 ReplaceNode(F: N, T: NewSHL.getNode());
4591 SelectCode(N: NewSHL.getNode());
4592 return true;
4593}
4594
4595bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4596 SDNode *ParentB, SDNode *ParentC,
4597 SDValue A, SDValue B, SDValue C,
4598 uint8_t Imm) {
4599 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4600 C.isOperandOf(ParentC) && "Incorrect parent node");
4601
4602 auto tryFoldLoadOrBCast =
4603 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4604 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4605 if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4606 return true;
4607
4608 // Not a load, check for broadcast which may be behind a bitcast.
4609 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4610 P = L.getNode();
4611 L = L.getOperand(i: 0);
4612 }
4613
4614 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4615 return false;
4616
4617 // Only 32 and 64 bit broadcasts are supported.
4618 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4619 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4620 if (Size != 32 && Size != 64)
4621 return false;
4622
4623 return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4624 };
4625
4626 bool FoldedLoad = false;
4627 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4628 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4629 FoldedLoad = true;
4630 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4631 Tmp4)) {
4632 FoldedLoad = true;
4633 std::swap(a&: A, b&: C);
4634 // Swap bits 1/4 and 3/6.
4635 uint8_t OldImm = Imm;
4636 Imm = OldImm & 0xa5;
4637 if (OldImm & 0x02) Imm |= 0x10;
4638 if (OldImm & 0x10) Imm |= 0x02;
4639 if (OldImm & 0x08) Imm |= 0x40;
4640 if (OldImm & 0x40) Imm |= 0x08;
4641 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4642 Tmp4)) {
4643 FoldedLoad = true;
4644 std::swap(a&: B, b&: C);
4645 // Swap bits 1/2 and 5/6.
4646 uint8_t OldImm = Imm;
4647 Imm = OldImm & 0x99;
4648 if (OldImm & 0x02) Imm |= 0x04;
4649 if (OldImm & 0x04) Imm |= 0x02;
4650 if (OldImm & 0x20) Imm |= 0x40;
4651 if (OldImm & 0x40) Imm |= 0x20;
4652 }
4653
4654 SDLoc DL(Root);
4655
4656 SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
4657
4658 MVT NVT = Root->getSimpleValueType(ResNo: 0);
4659
4660 MachineSDNode *MNode;
4661 if (FoldedLoad) {
4662 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
4663
4664 unsigned Opc;
4665 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4666 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C);
4667 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4668 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4669
4670 bool UseD = EltSize == 32;
4671 if (NVT.is128BitVector())
4672 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4673 else if (NVT.is256BitVector())
4674 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4675 else if (NVT.is512BitVector())
4676 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4677 else
4678 llvm_unreachable("Unexpected vector size!");
4679 } else {
4680 bool UseD = NVT.getVectorElementType() == MVT::i32;
4681 if (NVT.is128BitVector())
4682 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4683 else if (NVT.is256BitVector())
4684 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4685 else if (NVT.is512BitVector())
4686 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4687 else
4688 llvm_unreachable("Unexpected vector size!");
4689 }
4690
4691 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: 0)};
4692 MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops);
4693
4694 // Update the chain.
4695 ReplaceUses(F: C.getValue(R: 1), T: SDValue(MNode, 1));
4696 // Record the mem-refs
4697 CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()});
4698 } else {
4699 bool UseD = NVT.getVectorElementType() == MVT::i32;
4700 unsigned Opc;
4701 if (NVT.is128BitVector())
4702 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4703 else if (NVT.is256BitVector())
4704 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4705 else if (NVT.is512BitVector())
4706 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4707 else
4708 llvm_unreachable("Unexpected vector size!");
4709
4710 MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm});
4711 }
4712
4713 ReplaceUses(F: SDValue(Root, 0), T: SDValue(MNode, 0));
4714 CurDAG->RemoveDeadNode(N: Root);
4715 return true;
4716}
4717
4718// Try to match two logic ops to a VPTERNLOG.
4719// FIXME: Handle more complex patterns that use an operand more than once?
4720bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4721 MVT NVT = N->getSimpleValueType(ResNo: 0);
4722
4723 // Make sure we support VPTERNLOG.
4724 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4725 NVT.getVectorElementType() == MVT::i1)
4726 return false;
4727
4728 // We need VLX for 128/256-bit.
4729 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4730 return false;
4731
4732 auto getFoldableLogicOp = [](SDValue Op) {
4733 // Peek through single use bitcast.
4734 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4735 Op = Op.getOperand(i: 0);
4736
4737 if (!Op.hasOneUse())
4738 return SDValue();
4739
4740 unsigned Opc = Op.getOpcode();
4741 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4742 Opc == X86ISD::ANDNP)
4743 return Op;
4744
4745 return SDValue();
4746 };
4747
4748 SDValue N0, N1, A, FoldableOp;
4749
4750 // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
4751 auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
4752 if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
4753 ISD::isBuildVectorAllOnes(N: Op->getOperand(Num: 1).getNode())) {
4754 SDValue InnerOp = getFoldableLogicOp(Op->getOperand(Num: 0));
4755
4756 if (!InnerOp)
4757 return SDValue();
4758
4759 N0 = InnerOp.getOperand(i: 0);
4760 N1 = InnerOp.getOperand(i: 1);
4761 if ((FoldableOp = getFoldableLogicOp(N1))) {
4762 A = N0;
4763 return InnerOp;
4764 }
4765 if ((FoldableOp = getFoldableLogicOp(N0))) {
4766 A = N1;
4767 return InnerOp;
4768 }
4769 }
4770 return SDValue();
4771 };
4772
4773 bool PeeledOuterNot = false;
4774 SDNode *OriN = N;
4775 if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
4776 PeeledOuterNot = true;
4777 N = InnerOp.getNode();
4778 } else {
4779 N0 = N->getOperand(Num: 0);
4780 N1 = N->getOperand(Num: 1);
4781
4782 if ((FoldableOp = getFoldableLogicOp(N1)))
4783 A = N0;
4784 else if ((FoldableOp = getFoldableLogicOp(N0)))
4785 A = N1;
4786 else
4787 return false;
4788 }
4789
4790 SDValue B = FoldableOp.getOperand(i: 0);
4791 SDValue C = FoldableOp.getOperand(i: 1);
4792 SDNode *ParentA = N;
4793 SDNode *ParentB = FoldableOp.getNode();
4794 SDNode *ParentC = FoldableOp.getNode();
4795
4796 // We can build the appropriate control immediate by performing the logic
4797 // operation we're matching using these constants for A, B, and C.
4798 uint8_t TernlogMagicA = 0xf0;
4799 uint8_t TernlogMagicB = 0xcc;
4800 uint8_t TernlogMagicC = 0xaa;
4801
4802 // Some of the inputs may be inverted, peek through them and invert the
4803 // magic values accordingly.
4804 // TODO: There may be a bitcast before the xor that we should peek through.
4805 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4806 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4807 ISD::isBuildVectorAllOnes(N: Op.getOperand(i: 1).getNode())) {
4808 Magic = ~Magic;
4809 Parent = Op.getNode();
4810 Op = Op.getOperand(i: 0);
4811 }
4812 };
4813
4814 PeekThroughNot(A, ParentA, TernlogMagicA);
4815 PeekThroughNot(B, ParentB, TernlogMagicB);
4816 PeekThroughNot(C, ParentC, TernlogMagicC);
4817
4818 uint8_t Imm;
4819 switch (FoldableOp.getOpcode()) {
4820 default: llvm_unreachable("Unexpected opcode!");
4821 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4822 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4823 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4824 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4825 }
4826
4827 switch (N->getOpcode()) {
4828 default: llvm_unreachable("Unexpected opcode!");
4829 case X86ISD::ANDNP:
4830 if (A == N0)
4831 Imm &= ~TernlogMagicA;
4832 else
4833 Imm = ~(Imm) & TernlogMagicA;
4834 break;
4835 case ISD::AND: Imm &= TernlogMagicA; break;
4836 case ISD::OR: Imm |= TernlogMagicA; break;
4837 case ISD::XOR: Imm ^= TernlogMagicA; break;
4838 }
4839
4840 if (PeeledOuterNot)
4841 Imm = ~Imm;
4842
4843 return matchVPTERNLOG(Root: OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
4844}
4845
4846/// If the high bits of an 'and' operand are known zero, try setting the
4847/// high bits of an 'and' constant operand to produce a smaller encoding by
4848/// creating a small, sign-extended negative immediate rather than a large
4849/// positive one. This reverses a transform in SimplifyDemandedBits that
4850/// shrinks mask constants by clearing bits. There is also a possibility that
4851/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4852/// case, just replace the 'and'. Return 'true' if the node is replaced.
4853bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4854 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4855 // have immediate operands.
4856 MVT VT = And->getSimpleValueType(ResNo: 0);
4857 if (VT != MVT::i32 && VT != MVT::i64)
4858 return false;
4859
4860 auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1));
4861 if (!And1C)
4862 return false;
4863
4864 // Bail out if the mask constant is already negative. It's can't shrink more.
4865 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4866 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4867 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4868 // are negative too.
4869 APInt MaskVal = And1C->getAPIntValue();
4870 unsigned MaskLZ = MaskVal.countl_zero();
4871 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4872 return false;
4873
4874 // Don't extend into the upper 32 bits of a 64 bit mask.
4875 if (VT == MVT::i64 && MaskLZ >= 32) {
4876 MaskLZ -= 32;
4877 MaskVal = MaskVal.trunc(width: 32);
4878 }
4879
4880 SDValue And0 = And->getOperand(Num: 0);
4881 APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ);
4882 APInt NegMaskVal = MaskVal | HighZeros;
4883
4884 // If a negative constant would not allow a smaller encoding, there's no need
4885 // to continue. Only change the constant when we know it's a win.
4886 unsigned MinWidth = NegMaskVal.getSignificantBits();
4887 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4888 return false;
4889
4890 // Extend masks if we truncated above.
4891 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4892 NegMaskVal = NegMaskVal.zext(width: 64);
4893 HighZeros = HighZeros.zext(width: 64);
4894 }
4895
4896 // The variable operand must be all zeros in the top bits to allow using the
4897 // new, negative constant as the mask.
4898 // TODO: Handle constant folding?
4899 KnownBits Known0 = CurDAG->computeKnownBits(Op: And0);
4900 if (Known0.isConstant() || !HighZeros.isSubsetOf(RHS: Known0.Zero))
4901 return false;
4902
4903 // Check if the mask is -1. In that case, this is an unnecessary instruction
4904 // that escaped earlier analysis.
4905 if (NegMaskVal.isAllOnes()) {
4906 ReplaceNode(F: And, T: And0.getNode());
4907 return true;
4908 }
4909
4910 // A negative mask allows a smaller encoding. Create a new 'and' node.
4911 SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc(And), VT);
4912 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(And, 0), N: NewMask);
4913 SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(And), VT, N1: And0, N2: NewMask);
4914 ReplaceNode(F: And, T: NewAnd.getNode());
4915 SelectCode(N: NewAnd.getNode());
4916 return true;
4917}
4918
4919static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4920 bool FoldedBCast, bool Masked) {
4921#define VPTESTM_CASE(VT, SUFFIX) \
4922case MVT::VT: \
4923 if (Masked) \
4924 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4925 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4926
4927
4928#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4929default: llvm_unreachable("Unexpected VT!"); \
4930VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4931VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4932VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4933VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4934VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4935VPTESTM_CASE(v8i64, QZ##SUFFIX)
4936
4937#define VPTESTM_FULL_CASES(SUFFIX) \
4938VPTESTM_BROADCAST_CASES(SUFFIX) \
4939VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4940VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4941VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4942VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4943VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4944VPTESTM_CASE(v32i16, WZ##SUFFIX)
4945
4946 if (FoldedBCast) {
4947 switch (TestVT.SimpleTy) {
4948 VPTESTM_BROADCAST_CASES(rmb)
4949 }
4950 }
4951
4952 if (FoldedLoad) {
4953 switch (TestVT.SimpleTy) {
4954 VPTESTM_FULL_CASES(rm)
4955 }
4956 }
4957
4958 switch (TestVT.SimpleTy) {
4959 VPTESTM_FULL_CASES(rr)
4960 }
4961
4962#undef VPTESTM_FULL_CASES
4963#undef VPTESTM_BROADCAST_CASES
4964#undef VPTESTM_CASE
4965}
4966
4967// Try to create VPTESTM instruction. If InMask is not null, it will be used
4968// to form a masked operation.
4969bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4970 SDValue InMask) {
4971 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4972 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4973 "Unexpected VT!");
4974
4975 // Look for equal and not equal compares.
4976 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: 2))->get();
4977 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4978 return false;
4979
4980 SDValue SetccOp0 = Setcc.getOperand(i: 0);
4981 SDValue SetccOp1 = Setcc.getOperand(i: 1);
4982
4983 // Canonicalize the all zero vector to the RHS.
4984 if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode()))
4985 std::swap(a&: SetccOp0, b&: SetccOp1);
4986
4987 // See if we're comparing against zero.
4988 if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode()))
4989 return false;
4990
4991 SDValue N0 = SetccOp0;
4992
4993 MVT CmpVT = N0.getSimpleValueType();
4994 MVT CmpSVT = CmpVT.getVectorElementType();
4995
4996 // Start with both operands the same. We'll try to refine this.
4997 SDValue Src0 = N0;
4998 SDValue Src1 = N0;
4999
5000 {
5001 // Look through single use bitcasts.
5002 SDValue N0Temp = N0;
5003 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
5004 N0Temp = N0.getOperand(i: 0);
5005
5006 // Look for single use AND.
5007 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
5008 Src0 = N0Temp.getOperand(i: 0);
5009 Src1 = N0Temp.getOperand(i: 1);
5010 }
5011 }
5012
5013 // Without VLX we need to widen the operation.
5014 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
5015
5016 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
5017 SDValue &Base, SDValue &Scale, SDValue &Index,
5018 SDValue &Disp, SDValue &Segment) {
5019 // If we need to widen, we can't fold the load.
5020 if (!Widen)
5021 if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
5022 return true;
5023
5024 // If we didn't fold a load, try to match broadcast. No widening limitation
5025 // for this. But only 32 and 64 bit types are supported.
5026 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
5027 return false;
5028
5029 // Look through single use bitcasts.
5030 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
5031 P = L.getNode();
5032 L = L.getOperand(i: 0);
5033 }
5034
5035 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
5036 return false;
5037
5038 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
5039 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
5040 return false;
5041
5042 return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
5043 };
5044
5045 // We can only fold loads if the sources are unique.
5046 bool CanFoldLoads = Src0 != Src1;
5047
5048 bool FoldedLoad = false;
5049 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5050 if (CanFoldLoads) {
5051 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5052 Tmp3, Tmp4);
5053 if (!FoldedLoad) {
5054 // And is commutative.
5055 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
5056 Tmp2, Tmp3, Tmp4);
5057 if (FoldedLoad)
5058 std::swap(a&: Src0, b&: Src1);
5059 }
5060 }
5061
5062 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5063
5064 bool IsMasked = InMask.getNode() != nullptr;
5065
5066 SDLoc dl(Root);
5067
5068 MVT ResVT = Setcc.getSimpleValueType();
5069 MVT MaskVT = ResVT;
5070 if (Widen) {
5071 // Widen the inputs using insert_subreg or copy_to_regclass.
5072 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5073 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5074 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5075 CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts);
5076 MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts);
5077 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl,
5078 VT: CmpVT), 0);
5079 Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0);
5080
5081 if (!FoldedBCast)
5082 Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1);
5083
5084 if (IsMasked) {
5085 // Widen the mask.
5086 unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID();
5087 SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5088 InMask = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5089 dl, VT: MaskVT, Op1: InMask, Op2: RC), 0);
5090 }
5091 }
5092
5093 bool IsTestN = CC == ISD::SETEQ;
5094 unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5095 Masked: IsMasked);
5096
5097 MachineSDNode *CNode;
5098 if (FoldedLoad) {
5099 SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other);
5100
5101 if (IsMasked) {
5102 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5103 Src1.getOperand(i: 0) };
5104 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5105 } else {
5106 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5107 Src1.getOperand(i: 0) };
5108 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5109 }
5110
5111 // Update the chain.
5112 ReplaceUses(F: Src1.getValue(R: 1), T: SDValue(CNode, 1));
5113 // Record the mem-refs
5114 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()});
5115 } else {
5116 if (IsMasked)
5117 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1);
5118 else
5119 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1);
5120 }
5121
5122 // If we widened, we need to shrink the mask VT.
5123 if (Widen) {
5124 unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID();
5125 SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5126 CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5127 dl, VT: ResVT, Op1: SDValue(CNode, 0), Op2: RC);
5128 }
5129
5130 ReplaceUses(F: SDValue(Root, 0), T: SDValue(CNode, 0));
5131 CurDAG->RemoveDeadNode(N: Root);
5132 return true;
5133}
5134
5135// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5136// into vpternlog.
5137bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5138 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5139
5140 MVT NVT = N->getSimpleValueType(ResNo: 0);
5141
5142 // Make sure we support VPTERNLOG.
5143 if (!NVT.isVector() || !Subtarget->hasAVX512())
5144 return false;
5145
5146 // We need VLX for 128/256-bit.
5147 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5148 return false;
5149
5150 SDValue N0 = N->getOperand(Num: 0);
5151 SDValue N1 = N->getOperand(Num: 1);
5152
5153 // Canonicalize AND to LHS.
5154 if (N1.getOpcode() == ISD::AND)
5155 std::swap(a&: N0, b&: N1);
5156
5157 if (N0.getOpcode() != ISD::AND ||
5158 N1.getOpcode() != X86ISD::ANDNP ||
5159 !N0.hasOneUse() || !N1.hasOneUse())
5160 return false;
5161
5162 // ANDN is not commutable, use it to pick down A and C.
5163 SDValue A = N1.getOperand(i: 0);
5164 SDValue C = N1.getOperand(i: 1);
5165
5166 // AND is commutable, if one operand matches A, the other operand is B.
5167 // Otherwise this isn't a match.
5168 SDValue B;
5169 if (N0.getOperand(i: 0) == A)
5170 B = N0.getOperand(i: 1);
5171 else if (N0.getOperand(i: 1) == A)
5172 B = N0.getOperand(i: 0);
5173 else
5174 return false;
5175
5176 SDLoc dl(N);
5177 SDValue Imm = CurDAG->getTargetConstant(Val: 0xCA, DL: dl, VT: MVT::i8);
5178 SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm);
5179 ReplaceNode(F: N, T: Ternlog.getNode());
5180
5181 return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(),
5182 ParentC: Ternlog.getNode(), A, B, C, Imm: 0xCA);
5183}
5184
5185void X86DAGToDAGISel::Select(SDNode *Node) {
5186 MVT NVT = Node->getSimpleValueType(ResNo: 0);
5187 unsigned Opcode = Node->getOpcode();
5188 SDLoc dl(Node);
5189
5190 if (Node->isMachineOpcode()) {
5191 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5192 Node->setNodeId(-1);
5193 return; // Already selected.
5194 }
5195
5196 switch (Opcode) {
5197 default: break;
5198 case ISD::INTRINSIC_W_CHAIN: {
5199 unsigned IntNo = Node->getConstantOperandVal(Num: 1);
5200 switch (IntNo) {
5201 default: break;
5202 case Intrinsic::x86_encodekey128:
5203 case Intrinsic::x86_encodekey256: {
5204 if (!Subtarget->hasKL())
5205 break;
5206
5207 unsigned Opcode;
5208 switch (IntNo) {
5209 default: llvm_unreachable("Impossible intrinsic");
5210 case Intrinsic::x86_encodekey128:
5211 Opcode = X86::ENCODEKEY128;
5212 break;
5213 case Intrinsic::x86_encodekey256:
5214 Opcode = X86::ENCODEKEY256;
5215 break;
5216 }
5217
5218 SDValue Chain = Node->getOperand(Num: 0);
5219 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 3),
5220 Glue: SDValue());
5221 if (Opcode == X86::ENCODEKEY256)
5222 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 4),
5223 Glue: Chain.getValue(R: 1));
5224
5225 MachineSDNode *Res = CurDAG->getMachineNode(
5226 Opcode, dl, VTs: Node->getVTList(),
5227 Ops: {Node->getOperand(Num: 2), Chain, Chain.getValue(R: 1)});
5228 ReplaceNode(F: Node, T: Res);
5229 return;
5230 }
5231 case Intrinsic::x86_tileloaddrs64_internal:
5232 case Intrinsic::x86_tileloaddrst164_internal:
5233 if (!Subtarget->hasAMXMOVRS())
5234 break;
5235 [[fallthrough]];
5236 case Intrinsic::x86_tileloadd64_internal:
5237 case Intrinsic::x86_tileloaddt164_internal: {
5238 if (!Subtarget->hasAMXTILE())
5239 break;
5240 auto *MFI =
5241 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5242 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5243 unsigned Opc;
5244 switch (IntNo) {
5245 default:
5246 llvm_unreachable("Unexpected intrinsic!");
5247 case Intrinsic::x86_tileloaddrs64_internal:
5248 Opc = X86::PTILELOADDRSV;
5249 break;
5250 case Intrinsic::x86_tileloaddrst164_internal:
5251 Opc = X86::PTILELOADDRST1V;
5252 break;
5253 case Intrinsic::x86_tileloadd64_internal:
5254 Opc = X86::PTILELOADDV;
5255 break;
5256 case Intrinsic::x86_tileloaddt164_internal:
5257 Opc = X86::PTILELOADDT1V;
5258 break;
5259 }
5260 // _tile_loadd_internal(row, col, buf, STRIDE)
5261 SDValue Base = Node->getOperand(Num: 4);
5262 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5263 SDValue Index = Node->getOperand(Num: 5);
5264 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5265 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5266 SDValue Chain = Node->getOperand(Num: 0);
5267 MachineSDNode *CNode;
5268 SDValue Ops[] = {Node->getOperand(Num: 2),
5269 Node->getOperand(Num: 3),
5270 Base,
5271 Scale,
5272 Index,
5273 Disp,
5274 Segment,
5275 Chain};
5276 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops);
5277 ReplaceNode(F: Node, T: CNode);
5278 return;
5279 }
5280 }
5281 break;
5282 }
5283 case ISD::INTRINSIC_VOID: {
5284 unsigned IntNo = Node->getConstantOperandVal(Num: 1);
5285 switch (IntNo) {
5286 default: break;
5287 case Intrinsic::x86_sse3_monitor:
5288 case Intrinsic::x86_monitorx:
5289 case Intrinsic::x86_clzero: {
5290 bool Use64BitPtr = Node->getOperand(Num: 2).getValueType() == MVT::i64;
5291
5292 unsigned Opc = 0;
5293 switch (IntNo) {
5294 default: llvm_unreachable("Unexpected intrinsic!");
5295 case Intrinsic::x86_sse3_monitor:
5296 if (!Subtarget->hasSSE3())
5297 break;
5298 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5299 break;
5300 case Intrinsic::x86_monitorx:
5301 if (!Subtarget->hasMWAITX())
5302 break;
5303 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5304 break;
5305 case Intrinsic::x86_clzero:
5306 if (!Subtarget->hasCLZERO())
5307 break;
5308 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5309 break;
5310 }
5311
5312 if (Opc) {
5313 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5314 SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: 0), dl, Reg: PtrReg,
5315 N: Node->getOperand(Num: 2), Glue: SDValue());
5316 SDValue InGlue = Chain.getValue(R: 1);
5317
5318 if (IntNo == Intrinsic::x86_sse3_monitor ||
5319 IntNo == Intrinsic::x86_monitorx) {
5320 // Copy the other two operands to ECX and EDX.
5321 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: 3),
5322 Glue: InGlue);
5323 InGlue = Chain.getValue(R: 1);
5324 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: 4),
5325 Glue: InGlue);
5326 InGlue = Chain.getValue(R: 1);
5327 }
5328
5329 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other,
5330 Ops: { Chain, InGlue});
5331 ReplaceNode(F: Node, T: CNode);
5332 return;
5333 }
5334
5335 break;
5336 }
5337 case Intrinsic::x86_tilestored64_internal: {
5338 auto *MFI =
5339 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5340 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5341 unsigned Opc = X86::PTILESTOREDV;
5342 // _tile_stored_internal(row, col, buf, STRIDE, c)
5343 SDValue Base = Node->getOperand(Num: 4);
5344 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5345 SDValue Index = Node->getOperand(Num: 5);
5346 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5347 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5348 SDValue Chain = Node->getOperand(Num: 0);
5349 MachineSDNode *CNode;
5350 SDValue Ops[] = {Node->getOperand(Num: 2),
5351 Node->getOperand(Num: 3),
5352 Base,
5353 Scale,
5354 Index,
5355 Disp,
5356 Segment,
5357 Node->getOperand(Num: 6),
5358 Chain};
5359 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5360 ReplaceNode(F: Node, T: CNode);
5361 return;
5362 }
5363 case Intrinsic::x86_tileloaddrs64:
5364 case Intrinsic::x86_tileloaddrst164:
5365 if (!Subtarget->hasAMXMOVRS())
5366 break;
5367 [[fallthrough]];
5368 case Intrinsic::x86_tileloadd64:
5369 case Intrinsic::x86_tileloaddt164:
5370 case Intrinsic::x86_tilestored64: {
5371 if (!Subtarget->hasAMXTILE())
5372 break;
5373 auto *MFI =
5374 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5375 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5376 unsigned Opc;
5377 switch (IntNo) {
5378 default: llvm_unreachable("Unexpected intrinsic!");
5379 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5380 case Intrinsic::x86_tileloaddrs64:
5381 Opc = X86::PTILELOADDRS;
5382 break;
5383 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5384 case Intrinsic::x86_tileloaddrst164:
5385 Opc = X86::PTILELOADDRST1;
5386 break;
5387 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5388 }
5389 // FIXME: Match displacement and scale.
5390 unsigned TIndex = Node->getConstantOperandVal(Num: 2);
5391 SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5392 SDValue Base = Node->getOperand(Num: 3);
5393 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5394 SDValue Index = Node->getOperand(Num: 4);
5395 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5396 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5397 SDValue Chain = Node->getOperand(Num: 0);
5398 MachineSDNode *CNode;
5399 if (Opc == X86::PTILESTORED) {
5400 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5401 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5402 } else {
5403 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5404 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5405 }
5406 ReplaceNode(F: Node, T: CNode);
5407 return;
5408 }
5409 }
5410 break;
5411 }
5412 case ISD::BRIND:
5413 case X86ISD::NT_BRIND: {
5414 if (Subtarget->isTarget64BitILP32()) {
5415 // Converts a 32-bit register to a 64-bit, zero-extended version of
5416 // it. This is needed because x86-64 can do many things, but jmp %r32
5417 // ain't one of them.
5418 SDValue Target = Node->getOperand(Num: 1);
5419 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5420 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64);
5421 SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other,
5422 N1: Node->getOperand(Num: 0), N2: ZextTarget);
5423 ReplaceNode(F: Node, T: Brind.getNode());
5424 SelectCode(N: ZextTarget.getNode());
5425 SelectCode(N: Brind.getNode());
5426 return;
5427 }
5428 break;
5429 }
5430 case X86ISD::GlobalBaseReg:
5431 ReplaceNode(F: Node, T: getGlobalBaseReg());
5432 return;
5433
5434 case ISD::BITCAST:
5435 // Just drop all 128/256/512-bit bitcasts.
5436 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5437 NVT == MVT::f128) {
5438 ReplaceUses(F: SDValue(Node, 0), T: Node->getOperand(Num: 0));
5439 CurDAG->RemoveDeadNode(N: Node);
5440 return;
5441 }
5442 break;
5443
5444 case ISD::SRL:
5445 if (matchBitExtract(Node))
5446 return;
5447 [[fallthrough]];
5448 case ISD::SRA:
5449 case ISD::SHL:
5450 if (tryShiftAmountMod(N: Node))
5451 return;
5452 break;
5453
5454 case X86ISD::VPTERNLOG: {
5455 uint8_t Imm = Node->getConstantOperandVal(Num: 3);
5456 if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: 0),
5457 B: Node->getOperand(Num: 1), C: Node->getOperand(Num: 2), Imm))
5458 return;
5459 break;
5460 }
5461
5462 case X86ISD::ANDNP:
5463 if (tryVPTERNLOG(N: Node))
5464 return;
5465 break;
5466
5467 case ISD::AND:
5468 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5469 // Try to form a masked VPTESTM. Operands can be in either order.
5470 SDValue N0 = Node->getOperand(Num: 0);
5471 SDValue N1 = Node->getOperand(Num: 1);
5472 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5473 tryVPTESTM(Root: Node, Setcc: N0, InMask: N1))
5474 return;
5475 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5476 tryVPTESTM(Root: Node, Setcc: N1, InMask: N0))
5477 return;
5478 }
5479
5480 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5481 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
5482 CurDAG->RemoveDeadNode(N: Node);
5483 return;
5484 }
5485 if (matchBitExtract(Node))
5486 return;
5487 if (AndImmShrink && shrinkAndImmediate(And: Node))
5488 return;
5489
5490 [[fallthrough]];
5491 case ISD::OR:
5492 case ISD::XOR:
5493 if (tryShrinkShlLogicImm(N: Node))
5494 return;
5495 if (Opcode == ISD::OR && tryMatchBitSelect(N: Node))
5496 return;
5497 if (tryVPTERNLOG(N: Node))
5498 return;
5499
5500 [[fallthrough]];
5501 case ISD::ADD:
5502 if (Opcode == ISD::ADD && matchBitExtract(Node))
5503 return;
5504 [[fallthrough]];
5505 case ISD::SUB: {
5506 // Try to avoid folding immediates with multiple uses for optsize.
5507 // This code tries to select to register form directly to avoid going
5508 // through the isel table which might fold the immediate. We can't change
5509 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5510 // tablegen files to check immediate use count without making the patterns
5511 // unavailable to the fast-isel table.
5512 if (!CurDAG->shouldOptForSize())
5513 break;
5514
5515 // Only handle i8/i16/i32/i64.
5516 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5517 break;
5518
5519 SDValue N0 = Node->getOperand(Num: 0);
5520 SDValue N1 = Node->getOperand(Num: 1);
5521
5522 auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
5523 if (!Cst)
5524 break;
5525
5526 int64_t Val = Cst->getSExtValue();
5527
5528 // Make sure its an immediate that is considered foldable.
5529 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5530 if (!isInt<8>(x: Val) && !isInt<32>(x: Val))
5531 break;
5532
5533 // If this can match to INC/DEC, let it go.
5534 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5535 break;
5536
5537 // Check if we should avoid folding this immediate.
5538 if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode()))
5539 break;
5540
5541 // We should not fold the immediate. So we need a register form instead.
5542 unsigned ROpc, MOpc;
5543 switch (NVT.SimpleTy) {
5544 default: llvm_unreachable("Unexpected VT!");
5545 case MVT::i8:
5546 switch (Opcode) {
5547 default: llvm_unreachable("Unexpected opcode!");
5548 case ISD::ADD:
5549 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5550 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5551 break;
5552 case ISD::SUB:
5553 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5554 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5555 break;
5556 case ISD::AND:
5557 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5558 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5559 break;
5560 case ISD::OR:
5561 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5562 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5563 break;
5564 case ISD::XOR:
5565 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5566 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5567 break;
5568 }
5569 break;
5570 case MVT::i16:
5571 switch (Opcode) {
5572 default: llvm_unreachable("Unexpected opcode!");
5573 case ISD::ADD:
5574 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5575 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5576 break;
5577 case ISD::SUB:
5578 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5579 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5580 break;
5581 case ISD::AND:
5582 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5583 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5584 break;
5585 case ISD::OR:
5586 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5587 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5588 break;
5589 case ISD::XOR:
5590 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5591 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5592 break;
5593 }
5594 break;
5595 case MVT::i32:
5596 switch (Opcode) {
5597 default: llvm_unreachable("Unexpected opcode!");
5598 case ISD::ADD:
5599 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5600 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5601 break;
5602 case ISD::SUB:
5603 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5604 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5605 break;
5606 case ISD::AND:
5607 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5608 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5609 break;
5610 case ISD::OR:
5611 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5612 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5613 break;
5614 case ISD::XOR:
5615 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5616 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5617 break;
5618 }
5619 break;
5620 case MVT::i64:
5621 switch (Opcode) {
5622 default: llvm_unreachable("Unexpected opcode!");
5623 case ISD::ADD:
5624 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5625 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5626 break;
5627 case ISD::SUB:
5628 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5629 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5630 break;
5631 case ISD::AND:
5632 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5633 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5634 break;
5635 case ISD::OR:
5636 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5637 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5638 break;
5639 case ISD::XOR:
5640 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5641 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5642 break;
5643 }
5644 break;
5645 }
5646
5647 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5648
5649 // If this is a not a subtract, we can still try to fold a load.
5650 if (Opcode != ISD::SUB) {
5651 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5652 if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5653 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) };
5654 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5655 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5656 // Update the chain.
5657 ReplaceUses(F: N0.getValue(R: 1), T: SDValue(CNode, 2));
5658 // Record the mem-refs
5659 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5660 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
5661 CurDAG->RemoveDeadNode(N: Node);
5662 return;
5663 }
5664 }
5665
5666 CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1);
5667 return;
5668 }
5669
5670 case X86ISD::SMUL:
5671 // i16/i32/i64 are handled with isel patterns.
5672 if (NVT != MVT::i8)
5673 break;
5674 [[fallthrough]];
5675 case X86ISD::UMUL: {
5676 SDValue N0 = Node->getOperand(Num: 0);
5677 SDValue N1 = Node->getOperand(Num: 1);
5678
5679 unsigned LoReg, ROpc, MOpc;
5680 switch (NVT.SimpleTy) {
5681 default: llvm_unreachable("Unsupported VT!");
5682 case MVT::i8:
5683 LoReg = X86::AL;
5684 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5685 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5686 break;
5687 case MVT::i16:
5688 LoReg = X86::AX;
5689 ROpc = X86::MUL16r;
5690 MOpc = X86::MUL16m;
5691 break;
5692 case MVT::i32:
5693 LoReg = X86::EAX;
5694 ROpc = X86::MUL32r;
5695 MOpc = X86::MUL32m;
5696 break;
5697 case MVT::i64:
5698 LoReg = X86::RAX;
5699 ROpc = X86::MUL64r;
5700 MOpc = X86::MUL64m;
5701 break;
5702 }
5703
5704 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5705 bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5706 // Multiply is commutative.
5707 if (!FoldedLoad) {
5708 FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5709 if (FoldedLoad)
5710 std::swap(a&: N0, b&: N1);
5711 }
5712
5713 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5714 N: N0, Glue: SDValue()).getValue(R: 1);
5715
5716 MachineSDNode *CNode;
5717 if (FoldedLoad) {
5718 // i16/i32/i64 use an instruction that produces a low and high result even
5719 // though only the low result is used.
5720 SDVTList VTs;
5721 if (NVT == MVT::i8)
5722 VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5723 else
5724 VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other);
5725
5726 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5727 InGlue };
5728 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5729
5730 // Update the chain.
5731 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5732 // Record the mem-refs
5733 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5734 } else {
5735 // i16/i32/i64 use an instruction that produces a low and high result even
5736 // though only the low result is used.
5737 SDVTList VTs;
5738 if (NVT == MVT::i8)
5739 VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32);
5740 else
5741 VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32);
5742
5743 CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue});
5744 }
5745
5746 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
5747 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5748 CurDAG->RemoveDeadNode(N: Node);
5749 return;
5750 }
5751
5752 case ISD::SMUL_LOHI:
5753 case ISD::UMUL_LOHI: {
5754 SDValue N0 = Node->getOperand(Num: 0);
5755 SDValue N1 = Node->getOperand(Num: 1);
5756
5757 unsigned Opc, MOpc;
5758 unsigned LoReg, HiReg;
5759 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5760 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5761 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5762 switch (NVT.SimpleTy) {
5763 default: llvm_unreachable("Unsupported VT!");
5764 case MVT::i32:
5765 Opc = UseMULXHi ? X86::MULX32Hrr
5766 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5767 : IsSigned ? X86::IMUL32r
5768 : X86::MUL32r;
5769 MOpc = UseMULXHi ? X86::MULX32Hrm
5770 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5771 : IsSigned ? X86::IMUL32m
5772 : X86::MUL32m;
5773 LoReg = UseMULX ? X86::EDX : X86::EAX;
5774 HiReg = X86::EDX;
5775 break;
5776 case MVT::i64:
5777 Opc = UseMULXHi ? X86::MULX64Hrr
5778 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5779 : IsSigned ? X86::IMUL64r
5780 : X86::MUL64r;
5781 MOpc = UseMULXHi ? X86::MULX64Hrm
5782 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5783 : IsSigned ? X86::IMUL64m
5784 : X86::MUL64m;
5785 LoReg = UseMULX ? X86::RDX : X86::RAX;
5786 HiReg = X86::RDX;
5787 break;
5788 }
5789
5790 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5791 bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5792 // Multiply is commutative.
5793 if (!foldedLoad) {
5794 foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5795 if (foldedLoad)
5796 std::swap(a&: N0, b&: N1);
5797 }
5798
5799 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5800 N: N0, Glue: SDValue()).getValue(R: 1);
5801 SDValue ResHi, ResLo;
5802 if (foldedLoad) {
5803 SDValue Chain;
5804 MachineSDNode *CNode = nullptr;
5805 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5806 InGlue };
5807 if (UseMULXHi) {
5808 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
5809 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5810 ResHi = SDValue(CNode, 0);
5811 Chain = SDValue(CNode, 1);
5812 } else if (UseMULX) {
5813 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other);
5814 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5815 ResHi = SDValue(CNode, 0);
5816 ResLo = SDValue(CNode, 1);
5817 Chain = SDValue(CNode, 2);
5818 } else {
5819 SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
5820 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5821 Chain = SDValue(CNode, 0);
5822 InGlue = SDValue(CNode, 1);
5823 }
5824
5825 // Update the chain.
5826 ReplaceUses(F: N1.getValue(R: 1), T: Chain);
5827 // Record the mem-refs
5828 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5829 } else {
5830 SDValue Ops[] = { N1, InGlue };
5831 if (UseMULXHi) {
5832 SDVTList VTs = CurDAG->getVTList(VT: NVT);
5833 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5834 ResHi = SDValue(CNode, 0);
5835 } else if (UseMULX) {
5836 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT);
5837 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5838 ResHi = SDValue(CNode, 0);
5839 ResLo = SDValue(CNode, 1);
5840 } else {
5841 SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue);
5842 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5843 InGlue = SDValue(CNode, 0);
5844 }
5845 }
5846
5847 // Copy the low half of the result, if it is needed.
5848 if (!SDValue(Node, 0).use_empty()) {
5849 if (!ResLo) {
5850 assert(LoReg && "Register for low half is not defined!");
5851 ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5852 VT: NVT, Glue: InGlue);
5853 InGlue = ResLo.getValue(R: 2);
5854 }
5855 ReplaceUses(F: SDValue(Node, 0), T: ResLo);
5856 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5857 dbgs() << '\n');
5858 }
5859 // Copy the high half of the result, if it is needed.
5860 if (!SDValue(Node, 1).use_empty()) {
5861 if (!ResHi) {
5862 assert(HiReg && "Register for high half is not defined!");
5863 ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg,
5864 VT: NVT, Glue: InGlue);
5865 InGlue = ResHi.getValue(R: 2);
5866 }
5867 ReplaceUses(F: SDValue(Node, 1), T: ResHi);
5868 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5869 dbgs() << '\n');
5870 }
5871
5872 CurDAG->RemoveDeadNode(N: Node);
5873 return;
5874 }
5875
5876 case ISD::SDIVREM:
5877 case ISD::UDIVREM: {
5878 SDValue N0 = Node->getOperand(Num: 0);
5879 SDValue N1 = Node->getOperand(Num: 1);
5880
5881 unsigned ROpc, MOpc;
5882 bool isSigned = Opcode == ISD::SDIVREM;
5883 if (!isSigned) {
5884 switch (NVT.SimpleTy) {
5885 default: llvm_unreachable("Unsupported VT!");
5886 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5887 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5888 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5889 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5890 }
5891 } else {
5892 switch (NVT.SimpleTy) {
5893 default: llvm_unreachable("Unsupported VT!");
5894 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5895 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5896 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5897 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5898 }
5899 }
5900
5901 unsigned LoReg, HiReg, ClrReg;
5902 unsigned SExtOpcode;
5903 switch (NVT.SimpleTy) {
5904 default: llvm_unreachable("Unsupported VT!");
5905 case MVT::i8:
5906 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5907 SExtOpcode = 0; // Not used.
5908 break;
5909 case MVT::i16:
5910 LoReg = X86::AX; HiReg = X86::DX;
5911 ClrReg = X86::DX;
5912 SExtOpcode = X86::CWD;
5913 break;
5914 case MVT::i32:
5915 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5916 SExtOpcode = X86::CDQ;
5917 break;
5918 case MVT::i64:
5919 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5920 SExtOpcode = X86::CQO;
5921 break;
5922 }
5923
5924 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5925 bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5926 bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0);
5927
5928 SDValue InGlue;
5929 if (NVT == MVT::i8) {
5930 // Special case for div8, just use a move with zero extension to AX to
5931 // clear the upper 8 bits (AH).
5932 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5933 MachineSDNode *Move;
5934 if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5935 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) };
5936 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5937 : X86::MOVZX16rm8;
5938 Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops);
5939 Chain = SDValue(Move, 1);
5940 ReplaceUses(F: N0.getValue(R: 1), T: Chain);
5941 // Record the mem-refs
5942 CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5943 } else {
5944 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5945 : X86::MOVZX16rr8;
5946 Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0);
5947 Chain = CurDAG->getEntryNode();
5948 }
5949 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue(Move, 0),
5950 Glue: SDValue());
5951 InGlue = Chain.getValue(R: 1);
5952 } else {
5953 InGlue =
5954 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl,
5955 Reg: LoReg, N: N0, Glue: SDValue()).getValue(R: 1);
5956 if (isSigned && !signBitIsZero) {
5957 // Sign extend the low part into the high part.
5958 InGlue =
5959 SDValue(CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),0);
5960 } else {
5961 // Zero out the high part, effectively zero extending the input.
5962 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
5963 SDValue ClrNode =
5964 SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0);
5965 switch (NVT.SimpleTy) {
5966 case MVT::i16:
5967 ClrNode =
5968 SDValue(CurDAG->getMachineNode(
5969 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode,
5970 Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl,
5971 VT: MVT::i32)),
5972 0);
5973 break;
5974 case MVT::i32:
5975 break;
5976 case MVT::i64:
5977 ClrNode =
5978 SDValue(CurDAG->getMachineNode(
5979 Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64,
5980 Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: ClrNode,
5981 Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl,
5982 VT: MVT::i32)),
5983 0);
5984 break;
5985 default:
5986 llvm_unreachable("Unexpected division source");
5987 }
5988
5989 InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg,
5990 N: ClrNode, Glue: InGlue).getValue(R: 1);
5991 }
5992 }
5993
5994 if (foldedLoad) {
5995 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5996 InGlue };
5997 MachineSDNode *CNode =
5998 CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops);
5999 InGlue = SDValue(CNode, 1);
6000 // Update the chain.
6001 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 0));
6002 // Record the mem-refs
6003 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
6004 } else {
6005 InGlue =
6006 SDValue(CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), 0);
6007 }
6008
6009 // Prevent use of AH in a REX instruction by explicitly copying it to
6010 // an ABCD_L register.
6011 //
6012 // The current assumption of the register allocator is that isel
6013 // won't generate explicit references to the GR8_ABCD_H registers. If
6014 // the allocator and/or the backend get enhanced to be more robust in
6015 // that regard, this can be, and should be, removed.
6016 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6017 SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8);
6018 unsigned AHExtOpcode =
6019 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6020
6021 SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32,
6022 VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue);
6023 SDValue Result(RNode, 0);
6024 InGlue = SDValue(RNode, 1);
6025
6026 Result =
6027 CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result);
6028
6029 ReplaceUses(F: SDValue(Node, 1), T: Result);
6030 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6031 dbgs() << '\n');
6032 }
6033 // Copy the division (low) result, if it is needed.
6034 if (!SDValue(Node, 0).use_empty()) {
6035 SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6036 Reg: LoReg, VT: NVT, Glue: InGlue);
6037 InGlue = Result.getValue(R: 2);
6038 ReplaceUses(F: SDValue(Node, 0), T: Result);
6039 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6040 dbgs() << '\n');
6041 }
6042 // Copy the remainder (high) result, if it is needed.
6043 if (!SDValue(Node, 1).use_empty()) {
6044 SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6045 Reg: HiReg, VT: NVT, Glue: InGlue);
6046 InGlue = Result.getValue(R: 2);
6047 ReplaceUses(F: SDValue(Node, 1), T: Result);
6048 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6049 dbgs() << '\n');
6050 }
6051 CurDAG->RemoveDeadNode(N: Node);
6052 return;
6053 }
6054
6055 case X86ISD::FCMP:
6056 case X86ISD::STRICT_FCMP:
6057 case X86ISD::STRICT_FCMPS: {
6058 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6059 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6060 SDValue N0 = Node->getOperand(Num: IsStrictCmp ? 1 : 0);
6061 SDValue N1 = Node->getOperand(Num: IsStrictCmp ? 2 : 1);
6062
6063 // Save the original VT of the compare.
6064 MVT CmpVT = N0.getSimpleValueType();
6065
6066 // Floating point needs special handling if we don't have FCOMI.
6067 if (Subtarget->canUseCMOV())
6068 break;
6069
6070 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6071
6072 unsigned Opc;
6073 switch (CmpVT.SimpleTy) {
6074 default: llvm_unreachable("Unexpected type!");
6075 case MVT::f32:
6076 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6077 break;
6078 case MVT::f64:
6079 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6080 break;
6081 case MVT::f80:
6082 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6083 break;
6084 }
6085
6086 SDValue Chain =
6087 IsStrictCmp ? Node->getOperand(Num: 0) : CurDAG->getEntryNode();
6088 SDValue Glue;
6089 if (IsStrictCmp) {
6090 SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
6091 Chain = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), 0);
6092 Glue = Chain.getValue(R: 1);
6093 } else {
6094 Glue = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), 0);
6095 }
6096
6097 // Move FPSW to AX.
6098 SDValue FNSTSW =
6099 SDValue(CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), 0);
6100
6101 // Extract upper 8-bits of AX.
6102 SDValue Extract =
6103 CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW);
6104
6105 // Move AH into flags.
6106 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6107 assert(Subtarget->canUseLAHFSAHF() &&
6108 "Target doesn't support SAHF or FCOMI?");
6109 SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue());
6110 Chain = AH;
6111 SDValue SAHF = SDValue(
6112 CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: 1)), 0);
6113
6114 if (IsStrictCmp)
6115 ReplaceUses(F: SDValue(Node, 1), T: Chain);
6116
6117 ReplaceUses(F: SDValue(Node, 0), T: SAHF);
6118 CurDAG->RemoveDeadNode(N: Node);
6119 return;
6120 }
6121
6122 case X86ISD::CMP: {
6123 SDValue N0 = Node->getOperand(Num: 0);
6124 SDValue N1 = Node->getOperand(Num: 1);
6125
6126 // Optimizations for TEST compares.
6127 if (!isNullConstant(V: N1))
6128 break;
6129
6130 // Save the original VT of the compare.
6131 MVT CmpVT = N0.getSimpleValueType();
6132
6133 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6134 // by a test instruction. The test should be removed later by
6135 // analyzeCompare if we are using only the zero flag.
6136 // TODO: Should we check the users and use the BEXTR flags directly?
6137 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6138 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) {
6139 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6140 : X86::TEST32rr;
6141 SDValue BEXTR = SDValue(NewNode, 0);
6142 NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR);
6143 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
6144 CurDAG->RemoveDeadNode(N: Node);
6145 return;
6146 }
6147 }
6148
6149 // We can peek through truncates, but we need to be careful below.
6150 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6151 N0 = N0.getOperand(i: 0);
6152
6153 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6154 // use a smaller encoding.
6155 // Look past the truncate if CMP is the only use of it.
6156 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6157 N0.getValueType() != MVT::i8) {
6158 auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
6159 if (!MaskC)
6160 break;
6161
6162 // We may have looked through a truncate so mask off any bits that
6163 // shouldn't be part of the compare.
6164 uint64_t Mask = MaskC->getZExtValue();
6165 Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits());
6166
6167 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6168 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6169 // zero flag.
6170 if (CmpVT == MVT::i64 && !isInt<8>(x: Mask) && isShiftedMask_64(Value: Mask) &&
6171 onlyUsesZeroFlag(Flags: SDValue(Node, 0))) {
6172 unsigned ShiftOpcode = ISD::DELETED_NODE;
6173 unsigned ShiftAmt;
6174 unsigned SubRegIdx;
6175 MVT SubRegVT;
6176 unsigned TestOpcode;
6177 unsigned LeadingZeros = llvm::countl_zero(Val: Mask);
6178 unsigned TrailingZeros = llvm::countr_zero(Val: Mask);
6179
6180 // With leading/trailing zeros, the transform is profitable if we can
6181 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6182 // incurring any extra register moves.
6183 bool SavesBytes = !isInt<32>(x: Mask) || N0.getOperand(i: 0).hasOneUse();
6184 if (LeadingZeros == 0 && SavesBytes) {
6185 // If the mask covers the most significant bit, then we can replace
6186 // TEST+AND with a SHR and check eflags.
6187 // This emits a redundant TEST which is subsequently eliminated.
6188 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6189 ShiftAmt = TrailingZeros;
6190 SubRegIdx = 0;
6191 TestOpcode = X86::TEST64rr;
6192 } else if (TrailingZeros == 0 && SavesBytes) {
6193 // If the mask covers the least significant bit, then we can replace
6194 // TEST+AND with a SHL and check eflags.
6195 // This emits a redundant TEST which is subsequently eliminated.
6196 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6197 ShiftAmt = LeadingZeros;
6198 SubRegIdx = 0;
6199 TestOpcode = X86::TEST64rr;
6200 } else if (MaskC->hasOneUse() && !isInt<32>(x: Mask)) {
6201 // If the shifted mask extends into the high half and is 8/16/32 bits
6202 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6203 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6204 if (PopCount == 8) {
6205 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6206 ShiftAmt = TrailingZeros;
6207 SubRegIdx = X86::sub_8bit;
6208 SubRegVT = MVT::i8;
6209 TestOpcode = X86::TEST8rr;
6210 } else if (PopCount == 16) {
6211 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6212 ShiftAmt = TrailingZeros;
6213 SubRegIdx = X86::sub_16bit;
6214 SubRegVT = MVT::i16;
6215 TestOpcode = X86::TEST16rr;
6216 } else if (PopCount == 32) {
6217 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6218 ShiftAmt = TrailingZeros;
6219 SubRegIdx = X86::sub_32bit;
6220 SubRegVT = MVT::i32;
6221 TestOpcode = X86::TEST32rr;
6222 }
6223 }
6224 if (ShiftOpcode != ISD::DELETED_NODE) {
6225 SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64);
6226 SDValue Shift = SDValue(
6227 CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32,
6228 Op1: N0.getOperand(i: 0), Op2: ShiftC),
6229 0);
6230 if (SubRegIdx != 0) {
6231 Shift =
6232 CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift);
6233 }
6234 MachineSDNode *Test =
6235 CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift);
6236 ReplaceNode(F: Node, T: Test);
6237 return;
6238 }
6239 }
6240
6241 MVT VT;
6242 int SubRegOp;
6243 unsigned ROpc, MOpc;
6244
6245 // For each of these checks we need to be careful if the sign flag is
6246 // being used. It is only safe to use the sign flag in two conditions,
6247 // either the sign bit in the shrunken mask is zero or the final test
6248 // size is equal to the original compare size.
6249
6250 if (isUInt<8>(x: Mask) &&
6251 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6252 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6253 // For example, convert "testl %eax, $8" to "testb %al, $8"
6254 VT = MVT::i8;
6255 SubRegOp = X86::sub_8bit;
6256 ROpc = X86::TEST8ri;
6257 MOpc = X86::TEST8mi;
6258 } else if (OptForMinSize && isUInt<16>(x: Mask) &&
6259 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6260 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6261 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6262 // NOTE: We only want to form TESTW instructions if optimizing for
6263 // min size. Otherwise we only save one byte and possibly get a length
6264 // changing prefix penalty in the decoders.
6265 VT = MVT::i16;
6266 SubRegOp = X86::sub_16bit;
6267 ROpc = X86::TEST16ri;
6268 MOpc = X86::TEST16mi;
6269 } else if (isUInt<32>(x: Mask) && N0.getValueType() != MVT::i16 &&
6270 ((!(Mask & 0x80000000) &&
6271 // Without minsize 16-bit Cmps can get here so we need to
6272 // be sure we calculate the correct sign flag if needed.
6273 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6274 CmpVT == MVT::i32 ||
6275 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6276 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6277 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6278 // Otherwize, we find ourselves in a position where we have to do
6279 // promotion. If previous passes did not promote the and, we assume
6280 // they had a good reason not to and do not promote here.
6281 VT = MVT::i32;
6282 SubRegOp = X86::sub_32bit;
6283 ROpc = X86::TEST32ri;
6284 MOpc = X86::TEST32mi;
6285 } else {
6286 // No eligible transformation was found.
6287 break;
6288 }
6289
6290 SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT);
6291 SDValue Reg = N0.getOperand(i: 0);
6292
6293 // Emit a testl or testw.
6294 MachineSDNode *NewNode;
6295 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6296 if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6297 if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: 0).getNode())) {
6298 if (!LoadN->isSimple()) {
6299 unsigned NumVolBits = LoadN->getValueType(ResNo: 0).getSizeInBits();
6300 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6301 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6302 (MOpc == X86::TEST32mi && NumVolBits != 32))
6303 break;
6304 }
6305 }
6306 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6307 Reg.getOperand(i: 0) };
6308 NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops);
6309 // Update the chain.
6310 ReplaceUses(F: Reg.getValue(R: 1), T: SDValue(NewNode, 1));
6311 // Record the mem-refs
6312 CurDAG->setNodeMemRefs(N: NewNode,
6313 NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()});
6314 } else {
6315 // Extract the subregister if necessary.
6316 if (N0.getValueType() != VT)
6317 Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg);
6318
6319 NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm);
6320 }
6321 // Replace CMP with TEST.
6322 ReplaceNode(F: Node, T: NewNode);
6323 return;
6324 }
6325 break;
6326 }
6327 case X86ISD::PCMPISTR: {
6328 if (!Subtarget->hasSSE42())
6329 break;
6330
6331 bool NeedIndex = !SDValue(Node, 0).use_empty();
6332 bool NeedMask = !SDValue(Node, 1).use_empty();
6333 // We can't fold a load if we are going to make two instructions.
6334 bool MayFoldLoad = !NeedIndex || !NeedMask;
6335
6336 MachineSDNode *CNode;
6337 if (NeedMask) {
6338 unsigned ROpc =
6339 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6340 unsigned MOpc =
6341 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6342 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node);
6343 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0));
6344 }
6345 if (NeedIndex || !NeedMask) {
6346 unsigned ROpc =
6347 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6348 unsigned MOpc =
6349 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6350 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node);
6351 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
6352 }
6353
6354 // Connect the flag usage to the last instruction created.
6355 ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1));
6356 CurDAG->RemoveDeadNode(N: Node);
6357 return;
6358 }
6359 case X86ISD::PCMPESTR: {
6360 if (!Subtarget->hasSSE42())
6361 break;
6362
6363 // Copy the two implicit register inputs.
6364 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX,
6365 N: Node->getOperand(Num: 1),
6366 Glue: SDValue()).getValue(R: 1);
6367 InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX,
6368 N: Node->getOperand(Num: 3), Glue: InGlue).getValue(R: 1);
6369
6370 bool NeedIndex = !SDValue(Node, 0).use_empty();
6371 bool NeedMask = !SDValue(Node, 1).use_empty();
6372 // We can't fold a load if we are going to make two instructions.
6373 bool MayFoldLoad = !NeedIndex || !NeedMask;
6374
6375 MachineSDNode *CNode;
6376 if (NeedMask) {
6377 unsigned ROpc =
6378 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6379 unsigned MOpc =
6380 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6381 CNode =
6382 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue);
6383 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0));
6384 }
6385 if (NeedIndex || !NeedMask) {
6386 unsigned ROpc =
6387 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6388 unsigned MOpc =
6389 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6390 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue);
6391 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
6392 }
6393 // Connect the flag usage to the last instruction created.
6394 ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1));
6395 CurDAG->RemoveDeadNode(N: Node);
6396 return;
6397 }
6398
6399 case ISD::SETCC: {
6400 if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue(Node, 0), InMask: SDValue()))
6401 return;
6402
6403 break;
6404 }
6405
6406 case ISD::STORE:
6407 if (foldLoadStoreIntoMemOperand(Node))
6408 return;
6409 break;
6410
6411 case X86ISD::SETCC_CARRY: {
6412 MVT VT = Node->getSimpleValueType(ResNo: 0);
6413 SDValue Result;
6414 if (Subtarget->hasSBBDepBreaking()) {
6415 // We have to do this manually because tblgen will put the eflags copy in
6416 // the wrong place if we use an extract_subreg in the pattern.
6417 // Copy flags to the EFLAGS register and glue it to next node.
6418 SDValue EFLAGS =
6419 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
6420 N: Node->getOperand(Num: 1), Glue: SDValue());
6421
6422 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6423 // 32-bit version.
6424 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6425 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6426 Result = SDValue(
6427 CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: 1)),
6428 0);
6429 } else {
6430 // The target does not recognize sbb with the same reg operand as a
6431 // no-source idiom, so we explicitly zero the input values.
6432 Result = getSBBZero(N: Node);
6433 }
6434
6435 // For less than 32-bits we need to extract from the 32-bit node.
6436 if (VT == MVT::i8 || VT == MVT::i16) {
6437 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6438 Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6439 }
6440
6441 ReplaceUses(F: SDValue(Node, 0), T: Result);
6442 CurDAG->RemoveDeadNode(N: Node);
6443 return;
6444 }
6445 case X86ISD::SBB: {
6446 if (isNullConstant(V: Node->getOperand(Num: 0)) &&
6447 isNullConstant(V: Node->getOperand(Num: 1))) {
6448 SDValue Result = getSBBZero(N: Node);
6449
6450 // Replace the flag use.
6451 ReplaceUses(F: SDValue(Node, 1), T: Result.getValue(R: 1));
6452
6453 // Replace the result use.
6454 if (!SDValue(Node, 0).use_empty()) {
6455 // For less than 32-bits we need to extract from the 32-bit node.
6456 MVT VT = Node->getSimpleValueType(ResNo: 0);
6457 if (VT == MVT::i8 || VT == MVT::i16) {
6458 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6459 Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6460 }
6461 ReplaceUses(F: SDValue(Node, 0), T: Result);
6462 }
6463
6464 CurDAG->RemoveDeadNode(N: Node);
6465 return;
6466 }
6467 break;
6468 }
6469 case X86ISD::MGATHER: {
6470 auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node);
6471 SDValue IndexOp = Mgt->getIndex();
6472 SDValue Mask = Mgt->getMask();
6473 MVT IndexVT = IndexOp.getSimpleValueType();
6474 MVT ValueVT = Node->getSimpleValueType(ResNo: 0);
6475 MVT MaskVT = Mask.getSimpleValueType();
6476
6477 // This is just to prevent crashes if the nodes are malformed somehow. We're
6478 // otherwise only doing loose type checking in here based on type what
6479 // a type constraint would say just like table based isel.
6480 if (!ValueVT.isVector() || !MaskVT.isVector())
6481 break;
6482
6483 unsigned NumElts = ValueVT.getVectorNumElements();
6484 MVT ValueSVT = ValueVT.getVectorElementType();
6485
6486 bool IsFP = ValueSVT.isFloatingPoint();
6487 unsigned EltSize = ValueSVT.getSizeInBits();
6488
6489 unsigned Opc = 0;
6490 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6491 if (AVX512Gather) {
6492 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6493 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6494 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6495 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6496 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6497 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6498 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6499 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6500 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6501 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6502 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6503 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6504 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6505 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6506 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6507 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6508 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6509 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6510 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6511 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6512 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6513 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6514 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6515 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6516 } else {
6517 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6518 "Unexpected mask VT!");
6519 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6520 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6521 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6522 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6523 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6524 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6525 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6526 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6527 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6528 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6529 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6530 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6531 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6532 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6533 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6534 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6535 }
6536
6537 if (!Opc)
6538 break;
6539
6540 SDValue Base, Scale, Index, Disp, Segment;
6541 if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(),
6542 Base, Scale, Index, Disp, Segment))
6543 break;
6544
6545 SDValue PassThru = Mgt->getPassThru();
6546 SDValue Chain = Mgt->getChain();
6547 // Gather instructions have a mask output not in the ISD node.
6548 SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other);
6549
6550 MachineSDNode *NewNode;
6551 if (AVX512Gather) {
6552 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6553 Index, Disp, Segment, Chain};
6554 NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6555 } else {
6556 SDValue Ops[] = {PassThru, Base, Scale, Index,
6557 Disp, Segment, Mask, Chain};
6558 NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6559 }
6560 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()});
6561 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
6562 ReplaceUses(F: SDValue(Node, 1), T: SDValue(NewNode, 2));
6563 CurDAG->RemoveDeadNode(N: Node);
6564 return;
6565 }
6566 case X86ISD::MSCATTER: {
6567 auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node);
6568 SDValue Value = Sc->getValue();
6569 SDValue IndexOp = Sc->getIndex();
6570 MVT IndexVT = IndexOp.getSimpleValueType();
6571 MVT ValueVT = Value.getSimpleValueType();
6572
6573 // This is just to prevent crashes if the nodes are malformed somehow. We're
6574 // otherwise only doing loose type checking in here based on type what
6575 // a type constraint would say just like table based isel.
6576 if (!ValueVT.isVector())
6577 break;
6578
6579 unsigned NumElts = ValueVT.getVectorNumElements();
6580 MVT ValueSVT = ValueVT.getVectorElementType();
6581
6582 bool IsFP = ValueSVT.isFloatingPoint();
6583 unsigned EltSize = ValueSVT.getSizeInBits();
6584
6585 unsigned Opc;
6586 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6587 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6588 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6589 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6590 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6591 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6592 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6593 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6594 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6595 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6596 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6597 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6598 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6599 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6600 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6601 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6602 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6603 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6604 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6605 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6606 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6607 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6608 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6609 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6610 else
6611 break;
6612
6613 SDValue Base, Scale, Index, Disp, Segment;
6614 if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(),
6615 Base, Scale, Index, Disp, Segment))
6616 break;
6617
6618 SDValue Mask = Sc->getMask();
6619 SDValue Chain = Sc->getChain();
6620 // Scatter instructions have a mask output not in the ISD node.
6621 SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other);
6622 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6623
6624 MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6625 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()});
6626 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 1));
6627 CurDAG->RemoveDeadNode(N: Node);
6628 return;
6629 }
6630 case ISD::PREALLOCATED_SETUP: {
6631 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6632 auto CallId = MFI->getPreallocatedIdForCallSite(
6633 CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue());
6634 SDValue Chain = Node->getOperand(Num: 0);
6635 SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6636 MachineSDNode *New = CurDAG->getMachineNode(
6637 Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain);
6638 ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Chain
6639 CurDAG->RemoveDeadNode(N: Node);
6640 return;
6641 }
6642 case ISD::PREALLOCATED_ARG: {
6643 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6644 auto CallId = MFI->getPreallocatedIdForCallSite(
6645 CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue());
6646 SDValue Chain = Node->getOperand(Num: 0);
6647 SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6648 SDValue ArgIndex = Node->getOperand(Num: 2);
6649 SDValue Ops[3];
6650 Ops[0] = CallIdValue;
6651 Ops[1] = ArgIndex;
6652 Ops[2] = Chain;
6653 MachineSDNode *New = CurDAG->getMachineNode(
6654 Opcode: TargetOpcode::PREALLOCATED_ARG, dl,
6655 VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()),
6656 VT2: MVT::Other),
6657 Ops);
6658 ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Arg pointer
6659 ReplaceUses(F: SDValue(Node, 1), T: SDValue(New, 1)); // Chain
6660 CurDAG->RemoveDeadNode(N: Node);
6661 return;
6662 }
6663 case X86ISD::AESENCWIDE128KL:
6664 case X86ISD::AESDECWIDE128KL:
6665 case X86ISD::AESENCWIDE256KL:
6666 case X86ISD::AESDECWIDE256KL: {
6667 if (!Subtarget->hasWIDEKL())
6668 break;
6669
6670 unsigned Opcode;
6671 switch (Node->getOpcode()) {
6672 default:
6673 llvm_unreachable("Unexpected opcode!");
6674 case X86ISD::AESENCWIDE128KL:
6675 Opcode = X86::AESENCWIDE128KL;
6676 break;
6677 case X86ISD::AESDECWIDE128KL:
6678 Opcode = X86::AESDECWIDE128KL;
6679 break;
6680 case X86ISD::AESENCWIDE256KL:
6681 Opcode = X86::AESENCWIDE256KL;
6682 break;
6683 case X86ISD::AESDECWIDE256KL:
6684 Opcode = X86::AESDECWIDE256KL;
6685 break;
6686 }
6687
6688 SDValue Chain = Node->getOperand(Num: 0);
6689 SDValue Addr = Node->getOperand(Num: 1);
6690
6691 SDValue Base, Scale, Index, Disp, Segment;
6692 if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment))
6693 break;
6694
6695 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 2),
6696 Glue: SDValue());
6697 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 3),
6698 Glue: Chain.getValue(R: 1));
6699 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: 4),
6700 Glue: Chain.getValue(R: 1));
6701 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: 5),
6702 Glue: Chain.getValue(R: 1));
6703 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: 6),
6704 Glue: Chain.getValue(R: 1));
6705 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: 7),
6706 Glue: Chain.getValue(R: 1));
6707 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: 8),
6708 Glue: Chain.getValue(R: 1));
6709 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: 9),
6710 Glue: Chain.getValue(R: 1));
6711
6712 MachineSDNode *Res = CurDAG->getMachineNode(
6713 Opcode, dl, VTs: Node->getVTList(),
6714 Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: 1)});
6715 CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand());
6716 ReplaceNode(F: Node, T: Res);
6717 return;
6718 }
6719 case X86ISD::POP_FROM_X87_REG: {
6720 SDValue Chain = Node->getOperand(Num: 0);
6721 Register Reg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1))->getReg();
6722 SDValue Glue;
6723 if (Node->getNumValues() == 3)
6724 Glue = Node->getOperand(Num: 2);
6725 SDValue Copy =
6726 CurDAG->getCopyFromReg(Chain, dl, Reg, VT: Node->getValueType(ResNo: 0), Glue);
6727 ReplaceNode(F: Node, T: Copy.getNode());
6728 return;
6729 }
6730 }
6731
6732 SelectCode(N: Node);
6733}
6734
6735bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6736 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6737 std::vector<SDValue> &OutOps) {
6738 SDValue Op0, Op1, Op2, Op3, Op4;
6739 switch (ConstraintID) {
6740 default:
6741 llvm_unreachable("Unexpected asm memory constraint");
6742 case InlineAsm::ConstraintCode::o: // offsetable ??
6743 case InlineAsm::ConstraintCode::v: // not offsetable ??
6744 case InlineAsm::ConstraintCode::m: // memory
6745 case InlineAsm::ConstraintCode::X:
6746 case InlineAsm::ConstraintCode::p: // address
6747 if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4))
6748 return true;
6749 break;
6750 }
6751
6752 OutOps.push_back(x: Op0);
6753 OutOps.push_back(x: Op1);
6754 OutOps.push_back(x: Op2);
6755 OutOps.push_back(x: Op3);
6756 OutOps.push_back(x: Op4);
6757 return false;
6758}
6759
6760X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6761 : SelectionDAGISelPass(
6762 std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
6763
6764/// This pass converts a legalized DAG into a X86-specific DAG,
6765/// ready for instruction scheduling.
6766FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6767 CodeGenOptLevel OptLevel) {
6768 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6769}
6770