1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
16#include "X86MachineFunctionInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/CodeGen/MachineModuleInfo.h"
21#include "llvm/CodeGen/SelectionDAGISel.h"
22#include "llvm/Config/llvm-config.h"
23#include "llvm/IR/ConstantRange.h"
24#include "llvm/IR/Function.h"
25#include "llvm/IR/Instructions.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/ErrorHandling.h"
32#include "llvm/Support/KnownBits.h"
33#include "llvm/Support/MathExtras.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(Val: true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45 cl::Hidden);
46
47static cl::opt<bool> EnablePromoteAnyextLoad(
48 "x86-promote-anyext-load", cl::init(Val: true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
51extern cl::opt<bool> IndirectBranchTracking;
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 Kind: "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
187 return SelectionDAGISel::runOnMachineFunction(mf&: MF);
188 }
189
190 void emitFunctionEntryCode() override;
191
192 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
193
194 void PreprocessISelDAG() override;
195 void PostprocessISelDAG() override;
196
197// Include the pieces autogenerated from the target description.
198#include "X86GenDAGISel.inc"
199
200 private:
201 void Select(SDNode *N) override;
202
203 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205 bool AllowSegmentRegForX32 = false);
206 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211 unsigned Depth);
212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale,
218 SDValue &Index, SDValue &Disp, SDValue &Segment,
219 bool HasNDDM = true);
220 bool selectNDDAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale,
221 SDValue &Index, SDValue &Disp, SDValue &Segment);
222 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
223 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
224 SDValue &Index, SDValue &Disp, SDValue &Segment);
225 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
226 bool selectLEAAddr(SDValue N, SDValue &Base,
227 SDValue &Scale, SDValue &Index, SDValue &Disp,
228 SDValue &Segment);
229 bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
230 SDValue &Index, SDValue &Disp, SDValue &Segment);
231 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
232 SDValue &Scale, SDValue &Index, SDValue &Disp,
233 SDValue &Segment);
234 bool selectRelocImm(SDValue N, SDValue &Op);
235
236 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
237 SDValue &Base, SDValue &Scale,
238 SDValue &Index, SDValue &Disp,
239 SDValue &Segment);
240
241 // Convenience method where P is also root.
242 bool tryFoldLoad(SDNode *P, SDValue N,
243 SDValue &Base, SDValue &Scale,
244 SDValue &Index, SDValue &Disp,
245 SDValue &Segment) {
246 return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment);
247 }
248
249 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
250 SDValue &Base, SDValue &Scale,
251 SDValue &Index, SDValue &Disp,
252 SDValue &Segment);
253
254 bool isProfitableToFormMaskedOp(SDNode *N) const;
255
256 /// Implement addressing mode selection for inline asm expressions.
257 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
258 InlineAsm::ConstraintCode ConstraintID,
259 std::vector<SDValue> &OutOps) override;
260
261 void emitSpecialCodeForMain();
262
263 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
264 MVT VT, SDValue &Base, SDValue &Scale,
265 SDValue &Index, SDValue &Disp,
266 SDValue &Segment) {
267 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
268 Base = CurDAG->getTargetFrameIndex(
269 FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout()));
270 else if (AM.Base_Reg.getNode())
271 Base = AM.Base_Reg;
272 else
273 Base = CurDAG->getRegister(Reg: 0, VT);
274
275 Scale = getI8Imm(Imm: AM.Scale, DL);
276
277#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
278#define GET_NDM_IF_ENABLED(OPC) \
279 (Subtarget->hasNDD() && Subtarget->hasNDDM() ? OPC##_ND : OPC)
280 // Negate the index if needed.
281 if (AM.NegateIndex) {
282 unsigned NegOpc;
283 switch (VT.SimpleTy) {
284 default:
285 llvm_unreachable("Unsupported VT!");
286 case MVT::i64:
287 NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
288 break;
289 case MVT::i32:
290 NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
291 break;
292 case MVT::i16:
293 NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
294 break;
295 case MVT::i8:
296 NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
297 break;
298 }
299 SDValue Neg = SDValue(CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32,
300 Ops: AM.IndexReg), 0);
301 AM.IndexReg = Neg;
302 }
303
304 if (AM.IndexReg.getNode())
305 Index = AM.IndexReg;
306 else
307 Index = CurDAG->getRegister(Reg: 0, VT);
308
309 // These are 32-bit even in 64-bit mode since RIP-relative offset
310 // is 32-bit.
311 if (AM.GV)
312 Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc(),
313 VT: MVT::i32, offset: AM.Disp,
314 TargetFlags: AM.SymbolFlags);
315 else if (AM.CP)
316 Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment,
317 Offset: AM.Disp, TargetFlags: AM.SymbolFlags);
318 else if (AM.ES) {
319 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
320 Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
321 } else if (AM.MCSym) {
322 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
323 assert(AM.SymbolFlags == 0 && "oo");
324 Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32);
325 } else if (AM.JT != -1) {
326 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
327 Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
328 } else if (AM.BlockAddr)
329 Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp,
330 TargetFlags: AM.SymbolFlags);
331 else
332 Disp = CurDAG->getSignedTargetConstant(Val: AM.Disp, DL, VT: MVT::i32);
333
334 if (AM.Segment.getNode())
335 Segment = AM.Segment;
336 else
337 Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
338 }
339
340 // Utility function to determine whether it is AMX SDNode right after
341 // lowering but before ISEL.
342 bool isAMXSDNode(SDNode *N) const {
343 // Check if N is AMX SDNode:
344 // 1. check result type;
345 // 2. check operand type;
346 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
347 if (N->getValueType(ResNo: Idx) == MVT::x86amx)
348 return true;
349 }
350 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
351 SDValue Op = N->getOperand(Num: Idx);
352 if (Op.getValueType() == MVT::x86amx)
353 return true;
354 }
355 return false;
356 }
357
358 // Utility function to determine whether we should avoid selecting
359 // immediate forms of instructions for better code size or not.
360 // At a high level, we'd like to avoid such instructions when
361 // we have similar constants used within the same basic block
362 // that can be kept in a register.
363 //
364 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
365 uint32_t UseCount = 0;
366
367 // Do not want to hoist if we're not optimizing for size.
368 // TODO: We'd like to remove this restriction.
369 // See the comment in X86InstrInfo.td for more info.
370 if (!CurDAG->shouldOptForSize())
371 return false;
372
373 // Walk all the users of the immediate.
374 for (const SDNode *User : N->users()) {
375 if (UseCount >= 2)
376 break;
377
378 // This user is already selected. Count it as a legitimate use and
379 // move on.
380 if (User->isMachineOpcode()) {
381 UseCount++;
382 continue;
383 }
384
385 // We want to count stores of immediates as real uses.
386 if (User->getOpcode() == ISD::STORE &&
387 User->getOperand(Num: 1).getNode() == N) {
388 UseCount++;
389 continue;
390 }
391
392 // We don't currently match users that have > 2 operands (except
393 // for stores, which are handled above)
394 // Those instruction won't match in ISEL, for now, and would
395 // be counted incorrectly.
396 // This may change in the future as we add additional instruction
397 // types.
398 if (User->getNumOperands() != 2)
399 continue;
400
401 // If this is a sign-extended 8-bit integer immediate used in an ALU
402 // instruction, there is probably an opcode encoding to save space.
403 auto *C = dyn_cast<ConstantSDNode>(Val: N);
404 if (C && isInt<8>(x: C->getSExtValue()))
405 continue;
406
407 // Immediates that are used for offsets as part of stack
408 // manipulation should be left alone. These are typically
409 // used to indicate SP offsets for argument passing and
410 // will get pulled into stores/pushes (implicitly).
411 if (User->getOpcode() == X86ISD::ADD ||
412 User->getOpcode() == ISD::ADD ||
413 User->getOpcode() == X86ISD::SUB ||
414 User->getOpcode() == ISD::SUB) {
415
416 // Find the other operand of the add/sub.
417 SDValue OtherOp = User->getOperand(Num: 0);
418 if (OtherOp.getNode() == N)
419 OtherOp = User->getOperand(Num: 1);
420
421 // Don't count if the other operand is SP.
422 RegisterSDNode *RegNode;
423 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
424 (RegNode = dyn_cast_or_null<RegisterSDNode>(
425 Val: OtherOp->getOperand(Num: 1).getNode())))
426 if ((RegNode->getReg() == X86::ESP) ||
427 (RegNode->getReg() == X86::RSP))
428 continue;
429 }
430
431 // ... otherwise, count this and move on.
432 UseCount++;
433 }
434
435 // If we have more than 1 use, then recommend for hoisting.
436 return (UseCount > 1);
437 }
438
439 /// Return a target constant with the specified value of type i8.
440 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
441 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
442 }
443
444 /// Return a target constant with the specified value, of type i32.
445 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
446 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32);
447 }
448
449 /// Return a target constant with the specified value, of type i64.
450 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
451 return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64);
452 }
453
454 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
455 const SDLoc &DL) {
456 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
457 uint64_t Index = N->getConstantOperandVal(Num: 1);
458 MVT VecVT = N->getOperand(Num: 0).getSimpleValueType();
459 return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
460 }
461
462 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
463 const SDLoc &DL) {
464 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
465 uint64_t Index = N->getConstantOperandVal(Num: 2);
466 MVT VecVT = N->getSimpleValueType(ResNo: 0);
467 return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
468 }
469
470 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
471 const SDLoc &DL) {
472 assert(VecWidth == 128 && "Unexpected vector width");
473 uint64_t Index = N->getConstantOperandVal(Num: 2);
474 MVT VecVT = N->getSimpleValueType(ResNo: 0);
475 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
476 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
477 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
478 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
479 return getI8Imm(Imm: InsertIdx ? 0x02 : 0x30, DL);
480 }
481
482 SDValue getSBBZero(SDNode *N) {
483 SDLoc dl(N);
484 MVT VT = N->getSimpleValueType(ResNo: 0);
485
486 // Create zero.
487 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
488 SDValue Zero =
489 SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0);
490 if (VT == MVT::i64) {
491 Zero = SDValue(
492 CurDAG->getMachineNode(
493 Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, Op1: Zero,
494 Op2: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
495 0);
496 }
497
498 // Copy flags to the EFLAGS register and glue it to next node.
499 unsigned Opcode = N->getOpcode();
500 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
501 "Unexpected opcode for SBB materialization");
502 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
503 SDValue EFLAGS =
504 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
505 N: N->getOperand(Num: FlagOpIndex), Glue: SDValue());
506
507 // Create a 64-bit instruction if the result is 64-bits otherwise use the
508 // 32-bit version.
509 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
510 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
511 VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32);
512 return SDValue(
513 CurDAG->getMachineNode(Opcode: Opc, dl, VTs,
514 Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: 1)}),
515 0);
516 }
517
518 // Helper to detect unneeded and instructions on shift amounts. Called
519 // from PatFrags in tablegen.
520 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
521 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
522 const APInt &Val = N->getConstantOperandAPInt(Num: 1);
523
524 if (Val.countr_one() >= Width)
525 return true;
526
527 APInt Mask = Val | CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero;
528 return Mask.countr_one() >= Width;
529 }
530
531 /// Return an SDNode that returns the value of the global base register.
532 /// Output instructions required to initialize the global base register,
533 /// if necessary.
534 SDNode *getGlobalBaseReg();
535
536 /// Return a reference to the TargetMachine, casted to the target-specific
537 /// type.
538 const X86TargetMachine &getTargetMachine() const {
539 return static_cast<const X86TargetMachine &>(TM);
540 }
541
542 /// Return a reference to the TargetInstrInfo, casted to the target-specific
543 /// type.
544 const X86InstrInfo *getInstrInfo() const {
545 return Subtarget->getInstrInfo();
546 }
547
548 /// Return a condition code of the given SDNode
549 X86::CondCode getCondFromNode(SDNode *N) const;
550
551 /// Address-mode matching performs shift-of-and to and-of-shift
552 /// reassociation in order to expose more scaled addressing
553 /// opportunities.
554 bool ComplexPatternFuncMutatesDAG() const override {
555 return true;
556 }
557
558 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
559
560 // Indicates we should prefer to use a non-temporal load for this load.
561 bool useNonTemporalLoad(LoadSDNode *N) const {
562 if (!N->isNonTemporal())
563 return false;
564
565 unsigned StoreSize = N->getMemoryVT().getStoreSize();
566
567 if (N->getAlign().value() < StoreSize)
568 return false;
569
570 switch (StoreSize) {
571 default: llvm_unreachable("Unsupported store size");
572 case 4:
573 case 8:
574 return false;
575 case 16:
576 return Subtarget->hasSSE41();
577 case 32:
578 return Subtarget->hasAVX2();
579 case 64:
580 return Subtarget->hasAVX512();
581 }
582 }
583
584 bool foldLoadStoreIntoMemOperand(SDNode *Node);
585 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
586 bool matchBitExtract(SDNode *Node);
587 bool shrinkAndImmediate(SDNode *N);
588 bool isMaskZeroExtended(SDNode *N) const;
589 bool tryShiftAmountMod(SDNode *N);
590 bool tryShrinkShlLogicImm(SDNode *N);
591 bool tryVPTERNLOG(SDNode *N);
592 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
593 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
594 uint8_t Imm);
595 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
596 bool tryMatchBitSelect(SDNode *N);
597
598 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
599 const SDLoc &dl, MVT VT, SDNode *Node);
600 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
601 const SDLoc &dl, MVT VT, SDNode *Node,
602 SDValue &InGlue);
603
604 bool tryOptimizeRem8Extend(SDNode *N);
605
606 bool onlyUsesZeroFlag(SDValue Flags) const;
607 bool hasNoSignFlagUses(SDValue Flags) const;
608 bool hasNoCarryFlagUses(SDValue Flags) const;
609 bool checkTCRetEnoughRegs(SDNode *N) const;
610 };
611
612 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
613 public:
614 static char ID;
615 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
616 CodeGenOptLevel OptLevel)
617 : SelectionDAGISelLegacy(
618 ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {}
619 };
620}
621
622char X86DAGToDAGISelLegacy::ID = 0;
623
624INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
625
626// Returns true if this masked compare can be implemented legally with this
627// type.
628static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
629 unsigned Opcode = N->getOpcode();
630 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
631 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
632 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
633 // We can get 256-bit 8 element types here without VLX being enabled. When
634 // this happens we will use 512-bit operations and the mask will not be
635 // zero extended.
636 EVT OpVT = N->getOperand(Num: 0).getValueType();
637 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
638 // second operand.
639 if (Opcode == X86ISD::STRICT_CMPM)
640 OpVT = N->getOperand(Num: 1).getValueType();
641 if (OpVT.is256BitVector() || OpVT.is128BitVector())
642 return Subtarget->hasVLX();
643
644 return true;
645 }
646 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
647 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
648 Opcode == X86ISD::FSETCCM_SAE)
649 return true;
650
651 return false;
652}
653
654// Returns true if we can assume the writer of the mask has zero extended it
655// for us.
656bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
657 // If this is an AND, check if we have a compare on either side. As long as
658 // one side guarantees the mask is zero extended, the AND will preserve those
659 // zeros.
660 if (N->getOpcode() == ISD::AND)
661 return isLegalMaskCompare(N: N->getOperand(Num: 0).getNode(), Subtarget) ||
662 isLegalMaskCompare(N: N->getOperand(Num: 1).getNode(), Subtarget);
663
664 return isLegalMaskCompare(N, Subtarget);
665}
666
667bool
668X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
669 if (OptLevel == CodeGenOptLevel::None)
670 return false;
671
672 if (!N.hasOneUse())
673 return false;
674
675 if (N.getOpcode() != ISD::LOAD)
676 return true;
677
678 // Don't fold non-temporal loads if we have an instruction for them.
679 if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N)))
680 return false;
681
682 // If N is a load, do additional profitability checks.
683 if (U == Root) {
684 switch (U->getOpcode()) {
685 default: break;
686 case X86ISD::ADD:
687 case X86ISD::ADC:
688 case X86ISD::SUB:
689 case X86ISD::SBB:
690 case X86ISD::AND:
691 case X86ISD::XOR:
692 case X86ISD::OR:
693 case ISD::ADD:
694 case ISD::UADDO_CARRY:
695 case ISD::AND:
696 case ISD::OR:
697 case ISD::XOR: {
698 SDValue Op1 = U->getOperand(Num: 1);
699
700 // If the other operand is a 8-bit immediate we should fold the immediate
701 // instead. This reduces code size.
702 // e.g.
703 // movl 4(%esp), %eax
704 // addl $4, %eax
705 // vs.
706 // movl $4, %eax
707 // addl 4(%esp), %eax
708 // The former is 2 bytes shorter. In case where the increment is 1, then
709 // the saving can be 4 bytes (by using incl %eax).
710 if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) {
711 if (Imm->getAPIntValue().isSignedIntN(N: 8))
712 return false;
713
714 // If this is a 64-bit AND with an immediate that fits in 32-bits,
715 // prefer using the smaller and over folding the load. This is needed to
716 // make sure immediates created by shrinkAndImmediate are always folded.
717 // Ideally we would narrow the load during DAG combine and get the
718 // best of both worlds.
719 if (U->getOpcode() == ISD::AND &&
720 Imm->getAPIntValue().getBitWidth() == 64 &&
721 Imm->getAPIntValue().isIntN(N: 32))
722 return false;
723
724 // If this really a zext_inreg that can be represented with a movzx
725 // instruction, prefer that.
726 // TODO: We could shrink the load and fold if it is non-volatile.
727 if (U->getOpcode() == ISD::AND &&
728 (Imm->getAPIntValue() == UINT8_MAX ||
729 Imm->getAPIntValue() == UINT16_MAX ||
730 Imm->getAPIntValue() == UINT32_MAX))
731 return false;
732
733 // ADD/SUB with can negate the immediate and use the opposite operation
734 // to fit 128 into a sign extended 8 bit immediate.
735 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
736 (-Imm->getAPIntValue()).isSignedIntN(N: 8))
737 return false;
738
739 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
740 (-Imm->getAPIntValue()).isSignedIntN(N: 8) &&
741 hasNoCarryFlagUses(Flags: SDValue(U, 1)))
742 return false;
743 }
744
745 // If the other operand is a TLS address, we should fold it instead.
746 // This produces
747 // movl %gs:0, %eax
748 // leal i@NTPOFF(%eax), %eax
749 // instead of
750 // movl $i@NTPOFF, %eax
751 // addl %gs:0, %eax
752 // if the block also has an access to a second TLS address this will save
753 // a load.
754 // FIXME: This is probably also true for non-TLS addresses.
755 if (Op1.getOpcode() == X86ISD::Wrapper) {
756 SDValue Val = Op1.getOperand(i: 0);
757 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
758 return false;
759 }
760
761 // Don't fold load if this matches the BTS/BTR/BTC patterns.
762 // BTS: (or X, (shl 1, n))
763 // BTR: (and X, (rotl -2, n))
764 // BTC: (xor X, (shl 1, n))
765 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
766 if (U->getOperand(Num: 0).getOpcode() == ISD::SHL &&
767 isOneConstant(V: U->getOperand(Num: 0).getOperand(i: 0)))
768 return false;
769
770 if (U->getOperand(Num: 1).getOpcode() == ISD::SHL &&
771 isOneConstant(V: U->getOperand(Num: 1).getOperand(i: 0)))
772 return false;
773 }
774 if (U->getOpcode() == ISD::AND) {
775 SDValue U0 = U->getOperand(Num: 0);
776 SDValue U1 = U->getOperand(Num: 1);
777 if (U0.getOpcode() == ISD::ROTL) {
778 auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: 0));
779 if (C && C->getSExtValue() == -2)
780 return false;
781 }
782
783 if (U1.getOpcode() == ISD::ROTL) {
784 auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: 0));
785 if (C && C->getSExtValue() == -2)
786 return false;
787 }
788 }
789
790 break;
791 }
792 case ISD::SHL:
793 case ISD::SRA:
794 case ISD::SRL:
795 // Don't fold a load into a shift by immediate. The BMI2 instructions
796 // support folding a load, but not an immediate. The legacy instructions
797 // support folding an immediate, but can't fold a load. Folding an
798 // immediate is preferable to folding a load.
799 if (isa<ConstantSDNode>(Val: U->getOperand(Num: 1)))
800 return false;
801
802 break;
803 }
804 }
805
806 // Prevent folding a load if this can implemented with an insert_subreg or
807 // a move that implicitly zeroes.
808 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
809 isNullConstant(V: Root->getOperand(Num: 2)) &&
810 (Root->getOperand(Num: 0).isUndef() ||
811 ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: 0).getNode())))
812 return false;
813
814 return true;
815}
816
817// Indicates it is profitable to form an AVX512 masked operation. Returning
818// false will favor a masked register-register masked move or vblendm and the
819// operation will be selected separately.
820bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
821 assert(
822 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
823 "Unexpected opcode!");
824
825 // If the operation has additional users, the operation will be duplicated.
826 // Check the use count to prevent that.
827 // FIXME: Are there cheap opcodes we might want to duplicate?
828 return N->getOperand(Num: 1).hasOneUse();
829}
830
831/// Replace the original chain operand of the call with
832/// load's chain operand and move load below the call's chain operand.
833static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
834 SDValue Call, SDValue OrigChain) {
835 SmallVector<SDValue, 8> Ops;
836 SDValue Chain = OrigChain.getOperand(i: 0);
837 if (Chain.getNode() == Load.getNode())
838 Ops.push_back(Elt: Load.getOperand(i: 0));
839 else {
840 assert(Chain.getOpcode() == ISD::TokenFactor &&
841 "Unexpected chain operand");
842 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
843 if (Chain.getOperand(i).getNode() == Load.getNode())
844 Ops.push_back(Elt: Load.getOperand(i: 0));
845 else
846 Ops.push_back(Elt: Chain.getOperand(i));
847 SDValue NewChain =
848 CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Load), VT: MVT::Other, Ops);
849 Ops.clear();
850 Ops.push_back(Elt: NewChain);
851 }
852 Ops.append(in_start: OrigChain->op_begin() + 1, in_end: OrigChain->op_end());
853 CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops);
854 CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: 0),
855 Op2: Load.getOperand(i: 1), Op3: Load.getOperand(i: 2));
856
857 Ops.clear();
858 Ops.push_back(Elt: SDValue(Load.getNode(), 1));
859 Ops.append(in_start: Call->op_begin() + 1, in_end: Call->op_end());
860 CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops);
861}
862
863/// Return true if call address is a load and it can be
864/// moved below CALLSEQ_START and the chains leading up to the call.
865/// Return the CALLSEQ_START by reference as a second output.
866/// In the case of a tail call, there isn't a callseq node between the call
867/// chain and the load.
868static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
869 // The transformation is somewhat dangerous if the call's chain was glued to
870 // the call. After MoveBelowOrigChain the load is moved between the call and
871 // the chain, this can create a cycle if the load is not folded. So it is
872 // *really* important that we are sure the load will be folded.
873 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
874 return false;
875 auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode());
876 if (!LD ||
877 !LD->isSimple() ||
878 LD->getAddressingMode() != ISD::UNINDEXED ||
879 LD->getExtensionType() != ISD::NON_EXTLOAD)
880 return false;
881
882 // If the load's outgoing chain has more than one use, we can't (currently)
883 // move the load since we'd most likely create a loop. TODO: Maybe it could
884 // work if moveBelowOrigChain() updated *all* the chain users.
885 if (!Callee.getValue(R: 1).hasOneUse())
886 return false;
887
888 // Now let's find the callseq_start.
889 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
890 if (!Chain.hasOneUse())
891 return false;
892 Chain = Chain.getOperand(i: 0);
893 }
894
895 while (true) {
896 if (!Chain.getNumOperands())
897 return false;
898
899 // It's not safe to move the callee (a load) across e.g. a store.
900 // Conservatively abort if the chain contains a node other than the ones
901 // below.
902 switch (Chain.getNode()->getOpcode()) {
903 case ISD::CALLSEQ_START:
904 case ISD::CopyToReg:
905 case ISD::LOAD:
906 break;
907 default:
908 return false;
909 }
910
911 if (Chain.getOperand(i: 0).getNode() == Callee.getNode())
912 return true;
913 if (Chain.getOperand(i: 0).getOpcode() == ISD::TokenFactor &&
914 Chain.getOperand(i: 0).getValue(R: 0).hasOneUse() &&
915 Callee.getValue(R: 1).isOperandOf(N: Chain.getOperand(i: 0).getNode()) &&
916 Callee.getValue(R: 1).hasOneUse())
917 return true;
918
919 // Look past CopyToRegs. We only walk one path, so the chain mustn't branch.
920 if (Chain.getOperand(i: 0).getOpcode() == ISD::CopyToReg &&
921 Chain.getOperand(i: 0).getValue(R: 0).hasOneUse()) {
922 Chain = Chain.getOperand(i: 0);
923 continue;
924 }
925
926 return false;
927 }
928}
929
930static bool isEndbrImm64(uint64_t Imm) {
931// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
932// i.g: 0xF3660F1EFA, 0xF3670F1EFA
933 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
934 return false;
935
936 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
937 0x65, 0x66, 0x67, 0xf0, 0xf2};
938 int i = 24; // 24bit 0x0F1EFA has matched
939 while (i < 64) {
940 uint8_t Byte = (Imm >> i) & 0xFF;
941 if (Byte == 0xF3)
942 return true;
943 if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte))
944 return false;
945 i += 8;
946 }
947
948 return false;
949}
950
951static bool needBWI(MVT VT) {
952 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
953}
954
955void X86DAGToDAGISel::PreprocessISelDAG() {
956 bool MadeChange = false;
957 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
958 E = CurDAG->allnodes_end(); I != E; ) {
959 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
960
961 // This is for CET enhancement.
962 //
963 // ENDBR32 and ENDBR64 have specific opcodes:
964 // ENDBR32: F3 0F 1E FB
965 // ENDBR64: F3 0F 1E FA
966 // And we want that attackers won’t find unintended ENDBR32/64
967 // opcode matches in the binary
968 // Here’s an example:
969 // If the compiler had to generate asm for the following code:
970 // a = 0xF30F1EFA
971 // it could, for example, generate:
972 // mov 0xF30F1EFA, dword ptr[a]
973 // In such a case, the binary would include a gadget that starts
974 // with a fake ENDBR64 opcode. Therefore, we split such generation
975 // into multiple operations, let it not shows in the binary
976 if (N->getOpcode() == ISD::Constant) {
977 MVT VT = N->getSimpleValueType(ResNo: 0);
978 int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
979 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
980 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
981 // Check that the cf-protection-branch is enabled.
982 Metadata *CFProtectionBranch =
983 MF->getFunction().getParent()->getModuleFlag(
984 Key: "cf-protection-branch");
985 if (CFProtectionBranch || IndirectBranchTracking) {
986 SDLoc dl(N);
987 SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true);
988 Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT);
989 --I;
990 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Complement);
991 ++I;
992 MadeChange = true;
993 continue;
994 }
995 }
996 }
997
998 // If this is a target specific AND node with no flag usages, turn it back
999 // into ISD::AND to enable test instruction matching.
1000 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: 1)) {
1001 SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1002 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
1003 --I;
1004 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1005 ++I;
1006 MadeChange = true;
1007 continue;
1008 }
1009
1010 // Convert vector increment or decrement to sub/add with an all-ones
1011 // constant:
1012 // add X, <1, 1...> --> sub X, <-1, -1...>
1013 // sub X, <1, 1...> --> add X, <-1, -1...>
1014 // The all-ones vector constant can be materialized using a pcmpeq
1015 // instruction that is commonly recognized as an idiom (has no register
1016 // dependency), so that's better/smaller than loading a splat 1 constant.
1017 //
1018 // But don't do this if it would inhibit a potentially profitable load
1019 // folding opportunity for the other operand. That only occurs with the
1020 // intersection of:
1021 // (1) The other operand (op0) is load foldable.
1022 // (2) The op is an add (otherwise, we are *creating* an add and can still
1023 // load fold the other op).
1024 // (3) The target has AVX (otherwise, we have a destructive add and can't
1025 // load fold the other op without killing the constant op).
1026 // (4) The constant 1 vector has multiple uses (so it is profitable to load
1027 // into a register anyway).
1028 auto mayPreventLoadFold = [&]() {
1029 return X86::mayFoldLoad(Op: N->getOperand(Num: 0), Subtarget: *Subtarget) &&
1030 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1031 !N->getOperand(Num: 1).hasOneUse();
1032 };
1033 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1034 N->getSimpleValueType(ResNo: 0).isVector() && !mayPreventLoadFold()) {
1035 APInt SplatVal;
1036 if (!ISD::isBuildVectorOfConstantSDNodes(
1037 N: peekThroughBitcasts(V: N->getOperand(Num: 0)).getNode()) &&
1038 X86::isConstantSplat(Op: N->getOperand(Num: 1), SplatVal) &&
1039 SplatVal.isOne()) {
1040 SDLoc DL(N);
1041
1042 MVT VT = N->getSimpleValueType(ResNo: 0);
1043 unsigned NumElts = VT.getSizeInBits() / 32;
1044 SDValue AllOnes =
1045 CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts));
1046 AllOnes = CurDAG->getBitcast(VT, V: AllOnes);
1047
1048 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1049 SDValue Res =
1050 CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: 0), N2: AllOnes);
1051 --I;
1052 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1053 ++I;
1054 MadeChange = true;
1055 continue;
1056 }
1057 }
1058
1059 switch (N->getOpcode()) {
1060 case X86ISD::VBROADCAST: {
1061 MVT VT = N->getSimpleValueType(ResNo: 0);
1062 // Emulate v32i16/v64i8 broadcast without BWI.
1063 if (!Subtarget->hasBWI() && needBWI(VT)) {
1064 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1065 SDLoc dl(N);
1066 SDValue NarrowBCast =
1067 CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: 0));
1068 SDValue Res =
1069 CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1070 N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1071 unsigned Index = NarrowVT.getVectorMinNumElements();
1072 Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1073 N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1074
1075 --I;
1076 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1077 ++I;
1078 MadeChange = true;
1079 continue;
1080 }
1081
1082 break;
1083 }
1084 case X86ISD::VBROADCAST_LOAD: {
1085 MVT VT = N->getSimpleValueType(ResNo: 0);
1086 // Emulate v32i16/v64i8 broadcast without BWI.
1087 if (!Subtarget->hasBWI() && needBWI(VT)) {
1088 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1089 auto *MemNode = cast<MemSDNode>(Val: N);
1090 SDLoc dl(N);
1091 SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other);
1092 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1093 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1094 Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(),
1095 MMO: MemNode->getMemOperand());
1096 SDValue Res =
1097 CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1098 N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1099 unsigned Index = NarrowVT.getVectorMinNumElements();
1100 Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1101 N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1102
1103 --I;
1104 SDValue To[] = {Res, NarrowBCast.getValue(R: 1)};
1105 CurDAG->ReplaceAllUsesWith(From: N, To);
1106 ++I;
1107 MadeChange = true;
1108 continue;
1109 }
1110
1111 break;
1112 }
1113 case ISD::LOAD: {
1114 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1115 // load, then just extract the lower subvector and avoid the second load.
1116 auto *Ld = cast<LoadSDNode>(Val: N);
1117 MVT VT = N->getSimpleValueType(ResNo: 0);
1118 if (!ISD::isNormalLoad(N: Ld) || !Ld->isSimple() ||
1119 !(VT.is128BitVector() || VT.is256BitVector()))
1120 break;
1121
1122 MVT MaxVT = VT;
1123 SDNode *MaxLd = nullptr;
1124 SDValue Ptr = Ld->getBasePtr();
1125 SDValue Chain = Ld->getChain();
1126 for (SDNode *User : Ptr->users()) {
1127 auto *UserLd = dyn_cast<LoadSDNode>(Val: User);
1128 MVT UserVT = User->getSimpleValueType(ResNo: 0);
1129 if (User != N && UserLd && ISD::isNormalLoad(N: User) &&
1130 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1131 !User->hasAnyUseOfValue(Value: 1) &&
1132 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1133 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1134 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1135 MaxLd = User;
1136 MaxVT = UserVT;
1137 }
1138 }
1139 if (MaxLd) {
1140 SDLoc dl(N);
1141 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1142 MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts);
1143 SDValue Extract = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT,
1144 N1: SDValue(MaxLd, 0),
1145 N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1146 SDValue Res = CurDAG->getBitcast(VT, V: Extract);
1147
1148 --I;
1149 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1150 CurDAG->ReplaceAllUsesWith(From: N, To);
1151 ++I;
1152 MadeChange = true;
1153 continue;
1154 }
1155 break;
1156 }
1157 case ISD::VSELECT: {
1158 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1159 EVT EleVT = N->getOperand(Num: 0).getValueType().getVectorElementType();
1160 if (EleVT == MVT::i1)
1161 break;
1162
1163 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1164 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1165 "We can't replace VSELECT with BLENDV in vXi16!");
1166 SDValue R;
1167 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: 0)) ==
1168 EleVT.getSizeInBits()) {
1169 R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1170 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), N3: N->getOperand(Num: 2),
1171 N4: CurDAG->getTargetConstant(Val: 0xCA, DL: SDLoc(N), VT: MVT::i8));
1172 } else {
1173 R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1174 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1),
1175 N3: N->getOperand(Num: 2));
1176 }
1177 --I;
1178 CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode());
1179 ++I;
1180 MadeChange = true;
1181 continue;
1182 }
1183 case ISD::FP_ROUND:
1184 case ISD::STRICT_FP_ROUND:
1185 case ISD::FP_TO_SINT:
1186 case ISD::FP_TO_UINT:
1187 case ISD::STRICT_FP_TO_SINT:
1188 case ISD::STRICT_FP_TO_UINT: {
1189 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1190 // don't need 2 sets of patterns.
1191 if (!N->getSimpleValueType(ResNo: 0).isVector())
1192 break;
1193
1194 unsigned NewOpc;
1195 switch (N->getOpcode()) {
1196 default: llvm_unreachable("Unexpected opcode!");
1197 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1198 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1199 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1200 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1201 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1202 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1203 }
1204 SDValue Res;
1205 if (N->isStrictFPOpcode())
1206 Res =
1207 CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), ResultTys: {N->getValueType(ResNo: 0), MVT::Other},
1208 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)});
1209 else
1210 Res =
1211 CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1212 Operand: N->getOperand(Num: 0));
1213 --I;
1214 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1215 ++I;
1216 MadeChange = true;
1217 continue;
1218 }
1219 case ISD::SHL:
1220 case ISD::SRA:
1221 case ISD::SRL: {
1222 // Replace vector shifts with their X86 specific equivalent so we don't
1223 // need 2 sets of patterns.
1224 if (!N->getValueType(ResNo: 0).isVector())
1225 break;
1226
1227 unsigned NewOpc;
1228 switch (N->getOpcode()) {
1229 default: llvm_unreachable("Unexpected opcode!");
1230 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1231 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1232 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1233 }
1234 SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1235 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
1236 --I;
1237 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1238 ++I;
1239 MadeChange = true;
1240 continue;
1241 }
1242 case ISD::ANY_EXTEND:
1243 case ISD::ANY_EXTEND_VECTOR_INREG: {
1244 // Replace vector any extend with the zero extend equivalents so we don't
1245 // need 2 sets of patterns. Ignore vXi1 extensions.
1246 if (!N->getValueType(ResNo: 0).isVector())
1247 break;
1248
1249 unsigned NewOpc;
1250 if (N->getOperand(Num: 0).getScalarValueSizeInBits() == 1) {
1251 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1252 "Unexpected opcode for mask vector!");
1253 NewOpc = ISD::SIGN_EXTEND;
1254 } else {
1255 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1256 ? ISD::ZERO_EXTEND
1257 : ISD::ZERO_EXTEND_VECTOR_INREG;
1258 }
1259
1260 SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
1261 Operand: N->getOperand(Num: 0));
1262 --I;
1263 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1264 ++I;
1265 MadeChange = true;
1266 continue;
1267 }
1268 case ISD::FCEIL:
1269 case ISD::STRICT_FCEIL:
1270 case ISD::FFLOOR:
1271 case ISD::STRICT_FFLOOR:
1272 case ISD::FTRUNC:
1273 case ISD::STRICT_FTRUNC:
1274 case ISD::FROUNDEVEN:
1275 case ISD::STRICT_FROUNDEVEN:
1276 case ISD::FNEARBYINT:
1277 case ISD::STRICT_FNEARBYINT:
1278 case ISD::FRINT:
1279 case ISD::STRICT_FRINT: {
1280 // Replace fp rounding with their X86 specific equivalent so we don't
1281 // need 2 sets of patterns.
1282 unsigned Imm;
1283 switch (N->getOpcode()) {
1284 default: llvm_unreachable("Unexpected opcode!");
1285 case ISD::STRICT_FCEIL:
1286 case ISD::FCEIL: Imm = 0xA; break;
1287 case ISD::STRICT_FFLOOR:
1288 case ISD::FFLOOR: Imm = 0x9; break;
1289 case ISD::STRICT_FTRUNC:
1290 case ISD::FTRUNC: Imm = 0xB; break;
1291 case ISD::STRICT_FROUNDEVEN:
1292 case ISD::FROUNDEVEN: Imm = 0x8; break;
1293 case ISD::STRICT_FNEARBYINT:
1294 case ISD::FNEARBYINT: Imm = 0xC; break;
1295 case ISD::STRICT_FRINT:
1296 case ISD::FRINT: Imm = 0x4; break;
1297 }
1298 SDLoc dl(N);
1299 bool IsStrict = N->isStrictFPOpcode();
1300 SDValue Res;
1301 if (IsStrict)
1302 Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl,
1303 ResultTys: {N->getValueType(ResNo: 0), MVT::Other},
1304 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1),
1305 CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)});
1306 else
1307 Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: 0),
1308 N1: N->getOperand(Num: 0),
1309 N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32));
1310 --I;
1311 CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1312 ++I;
1313 MadeChange = true;
1314 continue;
1315 }
1316 case X86ISD::FANDN:
1317 case X86ISD::FAND:
1318 case X86ISD::FOR:
1319 case X86ISD::FXOR: {
1320 // Widen scalar fp logic ops to vector to reduce isel patterns.
1321 // FIXME: Can we do this during lowering/combine.
1322 MVT VT = N->getSimpleValueType(ResNo: 0);
1323 if (VT.isVector() || VT == MVT::f128)
1324 break;
1325
1326 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1327 : VT == MVT::f32 ? MVT::v4f32
1328 : MVT::v8f16;
1329
1330 SDLoc dl(N);
1331 SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1332 Operand: N->getOperand(Num: 0));
1333 SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1334 Operand: N->getOperand(Num: 1));
1335
1336 SDValue Res;
1337 if (Subtarget->hasSSE2()) {
1338 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1339 Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0);
1340 Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1);
1341 unsigned Opc;
1342 switch (N->getOpcode()) {
1343 default: llvm_unreachable("Unexpected opcode!");
1344 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1345 case X86ISD::FAND: Opc = ISD::AND; break;
1346 case X86ISD::FOR: Opc = ISD::OR; break;
1347 case X86ISD::FXOR: Opc = ISD::XOR; break;
1348 }
1349 Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1);
1350 Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res);
1351 } else {
1352 Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1);
1353 }
1354 Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res,
1355 N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl));
1356 --I;
1357 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res);
1358 ++I;
1359 MadeChange = true;
1360 continue;
1361 }
1362 }
1363
1364 if (OptLevel != CodeGenOptLevel::None &&
1365 // Only do this when the target can fold the load into the call or
1366 // jmp.
1367 !Subtarget->useIndirectThunkCalls() &&
1368 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1369 (N->getOpcode() == X86ISD::TC_RETURN &&
1370 (Subtarget->is64Bit() ||
1371 !getTargetMachine().isPositionIndependent())))) {
1372 /// Also try moving call address load from outside callseq_start to just
1373 /// before the call to allow it to be folded.
1374 ///
1375 /// [Load chain]
1376 /// ^
1377 /// |
1378 /// [Load]
1379 /// ^ ^
1380 /// | |
1381 /// / \--
1382 /// / |
1383 ///[CALLSEQ_START] |
1384 /// ^ |
1385 /// | |
1386 /// [LOAD/C2Reg] |
1387 /// | |
1388 /// \ /
1389 /// \ /
1390 /// [CALL]
1391 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1392 SDValue Chain = N->getOperand(Num: 0);
1393 SDValue Load = N->getOperand(Num: 1);
1394 if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq))
1395 continue;
1396 if (N->getOpcode() == X86ISD::TC_RETURN && !checkTCRetEnoughRegs(N))
1397 continue;
1398 moveBelowOrigChain(CurDAG, Load, Call: SDValue(N, 0), OrigChain: Chain);
1399 ++NumLoadMoved;
1400 MadeChange = true;
1401 continue;
1402 }
1403
1404 // Lower fpround and fpextend nodes that target the FP stack to be store and
1405 // load to the stack. This is a gross hack. We would like to simply mark
1406 // these as being illegal, but when we do that, legalize produces these when
1407 // it expands calls, then expands these in the same legalize pass. We would
1408 // like dag combine to be able to hack on these between the call expansion
1409 // and the node legalization. As such this pass basically does "really
1410 // late" legalization of these inline with the X86 isel pass.
1411 // FIXME: This should only happen when not compiled with -O0.
1412 switch (N->getOpcode()) {
1413 default: continue;
1414 case ISD::FP_ROUND:
1415 case ISD::FP_EXTEND:
1416 {
1417 MVT SrcVT = N->getOperand(Num: 0).getSimpleValueType();
1418 MVT DstVT = N->getSimpleValueType(ResNo: 0);
1419
1420 // If any of the sources are vectors, no fp stack involved.
1421 if (SrcVT.isVector() || DstVT.isVector())
1422 continue;
1423
1424 // If the source and destination are SSE registers, then this is a legal
1425 // conversion that should not be lowered.
1426 const X86TargetLowering *X86Lowering =
1427 static_cast<const X86TargetLowering *>(TLI);
1428 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1429 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1430 if (SrcIsSSE && DstIsSSE)
1431 continue;
1432
1433 if (!SrcIsSSE && !DstIsSSE) {
1434 // If this is an FPStack extension, it is a noop.
1435 if (N->getOpcode() == ISD::FP_EXTEND)
1436 continue;
1437 // If this is a value-preserving FPStack truncation, it is a noop.
1438 if (N->getConstantOperandVal(Num: 1))
1439 continue;
1440 }
1441
1442 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1443 // FPStack has extload and truncstore. SSE can fold direct loads into other
1444 // operations. Based on this, decide what we want to do.
1445 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1446 SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1447 int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1448 MachinePointerInfo MPI =
1449 MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1450 SDLoc dl(N);
1451
1452 // FIXME: optimize the case where the src/dest is a load or store?
1453
1454 SDValue Store = CurDAG->getTruncStore(
1455 Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: 0), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT);
1456 SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store,
1457 Ptr: MemTmp, PtrInfo: MPI, MemVT);
1458
1459 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1460 // extload we created. This will cause general havok on the dag because
1461 // anything below the conversion could be folded into other existing nodes.
1462 // To avoid invalidating 'I', back it up to the convert node.
1463 --I;
1464 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Result);
1465 break;
1466 }
1467
1468 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1469 //dealing with the chain differently, as there is already a preexisting chain.
1470 case ISD::STRICT_FP_ROUND:
1471 case ISD::STRICT_FP_EXTEND:
1472 {
1473 MVT SrcVT = N->getOperand(Num: 1).getSimpleValueType();
1474 MVT DstVT = N->getSimpleValueType(ResNo: 0);
1475
1476 // If any of the sources are vectors, no fp stack involved.
1477 if (SrcVT.isVector() || DstVT.isVector())
1478 continue;
1479
1480 // If the source and destination are SSE registers, then this is a legal
1481 // conversion that should not be lowered.
1482 const X86TargetLowering *X86Lowering =
1483 static_cast<const X86TargetLowering *>(TLI);
1484 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1485 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1486 if (SrcIsSSE && DstIsSSE)
1487 continue;
1488
1489 if (!SrcIsSSE && !DstIsSSE) {
1490 // If this is an FPStack extension, it is a noop.
1491 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1492 continue;
1493 // If this is a value-preserving FPStack truncation, it is a noop.
1494 if (N->getConstantOperandVal(Num: 2))
1495 continue;
1496 }
1497
1498 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1499 // FPStack has extload and truncstore. SSE can fold direct loads into other
1500 // operations. Based on this, decide what we want to do.
1501 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1502 SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1503 int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1504 MachinePointerInfo MPI =
1505 MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1506 SDLoc dl(N);
1507
1508 // FIXME: optimize the case where the src/dest is a load or store?
1509
1510 //Since the operation is StrictFP, use the preexisting chain.
1511 SDValue Store, Result;
1512 if (!SrcIsSSE) {
1513 SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
1514 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), MemTmp};
1515 Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT,
1516 PtrInfo: MPI, /*Align*/ Alignment: std::nullopt,
1517 Flags: MachineMemOperand::MOStore);
1518 if (N->getFlags().hasNoFPExcept()) {
1519 SDNodeFlags Flags = Store->getFlags();
1520 Flags.setNoFPExcept(true);
1521 Store->setFlags(Flags);
1522 }
1523 } else {
1524 assert(SrcVT == MemVT && "Unexpected VT!");
1525 Store = CurDAG->getStore(Chain: N->getOperand(Num: 0), dl, Val: N->getOperand(Num: 1), Ptr: MemTmp,
1526 PtrInfo: MPI);
1527 }
1528
1529 if (!DstIsSSE) {
1530 SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other);
1531 SDValue Ops[] = {Store, MemTmp};
1532 Result = CurDAG->getMemIntrinsicNode(
1533 Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI,
1534 /*Align*/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad);
1535 if (N->getFlags().hasNoFPExcept()) {
1536 SDNodeFlags Flags = Result->getFlags();
1537 Flags.setNoFPExcept(true);
1538 Result->setFlags(Flags);
1539 }
1540 } else {
1541 assert(DstVT == MemVT && "Unexpected VT!");
1542 Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI);
1543 }
1544
1545 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1546 // extload we created. This will cause general havok on the dag because
1547 // anything below the conversion could be folded into other existing nodes.
1548 // To avoid invalidating 'I', back it up to the convert node.
1549 --I;
1550 CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode());
1551 break;
1552 }
1553 }
1554
1555
1556 // Now that we did that, the node is dead. Increment the iterator to the
1557 // next node to process, then delete N.
1558 ++I;
1559 MadeChange = true;
1560 }
1561
1562 // Remove any dead nodes that may have been left behind.
1563 if (MadeChange)
1564 CurDAG->RemoveDeadNodes();
1565}
1566
1567// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1568bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1569 unsigned Opc = N->getMachineOpcode();
1570 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1571 Opc != X86::MOVSX64rr8)
1572 return false;
1573
1574 SDValue N0 = N->getOperand(Num: 0);
1575
1576 // We need to be extracting the lower bit of an extend.
1577 if (!N0.isMachineOpcode() ||
1578 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1579 N0.getConstantOperandVal(i: 1) != X86::sub_8bit)
1580 return false;
1581
1582 // We're looking for either a movsx or movzx to match the original opcode.
1583 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1584 : X86::MOVSX32rr8_NOREX;
1585 SDValue N00 = N0.getOperand(i: 0);
1586 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1587 return false;
1588
1589 if (Opc == X86::MOVSX64rr8) {
1590 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1591 // to 64.
1592 MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc(N),
1593 VT: MVT::i64, Op1: N00);
1594 ReplaceUses(F: N, T: Extend);
1595 } else {
1596 // Ok we can drop this extend and just use the original extend.
1597 ReplaceUses(F: N, T: N00.getNode());
1598 }
1599
1600 return true;
1601}
1602
1603void X86DAGToDAGISel::PostprocessISelDAG() {
1604 // Skip peepholes at -O0.
1605 if (TM.getOptLevel() == CodeGenOptLevel::None)
1606 return;
1607
1608 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1609
1610 bool MadeChange = false;
1611 while (Position != CurDAG->allnodes_begin()) {
1612 SDNode *N = &*--Position;
1613 // Skip dead nodes and any non-machine opcodes.
1614 if (N->use_empty() || !N->isMachineOpcode())
1615 continue;
1616
1617 if (tryOptimizeRem8Extend(N)) {
1618 MadeChange = true;
1619 continue;
1620 }
1621
1622 unsigned Opc = N->getMachineOpcode();
1623 switch (Opc) {
1624 default:
1625 continue;
1626 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1627 case X86::TEST8rr:
1628 case X86::TEST16rr:
1629 case X86::TEST32rr:
1630 case X86::TEST64rr:
1631 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1632 case X86::CTEST8rr:
1633 case X86::CTEST16rr:
1634 case X86::CTEST32rr:
1635 case X86::CTEST64rr: {
1636 auto &Op0 = N->getOperand(Num: 0);
1637 if (Op0 != N->getOperand(Num: 1) || !Op0->hasNUsesOfValue(NUses: 2, Value: Op0.getResNo()) ||
1638 !Op0.isMachineOpcode())
1639 continue;
1640 SDValue And = N->getOperand(Num: 0);
1641#define CASE_ND(OP) \
1642 case X86::OP: \
1643 case X86::OP##_ND:
1644 switch (And.getMachineOpcode()) {
1645 default:
1646 continue;
1647 CASE_ND(AND8rr)
1648 CASE_ND(AND16rr)
1649 CASE_ND(AND32rr)
1650 CASE_ND(AND64rr) {
1651 if (And->hasAnyUseOfValue(Value: 1))
1652 continue;
1653 SmallVector<SDValue> Ops(N->op_values());
1654 Ops[0] = And.getOperand(i: 0);
1655 Ops[1] = And.getOperand(i: 1);
1656 MachineSDNode *Test =
1657 CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: MVT::i32, Ops);
1658 ReplaceUses(F: N, T: Test);
1659 MadeChange = true;
1660 continue;
1661 }
1662 CASE_ND(AND8rm)
1663 CASE_ND(AND16rm)
1664 CASE_ND(AND32rm)
1665 CASE_ND(AND64rm) {
1666 if (And->hasAnyUseOfValue(Value: 1))
1667 continue;
1668 unsigned NewOpc;
1669 bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc);
1670#define FROM_TO(A, B) \
1671 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1672 break;
1673 switch (And.getMachineOpcode()) {
1674 FROM_TO(AND8rm, TEST8mr);
1675 FROM_TO(AND16rm, TEST16mr);
1676 FROM_TO(AND32rm, TEST32mr);
1677 FROM_TO(AND64rm, TEST64mr);
1678 }
1679#undef FROM_TO
1680#undef CASE_ND
1681 // Need to swap the memory and register operand.
1682 SmallVector<SDValue> Ops = {And.getOperand(i: 1), And.getOperand(i: 2),
1683 And.getOperand(i: 3), And.getOperand(i: 4),
1684 And.getOperand(i: 5), And.getOperand(i: 0)};
1685 // CC, Cflags.
1686 if (IsCTESTCC) {
1687 Ops.push_back(Elt: N->getOperand(Num: 2));
1688 Ops.push_back(Elt: N->getOperand(Num: 3));
1689 }
1690 // Chain of memory load
1691 Ops.push_back(Elt: And.getOperand(i: 6));
1692 // Glue
1693 if (IsCTESTCC)
1694 Ops.push_back(Elt: N->getOperand(Num: 4));
1695
1696 MachineSDNode *Test = CurDAG->getMachineNode(
1697 Opcode: NewOpc, dl: SDLoc(N), VT1: MVT::i32, VT2: MVT::Other, Ops);
1698 CurDAG->setNodeMemRefs(
1699 N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands());
1700 ReplaceUses(F: And.getValue(R: 2), T: SDValue(Test, 1));
1701 ReplaceUses(F: SDValue(N, 0), T: SDValue(Test, 0));
1702 MadeChange = true;
1703 continue;
1704 }
1705 }
1706 }
1707 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1708 // used. We're doing this late so we can prefer to fold the AND into masked
1709 // comparisons. Doing that can be better for the live range of the mask
1710 // register.
1711 case X86::KORTESTBkk:
1712 case X86::KORTESTWkk:
1713 case X86::KORTESTDkk:
1714 case X86::KORTESTQkk: {
1715 SDValue Op0 = N->getOperand(Num: 0);
1716 if (Op0 != N->getOperand(Num: 1) || !N->isOnlyUserOf(N: Op0.getNode()) ||
1717 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(Flags: SDValue(N, 0)))
1718 continue;
1719#define CASE(A) \
1720 case X86::A: \
1721 break;
1722 switch (Op0.getMachineOpcode()) {
1723 default:
1724 continue;
1725 CASE(KANDBkk)
1726 CASE(KANDWkk)
1727 CASE(KANDDkk)
1728 CASE(KANDQkk)
1729 }
1730 unsigned NewOpc;
1731#define FROM_TO(A, B) \
1732 case X86::A: \
1733 NewOpc = X86::B; \
1734 break;
1735 switch (Opc) {
1736 FROM_TO(KORTESTBkk, KTESTBkk)
1737 FROM_TO(KORTESTWkk, KTESTWkk)
1738 FROM_TO(KORTESTDkk, KTESTDkk)
1739 FROM_TO(KORTESTQkk, KTESTQkk)
1740 }
1741 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1742 // KAND instructions and KTEST use the same ISA feature.
1743 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1744 continue;
1745#undef FROM_TO
1746 MachineSDNode *KTest = CurDAG->getMachineNode(
1747 Opcode: NewOpc, dl: SDLoc(N), VT: MVT::i32, Op1: Op0.getOperand(i: 0), Op2: Op0.getOperand(i: 1));
1748 ReplaceUses(F: N, T: KTest);
1749 MadeChange = true;
1750 continue;
1751 }
1752 // Attempt to remove vectors moves that were inserted to zero upper bits.
1753 case TargetOpcode::SUBREG_TO_REG: {
1754 unsigned SubRegIdx = N->getConstantOperandVal(Num: 1);
1755 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1756 continue;
1757
1758 SDValue Move = N->getOperand(Num: 0);
1759 if (!Move.isMachineOpcode())
1760 continue;
1761
1762 // Make sure its one of the move opcodes we recognize.
1763 switch (Move.getMachineOpcode()) {
1764 default:
1765 continue;
1766 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1767 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1768 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1769 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1770 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1771 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1772 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1773 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1774 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1775 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1776 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1777 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1778 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1779 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1780 }
1781#undef CASE
1782
1783 SDValue In = Move.getOperand(i: 0);
1784 if (!In.isMachineOpcode() ||
1785 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1786 continue;
1787
1788 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1789 // the SHA instructions which use a legacy encoding.
1790 uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags;
1791 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1792 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1793 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1794 continue;
1795
1796 // Producing instruction is another vector instruction. We can drop the
1797 // move.
1798 CurDAG->UpdateNodeOperands(N, Op1: In, Op2: N->getOperand(Num: 1));
1799 MadeChange = true;
1800 }
1801 }
1802 }
1803
1804 if (MadeChange)
1805 CurDAG->RemoveDeadNodes();
1806}
1807
1808
1809/// Emit any code that needs to be executed only in the main function.
1810void X86DAGToDAGISel::emitSpecialCodeForMain() {
1811 if (Subtarget->isTargetCygMing()) {
1812 TargetLowering::ArgListTy Args;
1813 auto &DL = CurDAG->getDataLayout();
1814
1815 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1816 CLI.setChain(CurDAG->getRoot())
1817 .setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()),
1818 Target: CurDAG->getExternalSymbol(Sym: "__main", VT: TLI->getPointerTy(DL)),
1819 ArgsList: std::move(Args));
1820 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1821 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1822 CurDAG->setRoot(Result.second);
1823 }
1824}
1825
1826void X86DAGToDAGISel::emitFunctionEntryCode() {
1827 // If this is main, emit special code for main.
1828 const Function &F = MF->getFunction();
1829 if (F.hasExternalLinkage() && F.getName() == "main")
1830 emitSpecialCodeForMain();
1831}
1832
1833static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1834 // We can run into an issue where a frame index or a register base
1835 // includes a displacement that, when added to the explicit displacement,
1836 // will overflow the displacement field. Assuming that the
1837 // displacement fits into a 31-bit integer (which is only slightly more
1838 // aggressive than the current fundamental assumption that it fits into
1839 // a 32-bit integer), a 31-bit disp should always be safe.
1840 return isInt<31>(x: Val);
1841}
1842
1843bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1844 X86ISelAddressMode &AM) {
1845 // We may have already matched a displacement and the caller just added the
1846 // symbolic displacement. So we still need to do the checks even if Offset
1847 // is zero.
1848
1849 int64_t Val = AM.Disp + Offset;
1850
1851 // Cannot combine ExternalSymbol displacements with integer offsets.
1852 if (Val != 0 && (AM.ES || AM.MCSym))
1853 return true;
1854
1855 CodeModel::Model M = TM.getCodeModel();
1856 if (Subtarget->is64Bit()) {
1857 if (Val != 0 &&
1858 !X86::isOffsetSuitableForCodeModel(Offset: Val, M,
1859 hasSymbolicDisplacement: AM.hasSymbolicDisplacement()))
1860 return true;
1861 // In addition to the checks required for a register base, check that
1862 // we do not try to use an unsafe Disp with a frame index.
1863 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1864 !isDispSafeForFrameIndexOrRegBase(Val))
1865 return true;
1866 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1867 // 64 bits. Instructions with 32-bit register addresses perform this zero
1868 // extension for us and we can safely ignore the high bits of Offset.
1869 // Instructions with only a 32-bit immediate address do not, though: they
1870 // sign extend instead. This means only address the low 2GB of address space
1871 // is directly addressable, we need indirect addressing for the high 2GB of
1872 // address space.
1873 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1874 // implicit zero extension of instructions would cover up any problem.
1875 // However, we have asserts elsewhere that get triggered if we do, so keep
1876 // the checks for now.
1877 // TODO: We would actually be able to accept these, as well as the same
1878 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1879 // to get an address size override to be emitted. However, this
1880 // pseudo-register is not part of any register class and therefore causes
1881 // MIR verification to fail.
1882 if (Subtarget->isTarget64BitILP32() &&
1883 !isDispSafeForFrameIndexOrRegBase(Val: (uint32_t)Val) &&
1884 !AM.hasBaseOrIndexReg())
1885 return true;
1886 } else if (Subtarget->is16Bit()) {
1887 // In 16-bit mode, displacements are limited to [-65535,65535] for FK_Data_2
1888 // fixups of unknown signedness. See X86AsmBackend::applyFixup.
1889 if (Val < -(int64_t)UINT16_MAX || Val > (int64_t)UINT16_MAX)
1890 return true;
1891 } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1892 // For 32-bit X86, make sure the displacement still isn't close to the
1893 // expressible limit.
1894 return true;
1895 AM.Disp = Val;
1896 return false;
1897}
1898
1899bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1900 bool AllowSegmentRegForX32) {
1901 SDValue Address = N->getOperand(Num: 1);
1902
1903 // load gs:0 -> GS segment register.
1904 // load fs:0 -> FS segment register.
1905 //
1906 // This optimization is generally valid because the GNU TLS model defines that
1907 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1908 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1909 // zero-extended to 64 bits and then added it to the base address, which gives
1910 // unwanted results when the register holds a negative value.
1911 // For more information see http://people.redhat.com/drepper/tls.pdf
1912 if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr &&
1913 !IndirectTlsSegRefs &&
1914 (Subtarget->isTargetGlibc() || Subtarget->isTargetMusl() ||
1915 Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) {
1916 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1917 return true;
1918 switch (N->getPointerInfo().getAddrSpace()) {
1919 case X86AS::GS:
1920 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
1921 return false;
1922 case X86AS::FS:
1923 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
1924 return false;
1925 // Address space X86AS::SS is not handled here, because it is not used to
1926 // address TLS areas.
1927 }
1928 }
1929
1930 return true;
1931}
1932
1933/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1934/// mode. These wrap things that will resolve down into a symbol reference.
1935/// If no match is possible, this returns true, otherwise it returns false.
1936bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1937 // If the addressing mode already has a symbol as the displacement, we can
1938 // never match another symbol.
1939 if (AM.hasSymbolicDisplacement())
1940 return true;
1941
1942 bool IsRIPRelTLS = false;
1943 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1944 if (IsRIPRel) {
1945 SDValue Val = N.getOperand(i: 0);
1946 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1947 IsRIPRelTLS = true;
1948 }
1949
1950 // We can't use an addressing mode in the 64-bit large code model.
1951 // Global TLS addressing is an exception. In the medium code model,
1952 // we use can use a mode when RIP wrappers are present.
1953 // That signifies access to globals that are known to be "near",
1954 // such as the GOT itself.
1955 CodeModel::Model M = TM.getCodeModel();
1956 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1957 return true;
1958
1959 // Base and index reg must be 0 in order to use %rip as base.
1960 if (IsRIPRel && AM.hasBaseOrIndexReg())
1961 return true;
1962
1963 // Make a local copy in case we can't do this fold.
1964 X86ISelAddressMode Backup = AM;
1965
1966 int64_t Offset = 0;
1967 SDValue N0 = N.getOperand(i: 0);
1968 if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) {
1969 AM.GV = G->getGlobal();
1970 AM.SymbolFlags = G->getTargetFlags();
1971 Offset = G->getOffset();
1972 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) {
1973 AM.CP = CP->getConstVal();
1974 AM.Alignment = CP->getAlign();
1975 AM.SymbolFlags = CP->getTargetFlags();
1976 Offset = CP->getOffset();
1977 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) {
1978 AM.ES = S->getSymbol();
1979 AM.SymbolFlags = S->getTargetFlags();
1980 } else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) {
1981 AM.MCSym = S->getMCSymbol();
1982 } else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) {
1983 AM.JT = J->getIndex();
1984 AM.SymbolFlags = J->getTargetFlags();
1985 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) {
1986 AM.BlockAddr = BA->getBlockAddress();
1987 AM.SymbolFlags = BA->getTargetFlags();
1988 Offset = BA->getOffset();
1989 } else
1990 llvm_unreachable("Unhandled symbol reference node.");
1991
1992 // Can't use an addressing mode with large globals.
1993 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1994 TM.isLargeGlobalValue(GV: AM.GV)) {
1995 AM = Backup;
1996 return true;
1997 }
1998
1999 if (foldOffsetIntoAddress(Offset, AM)) {
2000 AM = Backup;
2001 return true;
2002 }
2003
2004 if (IsRIPRel)
2005 AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64));
2006
2007 // Commit the changes now that we know this fold is safe.
2008 return false;
2009}
2010
2011/// Add the specified node to the specified addressing mode, returning true if
2012/// it cannot be done. This just pattern matches for the addressing mode.
2013bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
2014 if (matchAddressRecursively(N, AM, Depth: 0))
2015 return true;
2016
2017 // Post-processing: Make a second attempt to fold a load, if we now know
2018 // that there will not be any other register. This is only performed for
2019 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
2020 // any foldable load the first time.
2021 if (Subtarget->isTarget64BitILP32() &&
2022 AM.BaseType == X86ISelAddressMode::RegBase &&
2023 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
2024 SDValue Save_Base_Reg = AM.Base_Reg;
2025 if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) {
2026 AM.Base_Reg = SDValue();
2027 if (matchLoadInAddress(N: LoadN, AM, /*AllowSegmentRegForX32=*/true))
2028 AM.Base_Reg = Save_Base_Reg;
2029 }
2030 }
2031
2032 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
2033 // a smaller encoding and avoids a scaled-index.
2034 if (AM.Scale == 2 &&
2035 AM.BaseType == X86ISelAddressMode::RegBase &&
2036 AM.Base_Reg.getNode() == nullptr) {
2037 AM.Base_Reg = AM.IndexReg;
2038 AM.Scale = 1;
2039 }
2040
2041 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2042 // because it has a smaller encoding.
2043 if (TM.getCodeModel() != CodeModel::Large &&
2044 (!AM.GV || !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() &&
2045 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2046 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2047 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2048 // However, when GV is a local function symbol and in the same section as
2049 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2050 // referencing GV+Disp generates a relocation referencing the section symbol
2051 // with an even smaller offset, which might underflow. We should bail out if
2052 // the negative offset is too close to INT32_MIN. Actually, we are more
2053 // conservative here, using a smaller magic number also used by
2054 // isOffsetSuitableForCodeModel.
2055 if (isa_and_nonnull<Function>(Val: AM.GV) && AM.Disp < -16 * 1024 * 1024)
2056 return true;
2057
2058 AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64);
2059 }
2060
2061 return false;
2062}
2063
2064bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2065 unsigned Depth) {
2066 // Add an artificial use to this node so that we can keep track of
2067 // it if it gets CSE'd with a different node.
2068 HandleSDNode Handle(N);
2069
2070 X86ISelAddressMode Backup = AM;
2071 if (!matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1) &&
2072 !matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, Depth: Depth+1))
2073 return false;
2074 AM = Backup;
2075
2076 // Try again after commutating the operands.
2077 if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2078 Depth: Depth + 1) &&
2079 !matchAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, Depth: Depth + 1))
2080 return false;
2081 AM = Backup;
2082
2083 // If we couldn't fold both operands into the address at the same time,
2084 // see if we can just put each operand into a register and fold at least
2085 // the add.
2086 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2087 !AM.Base_Reg.getNode() &&
2088 !AM.IndexReg.getNode()) {
2089 N = Handle.getValue();
2090 AM.Base_Reg = N.getOperand(i: 0);
2091 AM.IndexReg = N.getOperand(i: 1);
2092 AM.Scale = 1;
2093 return false;
2094 }
2095 N = Handle.getValue();
2096 return true;
2097}
2098
2099// Insert a node into the DAG at least before the Pos node's position. This
2100// will reposition the node as needed, and will assign it a node ID that is <=
2101// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2102// IDs! The selection DAG must no longer depend on their uniqueness when this
2103// is used.
2104static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2105 if (N->getNodeId() == -1 ||
2106 (SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) >
2107 SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) {
2108 DAG.RepositionNode(Position: Pos->getIterator(), N: N.getNode());
2109 // Mark Node as invalid for pruning as after this it may be a successor to a
2110 // selected node but otherwise be in the same position of Pos.
2111 // Conservatively mark it with the same -abs(Id) to assure node id
2112 // invariant is preserved.
2113 N->setNodeId(Pos->getNodeId());
2114 SelectionDAGISel::InvalidateNodeId(N: N.getNode());
2115 }
2116}
2117
2118// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2119// safe. This allows us to convert the shift and and into an h-register
2120// extract and a scaled index. Returns false if the simplification is
2121// performed.
2122static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2123 uint64_t Mask,
2124 SDValue Shift, SDValue X,
2125 X86ISelAddressMode &AM) {
2126 if (Shift.getOpcode() != ISD::SRL ||
2127 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) ||
2128 !Shift.hasOneUse())
2129 return true;
2130
2131 int ScaleLog = 8 - Shift.getConstantOperandVal(i: 1);
2132 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2133 Mask != (0xffu << ScaleLog))
2134 return true;
2135
2136 MVT XVT = X.getSimpleValueType();
2137 MVT VT = N.getSimpleValueType();
2138 SDLoc DL(N);
2139 SDValue Eight = DAG.getConstant(Val: 8, DL, VT: MVT::i8);
2140 SDValue NewMask = DAG.getConstant(Val: 0xff, DL, VT: XVT);
2141 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight);
2142 SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask);
2143 SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT);
2144 SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8);
2145 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount);
2146
2147 // Insert the new nodes into the topological ordering. We must do this in
2148 // a valid topological ordering as nothing is going to go back and re-sort
2149 // these nodes. We continually insert before 'N' in sequence as this is
2150 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2151 // hierarchy left to express.
2152 insertDAGNode(DAG, Pos: N, N: Eight);
2153 insertDAGNode(DAG, Pos: N, N: NewMask);
2154 insertDAGNode(DAG, Pos: N, N: Srl);
2155 insertDAGNode(DAG, Pos: N, N: And);
2156 insertDAGNode(DAG, Pos: N, N: Ext);
2157 insertDAGNode(DAG, Pos: N, N: ShlCount);
2158 insertDAGNode(DAG, Pos: N, N: Shl);
2159 DAG.ReplaceAllUsesWith(From: N, To: Shl);
2160 DAG.RemoveDeadNode(N: N.getNode());
2161 AM.IndexReg = Ext;
2162 AM.Scale = (1 << ScaleLog);
2163 return false;
2164}
2165
2166// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2167// allows us to fold the shift into this addressing mode. Returns false if the
2168// transform succeeded.
2169static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2170 X86ISelAddressMode &AM) {
2171 SDValue Shift = N.getOperand(i: 0);
2172
2173 // Use a signed mask so that shifting right will insert sign bits. These
2174 // bits will be removed when we shift the result left so it doesn't matter
2175 // what we use. This might allow a smaller immediate encoding.
2176 int64_t Mask = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getSExtValue();
2177
2178 // If we have an any_extend feeding the AND, look through it to see if there
2179 // is a shift behind it. But only if the AND doesn't use the extended bits.
2180 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2181 bool FoundAnyExtend = false;
2182 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2183 Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 &&
2184 isUInt<32>(x: Mask)) {
2185 FoundAnyExtend = true;
2186 Shift = Shift.getOperand(i: 0);
2187 }
2188
2189 if (Shift.getOpcode() != ISD::SHL ||
2190 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
2191 return true;
2192
2193 SDValue X = Shift.getOperand(i: 0);
2194
2195 // Not likely to be profitable if either the AND or SHIFT node has more
2196 // than one use (unless all uses are for address computation). Besides,
2197 // isel mechanism requires their node ids to be reused.
2198 if (!N.hasOneUse() || !Shift.hasOneUse())
2199 return true;
2200
2201 // Verify that the shift amount is something we can fold.
2202 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2203 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2204 return true;
2205
2206 MVT VT = N.getSimpleValueType();
2207 SDLoc DL(N);
2208 if (FoundAnyExtend) {
2209 SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X);
2210 insertDAGNode(DAG, Pos: N, N: NewX);
2211 X = NewX;
2212 }
2213
2214 SDValue NewMask = DAG.getSignedConstant(Val: Mask >> ShiftAmt, DL, VT);
2215 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask);
2216 SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: 1));
2217
2218 // Insert the new nodes into the topological ordering. We must do this in
2219 // a valid topological ordering as nothing is going to go back and re-sort
2220 // these nodes. We continually insert before 'N' in sequence as this is
2221 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2222 // hierarchy left to express.
2223 insertDAGNode(DAG, Pos: N, N: NewMask);
2224 insertDAGNode(DAG, Pos: N, N: NewAnd);
2225 insertDAGNode(DAG, Pos: N, N: NewShift);
2226 DAG.ReplaceAllUsesWith(From: N, To: NewShift);
2227 DAG.RemoveDeadNode(N: N.getNode());
2228
2229 AM.Scale = 1 << ShiftAmt;
2230 AM.IndexReg = NewAnd;
2231 return false;
2232}
2233
2234// Implement some heroics to detect shifts of masked values where the mask can
2235// be replaced by extending the shift and undoing that in the addressing mode
2236// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2237// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2238// the addressing mode. This results in code such as:
2239//
2240// int f(short *y, int *lookup_table) {
2241// ...
2242// return *y + lookup_table[*y >> 11];
2243// }
2244//
2245// Turning into:
2246// movzwl (%rdi), %eax
2247// movl %eax, %ecx
2248// shrl $11, %ecx
2249// addl (%rsi,%rcx,4), %eax
2250//
2251// Instead of:
2252// movzwl (%rdi), %eax
2253// movl %eax, %ecx
2254// shrl $9, %ecx
2255// andl $124, %rcx
2256// addl (%rsi,%rcx), %eax
2257//
2258// Note that this function assumes the mask is provided as a mask *after* the
2259// value is shifted. The input chain may or may not match that, but computing
2260// such a mask is trivial.
2261static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2262 uint64_t Mask,
2263 SDValue Shift, SDValue X,
2264 X86ISelAddressMode &AM) {
2265 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2266 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
2267 return true;
2268
2269 // We need to ensure that mask is a continuous run of bits.
2270 unsigned MaskIdx, MaskLen;
2271 if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2272 return true;
2273 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2274
2275 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2276
2277 // The amount of shift we're trying to fit into the addressing mode is taken
2278 // from the shifted mask index (number of trailing zeros of the mask).
2279 unsigned AMShiftAmt = MaskIdx;
2280
2281 // There is nothing we can do here unless the mask is removing some bits.
2282 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2283 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2284
2285 // Scale the leading zero count down based on the actual size of the value.
2286 // Also scale it down based on the size of the shift.
2287 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2288 if (MaskLZ < ScaleDown)
2289 return true;
2290 MaskLZ -= ScaleDown;
2291
2292 // The final check is to ensure that any masked out high bits of X are
2293 // already known to be zero. Otherwise, the mask has a semantic impact
2294 // other than masking out a couple of low bits. Unfortunately, because of
2295 // the mask, zero extensions will be removed from operands in some cases.
2296 // This code works extra hard to look through extensions because we can
2297 // replace them with zero extensions cheaply if necessary.
2298 bool ReplacingAnyExtend = false;
2299 if (X.getOpcode() == ISD::ANY_EXTEND) {
2300 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2301 X.getOperand(i: 0).getSimpleValueType().getSizeInBits();
2302 // Assume that we'll replace the any-extend with a zero-extend, and
2303 // narrow the search to the extended value.
2304 X = X.getOperand(i: 0);
2305 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2306 ReplacingAnyExtend = true;
2307 }
2308 APInt MaskedHighBits =
2309 APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ);
2310 if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits))
2311 return true;
2312
2313 // We've identified a pattern that can be transformed into a single shift
2314 // and an addressing mode. Make it so.
2315 MVT VT = N.getSimpleValueType();
2316 if (ReplacingAnyExtend) {
2317 assert(X.getValueType() != VT);
2318 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2319 SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(X), VT, Operand: X);
2320 insertDAGNode(DAG, Pos: N, N: NewX);
2321 X = NewX;
2322 }
2323
2324 MVT XVT = X.getSimpleValueType();
2325 SDLoc DL(N);
2326 SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2327 SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2328 SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT);
2329 SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2330 SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2331
2332 // Insert the new nodes into the topological ordering. We must do this in
2333 // a valid topological ordering as nothing is going to go back and re-sort
2334 // these nodes. We continually insert before 'N' in sequence as this is
2335 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2336 // hierarchy left to express.
2337 insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2338 insertDAGNode(DAG, Pos: N, N: NewSRL);
2339 insertDAGNode(DAG, Pos: N, N: NewExt);
2340 insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2341 insertDAGNode(DAG, Pos: N, N: NewSHL);
2342 DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2343 DAG.RemoveDeadNode(N: N.getNode());
2344
2345 AM.Scale = 1 << AMShiftAmt;
2346 AM.IndexReg = NewExt;
2347 return false;
2348}
2349
2350// Transform "(X >> SHIFT) & (MASK << C1)" to
2351// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2352// matched to a BEXTR later. Returns false if the simplification is performed.
2353static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2354 uint64_t Mask,
2355 SDValue Shift, SDValue X,
2356 X86ISelAddressMode &AM,
2357 const X86Subtarget &Subtarget) {
2358 if (Shift.getOpcode() != ISD::SRL ||
2359 !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) ||
2360 !Shift.hasOneUse() || !N.hasOneUse())
2361 return true;
2362
2363 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2364 if (!Subtarget.hasTBM() &&
2365 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2366 return true;
2367
2368 // We need to ensure that mask is a continuous run of bits.
2369 unsigned MaskIdx, MaskLen;
2370 if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2371 return true;
2372
2373 unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1);
2374
2375 // The amount of shift we're trying to fit into the addressing mode is taken
2376 // from the shifted mask index (number of trailing zeros of the mask).
2377 unsigned AMShiftAmt = MaskIdx;
2378
2379 // There is nothing we can do here unless the mask is removing some bits.
2380 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2381 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2382
2383 MVT XVT = X.getSimpleValueType();
2384 MVT VT = N.getSimpleValueType();
2385 SDLoc DL(N);
2386 SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2387 SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2388 SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT);
2389 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask);
2390 SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT);
2391 SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2392 SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2393
2394 // Insert the new nodes into the topological ordering. We must do this in
2395 // a valid topological ordering as nothing is going to go back and re-sort
2396 // these nodes. We continually insert before 'N' in sequence as this is
2397 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2398 // hierarchy left to express.
2399 insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2400 insertDAGNode(DAG, Pos: N, N: NewSRL);
2401 insertDAGNode(DAG, Pos: N, N: NewMask);
2402 insertDAGNode(DAG, Pos: N, N: NewAnd);
2403 insertDAGNode(DAG, Pos: N, N: NewExt);
2404 insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2405 insertDAGNode(DAG, Pos: N, N: NewSHL);
2406 DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2407 DAG.RemoveDeadNode(N: N.getNode());
2408
2409 AM.Scale = 1 << AMShiftAmt;
2410 AM.IndexReg = NewExt;
2411 return false;
2412}
2413
2414// Attempt to peek further into a scaled index register, collecting additional
2415// extensions / offsets / etc. Returns /p N if we can't peek any further.
2416SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2417 X86ISelAddressMode &AM,
2418 unsigned Depth) {
2419 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2420 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2421 "Illegal index scale");
2422
2423 // Limit recursion.
2424 if (Depth >= SelectionDAG::MaxRecursionDepth)
2425 return N;
2426
2427 EVT VT = N.getValueType();
2428 unsigned Opc = N.getOpcode();
2429
2430 // index: add(x,c) -> index: x, disp + c
2431 if (CurDAG->isBaseWithConstantOffset(Op: N)) {
2432 auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: 1));
2433 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2434 if (!foldOffsetIntoAddress(Offset, AM))
2435 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2436 }
2437
2438 // index: add(x,x) -> index: x, scale * 2
2439 if (Opc == ISD::ADD && N.getOperand(i: 0) == N.getOperand(i: 1)) {
2440 if (AM.Scale <= 4) {
2441 AM.Scale *= 2;
2442 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2443 }
2444 }
2445
2446 // index: shl(x,i) -> index: x, scale * (1 << i)
2447 if (Opc == X86ISD::VSHLI) {
2448 uint64_t ShiftAmt = N.getConstantOperandVal(i: 1);
2449 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2450 if ((AM.Scale * ScaleAmt) <= 8) {
2451 AM.Scale *= ScaleAmt;
2452 return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1);
2453 }
2454 }
2455
2456 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2457 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2458 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2459 SDValue Src = N.getOperand(i: 0);
2460 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2461 Src.hasOneUse()) {
2462 if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2463 SDValue AddSrc = Src.getOperand(i: 0);
2464 auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: 1));
2465 int64_t Offset = AddVal->getSExtValue();
2466 if (!foldOffsetIntoAddress(Offset: (uint64_t)Offset * AM.Scale, AM)) {
2467 SDLoc DL(N);
2468 SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2469 SDValue ExtVal = CurDAG->getSignedConstant(Val: Offset, DL, VT);
2470 SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal);
2471 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2472 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2473 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2474 CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2475 CurDAG->RemoveDeadNode(N: N.getNode());
2476 return ExtSrc;
2477 }
2478 }
2479 }
2480 }
2481
2482 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2483 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2484 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2485 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2486 SDValue Src = N.getOperand(i: 0);
2487 unsigned SrcOpc = Src.getOpcode();
2488 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2489 CurDAG->isADDLike(Op: Src, /*NoWrap=*/true)) &&
2490 Src.hasOneUse()) {
2491 if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2492 SDValue AddSrc = Src.getOperand(i: 0);
2493 uint64_t Offset = Src.getConstantOperandVal(i: 1);
2494 if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2495 SDLoc DL(N);
2496 SDValue Res;
2497 // If we're also scaling, see if we can use that as well.
2498 if (AddSrc.getOpcode() == ISD::SHL &&
2499 isa<ConstantSDNode>(Val: AddSrc.getOperand(i: 1))) {
2500 SDValue ShVal = AddSrc.getOperand(i: 0);
2501 uint64_t ShAmt = AddSrc.getConstantOperandVal(i: 1);
2502 APInt HiBits =
2503 APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt);
2504 uint64_t ScaleAmt = 1ULL << ShAmt;
2505 if ((AM.Scale * ScaleAmt) <= 8 &&
2506 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2507 CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) {
2508 AM.Scale *= ScaleAmt;
2509 SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal);
2510 SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal,
2511 N2: AddSrc.getOperand(i: 1));
2512 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal);
2513 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift);
2514 AddSrc = ExtShift;
2515 Res = ExtShVal;
2516 }
2517 }
2518 SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2519 SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2520 SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal);
2521 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2522 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2523 insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2524 CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2525 CurDAG->RemoveDeadNode(N: N.getNode());
2526 return Res ? Res : ExtSrc;
2527 }
2528 }
2529 }
2530 }
2531
2532 // TODO: Handle extensions, shifted masks etc.
2533 return N;
2534}
2535
2536bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2537 unsigned Depth) {
2538 LLVM_DEBUG({
2539 dbgs() << "MatchAddress: ";
2540 AM.dump(CurDAG);
2541 });
2542 // Limit recursion.
2543 if (Depth >= SelectionDAG::MaxRecursionDepth)
2544 return matchAddressBase(N, AM);
2545
2546 // If this is already a %rip relative address, we can only merge immediates
2547 // into it. Instead of handling this in every case, we handle it here.
2548 // RIP relative addressing: %rip + 32-bit displacement!
2549 if (AM.isRIPRelative()) {
2550 // FIXME: JumpTable and ExternalSymbol address currently don't like
2551 // displacements. It isn't very important, but this should be fixed for
2552 // consistency.
2553 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2554 return true;
2555
2556 if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N))
2557 if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM))
2558 return false;
2559 return true;
2560 }
2561
2562 switch (N.getOpcode()) {
2563 default: break;
2564 case ISD::LOCAL_RECOVER: {
2565 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2566 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: 0))) {
2567 // Use the symbol and don't prefix it.
2568 AM.MCSym = ESNode->getMCSymbol();
2569 return false;
2570 }
2571 break;
2572 }
2573 case ISD::Constant: {
2574 uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2575 if (!foldOffsetIntoAddress(Offset: Val, AM))
2576 return false;
2577 break;
2578 }
2579
2580 case X86ISD::Wrapper:
2581 case X86ISD::WrapperRIP:
2582 if (!matchWrapper(N, AM))
2583 return false;
2584 break;
2585
2586 case ISD::LOAD:
2587 if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM))
2588 return false;
2589 break;
2590
2591 case ISD::FrameIndex:
2592 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2593 AM.Base_Reg.getNode() == nullptr &&
2594 (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(Val: AM.Disp))) {
2595 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2596 AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex();
2597 return false;
2598 }
2599 break;
2600
2601 case ISD::SHL:
2602 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2603 break;
2604
2605 if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) {
2606 unsigned Val = CN->getZExtValue();
2607 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2608 // that the base operand remains free for further matching. If
2609 // the base doesn't end up getting used, a post-processing step
2610 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2611 if (Val == 1 || Val == 2 || Val == 3) {
2612 SDValue ShVal = N.getOperand(i: 0);
2613 AM.Scale = 1 << Val;
2614 AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + 1);
2615 return false;
2616 }
2617 }
2618 break;
2619
2620 case ISD::SRL: {
2621 // Scale must not be used already.
2622 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2623
2624 // We only handle up to 64-bit values here as those are what matter for
2625 // addressing mode optimizations.
2626 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2627 "Unexpected value size!");
2628
2629 SDValue And = N.getOperand(i: 0);
2630 if (And.getOpcode() != ISD::AND) break;
2631 SDValue X = And.getOperand(i: 0);
2632
2633 // The mask used for the transform is expected to be post-shift, but we
2634 // found the shift first so just apply the shift to the mask before passing
2635 // it down.
2636 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)) ||
2637 !isa<ConstantSDNode>(Val: And.getOperand(i: 1)))
2638 break;
2639 uint64_t Mask = And.getConstantOperandVal(i: 1) >> N.getConstantOperandVal(i: 1);
2640
2641 // Try to fold the mask and shift into the scale, and return false if we
2642 // succeed.
2643 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM))
2644 return false;
2645 break;
2646 }
2647
2648 case ISD::SMUL_LOHI:
2649 case ISD::UMUL_LOHI:
2650 // A mul_lohi where we need the low part can be folded as a plain multiply.
2651 if (N.getResNo() != 0) break;
2652 [[fallthrough]];
2653 case ISD::MUL:
2654 case X86ISD::MUL_IMM:
2655 // X*[3,5,9] -> X+X*[2,4,8]
2656 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2657 AM.Base_Reg.getNode() == nullptr &&
2658 AM.IndexReg.getNode() == nullptr) {
2659 if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1)))
2660 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2661 CN->getZExtValue() == 9) {
2662 AM.Scale = unsigned(CN->getZExtValue())-1;
2663
2664 SDValue MulVal = N.getOperand(i: 0);
2665 SDValue Reg;
2666
2667 // Okay, we know that we have a scale by now. However, if the scaled
2668 // value is an add of something and a constant, we can fold the
2669 // constant into the disp field here.
2670 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2671 isa<ConstantSDNode>(Val: MulVal.getOperand(i: 1))) {
2672 Reg = MulVal.getOperand(i: 0);
2673 auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: 1));
2674 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2675 if (foldOffsetIntoAddress(Offset: Disp, AM))
2676 Reg = N.getOperand(i: 0);
2677 } else {
2678 Reg = N.getOperand(i: 0);
2679 }
2680
2681 AM.IndexReg = AM.Base_Reg = Reg;
2682 return false;
2683 }
2684 }
2685 break;
2686
2687 case ISD::SUB: {
2688 // Given A-B, if A can be completely folded into the address and
2689 // the index field with the index field unused, use -B as the index.
2690 // This is a win if a has multiple parts that can be folded into
2691 // the address. Also, this saves a mov if the base register has
2692 // other uses, since it avoids a two-address sub instruction, however
2693 // it costs an additional mov if the index register has other uses.
2694
2695 // Add an artificial use to this node so that we can keep track of
2696 // it if it gets CSE'd with a different node.
2697 HandleSDNode Handle(N);
2698
2699 // Test if the LHS of the sub can be folded.
2700 X86ISelAddressMode Backup = AM;
2701 if (matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1)) {
2702 N = Handle.getValue();
2703 AM = Backup;
2704 break;
2705 }
2706 N = Handle.getValue();
2707 // Test if the index field is free for use.
2708 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2709 AM = Backup;
2710 break;
2711 }
2712
2713 int Cost = 0;
2714 SDValue RHS = N.getOperand(i: 1);
2715 // If the RHS involves a register with multiple uses, this
2716 // transformation incurs an extra mov, due to the neg instruction
2717 // clobbering its operand.
2718 if (!RHS.getNode()->hasOneUse() ||
2719 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2720 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2721 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2722 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2723 RHS.getOperand(i: 0).getValueType() == MVT::i32))
2724 ++Cost;
2725 // If the base is a register with multiple uses, this
2726 // transformation may save a mov.
2727 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2728 !AM.Base_Reg.getNode()->hasOneUse()) ||
2729 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2730 --Cost;
2731 // If the folded LHS was interesting, this transformation saves
2732 // address arithmetic.
2733 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2734 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2735 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2736 --Cost;
2737 // If it doesn't look like it may be an overall win, don't do it.
2738 if (Cost >= 0) {
2739 AM = Backup;
2740 break;
2741 }
2742
2743 // Ok, the transformation is legal and appears profitable. Go for it.
2744 // Negation will be emitted later to avoid creating dangling nodes if this
2745 // was an unprofitable LEA.
2746 AM.IndexReg = RHS;
2747 AM.NegateIndex = true;
2748 AM.Scale = 1;
2749 return false;
2750 }
2751
2752 case ISD::OR:
2753 case ISD::XOR:
2754 // See if we can treat the OR/XOR node as an ADD node.
2755 if (!CurDAG->isADDLike(Op: N))
2756 break;
2757 [[fallthrough]];
2758 case ISD::ADD:
2759 if (!matchAdd(N, AM, Depth))
2760 return false;
2761 break;
2762
2763 case ISD::AND: {
2764 // Perform some heroic transforms on an and of a constant-count shift
2765 // with a constant to enable use of the scaled offset field.
2766
2767 // Scale must not be used already.
2768 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2769
2770 // We only handle up to 64-bit values here as those are what matter for
2771 // addressing mode optimizations.
2772 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2773 "Unexpected value size!");
2774
2775 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)))
2776 break;
2777
2778 if (N.getOperand(i: 0).getOpcode() == ISD::SRL) {
2779 SDValue Shift = N.getOperand(i: 0);
2780 SDValue X = Shift.getOperand(i: 0);
2781
2782 uint64_t Mask = N.getConstantOperandVal(i: 1);
2783
2784 // Try to fold the mask and shift into an extract and scale.
2785 if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2786 return false;
2787
2788 // Try to fold the mask and shift directly into the scale.
2789 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2790 return false;
2791
2792 // Try to fold the mask and shift into BEXTR and scale.
2793 if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask, Shift, X, AM, Subtarget: *Subtarget))
2794 return false;
2795 }
2796
2797 // Try to swap the mask and shift to place shifts which can be done as
2798 // a scale on the outside of the mask.
2799 if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM))
2800 return false;
2801
2802 break;
2803 }
2804 case ISD::ZERO_EXTEND: {
2805 // Try to widen a zexted shift left to the same size as its use, so we can
2806 // match the shift as a scale factor.
2807 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2808 break;
2809
2810 SDValue Src = N.getOperand(i: 0);
2811
2812 // See if we can match a zext(addlike(x,c)).
2813 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2814 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2815 if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + 1))
2816 if (Index != N) {
2817 AM.IndexReg = Index;
2818 return false;
2819 }
2820
2821 // Peek through mask: zext(and(shl(x,c1),c2))
2822 APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits());
2823 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2824 if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1))) {
2825 Mask = MaskC->getAPIntValue();
2826 Src = Src.getOperand(i: 0);
2827 }
2828
2829 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2830 // Give up if the shift is not a valid scale factor [1,2,3].
2831 SDValue ShlSrc = Src.getOperand(i: 0);
2832 SDValue ShlAmt = Src.getOperand(i: 1);
2833 auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt);
2834 if (!ShAmtC)
2835 break;
2836 unsigned ShAmtV = ShAmtC->getZExtValue();
2837 if (ShAmtV > 3)
2838 break;
2839
2840 // The narrow shift must only shift out zero bits (it must be 'nuw').
2841 // That makes it safe to widen to the destination type.
2842 APInt HighZeros =
2843 APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV);
2844 if (!Src->getFlags().hasNoUnsignedWrap() &&
2845 !CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask))
2846 break;
2847
2848 // zext (shl nuw i8 %x, C1) to i32
2849 // --> shl (zext i8 %x to i32), (zext C1)
2850 // zext (and (shl nuw i8 %x, C1), C2) to i32
2851 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2852 MVT SrcVT = ShlSrc.getSimpleValueType();
2853 MVT VT = N.getSimpleValueType();
2854 SDLoc DL(N);
2855
2856 SDValue Res = ShlSrc;
2857 if (!Mask.isAllOnes()) {
2858 Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT);
2859 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2860 Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res);
2861 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2862 }
2863 SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res);
2864 insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext);
2865 SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt);
2866 insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl);
2867 CurDAG->ReplaceAllUsesWith(From: N, To: NewShl);
2868 CurDAG->RemoveDeadNode(N: N.getNode());
2869
2870 // Convert the shift to scale factor.
2871 AM.Scale = 1 << ShAmtV;
2872 // If matchIndexRecursively is not called here,
2873 // Zext may be replaced by other nodes but later used to call a builder
2874 // method
2875 AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + 1);
2876 return false;
2877 }
2878
2879 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2880 // Try to fold the mask and shift into an extract and scale.
2881 if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2882 X: Src.getOperand(i: 0), AM))
2883 return false;
2884
2885 // Try to fold the mask and shift directly into the scale.
2886 if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2887 X: Src.getOperand(i: 0), AM))
2888 return false;
2889
2890 // Try to fold the mask and shift into BEXTR and scale.
2891 if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2892 X: Src.getOperand(i: 0), AM, Subtarget: *Subtarget))
2893 return false;
2894 }
2895
2896 break;
2897 }
2898 }
2899
2900 return matchAddressBase(N, AM);
2901}
2902
2903/// Helper for MatchAddress. Add the specified node to the
2904/// specified addressing mode without any further recursion.
2905bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2906 // Is the base register already occupied?
2907 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2908 // If so, check to see if the scale index register is set.
2909 if (!AM.IndexReg.getNode()) {
2910 AM.IndexReg = N;
2911 AM.Scale = 1;
2912 return false;
2913 }
2914
2915 // Otherwise, we cannot select it.
2916 return true;
2917 }
2918
2919 // Default, generate it as a register.
2920 AM.BaseType = X86ISelAddressMode::RegBase;
2921 AM.Base_Reg = N;
2922 return false;
2923}
2924
2925bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2926 X86ISelAddressMode &AM,
2927 unsigned Depth) {
2928 LLVM_DEBUG({
2929 dbgs() << "MatchVectorAddress: ";
2930 AM.dump(CurDAG);
2931 });
2932 // Limit recursion.
2933 if (Depth >= SelectionDAG::MaxRecursionDepth)
2934 return matchAddressBase(N, AM);
2935
2936 // TODO: Support other operations.
2937 switch (N.getOpcode()) {
2938 case ISD::Constant: {
2939 uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2940 if (!foldOffsetIntoAddress(Offset: Val, AM))
2941 return false;
2942 break;
2943 }
2944 case X86ISD::Wrapper:
2945 if (!matchWrapper(N, AM))
2946 return false;
2947 break;
2948 case ISD::ADD: {
2949 // Add an artificial use to this node so that we can keep track of
2950 // it if it gets CSE'd with a different node.
2951 HandleSDNode Handle(N);
2952
2953 X86ISelAddressMode Backup = AM;
2954 if (!matchVectorAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1) &&
2955 !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2956 Depth: Depth + 1))
2957 return false;
2958 AM = Backup;
2959
2960 // Try again after commuting the operands.
2961 if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM,
2962 Depth: Depth + 1) &&
2963 !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM,
2964 Depth: Depth + 1))
2965 return false;
2966 AM = Backup;
2967
2968 N = Handle.getValue();
2969 break;
2970 }
2971 }
2972
2973 return matchAddressBase(N, AM);
2974}
2975
2976/// Helper for selectVectorAddr. Handles things that can be folded into a
2977/// gather/scatter address. The index register and scale should have already
2978/// been handled.
2979bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2980 return matchVectorAddressRecursively(N, AM, Depth: 0);
2981}
2982
2983bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2984 SDValue IndexOp, SDValue ScaleOp,
2985 SDValue &Base, SDValue &Scale,
2986 SDValue &Index, SDValue &Disp,
2987 SDValue &Segment) {
2988 X86ISelAddressMode AM;
2989 AM.Scale = ScaleOp->getAsZExtVal();
2990
2991 // Attempt to match index patterns, as long as we're not relying on implicit
2992 // sign-extension, which is performed BEFORE scale.
2993 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2994 AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: 0);
2995 else
2996 AM.IndexReg = IndexOp;
2997
2998 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2999 if (AddrSpace == X86AS::GS)
3000 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
3001 if (AddrSpace == X86AS::FS)
3002 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
3003 if (AddrSpace == X86AS::SS)
3004 AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
3005
3006 SDLoc DL(BasePtr);
3007 MVT VT = BasePtr.getSimpleValueType();
3008
3009 // Try to match into the base and displacement fields.
3010 if (matchVectorAddress(N: BasePtr, AM))
3011 return false;
3012
3013 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3014 return true;
3015}
3016
3017/// Returns true if it is able to pattern match an addressing mode.
3018/// It returns the operands which make up the maximal addressing mode it can
3019/// match by reference.
3020///
3021/// Parent is the parent node of the addr operand that is being matched. It
3022/// is always a load, store, atomic node, or null. It is only null when
3023/// checking memory operands for inline asm nodes.
3024bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
3025 SDValue &Scale, SDValue &Index, SDValue &Disp,
3026 SDValue &Segment, bool HasNDDM) {
3027 X86ISelAddressMode AM;
3028
3029 if (Parent &&
3030 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
3031 // that are not a MemSDNode, and thus don't have proper addrspace info.
3032 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
3033 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
3034 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
3035 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3036 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3037 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3038 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3039 unsigned AddrSpace =
3040 cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace();
3041 if (AddrSpace == X86AS::GS)
3042 AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
3043 if (AddrSpace == X86AS::FS)
3044 AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
3045 if (AddrSpace == X86AS::SS)
3046 AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
3047 }
3048
3049 // Save the DL and VT before calling matchAddress, it can invalidate N.
3050 SDLoc DL(N);
3051 MVT VT = N.getSimpleValueType();
3052
3053 if (matchAddress(N, AM))
3054 return false;
3055
3056 if (!HasNDDM && !AM.isRIPRelative())
3057 return false;
3058
3059 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3060 return true;
3061}
3062
3063bool X86DAGToDAGISel::selectNDDAddr(SDNode *Parent, SDValue N, SDValue &Base,
3064 SDValue &Scale, SDValue &Index,
3065 SDValue &Disp, SDValue &Segment) {
3066 return selectAddr(Parent, N, Base, Scale, Index, Disp, Segment,
3067 HasNDDM: Subtarget->hasNDDM());
3068}
3069
3070bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3071 // Cannot use 32 bit constants to reference objects in kernel/large code
3072 // model.
3073 if (TM.getCodeModel() == CodeModel::Kernel ||
3074 TM.getCodeModel() == CodeModel::Large)
3075 return false;
3076
3077 // In static codegen with small code model, we can get the address of a label
3078 // into a register with 'movl'
3079 if (N->getOpcode() != X86ISD::Wrapper)
3080 return false;
3081
3082 N = N.getOperand(i: 0);
3083
3084 // At least GNU as does not accept 'movl' for TPOFF relocations.
3085 // FIXME: We could use 'movl' when we know we are targeting MC.
3086 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3087 return false;
3088
3089 Imm = N;
3090 // Small/medium code model can reference non-TargetGlobalAddress objects with
3091 // 32 bit constants.
3092 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3093 return TM.getCodeModel() == CodeModel::Small ||
3094 TM.getCodeModel() == CodeModel::Medium;
3095 }
3096
3097 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal();
3098 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3099 return CR->getUnsignedMax().ult(RHS: 1ull << 32);
3100
3101 return !TM.isLargeGlobalValue(GV);
3102}
3103
3104bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3105 SDValue &Index, SDValue &Disp,
3106 SDValue &Segment) {
3107 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3108 SDLoc DL(N);
3109
3110 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3111 return false;
3112
3113 EVT BaseType = Base.getValueType();
3114 unsigned SubReg;
3115 if (BaseType == MVT::i8)
3116 SubReg = X86::sub_8bit;
3117 else if (BaseType == MVT::i16)
3118 SubReg = X86::sub_16bit;
3119 else
3120 SubReg = X86::sub_32bit;
3121
3122 auto *RN = dyn_cast<RegisterSDNode>(Val&: Base);
3123 if (RN && RN->getReg() == 0)
3124 Base = CurDAG->getRegister(Reg: 0, VT: MVT::i64);
3125 else if ((BaseType == MVT::i8 || BaseType == MVT::i16 ||
3126 BaseType == MVT::i32) &&
3127 !isa<FrameIndexSDNode>(Val: Base)) {
3128 // Base could already be %rip, particularly in the x32 ABI.
3129 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3130 VT: MVT::i64), 0);
3131 Base = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Base);
3132 }
3133
3134 [[maybe_unused]] EVT IndexType = Index.getValueType();
3135 RN = dyn_cast<RegisterSDNode>(Val&: Index);
3136 if (RN && RN->getReg() == 0)
3137 Index = CurDAG->getRegister(Reg: 0, VT: MVT::i64);
3138 else {
3139 assert((IndexType == BaseType) &&
3140 "Expect to be extending 8/16/32-bit registers for use in LEA");
3141 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3142 VT: MVT::i64), 0);
3143 Index = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Index);
3144 }
3145
3146 return true;
3147}
3148
3149/// Calls SelectAddr and determines if the maximal addressing
3150/// mode it matches can be cost effectively emitted as an LEA instruction.
3151bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3152 SDValue &Base, SDValue &Scale,
3153 SDValue &Index, SDValue &Disp,
3154 SDValue &Segment) {
3155 X86ISelAddressMode AM;
3156
3157 // Save the DL and VT before calling matchAddress, it can invalidate N.
3158 SDLoc DL(N);
3159 MVT VT = N.getSimpleValueType();
3160
3161 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3162 // segments.
3163 SDValue Copy = AM.Segment;
3164 SDValue T = CurDAG->getRegister(Reg: 0, VT: MVT::i32);
3165 AM.Segment = T;
3166 if (matchAddress(N, AM))
3167 return false;
3168 assert (T == AM.Segment);
3169 AM.Segment = Copy;
3170
3171 unsigned Complexity = 0;
3172 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3173 Complexity = 1;
3174 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3175 Complexity = 4;
3176
3177 if (AM.IndexReg.getNode())
3178 Complexity++;
3179
3180 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3181 // a simple shift.
3182 if (AM.Scale > 1)
3183 Complexity++;
3184
3185 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3186 // to a LEA. This is determined with some experimentation but is by no means
3187 // optimal (especially for code size consideration). LEA is nice because of
3188 // its three-address nature. Tweak the cost function again when we can run
3189 // convertToThreeAddress() at register allocation time.
3190 if (AM.hasSymbolicDisplacement()) {
3191 // For X86-64, always use LEA to materialize RIP-relative addresses.
3192 if (Subtarget->is64Bit())
3193 Complexity = 4;
3194 else
3195 Complexity += 2;
3196 }
3197
3198 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3199 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3200 // duplicating flag-producing instructions later in the pipeline.
3201 if (N.getOpcode() == ISD::ADD) {
3202 auto isMathWithFlags = [](SDValue V) {
3203 switch (V.getOpcode()) {
3204 case X86ISD::ADD:
3205 case X86ISD::SUB:
3206 case X86ISD::ADC:
3207 case X86ISD::SBB:
3208 case X86ISD::SMUL:
3209 case X86ISD::UMUL:
3210 /* TODO: These opcodes can be added safely, but we may want to justify
3211 their inclusion for different reasons (better for reg-alloc).
3212 case X86ISD::OR:
3213 case X86ISD::XOR:
3214 case X86ISD::AND:
3215 */
3216 // Value 1 is the flag output of the node - verify it's not dead.
3217 return !SDValue(V.getNode(), 1).use_empty();
3218 default:
3219 return false;
3220 }
3221 };
3222 // TODO: We might want to factor in whether there's a load folding
3223 // opportunity for the math op that disappears with LEA.
3224 if (isMathWithFlags(N.getOperand(i: 0)) || isMathWithFlags(N.getOperand(i: 1)))
3225 Complexity++;
3226 }
3227
3228 if (AM.Disp)
3229 Complexity++;
3230
3231 // If it isn't worth using an LEA, reject it.
3232 if (Complexity <= 2)
3233 return false;
3234
3235 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3236 return true;
3237}
3238
3239/// This is only run on TargetGlobalTLSAddress nodes.
3240bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3241 SDValue &Scale, SDValue &Index,
3242 SDValue &Disp, SDValue &Segment) {
3243 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3244 N.getOpcode() == ISD::TargetExternalSymbol);
3245
3246 X86ISelAddressMode AM;
3247 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) {
3248 AM.GV = GA->getGlobal();
3249 AM.Disp += GA->getOffset();
3250 AM.SymbolFlags = GA->getTargetFlags();
3251 } else {
3252 auto *SA = cast<ExternalSymbolSDNode>(Val&: N);
3253 AM.ES = SA->getSymbol();
3254 AM.SymbolFlags = SA->getTargetFlags();
3255 }
3256
3257 if (Subtarget->is32Bit()) {
3258 AM.Scale = 1;
3259 AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32);
3260 }
3261
3262 MVT VT = N.getSimpleValueType();
3263 getAddressOperands(AM, DL: SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3264 return true;
3265}
3266
3267bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3268 // Keep track of the original value type and whether this value was
3269 // truncated. If we see a truncation from pointer type to VT that truncates
3270 // bits that are known to be zero, we can use a narrow reference.
3271 EVT VT = N.getValueType();
3272 bool WasTruncated = false;
3273 if (N.getOpcode() == ISD::TRUNCATE) {
3274 WasTruncated = true;
3275 N = N.getOperand(i: 0);
3276 }
3277
3278 if (N.getOpcode() != X86ISD::Wrapper)
3279 return false;
3280
3281 // We can only use non-GlobalValues as immediates if they were not truncated,
3282 // as we do not have any range information. If we have a GlobalValue and the
3283 // address was not truncated, we can select it as an operand directly.
3284 unsigned Opc = N.getOperand(i: 0)->getOpcode();
3285 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3286 Op = N.getOperand(i: 0);
3287 // We can only select the operand directly if we didn't have to look past a
3288 // truncate.
3289 return !WasTruncated;
3290 }
3291
3292 // Check that the global's range fits into VT.
3293 auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: 0));
3294 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3295 if (!CR || CR->getUnsignedMax().uge(RHS: 1ull << VT.getSizeInBits()))
3296 return false;
3297
3298 // Okay, we can use a narrow reference.
3299 Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N), VT,
3300 offset: GA->getOffset(), TargetFlags: GA->getTargetFlags());
3301 return true;
3302}
3303
3304bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3305 SDValue &Base, SDValue &Scale,
3306 SDValue &Index, SDValue &Disp,
3307 SDValue &Segment) {
3308 assert(Root && P && "Unknown root/parent nodes");
3309 if (!ISD::isNON_EXTLoad(N: N.getNode()) ||
3310 !IsProfitableToFold(N, U: P, Root) ||
3311 !IsLegalToFold(N, U: P, Root, OptLevel))
3312 return false;
3313
3314 return selectAddr(Parent: N.getNode(),
3315 N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment);
3316}
3317
3318bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3319 SDValue &Base, SDValue &Scale,
3320 SDValue &Index, SDValue &Disp,
3321 SDValue &Segment) {
3322 assert(Root && P && "Unknown root/parent nodes");
3323 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3324 !IsProfitableToFold(N, U: P, Root) ||
3325 !IsLegalToFold(N, U: P, Root, OptLevel))
3326 return false;
3327
3328 return selectAddr(Parent: N.getNode(),
3329 N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment);
3330}
3331
3332/// Return an SDNode that returns the value of the global base register.
3333/// Output instructions required to initialize the global base register,
3334/// if necessary.
3335SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3336 Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3337 auto &DL = MF->getDataLayout();
3338 return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode();
3339}
3340
3341bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3342 if (N->getOpcode() == ISD::TRUNCATE)
3343 N = N->getOperand(Num: 0).getNode();
3344 if (N->getOpcode() != X86ISD::Wrapper)
3345 return false;
3346
3347 auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: 0));
3348 if (!GA)
3349 return false;
3350
3351 auto *GV = GA->getGlobal();
3352 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3353 if (CR)
3354 return CR->getSignedMin().sge(RHS: -1ull << Width) &&
3355 CR->getSignedMax().slt(RHS: 1ull << Width);
3356 // In the kernel code model, globals are in the negative 2GB of the address
3357 // space, so globals can be a sign extended 32-bit immediate.
3358 // In other code models, small globals are in the low 2GB of the address
3359 // space, so sign extending them is equivalent to zero extending them.
3360 return TM.getCodeModel() != CodeModel::Large && Width == 32 &&
3361 !TM.isLargeGlobalValue(GV);
3362}
3363
3364X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3365 assert(N->isMachineOpcode() && "Unexpected node");
3366 unsigned Opc = N->getMachineOpcode();
3367 const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc);
3368 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3369 if (CondNo < 0)
3370 return X86::COND_INVALID;
3371
3372 return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo));
3373}
3374
3375/// Test whether the given X86ISD::CMP node has any users that use a flag
3376/// other than ZF.
3377bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3378 // Examine each user of the node.
3379 for (SDUse &Use : Flags->uses()) {
3380 // Only check things that use the flags.
3381 if (Use.getResNo() != Flags.getResNo())
3382 continue;
3383 SDNode *User = Use.getUser();
3384 // Only examine CopyToReg uses that copy to EFLAGS.
3385 if (User->getOpcode() != ISD::CopyToReg ||
3386 cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3387 return false;
3388 // Examine each user of the CopyToReg use.
3389 for (SDUse &FlagUse : User->uses()) {
3390 // Only examine the Flag result.
3391 if (FlagUse.getResNo() != 1)
3392 continue;
3393 // Anything unusual: assume conservatively.
3394 if (!FlagUse.getUser()->isMachineOpcode())
3395 return false;
3396 // Examine the condition code of the user.
3397 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3398
3399 switch (CC) {
3400 // Comparisons which only use the zero flag.
3401 case X86::COND_E: case X86::COND_NE:
3402 continue;
3403 // Anything else: assume conservatively.
3404 default:
3405 return false;
3406 }
3407 }
3408 }
3409 return true;
3410}
3411
3412/// Test whether the given X86ISD::CMP node has any uses which require the SF
3413/// flag to be accurate.
3414bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3415 // Examine each user of the node.
3416 for (SDUse &Use : Flags->uses()) {
3417 // Only check things that use the flags.
3418 if (Use.getResNo() != Flags.getResNo())
3419 continue;
3420 SDNode *User = Use.getUser();
3421 // Only examine CopyToReg uses that copy to EFLAGS.
3422 if (User->getOpcode() != ISD::CopyToReg ||
3423 cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3424 return false;
3425 // Examine each user of the CopyToReg use.
3426 for (SDUse &FlagUse : User->uses()) {
3427 // Only examine the Flag result.
3428 if (FlagUse.getResNo() != 1)
3429 continue;
3430 // Anything unusual: assume conservatively.
3431 if (!FlagUse.getUser()->isMachineOpcode())
3432 return false;
3433 // Examine the condition code of the user.
3434 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3435
3436 switch (CC) {
3437 // Comparisons which don't examine the SF flag.
3438 case X86::COND_A: case X86::COND_AE:
3439 case X86::COND_B: case X86::COND_BE:
3440 case X86::COND_E: case X86::COND_NE:
3441 case X86::COND_O: case X86::COND_NO:
3442 case X86::COND_P: case X86::COND_NP:
3443 continue;
3444 // Anything else: assume conservatively.
3445 default:
3446 return false;
3447 }
3448 }
3449 }
3450 return true;
3451}
3452
3453static bool mayUseCarryFlag(X86::CondCode CC) {
3454 switch (CC) {
3455 // Comparisons which don't examine the CF flag.
3456 case X86::COND_O: case X86::COND_NO:
3457 case X86::COND_E: case X86::COND_NE:
3458 case X86::COND_S: case X86::COND_NS:
3459 case X86::COND_P: case X86::COND_NP:
3460 case X86::COND_L: case X86::COND_GE:
3461 case X86::COND_G: case X86::COND_LE:
3462 return false;
3463 // Anything else: assume conservatively.
3464 default:
3465 return true;
3466 }
3467}
3468
3469/// Test whether the given node which sets flags has any uses which require the
3470/// CF flag to be accurate.
3471 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3472 // Examine each user of the node.
3473 for (SDUse &Use : Flags->uses()) {
3474 // Only check things that use the flags.
3475 if (Use.getResNo() != Flags.getResNo())
3476 continue;
3477
3478 SDNode *User = Use.getUser();
3479 unsigned UserOpc = User->getOpcode();
3480
3481 if (UserOpc == ISD::CopyToReg) {
3482 // Only examine CopyToReg uses that copy to EFLAGS.
3483 if (cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS)
3484 return false;
3485 // Examine each user of the CopyToReg use.
3486 for (SDUse &FlagUse : User->uses()) {
3487 // Only examine the Flag result.
3488 if (FlagUse.getResNo() != 1)
3489 continue;
3490 // Anything unusual: assume conservatively.
3491 if (!FlagUse.getUser()->isMachineOpcode())
3492 return false;
3493 // Examine the condition code of the user.
3494 X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3495
3496 if (mayUseCarryFlag(CC))
3497 return false;
3498 }
3499
3500 // This CopyToReg is ok. Move on to the next user.
3501 continue;
3502 }
3503
3504 // This might be an unselected node. So look for the pre-isel opcodes that
3505 // use flags.
3506 unsigned CCOpNo;
3507 switch (UserOpc) {
3508 default:
3509 // Something unusual. Be conservative.
3510 return false;
3511 case X86ISD::SETCC: CCOpNo = 0; break;
3512 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3513 case X86ISD::CMOV: CCOpNo = 2; break;
3514 case X86ISD::BRCOND: CCOpNo = 2; break;
3515 }
3516
3517 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(Num: CCOpNo);
3518 if (mayUseCarryFlag(CC))
3519 return false;
3520 }
3521 return true;
3522}
3523
3524bool X86DAGToDAGISel::checkTCRetEnoughRegs(SDNode *N) const {
3525 // Check that there is enough volatile registers to load the callee address.
3526
3527 const X86RegisterInfo *RI = Subtarget->getRegisterInfo();
3528 unsigned AvailGPRs;
3529 // The register classes below must stay in sync with what's used for
3530 // TCRETURNri, TCRETURN_HIPE32ri, TCRETURN_WIN64ri, etc).
3531 if (Subtarget->is64Bit()) {
3532 const TargetRegisterClass *TCGPRs =
3533 Subtarget->isCallingConvWin64(CC: MF->getFunction().getCallingConv())
3534 ? &X86::GR64_TCW64RegClass
3535 : &X86::GR64_TCRegClass;
3536 // Can't use RSP or RIP for the load in general.
3537 assert(TCGPRs->contains(X86::RSP));
3538 assert(TCGPRs->contains(X86::RIP));
3539 AvailGPRs = TCGPRs->getNumRegs() - 2;
3540 } else {
3541 const TargetRegisterClass *TCGPRs =
3542 MF->getFunction().getCallingConv() == CallingConv::HiPE
3543 ? &X86::GR32RegClass
3544 : &X86::GR32_TCRegClass;
3545 // Can't use ESP for the address in general.
3546 assert(TCGPRs->contains(X86::ESP));
3547 AvailGPRs = TCGPRs->getNumRegs() - 1;
3548 }
3549
3550 // The load's base and index need up to two registers.
3551 unsigned LoadGPRs = 2;
3552
3553 assert(N->getOpcode() == X86ISD::TC_RETURN);
3554 // X86tcret args: (*chain, ptr, imm, regs..., glue)
3555
3556 if (Subtarget->is32Bit()) {
3557 // FIXME: This was carried from X86tcret_1reg which was used for 32-bit,
3558 // but it could apply to 64-bit too.
3559 const SDValue &BasePtr = cast<LoadSDNode>(Val: N->getOperand(Num: 1))->getBasePtr();
3560 if (isa<FrameIndexSDNode>(Val: BasePtr)) {
3561 LoadGPRs -= 2; // Base is fixed index off ESP; no regs needed.
3562 } else if (BasePtr.getOpcode() == X86ISD::Wrapper &&
3563 isa<GlobalAddressSDNode>(Val: BasePtr->getOperand(Num: 0))) {
3564 assert(!getTargetMachine().isPositionIndependent());
3565 LoadGPRs -= 1; // Base is a global (immediate since this is non-PIC), no
3566 // reg needed.
3567 }
3568 }
3569
3570 unsigned ArgGPRs = 0;
3571 for (unsigned I = 3, E = N->getNumOperands(); I != E; ++I) {
3572 if (const auto *RN = dyn_cast<RegisterSDNode>(Val: N->getOperand(Num: I))) {
3573 if (!RI->isGeneralPurposeRegister(*MF, RN->getReg()))
3574 continue;
3575 if (++ArgGPRs + LoadGPRs > AvailGPRs)
3576 return false;
3577 }
3578 }
3579
3580 return true;
3581}
3582
3583/// Check whether or not the chain ending in StoreNode is suitable for doing
3584/// the {load; op; store} to modify transformation.
3585static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3586 SDValue StoredVal, SelectionDAG *CurDAG,
3587 unsigned LoadOpNo,
3588 LoadSDNode *&LoadNode,
3589 SDValue &InputChain) {
3590 // Is the stored value result 0 of the operation?
3591 if (StoredVal.getResNo() != 0) return false;
3592
3593 // Are there other uses of the operation other than the store?
3594 if (!StoredVal.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) return false;
3595
3596 // Is the store non-extending and non-indexed?
3597 if (!ISD::isNormalStore(N: StoreNode) || StoreNode->isNonTemporal())
3598 return false;
3599
3600 SDValue Load = StoredVal->getOperand(Num: LoadOpNo);
3601 // Is the stored value a non-extending and non-indexed load?
3602 if (!ISD::isNormalLoad(N: Load.getNode())) return false;
3603
3604 // Return LoadNode by reference.
3605 LoadNode = cast<LoadSDNode>(Val&: Load);
3606
3607 // Is store the only read of the loaded value?
3608 if (!Load.hasOneUse())
3609 return false;
3610
3611 // Is the address of the store the same as the load?
3612 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3613 LoadNode->getOffset() != StoreNode->getOffset())
3614 return false;
3615
3616 bool FoundLoad = false;
3617 SmallVector<SDValue, 4> ChainOps;
3618 SmallVector<const SDNode *, 4> LoopWorklist;
3619 SmallPtrSet<const SDNode *, 16> Visited;
3620 const unsigned int Max = 1024;
3621
3622 // Visualization of Load-Op-Store fusion:
3623 // -------------------------
3624 // Legend:
3625 // *-lines = Chain operand dependencies.
3626 // |-lines = Normal operand dependencies.
3627 // Dependencies flow down and right. n-suffix references multiple nodes.
3628 //
3629 // C Xn C
3630 // * * *
3631 // * * *
3632 // Xn A-LD Yn TF Yn
3633 // * * \ | * |
3634 // * * \ | * |
3635 // * * \ | => A--LD_OP_ST
3636 // * * \| \
3637 // TF OP \
3638 // * | \ Zn
3639 // * | \
3640 // A-ST Zn
3641 //
3642
3643 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3644 // #2: Yn -> LD
3645 // #3: ST -> Zn
3646
3647 // Ensure the transform is safe by checking for the dual
3648 // dependencies to make sure we do not induce a loop.
3649
3650 // As LD is a predecessor to both OP and ST we can do this by checking:
3651 // a). if LD is a predecessor to a member of Xn or Yn.
3652 // b). if a Zn is a predecessor to ST.
3653
3654 // However, (b) can only occur through being a chain predecessor to
3655 // ST, which is the same as Zn being a member or predecessor of Xn,
3656 // which is a subset of LD being a predecessor of Xn. So it's
3657 // subsumed by check (a).
3658
3659 SDValue Chain = StoreNode->getChain();
3660
3661 // Gather X elements in ChainOps.
3662 if (Chain == Load.getValue(R: 1)) {
3663 FoundLoad = true;
3664 ChainOps.push_back(Elt: Load.getOperand(i: 0));
3665 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3666 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3667 SDValue Op = Chain.getOperand(i);
3668 if (Op == Load.getValue(R: 1)) {
3669 FoundLoad = true;
3670 // Drop Load, but keep its chain. No cycle check necessary.
3671 ChainOps.push_back(Elt: Load.getOperand(i: 0));
3672 continue;
3673 }
3674 LoopWorklist.push_back(Elt: Op.getNode());
3675 ChainOps.push_back(Elt: Op);
3676 }
3677 }
3678
3679 if (!FoundLoad)
3680 return false;
3681
3682 // Worklist is currently Xn. Add Yn to worklist.
3683 for (SDValue Op : StoredVal->ops())
3684 if (Op.getNode() != LoadNode)
3685 LoopWorklist.push_back(Elt: Op.getNode());
3686
3687 // Check (a) if Load is a predecessor to Xn + Yn
3688 if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max,
3689 TopologicalPrune: true))
3690 return false;
3691
3692 InputChain =
3693 CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ChainOps);
3694 return true;
3695}
3696
3697// Change a chain of {load; op; store} of the same value into a simple op
3698// through memory of that value, if the uses of the modified value and its
3699// address are suitable.
3700//
3701// The tablegen pattern memory operand pattern is currently not able to match
3702// the case where the EFLAGS on the original operation are used.
3703//
3704// To move this to tablegen, we'll need to improve tablegen to allow flags to
3705// be transferred from a node in the pattern to the result node, probably with
3706// a new keyword. For example, we have this
3707// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3708// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3709// but maybe need something like this
3710// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3711// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3712// (transferrable EFLAGS)]>;
3713//
3714// Until then, we manually fold these and instruction select the operation
3715// here.
3716bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3717 auto *StoreNode = cast<StoreSDNode>(Val: Node);
3718 SDValue StoredVal = StoreNode->getOperand(Num: 1);
3719 unsigned Opc = StoredVal->getOpcode();
3720
3721 // Before we try to select anything, make sure this is memory operand size
3722 // and opcode we can handle. Note that this must match the code below that
3723 // actually lowers the opcodes.
3724 EVT MemVT = StoreNode->getMemoryVT();
3725 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3726 MemVT != MVT::i8)
3727 return false;
3728
3729 bool IsCommutable = false;
3730 bool IsNegate = false;
3731 switch (Opc) {
3732 default:
3733 return false;
3734 case X86ISD::SUB:
3735 IsNegate = isNullConstant(V: StoredVal.getOperand(i: 0));
3736 break;
3737 case X86ISD::SBB:
3738 break;
3739 case X86ISD::ADD:
3740 case X86ISD::ADC:
3741 case X86ISD::AND:
3742 case X86ISD::OR:
3743 case X86ISD::XOR:
3744 IsCommutable = true;
3745 break;
3746 }
3747
3748 unsigned LoadOpNo = IsNegate ? 1 : 0;
3749 LoadSDNode *LoadNode = nullptr;
3750 SDValue InputChain;
3751 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3752 LoadNode, InputChain)) {
3753 if (!IsCommutable)
3754 return false;
3755
3756 // This operation is commutable, try the other operand.
3757 LoadOpNo = 1;
3758 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3759 LoadNode, InputChain))
3760 return false;
3761 }
3762
3763 SDValue Base, Scale, Index, Disp, Segment;
3764 if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3765 Segment))
3766 return false;
3767
3768 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3769 unsigned Opc8) {
3770 switch (MemVT.getSimpleVT().SimpleTy) {
3771 case MVT::i64:
3772 return Opc64;
3773 case MVT::i32:
3774 return Opc32;
3775 case MVT::i16:
3776 return Opc16;
3777 case MVT::i8:
3778 return Opc8;
3779 default:
3780 llvm_unreachable("Invalid size!");
3781 }
3782 };
3783
3784 MachineSDNode *Result;
3785 switch (Opc) {
3786 case X86ISD::SUB:
3787 // Handle negate.
3788 if (IsNegate) {
3789 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3790 X86::NEG8m);
3791 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3792 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32,
3793 VT2: MVT::Other, Ops);
3794 break;
3795 }
3796 [[fallthrough]];
3797 case X86ISD::ADD:
3798 // Try to match inc/dec.
3799 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3800 bool IsOne = isOneConstant(V: StoredVal.getOperand(i: 1));
3801 bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: 1));
3802 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3803 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) {
3804 unsigned NewOpc =
3805 ((Opc == X86ISD::ADD) == IsOne)
3806 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3807 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3808 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3809 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32,
3810 VT2: MVT::Other, Ops);
3811 break;
3812 }
3813 }
3814 [[fallthrough]];
3815 case X86ISD::ADC:
3816 case X86ISD::SBB:
3817 case X86ISD::AND:
3818 case X86ISD::OR:
3819 case X86ISD::XOR: {
3820 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3821 switch (Opc) {
3822 case X86ISD::ADD:
3823 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3824 X86::ADD8mr);
3825 case X86ISD::ADC:
3826 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3827 X86::ADC8mr);
3828 case X86ISD::SUB:
3829 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3830 X86::SUB8mr);
3831 case X86ISD::SBB:
3832 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3833 X86::SBB8mr);
3834 case X86ISD::AND:
3835 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3836 X86::AND8mr);
3837 case X86ISD::OR:
3838 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3839 case X86ISD::XOR:
3840 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3841 X86::XOR8mr);
3842 default:
3843 llvm_unreachable("Invalid opcode!");
3844 }
3845 };
3846 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3847 switch (Opc) {
3848 case X86ISD::ADD:
3849 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3850 X86::ADD8mi);
3851 case X86ISD::ADC:
3852 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3853 X86::ADC8mi);
3854 case X86ISD::SUB:
3855 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3856 X86::SUB8mi);
3857 case X86ISD::SBB:
3858 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3859 X86::SBB8mi);
3860 case X86ISD::AND:
3861 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3862 X86::AND8mi);
3863 case X86ISD::OR:
3864 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3865 X86::OR8mi);
3866 case X86ISD::XOR:
3867 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3868 X86::XOR8mi);
3869 default:
3870 llvm_unreachable("Invalid opcode!");
3871 }
3872 };
3873
3874 unsigned NewOpc = SelectRegOpcode(Opc);
3875 SDValue Operand = StoredVal->getOperand(Num: 1-LoadOpNo);
3876
3877 // See if the operand is a constant that we can fold into an immediate
3878 // operand.
3879 if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) {
3880 int64_t OperandV = OperandC->getSExtValue();
3881
3882 // Check if we can shrink the operand enough to fit in an immediate (or
3883 // fit into a smaller immediate) by negating it and switching the
3884 // operation.
3885 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3886 ((MemVT != MVT::i8 && !isInt<8>(x: OperandV) && isInt<8>(x: -OperandV)) ||
3887 (MemVT == MVT::i64 && !isInt<32>(x: OperandV) &&
3888 isInt<32>(x: -OperandV))) &&
3889 hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) {
3890 OperandV = -OperandV;
3891 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3892 }
3893
3894 if (MemVT != MVT::i64 || isInt<32>(x: OperandV)) {
3895 Operand = CurDAG->getSignedTargetConstant(Val: OperandV, DL: SDLoc(Node), VT: MemVT);
3896 NewOpc = SelectImmOpcode(Opc);
3897 }
3898 }
3899
3900 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3901 SDValue CopyTo =
3902 CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc(Node), Reg: X86::EFLAGS,
3903 N: StoredVal.getOperand(i: 2), Glue: SDValue());
3904
3905 const SDValue Ops[] = {Base, Scale, Index, Disp,
3906 Segment, Operand, CopyTo, CopyTo.getValue(R: 1)};
3907 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other,
3908 Ops);
3909 } else {
3910 const SDValue Ops[] = {Base, Scale, Index, Disp,
3911 Segment, Operand, InputChain};
3912 Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other,
3913 Ops);
3914 }
3915 break;
3916 }
3917 default:
3918 llvm_unreachable("Invalid opcode!");
3919 }
3920
3921 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3922 LoadNode->getMemOperand()};
3923 CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps);
3924
3925 // Update Load Chain uses as well.
3926 ReplaceUses(F: SDValue(LoadNode, 1), T: SDValue(Result, 1));
3927 ReplaceUses(F: SDValue(StoreNode, 0), T: SDValue(Result, 1));
3928 ReplaceUses(F: SDValue(StoredVal.getNode(), 1), T: SDValue(Result, 0));
3929 CurDAG->RemoveDeadNode(N: Node);
3930 return true;
3931}
3932
3933// See if this is an X & Mask that we can match to BEXTR/BZHI.
3934// Where Mask is one of the following patterns:
3935// a) x & (1 << nbits) - 1
3936// b) x & ~(-1 << nbits)
3937// c) x & (-1 >> (32 - y))
3938// d) x << (32 - y) >> (32 - y)
3939// e) (1 << nbits) - 1
3940bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3941 assert(
3942 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3943 Node->getOpcode() == ISD::SRL) &&
3944 "Should be either an and-mask, or right-shift after clearing high bits.");
3945
3946 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3947 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3948 return false;
3949
3950 MVT NVT = Node->getSimpleValueType(ResNo: 0);
3951
3952 // Only supported for 32 and 64 bits.
3953 if (NVT != MVT::i32 && NVT != MVT::i64)
3954 return false;
3955
3956 SDValue NBits;
3957 bool NegateNBits;
3958
3959 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3960 // Else, if we only have BMI1's BEXTR, we require one-use.
3961 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3962 auto checkUses = [AllowExtraUsesByDefault](
3963 SDValue Op, unsigned NUses,
3964 std::optional<bool> AllowExtraUses) {
3965 return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) ||
3966 Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo());
3967 };
3968 auto checkOneUse = [checkUses](SDValue Op,
3969 std::optional<bool> AllowExtraUses =
3970 std::nullopt) {
3971 return checkUses(Op, 1, AllowExtraUses);
3972 };
3973 auto checkTwoUse = [checkUses](SDValue Op,
3974 std::optional<bool> AllowExtraUses =
3975 std::nullopt) {
3976 return checkUses(Op, 2, AllowExtraUses);
3977 };
3978
3979 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3980 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3981 assert(V.getSimpleValueType() == MVT::i32 &&
3982 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3983 "Expected i64 -> i32 truncation");
3984 V = V.getOperand(i: 0);
3985 }
3986 return V;
3987 };
3988
3989 // a) x & ((1 << nbits) + (-1))
3990 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3991 &NegateNBits](SDValue Mask) -> bool {
3992 // Match `add`. Must only have one use!
3993 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3994 return false;
3995 // We should be adding all-ones constant (i.e. subtracting one.)
3996 if (!isAllOnesConstant(V: Mask->getOperand(Num: 1)))
3997 return false;
3998 // Match `1 << nbits`. Might be truncated. Must only have one use!
3999 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0));
4000 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
4001 return false;
4002 if (!isOneConstant(V: M0->getOperand(Num: 0)))
4003 return false;
4004 NBits = M0->getOperand(Num: 1);
4005 NegateNBits = false;
4006 return true;
4007 };
4008
4009 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
4010 V = peekThroughOneUseTruncation(V);
4011 return CurDAG->MaskedValueIsAllOnes(
4012 Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(),
4013 loBitsSet: NVT.getSizeInBits()));
4014 };
4015
4016 // b) x & ~(-1 << nbits)
4017 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
4018 &NBits, &NegateNBits](SDValue Mask) -> bool {
4019 // Match `~()`. Must only have one use!
4020 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
4021 return false;
4022 // The -1 only has to be all-ones for the final Node's NVT.
4023 if (!isAllOnes(Mask->getOperand(Num: 1)))
4024 return false;
4025 // Match `-1 << nbits`. Might be truncated. Must only have one use!
4026 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0));
4027 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
4028 return false;
4029 // The -1 only has to be all-ones for the final Node's NVT.
4030 if (!isAllOnes(M0->getOperand(Num: 0)))
4031 return false;
4032 NBits = M0->getOperand(Num: 1);
4033 NegateNBits = false;
4034 return true;
4035 };
4036
4037 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
4038 // or leave the shift amount as-is, but then we'll have to negate it.
4039 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
4040 unsigned Bitwidth) {
4041 NBits = ShiftAmt;
4042 NegateNBits = true;
4043 // Skip over a truncate of the shift amount, if any.
4044 if (NBits.getOpcode() == ISD::TRUNCATE)
4045 NBits = NBits.getOperand(i: 0);
4046 // Try to match the shift amount as (bitwidth - y). It should go away, too.
4047 // If it doesn't match, that's fine, we'll just negate it ourselves.
4048 if (NBits.getOpcode() != ISD::SUB)
4049 return;
4050 auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: 0));
4051 if (!V0 || V0->getZExtValue() != Bitwidth)
4052 return;
4053 NBits = NBits.getOperand(i: 1);
4054 NegateNBits = false;
4055 };
4056
4057 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
4058 // or
4059 // c) x & (-1 >> (32 - y))
4060 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
4061 canonicalizeShiftAmt](SDValue Mask) -> bool {
4062 // The mask itself may be truncated.
4063 Mask = peekThroughOneUseTruncation(Mask);
4064 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
4065 // Match `l>>`. Must only have one use!
4066 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
4067 return false;
4068 // We should be shifting truly all-ones constant.
4069 if (!isAllOnesConstant(V: Mask.getOperand(i: 0)))
4070 return false;
4071 SDValue M1 = Mask.getOperand(i: 1);
4072 // The shift amount should not be used externally.
4073 if (!checkOneUse(M1))
4074 return false;
4075 canonicalizeShiftAmt(M1, Bitwidth);
4076 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
4077 // is no extra use of the mask. Clearly, there was one since we are here.
4078 // But at the same time, if we need to negate the shift amount,
4079 // then we don't want the mask to stick around, else it's unprofitable.
4080 return !NegateNBits;
4081 };
4082
4083 SDValue X;
4084
4085 // d) x << z >> z but then we'll have to subtract z from bitwidth
4086 // or
4087 // d) x << (32 - y) >> (32 - y)
4088 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
4089 AllowExtraUsesByDefault, &NegateNBits,
4090 &X](SDNode *Node) -> bool {
4091 if (Node->getOpcode() != ISD::SRL)
4092 return false;
4093 SDValue N0 = Node->getOperand(Num: 0);
4094 if (N0->getOpcode() != ISD::SHL)
4095 return false;
4096 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
4097 SDValue N1 = Node->getOperand(Num: 1);
4098 SDValue N01 = N0->getOperand(Num: 1);
4099 // Both of the shifts must be by the exact same value.
4100 if (N1 != N01)
4101 return false;
4102 canonicalizeShiftAmt(N1, Bitwidth);
4103 // There should not be any external uses of the inner shift / shift amount.
4104 // Note that while we are generally okay with external uses given BMI2,
4105 // iff we need to negate the shift amount, we are not okay with extra uses.
4106 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4107 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
4108 return false;
4109 X = N0->getOperand(Num: 0);
4110 return true;
4111 };
4112
4113 auto matchLowBitMask = [matchPatternA, matchPatternB,
4114 matchPatternC](SDValue Mask) -> bool {
4115 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4116 };
4117
4118 if (Node->getOpcode() == ISD::AND) {
4119 X = Node->getOperand(Num: 0);
4120 SDValue Mask = Node->getOperand(Num: 1);
4121
4122 if (matchLowBitMask(Mask)) {
4123 // Great.
4124 } else {
4125 std::swap(a&: X, b&: Mask);
4126 if (!matchLowBitMask(Mask))
4127 return false;
4128 }
4129 } else if (matchLowBitMask(SDValue(Node, 0))) {
4130 X = CurDAG->getAllOnesConstant(DL: SDLoc(Node), VT: NVT);
4131 } else if (!matchPatternD(Node))
4132 return false;
4133
4134 // If we need to negate the shift amount, require BMI2 BZHI support.
4135 // It's just too unprofitable for BMI1 BEXTR.
4136 if (NegateNBits && !Subtarget->hasBMI2())
4137 return false;
4138
4139 SDLoc DL(Node);
4140
4141 if (NBits.getSimpleValueType() != MVT::i8) {
4142 // Truncate the shift amount.
4143 NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits);
4144 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4145 }
4146
4147 // Turn (i32)(x & imm8) into (i32)x & imm32.
4148 ConstantSDNode *Imm = nullptr;
4149 if (NBits->getOpcode() == ISD::AND)
4150 if ((Imm = dyn_cast<ConstantSDNode>(Val: NBits->getOperand(Num: 1))))
4151 NBits = NBits->getOperand(Num: 0);
4152
4153 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4154 // All the other bits are undefined, we do not care about them.
4155 SDValue ImplDef = SDValue(
4156 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), 0);
4157 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: ImplDef);
4158
4159 SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32);
4160 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: SRIdxVal);
4161 NBits = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL,
4162 VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal),
4163 0);
4164 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4165
4166 if (Imm) {
4167 NBits =
4168 CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: NBits,
4169 N2: CurDAG->getConstant(Val: Imm->getZExtValue(), DL, VT: MVT::i32));
4170 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4171 }
4172
4173 // We might have matched the amount of high bits to be cleared,
4174 // but we want the amount of low bits to be kept, so negate it then.
4175 if (NegateNBits) {
4176 SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32);
4177 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: BitWidthC);
4178
4179 NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits);
4180 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4181 }
4182
4183 if (Subtarget->hasBMI2()) {
4184 // Great, just emit the BZHI..
4185 if (NVT != MVT::i32) {
4186 // But have to place the bit count into the wide-enough register first.
4187 NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits);
4188 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits);
4189 }
4190
4191 SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits);
4192 ReplaceNode(F: Node, T: Extract.getNode());
4193 SelectCode(N: Extract.getNode());
4194 return true;
4195 }
4196
4197 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4198 // *logically* shifted (potentially with one-use trunc inbetween),
4199 // and the truncation was the only use of the shift,
4200 // and if so look past one-use truncation.
4201 {
4202 SDValue RealX = peekThroughOneUseTruncation(X);
4203 // FIXME: only if the shift is one-use?
4204 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4205 X = RealX;
4206 }
4207
4208 MVT XVT = X.getSimpleValueType();
4209
4210 // Else, emitting BEXTR requires one more step.
4211 // The 'control' of BEXTR has the pattern of:
4212 // [15...8 bit][ 7...0 bit] location
4213 // [ bit count][ shift] name
4214 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4215
4216 // Shift NBits left by 8 bits, thus producing 'control'.
4217 // This makes the low 8 bits to be zero.
4218 SDValue C8 = CurDAG->getConstant(Val: 8, DL, VT: MVT::i8);
4219 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: C8);
4220 SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8);
4221 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4222
4223 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4224 // FIXME: only if the shift is one-use?
4225 if (X.getOpcode() == ISD::SRL) {
4226 SDValue ShiftAmt = X.getOperand(i: 1);
4227 X = X.getOperand(i: 0);
4228
4229 assert(ShiftAmt.getValueType() == MVT::i8 &&
4230 "Expected shift amount to be i8");
4231
4232 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4233 // We could zext to i16 in some form, but we intentionally don't do that.
4234 SDValue OrigShiftAmt = ShiftAmt;
4235 ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt);
4236 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt);
4237
4238 // And now 'or' these low 8 bits of shift amount into the 'control'.
4239 Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt);
4240 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4241 }
4242
4243 // But have to place the 'control' into the wide-enough register first.
4244 if (XVT != MVT::i32) {
4245 Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control);
4246 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control);
4247 }
4248
4249 // And finally, form the BEXTR itself.
4250 SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control);
4251
4252 // The 'X' was originally truncated. Do that now.
4253 if (XVT != NVT) {
4254 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Extract);
4255 Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract);
4256 }
4257
4258 ReplaceNode(F: Node, T: Extract.getNode());
4259 SelectCode(N: Extract.getNode());
4260
4261 return true;
4262}
4263
4264// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4265MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4266 MVT NVT = Node->getSimpleValueType(ResNo: 0);
4267 SDLoc dl(Node);
4268
4269 SDValue N0 = Node->getOperand(Num: 0);
4270 SDValue N1 = Node->getOperand(Num: 1);
4271
4272 // If we have TBM we can use an immediate for the control. If we have BMI
4273 // we should only do this if the BEXTR instruction is implemented well.
4274 // Otherwise moving the control into a register makes this more costly.
4275 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4276 // hoisting the move immediate would make it worthwhile with a less optimal
4277 // BEXTR?
4278 bool PreferBEXTR =
4279 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4280 if (!PreferBEXTR && !Subtarget->hasBMI2())
4281 return nullptr;
4282
4283 // Must have a shift right.
4284 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4285 return nullptr;
4286
4287 // Shift can't have additional users.
4288 if (!N0->hasOneUse())
4289 return nullptr;
4290
4291 // Only supported for 32 and 64 bits.
4292 if (NVT != MVT::i32 && NVT != MVT::i64)
4293 return nullptr;
4294
4295 // Shift amount and RHS of and must be constant.
4296 auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1);
4297 auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1));
4298 if (!MaskCst || !ShiftCst)
4299 return nullptr;
4300
4301 // And RHS must be a mask.
4302 uint64_t Mask = MaskCst->getZExtValue();
4303 if (!isMask_64(Value: Mask))
4304 return nullptr;
4305
4306 uint64_t Shift = ShiftCst->getZExtValue();
4307 uint64_t MaskSize = llvm::popcount(Value: Mask);
4308
4309 // Don't interfere with something that can be handled by extracting AH.
4310 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4311 if (Shift == 8 && MaskSize == 8)
4312 return nullptr;
4313
4314 // Make sure we are only using bits that were in the original value, not
4315 // shifted in.
4316 if (Shift + MaskSize > NVT.getSizeInBits())
4317 return nullptr;
4318
4319 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4320 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4321 // does not fit into 32 bits. Load folding is not a sufficient reason.
4322 if (!PreferBEXTR && MaskSize <= 32)
4323 return nullptr;
4324
4325 SDValue Control;
4326 unsigned ROpc, MOpc;
4327
4328#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4329 if (!PreferBEXTR) {
4330 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4331 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4332 // Let's perform the mask first, and apply shift later. Note that we need to
4333 // widen the mask to account for the fact that we'll apply shift afterwards!
4334 Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT);
4335 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4336 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4337 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4338 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4339 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4340 Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0);
4341 } else {
4342 // The 'control' of BEXTR has the pattern of:
4343 // [15...8 bit][ 7...0 bit] location
4344 // [ bit count][ shift] name
4345 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4346 Control = CurDAG->getTargetConstant(Val: Shift | (MaskSize << 8), DL: dl, VT: NVT);
4347 if (Subtarget->hasTBM()) {
4348 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4349 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4350 } else {
4351 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4352 // BMI requires the immediate to placed in a register.
4353 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4354 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4355 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4356 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4357 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4358 Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0);
4359 }
4360 }
4361
4362 MachineSDNode *NewNode;
4363 SDValue Input = N0->getOperand(Num: 0);
4364 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4365 if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4366 SDValue Ops[] = {
4367 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: 0)};
4368 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
4369 NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4370 // Update the chain.
4371 ReplaceUses(F: Input.getValue(R: 1), T: SDValue(NewNode, 2));
4372 // Record the mem-refs
4373 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()});
4374 } else {
4375 NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control);
4376 }
4377
4378 if (!PreferBEXTR) {
4379 // We still need to apply the shift.
4380 SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT);
4381 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4382 : GET_ND_IF_ENABLED(X86::SHR32ri);
4383 NewNode =
4384 CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue(NewNode, 0), Op2: ShAmt);
4385 }
4386
4387 return NewNode;
4388}
4389
4390// Emit a PCMISTR(I/M) instruction.
4391MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4392 bool MayFoldLoad, const SDLoc &dl,
4393 MVT VT, SDNode *Node) {
4394 SDValue N0 = Node->getOperand(Num: 0);
4395 SDValue N1 = Node->getOperand(Num: 1);
4396 SDValue Imm = Node->getOperand(Num: 2);
4397 auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4398 Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType());
4399
4400 // Try to fold a load. No need to check alignment.
4401 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4402 if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4403 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4404 N1.getOperand(i: 0) };
4405 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other);
4406 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4407 // Update the chain.
4408 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 2));
4409 // Record the mem-refs
4410 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
4411 return CNode;
4412 }
4413
4414 SDValue Ops[] = { N0, N1, Imm };
4415 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32);
4416 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4417 return CNode;
4418}
4419
4420// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4421// to emit a second instruction after this one. This is needed since we have two
4422// copyToReg nodes glued before this and we need to continue that glue through.
4423MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4424 bool MayFoldLoad, const SDLoc &dl,
4425 MVT VT, SDNode *Node,
4426 SDValue &InGlue) {
4427 SDValue N0 = Node->getOperand(Num: 0);
4428 SDValue N2 = Node->getOperand(Num: 2);
4429 SDValue Imm = Node->getOperand(Num: 4);
4430 auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4431 Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType());
4432
4433 // Try to fold a load. No need to check alignment.
4434 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4435 if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4436 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4437 N2.getOperand(i: 0), InGlue };
4438 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
4439 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4440 InGlue = SDValue(CNode, 3);
4441 // Update the chain.
4442 ReplaceUses(F: N2.getValue(R: 1), T: SDValue(CNode, 2));
4443 // Record the mem-refs
4444 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()});
4445 return CNode;
4446 }
4447
4448 SDValue Ops[] = { N0, N2, Imm, InGlue };
4449 SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue);
4450 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4451 InGlue = SDValue(CNode, 2);
4452 return CNode;
4453}
4454
4455bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4456 EVT VT = N->getValueType(ResNo: 0);
4457
4458 // Only handle scalar shifts.
4459 if (VT.isVector())
4460 return false;
4461
4462 // Narrower shifts only mask to 5 bits in hardware.
4463 unsigned Size = VT == MVT::i64 ? 64 : 32;
4464
4465 SDValue OrigShiftAmt = N->getOperand(Num: 1);
4466 SDValue ShiftAmt = OrigShiftAmt;
4467 SDLoc DL(N);
4468
4469 // Skip over a truncate of the shift amount.
4470 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4471 ShiftAmt = ShiftAmt->getOperand(Num: 0);
4472
4473 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4474 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4475
4476 SDValue NewShiftAmt;
4477 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4478 ShiftAmt->getOpcode() == ISD::XOR) {
4479 SDValue Add0 = ShiftAmt->getOperand(Num: 0);
4480 SDValue Add1 = ShiftAmt->getOperand(Num: 1);
4481 auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0);
4482 auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1);
4483 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4484 // to avoid the ADD/SUB/XOR.
4485 if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == 0) {
4486 NewShiftAmt = Add0;
4487
4488 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4489 ((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - 1) ||
4490 (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - 1))) {
4491 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4492 // we can replace it with a NOT. In the XOR case it may save some code
4493 // size, in the SUB case it also may save a move.
4494 assert(Add0C == nullptr || Add1C == nullptr);
4495
4496 // We can only do N-X, not X-N
4497 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4498 return false;
4499
4500 EVT OpVT = ShiftAmt.getValueType();
4501
4502 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT);
4503 NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT,
4504 N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes);
4505 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes);
4506 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4507 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4508 // -X to generate a NEG instead of a SUB of a constant.
4509 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4510 Add0C->getZExtValue() != 0) {
4511 EVT SubVT = ShiftAmt.getValueType();
4512 SDValue X;
4513 if (Add0C->getZExtValue() % Size == 0)
4514 X = Add1;
4515 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4516 Add0C->getZExtValue() % 32 == 0) {
4517 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4518 // This is mainly beneficial if we already compute (x+n*32).
4519 if (Add1.getOpcode() == ISD::TRUNCATE) {
4520 Add1 = Add1.getOperand(i: 0);
4521 SubVT = Add1.getValueType();
4522 }
4523 if (Add0.getValueType() != SubVT) {
4524 Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT);
4525 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0);
4526 }
4527
4528 X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0);
4529 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X);
4530 } else
4531 return false;
4532 // Insert a negate op.
4533 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4534 // that uses it that's not a shift.
4535 SDValue Zero = CurDAG->getConstant(Val: 0, DL, VT: SubVT);
4536 SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X);
4537 NewShiftAmt = Neg;
4538
4539 // Insert these operands into a valid topological order so they can
4540 // get selected independently.
4541 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero);
4542 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg);
4543 } else
4544 return false;
4545 } else
4546 return false;
4547
4548 if (NewShiftAmt.getValueType() != MVT::i8) {
4549 // Need to truncate the shift amount.
4550 NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt);
4551 // Add to a correct topological ordering.
4552 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4553 }
4554
4555 // Insert a new mask to keep the shift amount legal. This should be removed
4556 // by isel patterns.
4557 NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt,
4558 N2: CurDAG->getConstant(Val: Size - 1, DL, VT: MVT::i8));
4559 // Place in a correct topological ordering.
4560 insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4561
4562 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0),
4563 Op2: NewShiftAmt);
4564 if (UpdatedNode != N) {
4565 // If we found an existing node, we should replace ourselves with that node
4566 // and wait for it to be selected after its other users.
4567 ReplaceNode(F: N, T: UpdatedNode);
4568 return true;
4569 }
4570
4571 // If the original shift amount is now dead, delete it so that we don't run
4572 // it through isel.
4573 if (OrigShiftAmt.getNode()->use_empty())
4574 CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode());
4575
4576 // Now that we've optimized the shift amount, defer to normal isel to get
4577 // load folding and legacy vs BMI2 selection without repeating it here.
4578 SelectCode(N);
4579 return true;
4580}
4581
4582bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4583 MVT NVT = N->getSimpleValueType(ResNo: 0);
4584 unsigned Opcode = N->getOpcode();
4585 SDLoc dl(N);
4586
4587 // For operations of the form (x << C1) op C2, check if we can use a smaller
4588 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4589 SDValue Shift = N->getOperand(Num: 0);
4590 SDValue N1 = N->getOperand(Num: 1);
4591
4592 auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
4593 if (!Cst)
4594 return false;
4595
4596 int64_t Val = Cst->getSExtValue();
4597
4598 // If we have an any_extend feeding the AND, look through it to see if there
4599 // is a shift behind it. But only if the AND doesn't use the extended bits.
4600 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4601 bool FoundAnyExtend = false;
4602 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4603 Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 &&
4604 isUInt<32>(x: Val)) {
4605 FoundAnyExtend = true;
4606 Shift = Shift.getOperand(i: 0);
4607 }
4608
4609 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4610 return false;
4611
4612 // i8 is unshrinkable, i16 should be promoted to i32.
4613 if (NVT != MVT::i32 && NVT != MVT::i64)
4614 return false;
4615
4616 auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1));
4617 if (!ShlCst)
4618 return false;
4619
4620 uint64_t ShAmt = ShlCst->getZExtValue();
4621
4622 // Make sure that we don't change the operation by removing bits.
4623 // This only matters for OR and XOR, AND is unaffected.
4624 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4625 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4626 return false;
4627
4628 // Check the minimum bitwidth for the new constant.
4629 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4630 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4631 if (Opcode == ISD::AND) {
4632 // AND32ri is the same as AND64ri32 with zext imm.
4633 // Try this before sign extended immediates below.
4634 ShiftedVal = (uint64_t)Val >> ShAmt;
4635 if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal))
4636 return true;
4637 // Also swap order when the AND can become MOVZX.
4638 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4639 return true;
4640 }
4641 ShiftedVal = Val >> ShAmt;
4642 if ((!isInt<8>(x: Val) && isInt<8>(x: ShiftedVal)) ||
4643 (!isInt<32>(x: Val) && isInt<32>(x: ShiftedVal)))
4644 return true;
4645 if (Opcode != ISD::AND) {
4646 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4647 ShiftedVal = (uint64_t)Val >> ShAmt;
4648 if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal))
4649 return true;
4650 }
4651 return false;
4652 };
4653
4654 int64_t ShiftedVal;
4655 if (!CanShrinkImmediate(ShiftedVal))
4656 return false;
4657
4658 // Ok, we can reorder to get a smaller immediate.
4659
4660 // But, its possible the original immediate allowed an AND to become MOVZX.
4661 // Doing this late due to avoid the MakedValueIsZero call as late as
4662 // possible.
4663 if (Opcode == ISD::AND) {
4664 // Find the smallest zext this could possibly be.
4665 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4666 ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: 8U));
4667
4668 // Figure out which bits need to be zero to achieve that mask.
4669 APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(),
4670 loBitsSet: ZExtWidth);
4671 NeededMask &= ~Cst->getAPIntValue();
4672
4673 if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: 0), Mask: NeededMask))
4674 return false;
4675 }
4676
4677 SDValue X = Shift.getOperand(i: 0);
4678 if (FoundAnyExtend) {
4679 SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X);
4680 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewX);
4681 X = NewX;
4682 }
4683
4684 SDValue NewCst = CurDAG->getSignedConstant(Val: ShiftedVal, DL: dl, VT: NVT);
4685 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewCst);
4686 SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst);
4687 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewBinOp);
4688 SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp,
4689 N2: Shift.getOperand(i: 1));
4690 ReplaceNode(F: N, T: NewSHL.getNode());
4691 SelectCode(N: NewSHL.getNode());
4692 return true;
4693}
4694
4695bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4696 SDNode *ParentB, SDNode *ParentC,
4697 SDValue A, SDValue B, SDValue C,
4698 uint8_t Imm) {
4699 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4700 C.isOperandOf(ParentC) && "Incorrect parent node");
4701
4702 auto tryFoldLoadOrBCast =
4703 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4704 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4705 if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4706 return true;
4707
4708 // Not a load, check for broadcast which may be behind a bitcast.
4709 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4710 P = L.getNode();
4711 L = L.getOperand(i: 0);
4712 }
4713
4714 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4715 return false;
4716
4717 // Only 32 and 64 bit broadcasts are supported.
4718 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4719 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4720 if (Size != 32 && Size != 64)
4721 return false;
4722
4723 return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4724 };
4725
4726 bool FoldedLoad = false;
4727 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4728 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4729 FoldedLoad = true;
4730 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4731 Tmp4)) {
4732 FoldedLoad = true;
4733 std::swap(a&: A, b&: C);
4734 // Swap bits 1/4 and 3/6.
4735 uint8_t OldImm = Imm;
4736 Imm = OldImm & 0xa5;
4737 if (OldImm & 0x02) Imm |= 0x10;
4738 if (OldImm & 0x10) Imm |= 0x02;
4739 if (OldImm & 0x08) Imm |= 0x40;
4740 if (OldImm & 0x40) Imm |= 0x08;
4741 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4742 Tmp4)) {
4743 FoldedLoad = true;
4744 std::swap(a&: B, b&: C);
4745 // Swap bits 1/2 and 5/6.
4746 uint8_t OldImm = Imm;
4747 Imm = OldImm & 0x99;
4748 if (OldImm & 0x02) Imm |= 0x04;
4749 if (OldImm & 0x04) Imm |= 0x02;
4750 if (OldImm & 0x20) Imm |= 0x40;
4751 if (OldImm & 0x40) Imm |= 0x20;
4752 }
4753
4754 SDLoc DL(Root);
4755
4756 SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
4757
4758 MVT NVT = Root->getSimpleValueType(ResNo: 0);
4759
4760 MachineSDNode *MNode;
4761 if (FoldedLoad) {
4762 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
4763
4764 unsigned Opc;
4765 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4766 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C);
4767 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4768 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4769
4770 bool UseD = EltSize == 32;
4771 if (NVT.is128BitVector())
4772 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4773 else if (NVT.is256BitVector())
4774 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4775 else if (NVT.is512BitVector())
4776 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4777 else
4778 llvm_unreachable("Unexpected vector size!");
4779 } else {
4780 bool UseD = NVT.getVectorElementType() == MVT::i32;
4781 if (NVT.is128BitVector())
4782 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4783 else if (NVT.is256BitVector())
4784 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4785 else if (NVT.is512BitVector())
4786 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4787 else
4788 llvm_unreachable("Unexpected vector size!");
4789 }
4790
4791 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: 0)};
4792 MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops);
4793
4794 // Update the chain.
4795 ReplaceUses(F: C.getValue(R: 1), T: SDValue(MNode, 1));
4796 // Record the mem-refs
4797 CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()});
4798 } else {
4799 bool UseD = NVT.getVectorElementType() == MVT::i32;
4800 unsigned Opc;
4801 if (NVT.is128BitVector())
4802 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4803 else if (NVT.is256BitVector())
4804 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4805 else if (NVT.is512BitVector())
4806 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4807 else
4808 llvm_unreachable("Unexpected vector size!");
4809
4810 MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm});
4811 }
4812
4813 ReplaceUses(F: SDValue(Root, 0), T: SDValue(MNode, 0));
4814 CurDAG->RemoveDeadNode(N: Root);
4815 return true;
4816}
4817
4818// Try to match two logic ops to a VPTERNLOG.
4819// FIXME: Handle more complex patterns that use an operand more than once?
4820bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4821 MVT NVT = N->getSimpleValueType(ResNo: 0);
4822
4823 // Make sure we support VPTERNLOG.
4824 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4825 NVT.getVectorElementType() == MVT::i1)
4826 return false;
4827
4828 // We need VLX for 128/256-bit.
4829 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4830 return false;
4831
4832 auto getFoldableLogicOp = [](SDValue Op) {
4833 // Peek through single use bitcast.
4834 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4835 Op = Op.getOperand(i: 0);
4836
4837 if (!Op.hasOneUse())
4838 return SDValue();
4839
4840 unsigned Opc = Op.getOpcode();
4841 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4842 Opc == X86ISD::ANDNP)
4843 return Op;
4844
4845 return SDValue();
4846 };
4847
4848 SDValue N0, N1, A, FoldableOp;
4849
4850 // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
4851 auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
4852 if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
4853 ISD::isBuildVectorAllOnes(N: Op->getOperand(Num: 1).getNode())) {
4854 SDValue InnerOp = getFoldableLogicOp(Op->getOperand(Num: 0));
4855
4856 if (!InnerOp)
4857 return SDValue();
4858
4859 N0 = InnerOp.getOperand(i: 0);
4860 N1 = InnerOp.getOperand(i: 1);
4861 if ((FoldableOp = getFoldableLogicOp(N1))) {
4862 A = N0;
4863 return InnerOp;
4864 }
4865 if ((FoldableOp = getFoldableLogicOp(N0))) {
4866 A = N1;
4867 return InnerOp;
4868 }
4869 }
4870 return SDValue();
4871 };
4872
4873 bool PeeledOuterNot = false;
4874 SDNode *OriN = N;
4875 if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
4876 PeeledOuterNot = true;
4877 N = InnerOp.getNode();
4878 } else {
4879 N0 = N->getOperand(Num: 0);
4880 N1 = N->getOperand(Num: 1);
4881
4882 if ((FoldableOp = getFoldableLogicOp(N1)))
4883 A = N0;
4884 else if ((FoldableOp = getFoldableLogicOp(N0)))
4885 A = N1;
4886 else
4887 return false;
4888 }
4889
4890 SDValue B = FoldableOp.getOperand(i: 0);
4891 SDValue C = FoldableOp.getOperand(i: 1);
4892 SDNode *ParentA = N;
4893 SDNode *ParentB = FoldableOp.getNode();
4894 SDNode *ParentC = FoldableOp.getNode();
4895
4896 // We can build the appropriate control immediate by performing the logic
4897 // operation we're matching using these constants for A, B, and C.
4898 uint8_t TernlogMagicA = 0xf0;
4899 uint8_t TernlogMagicB = 0xcc;
4900 uint8_t TernlogMagicC = 0xaa;
4901
4902 // Some of the inputs may be inverted, peek through them and invert the
4903 // magic values accordingly.
4904 // TODO: There may be a bitcast before the xor that we should peek through.
4905 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4906 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4907 ISD::isBuildVectorAllOnes(N: Op.getOperand(i: 1).getNode())) {
4908 Magic = ~Magic;
4909 Parent = Op.getNode();
4910 Op = Op.getOperand(i: 0);
4911 }
4912 };
4913
4914 PeekThroughNot(A, ParentA, TernlogMagicA);
4915 PeekThroughNot(B, ParentB, TernlogMagicB);
4916 PeekThroughNot(C, ParentC, TernlogMagicC);
4917
4918 uint8_t Imm;
4919 switch (FoldableOp.getOpcode()) {
4920 default: llvm_unreachable("Unexpected opcode!");
4921 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4922 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4923 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4924 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4925 }
4926
4927 switch (N->getOpcode()) {
4928 default: llvm_unreachable("Unexpected opcode!");
4929 case X86ISD::ANDNP:
4930 if (A == N0)
4931 Imm &= ~TernlogMagicA;
4932 else
4933 Imm = ~(Imm) & TernlogMagicA;
4934 break;
4935 case ISD::AND: Imm &= TernlogMagicA; break;
4936 case ISD::OR: Imm |= TernlogMagicA; break;
4937 case ISD::XOR: Imm ^= TernlogMagicA; break;
4938 }
4939
4940 if (PeeledOuterNot)
4941 Imm = ~Imm;
4942
4943 return matchVPTERNLOG(Root: OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
4944}
4945
4946/// If the high bits of an 'and' operand are known zero, try setting the
4947/// high bits of an 'and' constant operand to produce a smaller encoding by
4948/// creating a small, sign-extended negative immediate rather than a large
4949/// positive one. This reverses a transform in SimplifyDemandedBits that
4950/// shrinks mask constants by clearing bits. There is also a possibility that
4951/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4952/// case, just replace the 'and'. Return 'true' if the node is replaced.
4953bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4954 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4955 // have immediate operands.
4956 MVT VT = And->getSimpleValueType(ResNo: 0);
4957 if (VT != MVT::i32 && VT != MVT::i64)
4958 return false;
4959
4960 auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1));
4961 if (!And1C)
4962 return false;
4963
4964 // Bail out if the mask constant is already negative. It's can't shrink more.
4965 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4966 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4967 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4968 // are negative too.
4969 APInt MaskVal = And1C->getAPIntValue();
4970 unsigned MaskLZ = MaskVal.countl_zero();
4971 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4972 return false;
4973
4974 // Don't extend into the upper 32 bits of a 64 bit mask.
4975 if (VT == MVT::i64 && MaskLZ >= 32) {
4976 MaskLZ -= 32;
4977 MaskVal = MaskVal.trunc(width: 32);
4978 }
4979
4980 SDValue And0 = And->getOperand(Num: 0);
4981 APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ);
4982 APInt NegMaskVal = MaskVal | HighZeros;
4983
4984 // If a negative constant would not allow a smaller encoding, there's no need
4985 // to continue. Only change the constant when we know it's a win.
4986 unsigned MinWidth = NegMaskVal.getSignificantBits();
4987 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4988 return false;
4989
4990 // Extend masks if we truncated above.
4991 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4992 NegMaskVal = NegMaskVal.zext(width: 64);
4993 HighZeros = HighZeros.zext(width: 64);
4994 }
4995
4996 // The variable operand must be all zeros in the top bits to allow using the
4997 // new, negative constant as the mask.
4998 // TODO: Handle constant folding?
4999 KnownBits Known0 = CurDAG->computeKnownBits(Op: And0);
5000 if (Known0.isConstant() || !HighZeros.isSubsetOf(RHS: Known0.Zero))
5001 return false;
5002
5003 // Check if the mask is -1. In that case, this is an unnecessary instruction
5004 // that escaped earlier analysis.
5005 if (NegMaskVal.isAllOnes()) {
5006 ReplaceNode(F: And, T: And0.getNode());
5007 return true;
5008 }
5009
5010 // A negative mask allows a smaller encoding. Create a new 'and' node.
5011 SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc(And), VT);
5012 insertDAGNode(DAG&: *CurDAG, Pos: SDValue(And, 0), N: NewMask);
5013 SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(And), VT, N1: And0, N2: NewMask);
5014 ReplaceNode(F: And, T: NewAnd.getNode());
5015 SelectCode(N: NewAnd.getNode());
5016 return true;
5017}
5018
5019static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
5020 bool FoldedBCast, bool Masked) {
5021#define VPTESTM_CASE(VT, SUFFIX) \
5022case MVT::VT: \
5023 if (Masked) \
5024 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
5025 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
5026
5027
5028#define VPTESTM_BROADCAST_CASES(SUFFIX) \
5029default: llvm_unreachable("Unexpected VT!"); \
5030VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
5031VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
5032VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
5033VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
5034VPTESTM_CASE(v16i32, DZ##SUFFIX) \
5035VPTESTM_CASE(v8i64, QZ##SUFFIX)
5036
5037#define VPTESTM_FULL_CASES(SUFFIX) \
5038VPTESTM_BROADCAST_CASES(SUFFIX) \
5039VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
5040VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
5041VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
5042VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
5043VPTESTM_CASE(v64i8, BZ##SUFFIX) \
5044VPTESTM_CASE(v32i16, WZ##SUFFIX)
5045
5046 if (FoldedBCast) {
5047 switch (TestVT.SimpleTy) {
5048 VPTESTM_BROADCAST_CASES(rmb)
5049 }
5050 }
5051
5052 if (FoldedLoad) {
5053 switch (TestVT.SimpleTy) {
5054 VPTESTM_FULL_CASES(rm)
5055 }
5056 }
5057
5058 switch (TestVT.SimpleTy) {
5059 VPTESTM_FULL_CASES(rr)
5060 }
5061
5062#undef VPTESTM_FULL_CASES
5063#undef VPTESTM_BROADCAST_CASES
5064#undef VPTESTM_CASE
5065}
5066
5067static void orderRegForMul(SDValue &N0, SDValue &N1, const unsigned LoReg,
5068 const MachineRegisterInfo &MRI) {
5069 auto GetPhysReg = [&](SDValue V) -> Register {
5070 if (V.getOpcode() != ISD::CopyFromReg)
5071 return Register();
5072 Register Reg = cast<RegisterSDNode>(Val: V.getOperand(i: 1))->getReg();
5073 if (Reg.isVirtual())
5074 return MRI.getLiveInPhysReg(VReg: Reg);
5075 return Reg;
5076 };
5077
5078 if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
5079 std::swap(a&: N0, b&: N1);
5080}
5081
5082// Try to create VPTESTM instruction. If InMask is not null, it will be used
5083// to form a masked operation.
5084bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
5085 SDValue InMask) {
5086 assert(Subtarget->hasAVX512() && "Expected AVX512!");
5087 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
5088 "Unexpected VT!");
5089
5090 // Look for equal and not equal compares.
5091 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: 2))->get();
5092 if (CC != ISD::SETEQ && CC != ISD::SETNE)
5093 return false;
5094
5095 SDValue SetccOp0 = Setcc.getOperand(i: 0);
5096 SDValue SetccOp1 = Setcc.getOperand(i: 1);
5097
5098 // Canonicalize the all zero vector to the RHS.
5099 if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode()))
5100 std::swap(a&: SetccOp0, b&: SetccOp1);
5101
5102 // See if we're comparing against zero.
5103 if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode()))
5104 return false;
5105
5106 SDValue N0 = SetccOp0;
5107
5108 MVT CmpVT = N0.getSimpleValueType();
5109 MVT CmpSVT = CmpVT.getVectorElementType();
5110
5111 // Start with both operands the same. We'll try to refine this.
5112 SDValue Src0 = N0;
5113 SDValue Src1 = N0;
5114
5115 {
5116 // Look through single use bitcasts.
5117 SDValue N0Temp = N0;
5118 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
5119 N0Temp = N0.getOperand(i: 0);
5120
5121 // Look for single use AND.
5122 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
5123 Src0 = N0Temp.getOperand(i: 0);
5124 Src1 = N0Temp.getOperand(i: 1);
5125 }
5126 }
5127
5128 // Without VLX we need to widen the operation.
5129 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
5130
5131 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
5132 SDValue &Base, SDValue &Scale, SDValue &Index,
5133 SDValue &Disp, SDValue &Segment) {
5134 // If we need to widen, we can't fold the load.
5135 if (!Widen)
5136 if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
5137 return true;
5138
5139 // If we didn't fold a load, try to match broadcast. No widening limitation
5140 // for this. But only 32 and 64 bit types are supported.
5141 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
5142 return false;
5143
5144 // Look through single use bitcasts.
5145 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
5146 P = L.getNode();
5147 L = L.getOperand(i: 0);
5148 }
5149
5150 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
5151 return false;
5152
5153 auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
5154 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
5155 return false;
5156
5157 return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
5158 };
5159
5160 // We can only fold loads if the sources are unique.
5161 bool CanFoldLoads = Src0 != Src1;
5162
5163 bool FoldedLoad = false;
5164 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5165 if (CanFoldLoads) {
5166 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5167 Tmp3, Tmp4);
5168 if (!FoldedLoad) {
5169 // And is commutative.
5170 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
5171 Tmp2, Tmp3, Tmp4);
5172 if (FoldedLoad)
5173 std::swap(a&: Src0, b&: Src1);
5174 }
5175 }
5176
5177 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5178
5179 bool IsMasked = InMask.getNode() != nullptr;
5180
5181 SDLoc dl(Root);
5182
5183 MVT ResVT = Setcc.getSimpleValueType();
5184 MVT MaskVT = ResVT;
5185 if (Widen) {
5186 // Widen the inputs using insert_subreg or copy_to_regclass.
5187 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5188 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5189 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5190 CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts);
5191 MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts);
5192 SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl,
5193 VT: CmpVT), 0);
5194 Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0);
5195
5196 if (!FoldedBCast)
5197 Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1);
5198
5199 if (IsMasked) {
5200 // Widen the mask.
5201 unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID();
5202 SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5203 InMask = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5204 dl, VT: MaskVT, Op1: InMask, Op2: RC), 0);
5205 }
5206 }
5207
5208 bool IsTestN = CC == ISD::SETEQ;
5209 unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5210 Masked: IsMasked);
5211
5212 MachineSDNode *CNode;
5213 if (FoldedLoad) {
5214 SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other);
5215
5216 if (IsMasked) {
5217 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5218 Src1.getOperand(i: 0) };
5219 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5220 } else {
5221 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5222 Src1.getOperand(i: 0) };
5223 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5224 }
5225
5226 // Update the chain.
5227 ReplaceUses(F: Src1.getValue(R: 1), T: SDValue(CNode, 1));
5228 // Record the mem-refs
5229 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()});
5230 } else {
5231 if (IsMasked)
5232 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1);
5233 else
5234 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1);
5235 }
5236
5237 // If we widened, we need to shrink the mask VT.
5238 if (Widen) {
5239 unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID();
5240 SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5241 CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5242 dl, VT: ResVT, Op1: SDValue(CNode, 0), Op2: RC);
5243 }
5244
5245 ReplaceUses(F: SDValue(Root, 0), T: SDValue(CNode, 0));
5246 CurDAG->RemoveDeadNode(N: Root);
5247 return true;
5248}
5249
5250// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5251// into vpternlog.
5252bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5253 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5254
5255 MVT NVT = N->getSimpleValueType(ResNo: 0);
5256
5257 // Make sure we support VPTERNLOG.
5258 if (!NVT.isVector() || !Subtarget->hasAVX512())
5259 return false;
5260
5261 // We need VLX for 128/256-bit.
5262 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5263 return false;
5264
5265 SDValue N0 = N->getOperand(Num: 0);
5266 SDValue N1 = N->getOperand(Num: 1);
5267
5268 // Canonicalize AND to LHS.
5269 if (N1.getOpcode() == ISD::AND)
5270 std::swap(a&: N0, b&: N1);
5271
5272 if (N0.getOpcode() != ISD::AND ||
5273 N1.getOpcode() != X86ISD::ANDNP ||
5274 !N0.hasOneUse() || !N1.hasOneUse())
5275 return false;
5276
5277 // ANDN is not commutable, use it to pick down A and C.
5278 SDValue A = N1.getOperand(i: 0);
5279 SDValue C = N1.getOperand(i: 1);
5280
5281 // AND is commutable, if one operand matches A, the other operand is B.
5282 // Otherwise this isn't a match.
5283 SDValue B;
5284 if (N0.getOperand(i: 0) == A)
5285 B = N0.getOperand(i: 1);
5286 else if (N0.getOperand(i: 1) == A)
5287 B = N0.getOperand(i: 0);
5288 else
5289 return false;
5290
5291 SDLoc dl(N);
5292 SDValue Imm = CurDAG->getTargetConstant(Val: 0xCA, DL: dl, VT: MVT::i8);
5293 SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm);
5294 ReplaceNode(F: N, T: Ternlog.getNode());
5295
5296 return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(),
5297 ParentC: Ternlog.getNode(), A, B, C, Imm: 0xCA);
5298}
5299
5300void X86DAGToDAGISel::Select(SDNode *Node) {
5301 MVT NVT = Node->getSimpleValueType(ResNo: 0);
5302 unsigned Opcode = Node->getOpcode();
5303 SDLoc dl(Node);
5304
5305 if (Node->isMachineOpcode()) {
5306 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5307 Node->setNodeId(-1);
5308 return; // Already selected.
5309 }
5310
5311 switch (Opcode) {
5312 default: break;
5313 case ISD::INTRINSIC_W_CHAIN: {
5314 unsigned IntNo = Node->getConstantOperandVal(Num: 1);
5315 switch (IntNo) {
5316 default: break;
5317 case Intrinsic::x86_encodekey128:
5318 case Intrinsic::x86_encodekey256: {
5319 if (!Subtarget->hasKL())
5320 break;
5321
5322 unsigned Opcode;
5323 switch (IntNo) {
5324 default: llvm_unreachable("Impossible intrinsic");
5325 case Intrinsic::x86_encodekey128:
5326 Opcode = X86::ENCODEKEY128;
5327 break;
5328 case Intrinsic::x86_encodekey256:
5329 Opcode = X86::ENCODEKEY256;
5330 break;
5331 }
5332
5333 SDValue Chain = Node->getOperand(Num: 0);
5334 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 3),
5335 Glue: SDValue());
5336 if (Opcode == X86::ENCODEKEY256)
5337 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 4),
5338 Glue: Chain.getValue(R: 1));
5339
5340 MachineSDNode *Res = CurDAG->getMachineNode(
5341 Opcode, dl, VTs: Node->getVTList(),
5342 Ops: {Node->getOperand(Num: 2), Chain, Chain.getValue(R: 1)});
5343 ReplaceNode(F: Node, T: Res);
5344 return;
5345 }
5346 case Intrinsic::x86_tileloaddrs64_internal:
5347 case Intrinsic::x86_tileloaddrst164_internal:
5348 if (!Subtarget->hasAMXMOVRS())
5349 break;
5350 [[fallthrough]];
5351 case Intrinsic::x86_tileloadd64_internal:
5352 case Intrinsic::x86_tileloaddt164_internal: {
5353 if (!Subtarget->hasAMXTILE())
5354 break;
5355 auto *MFI =
5356 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5357 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5358 unsigned Opc;
5359 switch (IntNo) {
5360 default:
5361 llvm_unreachable("Unexpected intrinsic!");
5362 case Intrinsic::x86_tileloaddrs64_internal:
5363 Opc = X86::PTILELOADDRSV;
5364 break;
5365 case Intrinsic::x86_tileloaddrst164_internal:
5366 Opc = X86::PTILELOADDRST1V;
5367 break;
5368 case Intrinsic::x86_tileloadd64_internal:
5369 Opc = X86::PTILELOADDV;
5370 break;
5371 case Intrinsic::x86_tileloaddt164_internal:
5372 Opc = X86::PTILELOADDT1V;
5373 break;
5374 }
5375 // _tile_loadd_internal(row, col, buf, STRIDE)
5376 SDValue Base = Node->getOperand(Num: 4);
5377 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5378 SDValue Index = Node->getOperand(Num: 5);
5379 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5380 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5381 SDValue Chain = Node->getOperand(Num: 0);
5382 MachineSDNode *CNode;
5383 SDValue Ops[] = {Node->getOperand(Num: 2),
5384 Node->getOperand(Num: 3),
5385 Base,
5386 Scale,
5387 Index,
5388 Disp,
5389 Segment,
5390 Chain};
5391 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops);
5392 ReplaceNode(F: Node, T: CNode);
5393 return;
5394 }
5395 }
5396 break;
5397 }
5398 case ISD::INTRINSIC_VOID: {
5399 unsigned IntNo = Node->getConstantOperandVal(Num: 1);
5400 switch (IntNo) {
5401 default: break;
5402 case Intrinsic::x86_sse3_monitor:
5403 case Intrinsic::x86_monitorx:
5404 case Intrinsic::x86_clzero: {
5405 bool Use64BitPtr = Node->getOperand(Num: 2).getValueType() == MVT::i64;
5406
5407 unsigned Opc = 0;
5408 switch (IntNo) {
5409 default: llvm_unreachable("Unexpected intrinsic!");
5410 case Intrinsic::x86_sse3_monitor:
5411 if (!Subtarget->hasSSE3())
5412 break;
5413 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5414 break;
5415 case Intrinsic::x86_monitorx:
5416 if (!Subtarget->hasMWAITX())
5417 break;
5418 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5419 break;
5420 case Intrinsic::x86_clzero:
5421 if (!Subtarget->hasCLZERO())
5422 break;
5423 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5424 break;
5425 }
5426
5427 if (Opc) {
5428 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5429 SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: 0), dl, Reg: PtrReg,
5430 N: Node->getOperand(Num: 2), Glue: SDValue());
5431 SDValue InGlue = Chain.getValue(R: 1);
5432
5433 if (IntNo == Intrinsic::x86_sse3_monitor ||
5434 IntNo == Intrinsic::x86_monitorx) {
5435 // Copy the other two operands to ECX and EDX.
5436 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: 3),
5437 Glue: InGlue);
5438 InGlue = Chain.getValue(R: 1);
5439 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: 4),
5440 Glue: InGlue);
5441 InGlue = Chain.getValue(R: 1);
5442 }
5443
5444 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other,
5445 Ops: { Chain, InGlue});
5446 ReplaceNode(F: Node, T: CNode);
5447 return;
5448 }
5449
5450 break;
5451 }
5452 case Intrinsic::x86_tilestored64_internal: {
5453 auto *MFI =
5454 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5455 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5456 unsigned Opc = X86::PTILESTOREDV;
5457 // _tile_stored_internal(row, col, buf, STRIDE, c)
5458 SDValue Base = Node->getOperand(Num: 4);
5459 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5460 SDValue Index = Node->getOperand(Num: 5);
5461 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5462 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5463 SDValue Chain = Node->getOperand(Num: 0);
5464 MachineSDNode *CNode;
5465 SDValue Ops[] = {Node->getOperand(Num: 2),
5466 Node->getOperand(Num: 3),
5467 Base,
5468 Scale,
5469 Index,
5470 Disp,
5471 Segment,
5472 Node->getOperand(Num: 6),
5473 Chain};
5474 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5475 ReplaceNode(F: Node, T: CNode);
5476 return;
5477 }
5478 case Intrinsic::x86_tileloaddrs64:
5479 case Intrinsic::x86_tileloaddrst164:
5480 if (!Subtarget->hasAMXMOVRS())
5481 break;
5482 [[fallthrough]];
5483 case Intrinsic::x86_tileloadd64:
5484 case Intrinsic::x86_tileloaddt164:
5485 case Intrinsic::x86_tilestored64: {
5486 if (!Subtarget->hasAMXTILE())
5487 break;
5488 auto *MFI =
5489 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5490 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5491 unsigned Opc;
5492 switch (IntNo) {
5493 default: llvm_unreachable("Unexpected intrinsic!");
5494 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5495 case Intrinsic::x86_tileloaddrs64:
5496 Opc = X86::PTILELOADDRS;
5497 break;
5498 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5499 case Intrinsic::x86_tileloaddrst164:
5500 Opc = X86::PTILELOADDRST1;
5501 break;
5502 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5503 }
5504 // FIXME: Match displacement and scale.
5505 unsigned TIndex = Node->getConstantOperandVal(Num: 2);
5506 SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5507 SDValue Base = Node->getOperand(Num: 3);
5508 SDValue Scale = getI8Imm(Imm: 1, DL: dl);
5509 SDValue Index = Node->getOperand(Num: 4);
5510 SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32);
5511 SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16);
5512 SDValue Chain = Node->getOperand(Num: 0);
5513 MachineSDNode *CNode;
5514 if (Opc == X86::PTILESTORED) {
5515 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5516 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5517 } else {
5518 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5519 CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5520 }
5521 ReplaceNode(F: Node, T: CNode);
5522 return;
5523 }
5524 }
5525 break;
5526 }
5527 case ISD::BRIND:
5528 case X86ISD::NT_BRIND: {
5529 if (Subtarget->isTarget64BitILP32()) {
5530 // Converts a 32-bit register to a 64-bit, zero-extended version of
5531 // it. This is needed because x86-64 can do many things, but jmp %r32
5532 // ain't one of them.
5533 SDValue Target = Node->getOperand(Num: 1);
5534 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5535 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64);
5536 SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other,
5537 N1: Node->getOperand(Num: 0), N2: ZextTarget);
5538 ReplaceNode(F: Node, T: Brind.getNode());
5539 SelectCode(N: ZextTarget.getNode());
5540 SelectCode(N: Brind.getNode());
5541 return;
5542 }
5543 break;
5544 }
5545 case X86ISD::GlobalBaseReg:
5546 ReplaceNode(F: Node, T: getGlobalBaseReg());
5547 return;
5548
5549 case ISD::BITCAST:
5550 // Just drop all 128/256/512-bit bitcasts.
5551 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5552 NVT == MVT::f128) {
5553 ReplaceUses(F: SDValue(Node, 0), T: Node->getOperand(Num: 0));
5554 CurDAG->RemoveDeadNode(N: Node);
5555 return;
5556 }
5557 break;
5558
5559 case ISD::SRL:
5560 if (matchBitExtract(Node))
5561 return;
5562 [[fallthrough]];
5563 case ISD::SRA:
5564 case ISD::SHL:
5565 if (tryShiftAmountMod(N: Node))
5566 return;
5567 break;
5568
5569 case X86ISD::VPTERNLOG: {
5570 uint8_t Imm = Node->getConstantOperandVal(Num: 3);
5571 if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: 0),
5572 B: Node->getOperand(Num: 1), C: Node->getOperand(Num: 2), Imm))
5573 return;
5574 break;
5575 }
5576
5577 case X86ISD::ANDNP:
5578 if (tryVPTERNLOG(N: Node))
5579 return;
5580 break;
5581
5582 case ISD::AND:
5583 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5584 // Try to form a masked VPTESTM. Operands can be in either order.
5585 SDValue N0 = Node->getOperand(Num: 0);
5586 SDValue N1 = Node->getOperand(Num: 1);
5587 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5588 tryVPTESTM(Root: Node, Setcc: N0, InMask: N1))
5589 return;
5590 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5591 tryVPTESTM(Root: Node, Setcc: N1, InMask: N0))
5592 return;
5593 }
5594
5595 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5596 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
5597 CurDAG->RemoveDeadNode(N: Node);
5598 return;
5599 }
5600 if (matchBitExtract(Node))
5601 return;
5602 if (AndImmShrink && shrinkAndImmediate(And: Node))
5603 return;
5604
5605 [[fallthrough]];
5606 case ISD::OR:
5607 case ISD::XOR:
5608 if (tryShrinkShlLogicImm(N: Node))
5609 return;
5610 if (Opcode == ISD::OR && tryMatchBitSelect(N: Node))
5611 return;
5612 if (tryVPTERNLOG(N: Node))
5613 return;
5614
5615 [[fallthrough]];
5616 case ISD::ADD:
5617 if (Opcode == ISD::ADD && matchBitExtract(Node))
5618 return;
5619 [[fallthrough]];
5620 case ISD::SUB: {
5621 // Try to avoid folding immediates with multiple uses for optsize.
5622 // This code tries to select to register form directly to avoid going
5623 // through the isel table which might fold the immediate. We can't change
5624 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5625 // tablegen files to check immediate use count without making the patterns
5626 // unavailable to the fast-isel table.
5627 if (!CurDAG->shouldOptForSize())
5628 break;
5629
5630 // Only handle i8/i16/i32/i64.
5631 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5632 break;
5633
5634 SDValue N0 = Node->getOperand(Num: 0);
5635 SDValue N1 = Node->getOperand(Num: 1);
5636
5637 auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
5638 if (!Cst)
5639 break;
5640
5641 int64_t Val = Cst->getSExtValue();
5642
5643 // Make sure its an immediate that is considered foldable.
5644 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5645 if (!isInt<8>(x: Val) && !isInt<32>(x: Val))
5646 break;
5647
5648 // If this can match to INC/DEC, let it go.
5649 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5650 break;
5651
5652 // Check if we should avoid folding this immediate.
5653 if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode()))
5654 break;
5655
5656 // We should not fold the immediate. So we need a register form instead.
5657 unsigned ROpc, MOpc;
5658 switch (NVT.SimpleTy) {
5659 default: llvm_unreachable("Unexpected VT!");
5660 case MVT::i8:
5661 switch (Opcode) {
5662 default: llvm_unreachable("Unexpected opcode!");
5663 case ISD::ADD:
5664 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5665 MOpc = GET_NDM_IF_ENABLED(X86::ADD8rm);
5666 break;
5667 case ISD::SUB:
5668 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5669 MOpc = GET_NDM_IF_ENABLED(X86::SUB8rm);
5670 break;
5671 case ISD::AND:
5672 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5673 MOpc = GET_NDM_IF_ENABLED(X86::AND8rm);
5674 break;
5675 case ISD::OR:
5676 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5677 MOpc = GET_NDM_IF_ENABLED(X86::OR8rm);
5678 break;
5679 case ISD::XOR:
5680 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5681 MOpc = GET_NDM_IF_ENABLED(X86::XOR8rm);
5682 break;
5683 }
5684 break;
5685 case MVT::i16:
5686 switch (Opcode) {
5687 default: llvm_unreachable("Unexpected opcode!");
5688 case ISD::ADD:
5689 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5690 MOpc = GET_NDM_IF_ENABLED(X86::ADD16rm);
5691 break;
5692 case ISD::SUB:
5693 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5694 MOpc = GET_NDM_IF_ENABLED(X86::SUB16rm);
5695 break;
5696 case ISD::AND:
5697 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5698 MOpc = GET_NDM_IF_ENABLED(X86::AND16rm);
5699 break;
5700 case ISD::OR:
5701 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5702 MOpc = GET_NDM_IF_ENABLED(X86::OR16rm);
5703 break;
5704 case ISD::XOR:
5705 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5706 MOpc = GET_NDM_IF_ENABLED(X86::XOR16rm);
5707 break;
5708 }
5709 break;
5710 case MVT::i32:
5711 switch (Opcode) {
5712 default: llvm_unreachable("Unexpected opcode!");
5713 case ISD::ADD:
5714 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5715 MOpc = GET_NDM_IF_ENABLED(X86::ADD32rm);
5716 break;
5717 case ISD::SUB:
5718 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5719 MOpc = GET_NDM_IF_ENABLED(X86::SUB32rm);
5720 break;
5721 case ISD::AND:
5722 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5723 MOpc = GET_NDM_IF_ENABLED(X86::AND32rm);
5724 break;
5725 case ISD::OR:
5726 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5727 MOpc = GET_NDM_IF_ENABLED(X86::OR32rm);
5728 break;
5729 case ISD::XOR:
5730 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5731 MOpc = GET_NDM_IF_ENABLED(X86::XOR32rm);
5732 break;
5733 }
5734 break;
5735 case MVT::i64:
5736 switch (Opcode) {
5737 default: llvm_unreachable("Unexpected opcode!");
5738 case ISD::ADD:
5739 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5740 MOpc = GET_NDM_IF_ENABLED(X86::ADD64rm);
5741 break;
5742 case ISD::SUB:
5743 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5744 MOpc = GET_NDM_IF_ENABLED(X86::SUB64rm);
5745 break;
5746 case ISD::AND:
5747 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5748 MOpc = GET_NDM_IF_ENABLED(X86::AND64rm);
5749 break;
5750 case ISD::OR:
5751 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5752 MOpc = GET_NDM_IF_ENABLED(X86::OR64rm);
5753 break;
5754 case ISD::XOR:
5755 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5756 MOpc = GET_NDM_IF_ENABLED(X86::XOR64rm);
5757 break;
5758 }
5759 break;
5760 }
5761
5762 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5763
5764 // If this is a not a subtract, we can still try to fold a load.
5765 if (Opcode != ISD::SUB) {
5766 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5767 if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5768 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) };
5769 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5770 MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5771 // Update the chain.
5772 ReplaceUses(F: N0.getValue(R: 1), T: SDValue(CNode, 2));
5773 // Record the mem-refs
5774 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5775 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
5776 CurDAG->RemoveDeadNode(N: Node);
5777 return;
5778 }
5779 }
5780
5781 CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1);
5782 return;
5783 }
5784
5785 case X86ISD::SMUL:
5786 // i16/i32/i64 are handled with isel patterns.
5787 if (NVT != MVT::i8)
5788 break;
5789 [[fallthrough]];
5790 case X86ISD::UMUL: {
5791 SDValue N0 = Node->getOperand(Num: 0);
5792 SDValue N1 = Node->getOperand(Num: 1);
5793
5794 unsigned LoReg, ROpc, MOpc;
5795 switch (NVT.SimpleTy) {
5796 default: llvm_unreachable("Unsupported VT!");
5797 case MVT::i8:
5798 LoReg = X86::AL;
5799 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5800 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5801 break;
5802 case MVT::i16:
5803 LoReg = X86::AX;
5804 ROpc = X86::MUL16r;
5805 MOpc = X86::MUL16m;
5806 break;
5807 case MVT::i32:
5808 LoReg = X86::EAX;
5809 ROpc = X86::MUL32r;
5810 MOpc = X86::MUL32m;
5811 break;
5812 case MVT::i64:
5813 LoReg = X86::RAX;
5814 ROpc = X86::MUL64r;
5815 MOpc = X86::MUL64m;
5816 break;
5817 }
5818
5819 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5820 bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5821 // Multiply is commutative.
5822 if (!FoldedLoad) {
5823 FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5824 if (FoldedLoad)
5825 std::swap(a&: N0, b&: N1);
5826 }
5827
5828 // UMUL/SMUL have an implicit source in LoReg (AL/AX/EAX/RAX). Prefer the
5829 // operand that's already there to avoid an extra register-to-register move.
5830 if (!FoldedLoad)
5831 orderRegForMul(N0, N1, LoReg, MRI: CurDAG->getMachineFunction().getRegInfo());
5832
5833 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5834 N: N0, Glue: SDValue()).getValue(R: 1);
5835
5836 MachineSDNode *CNode;
5837 if (FoldedLoad) {
5838 // i16/i32/i64 use an instruction that produces a low and high result even
5839 // though only the low result is used.
5840 SDVTList VTs;
5841 if (NVT == MVT::i8)
5842 VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5843 else
5844 VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other);
5845
5846 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5847 InGlue };
5848 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5849
5850 // Update the chain.
5851 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5852 // Record the mem-refs
5853 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5854 } else {
5855 // i16/i32/i64 use an instruction that produces a low and high result even
5856 // though only the low result is used.
5857 SDVTList VTs;
5858 if (NVT == MVT::i8)
5859 VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32);
5860 else
5861 VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32);
5862
5863 CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue});
5864 }
5865
5866 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
5867 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5868 CurDAG->RemoveDeadNode(N: Node);
5869 return;
5870 }
5871
5872 case ISD::SMUL_LOHI:
5873 case ISD::UMUL_LOHI: {
5874 SDValue N0 = Node->getOperand(Num: 0);
5875 SDValue N1 = Node->getOperand(Num: 1);
5876
5877 unsigned Opc, MOpc;
5878 unsigned LoReg, HiReg;
5879 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5880 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5881 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5882 switch (NVT.SimpleTy) {
5883 default: llvm_unreachable("Unsupported VT!");
5884 case MVT::i32:
5885 Opc = UseMULXHi ? X86::MULX32Hrr
5886 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5887 : IsSigned ? X86::IMUL32r
5888 : X86::MUL32r;
5889 MOpc = UseMULXHi ? X86::MULX32Hrm
5890 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5891 : IsSigned ? X86::IMUL32m
5892 : X86::MUL32m;
5893 LoReg = UseMULX ? X86::EDX : X86::EAX;
5894 HiReg = X86::EDX;
5895 break;
5896 case MVT::i64:
5897 Opc = UseMULXHi ? X86::MULX64Hrr
5898 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5899 : IsSigned ? X86::IMUL64r
5900 : X86::MUL64r;
5901 MOpc = UseMULXHi ? X86::MULX64Hrm
5902 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5903 : IsSigned ? X86::IMUL64m
5904 : X86::MUL64m;
5905 LoReg = UseMULX ? X86::RDX : X86::RAX;
5906 HiReg = X86::RDX;
5907 break;
5908 }
5909
5910 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5911 bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5912 // Multiply is commutative.
5913 if (!foldedLoad) {
5914 foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5915 if (foldedLoad)
5916 std::swap(a&: N0, b&: N1);
5917 }
5918
5919 // UMUL/SMUL_LOHI has an implicit source in LoReg (RDX for MULX, RAX for
5920 // MUL/IMUL). Prefer the operand that's already there.
5921 if (!foldedLoad)
5922 orderRegForMul(N0, N1, LoReg, MRI: CurDAG->getMachineFunction().getRegInfo());
5923
5924 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5925 N: N0, Glue: SDValue()).getValue(R: 1);
5926 SDValue ResHi, ResLo;
5927 if (foldedLoad) {
5928 SDValue Chain;
5929 MachineSDNode *CNode = nullptr;
5930 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
5931 InGlue };
5932 if (UseMULXHi) {
5933 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
5934 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5935 ResHi = SDValue(CNode, 0);
5936 Chain = SDValue(CNode, 1);
5937 } else if (UseMULX) {
5938 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other);
5939 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5940 ResHi = SDValue(CNode, 0);
5941 ResLo = SDValue(CNode, 1);
5942 Chain = SDValue(CNode, 2);
5943 } else {
5944 SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
5945 CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5946 Chain = SDValue(CNode, 0);
5947 InGlue = SDValue(CNode, 1);
5948 }
5949
5950 // Update the chain.
5951 ReplaceUses(F: N1.getValue(R: 1), T: Chain);
5952 // Record the mem-refs
5953 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5954 } else {
5955 SDValue Ops[] = { N1, InGlue };
5956 if (UseMULXHi) {
5957 SDVTList VTs = CurDAG->getVTList(VT: NVT);
5958 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5959 ResHi = SDValue(CNode, 0);
5960 } else if (UseMULX) {
5961 SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT);
5962 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5963 ResHi = SDValue(CNode, 0);
5964 ResLo = SDValue(CNode, 1);
5965 } else {
5966 SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue);
5967 SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5968 InGlue = SDValue(CNode, 0);
5969 }
5970 }
5971
5972 // Copy the low half of the result, if it is needed.
5973 if (!SDValue(Node, 0).use_empty()) {
5974 if (!ResLo) {
5975 assert(LoReg && "Register for low half is not defined!");
5976 ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5977 VT: NVT, Glue: InGlue);
5978 InGlue = ResLo.getValue(R: 2);
5979 }
5980 ReplaceUses(F: SDValue(Node, 0), T: ResLo);
5981 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5982 dbgs() << '\n');
5983 }
5984 // Copy the high half of the result, if it is needed.
5985 if (!SDValue(Node, 1).use_empty()) {
5986 if (!ResHi) {
5987 assert(HiReg && "Register for high half is not defined!");
5988 ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg,
5989 VT: NVT, Glue: InGlue);
5990 InGlue = ResHi.getValue(R: 2);
5991 }
5992 ReplaceUses(F: SDValue(Node, 1), T: ResHi);
5993 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5994 dbgs() << '\n');
5995 }
5996
5997 CurDAG->RemoveDeadNode(N: Node);
5998 return;
5999 }
6000
6001 case ISD::SDIVREM:
6002 case ISD::UDIVREM: {
6003 SDValue N0 = Node->getOperand(Num: 0);
6004 SDValue N1 = Node->getOperand(Num: 1);
6005
6006 unsigned ROpc, MOpc;
6007 bool isSigned = Opcode == ISD::SDIVREM;
6008 if (!isSigned) {
6009 switch (NVT.SimpleTy) {
6010 default: llvm_unreachable("Unsupported VT!");
6011 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
6012 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
6013 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
6014 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
6015 }
6016 } else {
6017 switch (NVT.SimpleTy) {
6018 default: llvm_unreachable("Unsupported VT!");
6019 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
6020 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
6021 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
6022 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
6023 }
6024 }
6025
6026 unsigned LoReg, HiReg, ClrReg;
6027 unsigned SExtOpcode;
6028 switch (NVT.SimpleTy) {
6029 default: llvm_unreachable("Unsupported VT!");
6030 case MVT::i8:
6031 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
6032 SExtOpcode = 0; // Not used.
6033 break;
6034 case MVT::i16:
6035 LoReg = X86::AX; HiReg = X86::DX;
6036 ClrReg = X86::DX;
6037 SExtOpcode = X86::CWD;
6038 break;
6039 case MVT::i32:
6040 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
6041 SExtOpcode = X86::CDQ;
6042 break;
6043 case MVT::i64:
6044 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
6045 SExtOpcode = X86::CQO;
6046 break;
6047 }
6048
6049 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6050 bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
6051 bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0);
6052
6053 SDValue InGlue;
6054 if (NVT == MVT::i8) {
6055 // Special case for div8, just use a move with zero extension to AX to
6056 // clear the upper 8 bits (AH).
6057 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
6058 MachineSDNode *Move;
6059 if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6060 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) };
6061 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
6062 : X86::MOVZX16rm8;
6063 Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops);
6064 Chain = SDValue(Move, 1);
6065 ReplaceUses(F: N0.getValue(R: 1), T: Chain);
6066 // Record the mem-refs
6067 CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
6068 } else {
6069 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
6070 : X86::MOVZX16rr8;
6071 Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0);
6072 Chain = CurDAG->getEntryNode();
6073 }
6074 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue(Move, 0),
6075 Glue: SDValue());
6076 InGlue = Chain.getValue(R: 1);
6077 } else {
6078 InGlue =
6079 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl,
6080 Reg: LoReg, N: N0, Glue: SDValue()).getValue(R: 1);
6081 if (isSigned && !signBitIsZero) {
6082 // Sign extend the low part into the high part.
6083 InGlue =
6084 SDValue(CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),0);
6085 } else {
6086 // Zero out the high part, effectively zero extending the input.
6087 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
6088 SDValue ClrNode =
6089 SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0);
6090 switch (NVT.SimpleTy) {
6091 case MVT::i16:
6092 ClrNode =
6093 SDValue(CurDAG->getMachineNode(
6094 Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode,
6095 Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl,
6096 VT: MVT::i32)),
6097 0);
6098 break;
6099 case MVT::i32:
6100 break;
6101 case MVT::i64:
6102 ClrNode = SDValue(
6103 CurDAG->getMachineNode(
6104 Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, Op1: ClrNode,
6105 Op2: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
6106 0);
6107 break;
6108 default:
6109 llvm_unreachable("Unexpected division source");
6110 }
6111
6112 InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg,
6113 N: ClrNode, Glue: InGlue).getValue(R: 1);
6114 }
6115 }
6116
6117 if (foldedLoad) {
6118 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0),
6119 InGlue };
6120 MachineSDNode *CNode =
6121 CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops);
6122 InGlue = SDValue(CNode, 1);
6123 // Update the chain.
6124 ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 0));
6125 // Record the mem-refs
6126 CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
6127 } else {
6128 InGlue =
6129 SDValue(CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), 0);
6130 }
6131
6132 // Prevent use of AH in a REX instruction by explicitly copying it to
6133 // an ABCD_L register.
6134 //
6135 // The current assumption of the register allocator is that isel
6136 // won't generate explicit references to the GR8_ABCD_H registers. If
6137 // the allocator and/or the backend get enhanced to be more robust in
6138 // that regard, this can be, and should be, removed.
6139 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6140 SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8);
6141 unsigned AHExtOpcode =
6142 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6143
6144 SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32,
6145 VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue);
6146 SDValue Result(RNode, 0);
6147 InGlue = SDValue(RNode, 1);
6148
6149 Result =
6150 CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result);
6151
6152 ReplaceUses(F: SDValue(Node, 1), T: Result);
6153 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6154 dbgs() << '\n');
6155 }
6156 // Copy the division (low) result, if it is needed.
6157 if (!SDValue(Node, 0).use_empty()) {
6158 SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6159 Reg: LoReg, VT: NVT, Glue: InGlue);
6160 InGlue = Result.getValue(R: 2);
6161 ReplaceUses(F: SDValue(Node, 0), T: Result);
6162 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6163 dbgs() << '\n');
6164 }
6165 // Copy the remainder (high) result, if it is needed.
6166 if (!SDValue(Node, 1).use_empty()) {
6167 SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6168 Reg: HiReg, VT: NVT, Glue: InGlue);
6169 InGlue = Result.getValue(R: 2);
6170 ReplaceUses(F: SDValue(Node, 1), T: Result);
6171 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6172 dbgs() << '\n');
6173 }
6174 CurDAG->RemoveDeadNode(N: Node);
6175 return;
6176 }
6177
6178 case X86ISD::FCMP:
6179 case X86ISD::STRICT_FCMP:
6180 case X86ISD::STRICT_FCMPS: {
6181 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6182 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6183 SDValue N0 = Node->getOperand(Num: IsStrictCmp ? 1 : 0);
6184 SDValue N1 = Node->getOperand(Num: IsStrictCmp ? 2 : 1);
6185
6186 // Save the original VT of the compare.
6187 MVT CmpVT = N0.getSimpleValueType();
6188
6189 // Floating point needs special handling if we don't have FCOMI.
6190 if (Subtarget->canUseCMOV())
6191 break;
6192
6193 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6194
6195 unsigned Opc;
6196 switch (CmpVT.SimpleTy) {
6197 default: llvm_unreachable("Unexpected type!");
6198 case MVT::f32:
6199 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6200 break;
6201 case MVT::f64:
6202 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6203 break;
6204 case MVT::f80:
6205 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6206 break;
6207 }
6208
6209 SDValue Chain =
6210 IsStrictCmp ? Node->getOperand(Num: 0) : CurDAG->getEntryNode();
6211 SDValue Glue;
6212 if (IsStrictCmp) {
6213 SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
6214 Chain = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), 0);
6215 Glue = Chain.getValue(R: 1);
6216 } else {
6217 Glue = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), 0);
6218 }
6219
6220 // Move FPSW to AX.
6221 SDValue FNSTSW =
6222 SDValue(CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), 0);
6223
6224 // Extract upper 8-bits of AX.
6225 SDValue Extract =
6226 CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW);
6227
6228 // Move AH into flags.
6229 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6230 assert(Subtarget->canUseLAHFSAHF() &&
6231 "Target doesn't support SAHF or FCOMI?");
6232 SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue());
6233 Chain = AH;
6234 SDValue SAHF = SDValue(
6235 CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: 1)), 0);
6236
6237 if (IsStrictCmp)
6238 ReplaceUses(F: SDValue(Node, 1), T: Chain);
6239
6240 ReplaceUses(F: SDValue(Node, 0), T: SAHF);
6241 CurDAG->RemoveDeadNode(N: Node);
6242 return;
6243 }
6244
6245 case X86ISD::CMP: {
6246 SDValue N0 = Node->getOperand(Num: 0);
6247 SDValue N1 = Node->getOperand(Num: 1);
6248
6249 // Optimizations for TEST compares.
6250 if (!isNullConstant(V: N1))
6251 break;
6252
6253 // Save the original VT of the compare.
6254 MVT CmpVT = N0.getSimpleValueType();
6255
6256 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6257 // by a test instruction. The test should be removed later by
6258 // analyzeCompare if we are using only the zero flag.
6259 // TODO: Should we check the users and use the BEXTR flags directly?
6260 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6261 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) {
6262 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6263 : X86::TEST32rr;
6264 SDValue BEXTR = SDValue(NewNode, 0);
6265 NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR);
6266 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
6267 CurDAG->RemoveDeadNode(N: Node);
6268 return;
6269 }
6270 }
6271
6272 // We can peek through truncates, but we need to be careful below.
6273 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6274 N0 = N0.getOperand(i: 0);
6275
6276 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6277 // use a smaller encoding.
6278 // Look past the truncate if CMP is the only use of it.
6279 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6280 N0.getValueType() != MVT::i8) {
6281 auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
6282 if (!MaskC)
6283 break;
6284
6285 // We may have looked through a truncate so mask off any bits that
6286 // shouldn't be part of the compare.
6287 uint64_t Mask = MaskC->getZExtValue();
6288 Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits());
6289
6290 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6291 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6292 // zero flag.
6293 if (CmpVT == MVT::i64 && !isInt<8>(x: Mask) && isShiftedMask_64(Value: Mask) &&
6294 onlyUsesZeroFlag(Flags: SDValue(Node, 0))) {
6295 unsigned ShiftOpcode = ISD::DELETED_NODE;
6296 unsigned ShiftAmt;
6297 unsigned SubRegIdx;
6298 MVT SubRegVT;
6299 unsigned TestOpcode;
6300 unsigned LeadingZeros = llvm::countl_zero(Val: Mask);
6301 unsigned TrailingZeros = llvm::countr_zero(Val: Mask);
6302
6303 // With leading/trailing zeros, the transform is profitable if we can
6304 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6305 // incurring any extra register moves.
6306 bool SavesBytes = !isInt<32>(x: Mask) || N0.getOperand(i: 0).hasOneUse();
6307 if (LeadingZeros == 0 && SavesBytes) {
6308 // If the mask covers the most significant bit, then we can replace
6309 // TEST+AND with a SHR and check eflags.
6310 // This emits a redundant TEST which is subsequently eliminated.
6311 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6312 ShiftAmt = TrailingZeros;
6313 SubRegIdx = 0;
6314 TestOpcode = X86::TEST64rr;
6315 } else if (TrailingZeros == 0 && SavesBytes) {
6316 // If the mask covers the least significant bit, then we can replace
6317 // TEST+AND with a SHL and check eflags.
6318 // This emits a redundant TEST which is subsequently eliminated.
6319 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6320 ShiftAmt = LeadingZeros;
6321 SubRegIdx = 0;
6322 TestOpcode = X86::TEST64rr;
6323 } else if (MaskC->hasOneUse() && !isInt<32>(x: Mask)) {
6324 // If the shifted mask extends into the high half and is 8/16/32 bits
6325 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6326 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6327 if (PopCount == 8) {
6328 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6329 ShiftAmt = TrailingZeros;
6330 SubRegIdx = X86::sub_8bit;
6331 SubRegVT = MVT::i8;
6332 TestOpcode = X86::TEST8rr;
6333 } else if (PopCount == 16) {
6334 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6335 ShiftAmt = TrailingZeros;
6336 SubRegIdx = X86::sub_16bit;
6337 SubRegVT = MVT::i16;
6338 TestOpcode = X86::TEST16rr;
6339 } else if (PopCount == 32) {
6340 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6341 ShiftAmt = TrailingZeros;
6342 SubRegIdx = X86::sub_32bit;
6343 SubRegVT = MVT::i32;
6344 TestOpcode = X86::TEST32rr;
6345 }
6346 }
6347 if (ShiftOpcode != ISD::DELETED_NODE) {
6348 SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64);
6349 SDValue Shift = SDValue(
6350 CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32,
6351 Op1: N0.getOperand(i: 0), Op2: ShiftC),
6352 0);
6353 if (SubRegIdx != 0) {
6354 Shift =
6355 CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift);
6356 }
6357 MachineSDNode *Test =
6358 CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift);
6359 ReplaceNode(F: Node, T: Test);
6360 return;
6361 }
6362 }
6363
6364 MVT VT;
6365 int SubRegOp;
6366 unsigned ROpc, MOpc;
6367
6368 // For each of these checks we need to be careful if the sign flag is
6369 // being used. It is only safe to use the sign flag in two conditions,
6370 // either the sign bit in the shrunken mask is zero or the final test
6371 // size is equal to the original compare size.
6372
6373 if (isUInt<8>(x: Mask) &&
6374 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6375 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6376 // For example, convert "testl %eax, $8" to "testb %al, $8"
6377 VT = MVT::i8;
6378 SubRegOp = X86::sub_8bit;
6379 ROpc = X86::TEST8ri;
6380 MOpc = X86::TEST8mi;
6381 } else if (OptForMinSize && isUInt<16>(x: Mask) &&
6382 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6383 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6384 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6385 // NOTE: We only want to form TESTW instructions if optimizing for
6386 // min size. Otherwise we only save one byte and possibly get a length
6387 // changing prefix penalty in the decoders.
6388 VT = MVT::i16;
6389 SubRegOp = X86::sub_16bit;
6390 ROpc = X86::TEST16ri;
6391 MOpc = X86::TEST16mi;
6392 } else if (isUInt<32>(x: Mask) && N0.getValueType() != MVT::i16 &&
6393 ((!(Mask & 0x80000000) &&
6394 // Without minsize 16-bit Cmps can get here so we need to
6395 // be sure we calculate the correct sign flag if needed.
6396 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6397 CmpVT == MVT::i32 ||
6398 hasNoSignFlagUses(Flags: SDValue(Node, 0)))) {
6399 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6400 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6401 // Otherwize, we find ourselves in a position where we have to do
6402 // promotion. If previous passes did not promote the and, we assume
6403 // they had a good reason not to and do not promote here.
6404 VT = MVT::i32;
6405 SubRegOp = X86::sub_32bit;
6406 ROpc = X86::TEST32ri;
6407 MOpc = X86::TEST32mi;
6408 } else {
6409 // No eligible transformation was found.
6410 break;
6411 }
6412
6413 SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT);
6414 SDValue Reg = N0.getOperand(i: 0);
6415
6416 // Emit a testl or testw.
6417 MachineSDNode *NewNode;
6418 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6419 if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6420 if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: 0).getNode())) {
6421 if (!LoadN->isSimple()) {
6422 unsigned NumVolBits = LoadN->getValueType(ResNo: 0).getSizeInBits();
6423 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6424 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6425 (MOpc == X86::TEST32mi && NumVolBits != 32))
6426 break;
6427 }
6428 }
6429 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6430 Reg.getOperand(i: 0) };
6431 NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops);
6432 // Update the chain.
6433 ReplaceUses(F: Reg.getValue(R: 1), T: SDValue(NewNode, 1));
6434 // Record the mem-refs
6435 CurDAG->setNodeMemRefs(N: NewNode,
6436 NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()});
6437 } else {
6438 // Extract the subregister if necessary.
6439 if (N0.getValueType() != VT)
6440 Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg);
6441
6442 NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm);
6443 }
6444 // Replace CMP with TEST.
6445 ReplaceNode(F: Node, T: NewNode);
6446 return;
6447 }
6448 break;
6449 }
6450 case X86ISD::PCMPISTR: {
6451 if (!Subtarget->hasSSE42())
6452 break;
6453
6454 bool NeedIndex = !SDValue(Node, 0).use_empty();
6455 bool NeedMask = !SDValue(Node, 1).use_empty();
6456 // We can't fold a load if we are going to make two instructions.
6457 bool MayFoldLoad = !NeedIndex || !NeedMask;
6458
6459 MachineSDNode *CNode;
6460 if (NeedMask) {
6461 unsigned ROpc =
6462 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6463 unsigned MOpc =
6464 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6465 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node);
6466 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0));
6467 }
6468 if (NeedIndex || !NeedMask) {
6469 unsigned ROpc =
6470 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6471 unsigned MOpc =
6472 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6473 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node);
6474 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
6475 }
6476
6477 // Connect the flag usage to the last instruction created.
6478 ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1));
6479 CurDAG->RemoveDeadNode(N: Node);
6480 return;
6481 }
6482 case X86ISD::PCMPESTR: {
6483 if (!Subtarget->hasSSE42())
6484 break;
6485
6486 // Copy the two implicit register inputs.
6487 SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX,
6488 N: Node->getOperand(Num: 1),
6489 Glue: SDValue()).getValue(R: 1);
6490 InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX,
6491 N: Node->getOperand(Num: 3), Glue: InGlue).getValue(R: 1);
6492
6493 bool NeedIndex = !SDValue(Node, 0).use_empty();
6494 bool NeedMask = !SDValue(Node, 1).use_empty();
6495 // We can't fold a load if we are going to make two instructions.
6496 bool MayFoldLoad = !NeedIndex || !NeedMask;
6497
6498 MachineSDNode *CNode;
6499 if (NeedMask) {
6500 unsigned ROpc =
6501 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6502 unsigned MOpc =
6503 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6504 CNode =
6505 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue);
6506 ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0));
6507 }
6508 if (NeedIndex || !NeedMask) {
6509 unsigned ROpc =
6510 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6511 unsigned MOpc =
6512 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6513 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue);
6514 ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0));
6515 }
6516 // Connect the flag usage to the last instruction created.
6517 ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1));
6518 CurDAG->RemoveDeadNode(N: Node);
6519 return;
6520 }
6521
6522 case ISD::SETCC: {
6523 if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue(Node, 0), InMask: SDValue()))
6524 return;
6525
6526 break;
6527 }
6528
6529 case ISD::STORE:
6530 if (foldLoadStoreIntoMemOperand(Node))
6531 return;
6532 break;
6533
6534 case X86ISD::SETCC_CARRY: {
6535 MVT VT = Node->getSimpleValueType(ResNo: 0);
6536 SDValue Result;
6537 if (Subtarget->hasSBBDepBreaking()) {
6538 // We have to do this manually because tblgen will put the eflags copy in
6539 // the wrong place if we use an extract_subreg in the pattern.
6540 // Copy flags to the EFLAGS register and glue it to next node.
6541 SDValue EFLAGS =
6542 CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
6543 N: Node->getOperand(Num: 1), Glue: SDValue());
6544
6545 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6546 // 32-bit version.
6547 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6548 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6549 Result = SDValue(
6550 CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: 1)),
6551 0);
6552 } else {
6553 // The target does not recognize sbb with the same reg operand as a
6554 // no-source idiom, so we explicitly zero the input values.
6555 Result = getSBBZero(N: Node);
6556 }
6557
6558 // For less than 32-bits we need to extract from the 32-bit node.
6559 if (VT == MVT::i8 || VT == MVT::i16) {
6560 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6561 Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6562 }
6563
6564 ReplaceUses(F: SDValue(Node, 0), T: Result);
6565 CurDAG->RemoveDeadNode(N: Node);
6566 return;
6567 }
6568 case X86ISD::SBB: {
6569 if (isNullConstant(V: Node->getOperand(Num: 0)) &&
6570 isNullConstant(V: Node->getOperand(Num: 1))) {
6571 SDValue Result = getSBBZero(N: Node);
6572
6573 // Replace the flag use.
6574 ReplaceUses(F: SDValue(Node, 1), T: Result.getValue(R: 1));
6575
6576 // Replace the result use.
6577 if (!SDValue(Node, 0).use_empty()) {
6578 // For less than 32-bits we need to extract from the 32-bit node.
6579 MVT VT = Node->getSimpleValueType(ResNo: 0);
6580 if (VT == MVT::i8 || VT == MVT::i16) {
6581 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6582 Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6583 }
6584 ReplaceUses(F: SDValue(Node, 0), T: Result);
6585 }
6586
6587 CurDAG->RemoveDeadNode(N: Node);
6588 return;
6589 }
6590 break;
6591 }
6592 case X86ISD::MGATHER: {
6593 auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node);
6594 SDValue IndexOp = Mgt->getIndex();
6595 SDValue Mask = Mgt->getMask();
6596 MVT IndexVT = IndexOp.getSimpleValueType();
6597 MVT ValueVT = Node->getSimpleValueType(ResNo: 0);
6598 MVT MaskVT = Mask.getSimpleValueType();
6599
6600 // This is just to prevent crashes if the nodes are malformed somehow. We're
6601 // otherwise only doing loose type checking in here based on type what
6602 // a type constraint would say just like table based isel.
6603 if (!ValueVT.isVector() || !MaskVT.isVector())
6604 break;
6605
6606 unsigned NumElts = ValueVT.getVectorNumElements();
6607 MVT ValueSVT = ValueVT.getVectorElementType();
6608
6609 bool IsFP = ValueSVT.isFloatingPoint();
6610 unsigned EltSize = ValueSVT.getSizeInBits();
6611
6612 unsigned Opc = 0;
6613 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6614 if (AVX512Gather) {
6615 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6616 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6617 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6618 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6619 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6620 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6621 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6622 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6623 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6624 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6625 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6626 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6627 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6628 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6629 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6630 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6631 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6632 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6633 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6634 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6635 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6636 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6637 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6638 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6639 } else {
6640 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6641 "Unexpected mask VT!");
6642 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6643 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6644 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6645 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6646 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6647 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6648 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6649 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6650 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6651 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6652 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6653 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6654 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6655 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6656 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6657 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6658 }
6659
6660 if (!Opc)
6661 break;
6662
6663 SDValue Base, Scale, Index, Disp, Segment;
6664 if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(),
6665 Base, Scale, Index, Disp, Segment))
6666 break;
6667
6668 SDValue PassThru = Mgt->getPassThru();
6669 SDValue Chain = Mgt->getChain();
6670 // Gather instructions have a mask output not in the ISD node.
6671 SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other);
6672
6673 MachineSDNode *NewNode;
6674 if (AVX512Gather) {
6675 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6676 Index, Disp, Segment, Chain};
6677 NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6678 } else {
6679 SDValue Ops[] = {PassThru, Base, Scale, Index,
6680 Disp, Segment, Mask, Chain};
6681 NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6682 }
6683 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()});
6684 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0));
6685 ReplaceUses(F: SDValue(Node, 1), T: SDValue(NewNode, 2));
6686 CurDAG->RemoveDeadNode(N: Node);
6687 return;
6688 }
6689 case X86ISD::MSCATTER: {
6690 auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node);
6691 SDValue Value = Sc->getValue();
6692 SDValue IndexOp = Sc->getIndex();
6693 MVT IndexVT = IndexOp.getSimpleValueType();
6694 MVT ValueVT = Value.getSimpleValueType();
6695
6696 // This is just to prevent crashes if the nodes are malformed somehow. We're
6697 // otherwise only doing loose type checking in here based on type what
6698 // a type constraint would say just like table based isel.
6699 if (!ValueVT.isVector())
6700 break;
6701
6702 unsigned NumElts = ValueVT.getVectorNumElements();
6703 MVT ValueSVT = ValueVT.getVectorElementType();
6704
6705 bool IsFP = ValueSVT.isFloatingPoint();
6706 unsigned EltSize = ValueSVT.getSizeInBits();
6707
6708 unsigned Opc;
6709 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6710 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6711 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6712 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6713 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6714 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6715 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6716 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6717 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6718 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6719 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6720 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6721 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6722 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6723 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6724 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6725 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6726 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6727 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6728 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6729 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6730 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6731 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6732 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6733 else
6734 break;
6735
6736 SDValue Base, Scale, Index, Disp, Segment;
6737 if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(),
6738 Base, Scale, Index, Disp, Segment))
6739 break;
6740
6741 SDValue Mask = Sc->getMask();
6742 SDValue Chain = Sc->getChain();
6743 // Scatter instructions have a mask output not in the ISD node.
6744 SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other);
6745 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6746
6747 MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops);
6748 CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()});
6749 ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 1));
6750 CurDAG->RemoveDeadNode(N: Node);
6751 return;
6752 }
6753 case ISD::PREALLOCATED_SETUP: {
6754 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6755 auto CallId = MFI->getPreallocatedIdForCallSite(
6756 CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue());
6757 SDValue Chain = Node->getOperand(Num: 0);
6758 SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6759 MachineSDNode *New = CurDAG->getMachineNode(
6760 Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain);
6761 ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Chain
6762 CurDAG->RemoveDeadNode(N: Node);
6763 return;
6764 }
6765 case ISD::PREALLOCATED_ARG: {
6766 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6767 auto CallId = MFI->getPreallocatedIdForCallSite(
6768 CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue());
6769 SDValue Chain = Node->getOperand(Num: 0);
6770 SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6771 SDValue ArgIndex = Node->getOperand(Num: 2);
6772 SDValue Ops[3];
6773 Ops[0] = CallIdValue;
6774 Ops[1] = ArgIndex;
6775 Ops[2] = Chain;
6776 MachineSDNode *New = CurDAG->getMachineNode(
6777 Opcode: TargetOpcode::PREALLOCATED_ARG, dl,
6778 VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()),
6779 VT2: MVT::Other),
6780 Ops);
6781 ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Arg pointer
6782 ReplaceUses(F: SDValue(Node, 1), T: SDValue(New, 1)); // Chain
6783 CurDAG->RemoveDeadNode(N: Node);
6784 return;
6785 }
6786 case X86ISD::AESENCWIDE128KL:
6787 case X86ISD::AESDECWIDE128KL:
6788 case X86ISD::AESENCWIDE256KL:
6789 case X86ISD::AESDECWIDE256KL: {
6790 if (!Subtarget->hasWIDEKL())
6791 break;
6792
6793 unsigned Opcode;
6794 switch (Node->getOpcode()) {
6795 default:
6796 llvm_unreachable("Unexpected opcode!");
6797 case X86ISD::AESENCWIDE128KL:
6798 Opcode = X86::AESENCWIDE128KL;
6799 break;
6800 case X86ISD::AESDECWIDE128KL:
6801 Opcode = X86::AESDECWIDE128KL;
6802 break;
6803 case X86ISD::AESENCWIDE256KL:
6804 Opcode = X86::AESENCWIDE256KL;
6805 break;
6806 case X86ISD::AESDECWIDE256KL:
6807 Opcode = X86::AESDECWIDE256KL;
6808 break;
6809 }
6810
6811 SDValue Chain = Node->getOperand(Num: 0);
6812 SDValue Addr = Node->getOperand(Num: 1);
6813
6814 SDValue Base, Scale, Index, Disp, Segment;
6815 if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment))
6816 break;
6817
6818 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 2),
6819 Glue: SDValue());
6820 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 3),
6821 Glue: Chain.getValue(R: 1));
6822 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: 4),
6823 Glue: Chain.getValue(R: 1));
6824 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: 5),
6825 Glue: Chain.getValue(R: 1));
6826 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: 6),
6827 Glue: Chain.getValue(R: 1));
6828 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: 7),
6829 Glue: Chain.getValue(R: 1));
6830 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: 8),
6831 Glue: Chain.getValue(R: 1));
6832 Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: 9),
6833 Glue: Chain.getValue(R: 1));
6834
6835 MachineSDNode *Res = CurDAG->getMachineNode(
6836 Opcode, dl, VTs: Node->getVTList(),
6837 Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: 1)});
6838 CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand());
6839 ReplaceNode(F: Node, T: Res);
6840 return;
6841 }
6842 case X86ISD::POP_FROM_X87_REG: {
6843 SDValue Chain = Node->getOperand(Num: 0);
6844 Register Reg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1))->getReg();
6845 SDValue Glue;
6846 if (Node->getNumValues() == 3)
6847 Glue = Node->getOperand(Num: 2);
6848 SDValue Copy =
6849 CurDAG->getCopyFromReg(Chain, dl, Reg, VT: Node->getValueType(ResNo: 0), Glue);
6850 ReplaceNode(F: Node, T: Copy.getNode());
6851 return;
6852 }
6853 }
6854
6855 SelectCode(N: Node);
6856}
6857
6858bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6859 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6860 std::vector<SDValue> &OutOps) {
6861 SDValue Op0, Op1, Op2, Op3, Op4;
6862 switch (ConstraintID) {
6863 default:
6864 llvm_unreachable("Unexpected asm memory constraint");
6865 case InlineAsm::ConstraintCode::o: // offsetable ??
6866 case InlineAsm::ConstraintCode::v: // not offsetable ??
6867 case InlineAsm::ConstraintCode::m: // memory
6868 case InlineAsm::ConstraintCode::X:
6869 case InlineAsm::ConstraintCode::p: // address
6870 if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4))
6871 return true;
6872 break;
6873 }
6874
6875 OutOps.push_back(x: Op0);
6876 OutOps.push_back(x: Op1);
6877 OutOps.push_back(x: Op2);
6878 OutOps.push_back(x: Op3);
6879 OutOps.push_back(x: Op4);
6880 return false;
6881}
6882
6883X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6884 : SelectionDAGISelPass(
6885 std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
6886
6887/// This pass converts a legalized DAG into a X86-specific DAG,
6888/// ready for instruction scheduling.
6889FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6890 CodeGenOptLevel OptLevel) {
6891 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6892}
6893