X86ISelDAGToDAG.cpp source code [llvm_projects/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp]

1	//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines a DAG pattern matching instruction selector for X86,
10	// converting from a legalized dag to a X86 dag.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "X86ISelDAGToDAG.h"
15	#include "X86.h"
16	#include "X86MachineFunctionInfo.h"
17	#include "X86Subtarget.h"
18	#include "X86TargetMachine.h"
19	#include "llvm/ADT/Statistic.h"
20	#include "llvm/CodeGen/MachineModuleInfo.h"
21	#include "llvm/CodeGen/SelectionDAGISel.h"
22	#include "llvm/Config/llvm-config.h"
23	#include "llvm/IR/ConstantRange.h"
24	#include "llvm/IR/Function.h"
25	#include "llvm/IR/Instructions.h"
26	#include "llvm/IR/Intrinsics.h"
27	#include "llvm/IR/IntrinsicsX86.h"
28	#include "llvm/IR/Module.h"
29	#include "llvm/IR/Type.h"
30	#include "llvm/Support/Debug.h"
31	#include "llvm/Support/ErrorHandling.h"
32	#include "llvm/Support/KnownBits.h"
33	#include "llvm/Support/MathExtras.h"
34	#include <cstdint>
35
36	using namespace llvm;
37
38	#define DEBUG_TYPE "x86-isel"
39	#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41	STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43	static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(Val: true),
44	cl::desc ("Enable setting constant bits to reduce size of mask immediates"),
45	cl::Hidden);
46
47	static cl::opt<bool> EnablePromoteAnyextLoad(
48	"x86-promote-anyext-load", cl::init(Val: true),
49	cl::desc ("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
51	extern cl::opt<bool> IndirectBranchTracking;
52
53	//===----------------------------------------------------------------------===//
54	// Pattern Matcher Implementation
55	//===----------------------------------------------------------------------===//
56
57	namespace {
58	/// This corresponds to X86AddressMode, but uses SDValue's instead of register
59	/// numbers for the leaves of the matched tree.
60	struct X86ISelAddressMode {
61	enum {
62	RegBase,
63	FrameIndexBase
64	} BaseType = RegBase;
65
66	// This is really a union, discriminated by BaseType!
67	SDValue Base_Reg;
68	int Base_FrameIndex = `0`;
69
70	unsigned Scale = `1`;
71	SDValue IndexReg;
72	int32_t Disp = `0`;
73	SDValue Segment;
74	const GlobalValue GV = nullptr*;
75	const Constant CP = nullptr*;
76	const BlockAddress BlockAddr = nullptr*;
77	const char ES = nullptr*;
78	MCSymbol MCSym = nullptr*;
79	int JT = -`1`;
80	Align Alignment; // CP alignment.
81	unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82	bool NegateIndex = false;
83
84	X86ISelAddressMode() = default;
85
86	bool hasSymbolicDisplacement() const {
87	return GV != nullptr \|\| CP != nullptr \|\| ES != nullptr \|\|
88	MCSym != nullptr \|\| JT != -`1` \|\| BlockAddr != nullptr;
89	}
90
91	bool hasBaseOrIndexReg() const {
92	return BaseType == FrameIndexBase \|\|
93	IndexReg.getNode() != nullptr \|\| Base_Reg.getNode() != nullptr;
94	}
95
96	/// Return true if this addressing mode is already RIP-relative.
97	bool isRIPRelative() const {
98	if (BaseType != RegBase) return false;
99	if (RegisterSDNode *RegNode =
100	dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode()))
101	return RegNode->getReg() == X86::RIP;
102	return false;
103	}
104
105	void setBaseReg(SDValue Reg) {
106	BaseType = RegBase;
107	Base_Reg = Reg;
108	}
109
110	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
111	void dump(SelectionDAG DAG = nullptr*) {
112	dbgs() << "X86ISelAddressMode " << this << `'\n'`;
113	dbgs() << "Base_Reg ";
114	if (Base_Reg.getNode())
115	Base_Reg.getNode()->dump(DAG);
116	else
117	dbgs() << "nul\n";
118	if (BaseType == FrameIndexBase)
119	dbgs() << " Base.FrameIndex " << Base_FrameIndex << `'\n'`;
120	dbgs() << " Scale " << Scale << `'\n'`
121	<< "IndexReg ";
122	if (NegateIndex)
123	dbgs() << "negate ";
124	if (IndexReg.getNode())
125	IndexReg.getNode()->dump(DAG);
126	else
127	dbgs() << "nul\n";
128	dbgs() << " Disp " << Disp << `'\n'`
129	<< "GV ";
130	if (GV)
131	GV->dump();
132	else
133	dbgs() << "nul";
134	dbgs() << " CP ";
135	if (CP)
136	CP->dump();
137	else
138	dbgs() << "nul";
139	dbgs() << `'\n'`
140	<< "ES ";
141	if (ES)
142	dbgs() << ES;
143	else
144	dbgs() << "nul";
145	dbgs() << " MCSym ";
146	if (MCSym)
147	dbgs() << MCSym;
148	else
149	dbgs() << "nul";
150	dbgs() << " JT" << JT << " Align" << Alignment.value() << `'\n'`;
151	}
152	#endif
153	};
154	}
155
156	namespace {
157	//===--------------------------------------------------------------------===//
158	/// ISel - X86-specific code to select X86 machine instructions for
159	/// SelectionDAG operations.
160	///
161	class X86DAGToDAGISel final : public SelectionDAGISel {
162	/// Keep a pointer to the X86Subtarget around so that we can
163	/// make the right decision when generating code for different targets.
164	const X86Subtarget *Subtarget;
165
166	/// If true, selector should try to optimize for minimum code size.
167	bool OptForMinSize;
168
169	/// Disable direct TLS access through segment registers.
170	bool IndirectTlsSegRefs;
171
172	public:
173	X86DAGToDAGISel() = delete;
174
175	explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176	: SelectionDAGISel (tm, OptLevel), Subtarget(nullptr),
177	OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179	bool runOnMachineFunction(MachineFunction &MF) override {
180	// Reset the subtarget each time through.
181	Subtarget = &MF.getSubtarget<X86Subtarget>();
182	IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183	Kind: "indirect-tls-seg-refs");
184
185	// OptFor[Min]Size are used in pattern predicates that isel is matching.
186	OptForMinSize = MF.getFunction().hasMinSize();
187	return SelectionDAGISel::runOnMachineFunction(mf&: MF);
188	}
189
190	void emitFunctionEntryCode() override;
191
192	bool IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const override;
193
194	void PreprocessISelDAG() override;
195	void PostprocessISelDAG() override;
196
197	// Include the pieces autogenerated from the target description.
198	#include "X86GenDAGISel.inc"
199
200	private:
201	void Select(SDNode *N) override;
202
203	bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204	bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205	bool AllowSegmentRegForX32 = false);
206	bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207	bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208	bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209	bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210	SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211	unsigned Depth);
212	bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213	unsigned Depth);
214	bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215	unsigned Depth);
216	bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217	bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
218	SDValue &Scale, SDValue &Index, SDValue &Disp,
219	SDValue &Segment);
220	bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
221	SDValue ScaleOp, SDValue &Base, SDValue &Scale,
222	SDValue &Index, SDValue &Disp, SDValue &Segment);
223	bool selectMOV64Imm32(SDValue N, SDValue &Imm);
224	bool selectLEAAddr(SDValue N, SDValue &Base,
225	SDValue &Scale, SDValue &Index, SDValue &Disp,
226	SDValue &Segment);
227	bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
228	SDValue &Index, SDValue &Disp, SDValue &Segment);
229	bool selectTLSADDRAddr(SDValue N, SDValue &Base,
230	SDValue &Scale, SDValue &Index, SDValue &Disp,
231	SDValue &Segment);
232	bool selectRelocImm(SDValue N, SDValue &Op);
233
234	bool tryFoldLoad(SDNode Root, SDNode P, SDValue N,
235	SDValue &Base, SDValue &Scale,
236	SDValue &Index, SDValue &Disp,
237	SDValue &Segment);
238
239	// Convenience method where P is also root.
240	bool tryFoldLoad(SDNode *P, SDValue N,
241	SDValue &Base, SDValue &Scale,
242	SDValue &Index, SDValue &Disp,
243	SDValue &Segment) {
244	return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment);
245	}
246
247	bool tryFoldBroadcast(SDNode Root, SDNode P, SDValue N,
248	SDValue &Base, SDValue &Scale,
249	SDValue &Index, SDValue &Disp,
250	SDValue &Segment);
251
252	bool isProfitableToFormMaskedOp(SDNode N) const*;
253
254	/// Implement addressing mode selection for inline asm expressions.
255	bool SelectInlineAsmMemoryOperand(const SDValue &Op,
256	InlineAsm::ConstraintCode ConstraintID,
257	std::vector<SDValue> &OutOps) override;
258
259	void emitSpecialCodeForMain();
260
261	inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
262	MVT VT, SDValue &Base, SDValue &Scale,
263	SDValue &Index, SDValue &Disp,
264	SDValue &Segment) {
265	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
266	Base = CurDAG->getTargetFrameIndex(
267	FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout()));
268	else if (AM.Base_Reg.getNode())
269	Base = AM.Base_Reg;
270	else
271	Base = CurDAG->getRegister(Reg: `0`, VT);
272
273	Scale = getI8Imm(Imm: AM.Scale, DL);
274
275	#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
276	// Negate the index if needed.
277	if (AM.NegateIndex) {
278	unsigned NegOpc;
279	switch (VT.SimpleTy) {
280	default:
281	llvm_unreachable("Unsupported VT!");
282	case MVT::i64:
283	NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
284	break;
285	case MVT::i32:
286	NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
287	break;
288	case MVT::i16:
289	NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
290	break;
291	case MVT::i8:
292	NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
293	break;
294	}
295	SDValue Neg = SDValue (CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32,
296	Ops: AM.IndexReg), `0`);
297	AM.IndexReg = Neg;
298	}
299
300	if (AM.IndexReg.getNode())
301	Index = AM.IndexReg;
302	else
303	Index = CurDAG->getRegister(Reg: `0`, VT);
304
305	// These are 32-bit even in 64-bit mode since RIP-relative offset
306	// is 32-bit.
307	if (AM.GV)
308	Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc (),
309	VT: MVT::i32, offset: AM.Disp,
310	TargetFlags: AM.SymbolFlags);
311	else if (AM.CP)
312	Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment,
313	Offset: AM.Disp, TargetFlags: AM.SymbolFlags);
314	else if (AM.ES) {
315	assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
316	Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
317	} else if (AM.MCSym) {
318	assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
319	assert(AM.SymbolFlags == `0` && "oo");
320	Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32);
321	} else if (AM.JT != -`1`) {
322	assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
323	Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
324	} else if (AM.BlockAddr)
325	Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp,
326	TargetFlags: AM.SymbolFlags);
327	else
328	Disp = CurDAG->getSignedTargetConstant(Val: AM.Disp, DL, VT: MVT::i32);
329
330	if (AM.Segment.getNode())
331	Segment = AM.Segment;
332	else
333	Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
334	}
335
336	// Utility function to determine whether it is AMX SDNode right after
337	// lowering but before ISEL.
338	bool isAMXSDNode(SDNode N) const* {
339	// Check if N is AMX SDNode:
340	// 1. check specific opcode since these carry MVT::Untyped instead of
341	// x86amx_type;
342	// 2. check result type;
343	// 3. check operand type;
344	switch (N->getOpcode()) {
345	default:
346	break;
347	case X86::PT2RPNTLVWZ0V:
348	case X86::PT2RPNTLVWZ0T1V:
349	case X86::PT2RPNTLVWZ1V:
350	case X86::PT2RPNTLVWZ1T1V:
351	case X86::PT2RPNTLVWZ0RSV:
352	case X86::PT2RPNTLVWZ0RST1V:
353	case X86::PT2RPNTLVWZ1RSV:
354	case X86::PT2RPNTLVWZ1RST1V:
355	return true;
356	}
357	for (unsigned Idx = `0`, E = N->getNumValues(); Idx != E; ++Idx) {
358	if (N->getValueType(ResNo: Idx) == MVT::x86amx)
359	return true;
360	}
361	for (unsigned Idx = `0`, E = N->getNumOperands(); Idx != E; ++Idx) {
362	SDValue Op = N->getOperand(Num: Idx);
363	if (Op.getValueType() == MVT::x86amx)
364	return true;
365	}
366	return false;
367	}
368
369	// Utility function to determine whether we should avoid selecting
370	// immediate forms of instructions for better code size or not.
371	// At a high level, we'd like to avoid such instructions when
372	// we have similar constants used within the same basic block
373	// that can be kept in a register.
374	//
375	bool shouldAvoidImmediateInstFormsForSize(SDNode N) const* {
376	uint32_t UseCount = `0`;
377
378	// Do not want to hoist if we're not optimizing for size.
379	// TODO: We'd like to remove this restriction.
380	// See the comment in X86InstrInfo.td for more info.
381	if (!CurDAG->shouldOptForSize())
382	return false;
383
384	// Walk all the users of the immediate.
385	for (const SDNode *User : N->users()) {
386	if (UseCount >= `2`)
387	break;
388
389	// This user is already selected. Count it as a legitimate use and
390	// move on.
391	if (User->isMachineOpcode()) {
392	UseCount++;
393	continue;
394	}
395
396	// We want to count stores of immediates as real uses.
397	if (User->getOpcode() == ISD::STORE &&
398	User->getOperand(Num: `1`).getNode() == N) {
399	UseCount++;
400	continue;
401	}
402
403	// We don't currently match users that have > 2 operands (except
404	// for stores, which are handled above)
405	// Those instruction won't match in ISEL, for now, and would
406	// be counted incorrectly.
407	// This may change in the future as we add additional instruction
408	// types.
409	if (User->getNumOperands() != `2`)
410	continue;
411
412	// If this is a sign-extended 8-bit integer immediate used in an ALU
413	// instruction, there is probably an opcode encoding to save space.
414	auto *C = dyn_cast<ConstantSDNode>(Val: N);
415	if (C && isInt<`8`>(x: C->getSExtValue()))
416	continue;
417
418	// Immediates that are used for offsets as part of stack
419	// manipulation should be left alone. These are typically
420	// used to indicate SP offsets for argument passing and
421	// will get pulled into stores/pushes (implicitly).
422	if (User->getOpcode() == X86ISD::ADD \|\|
423	User->getOpcode() == ISD::ADD \|\|
424	User->getOpcode() == X86ISD::SUB \|\|
425	User->getOpcode() == ISD::SUB) {
426
427	// Find the other operand of the add/sub.
428	SDValue OtherOp = User->getOperand(Num: `0`);
429	if (OtherOp.getNode() == N)
430	OtherOp = User->getOperand(Num: `1`);
431
432	// Don't count if the other operand is SP.
433	RegisterSDNode *RegNode;
434	if (OtherOp ->getOpcode() == ISD::CopyFromReg &&
435	(RegNode = dyn_cast_or_null<RegisterSDNode>(
436	Val: OtherOp ->getOperand(Num: `1`).getNode())))
437	if ((RegNode->getReg() == X86::ESP) \|\|
438	(RegNode->getReg() == X86::RSP))
439	continue;
440	}
441
442	// ... otherwise, count this and move on.
443	UseCount++;
444	}
445
446	// If we have more than 1 use, then recommend for hoisting.
447	return (UseCount > `1`);
448	}
449
450	/// Return a target constant with the specified value of type i8.
451	inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
452	return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
453	}
454
455	/// Return a target constant with the specified value, of type i32.
456	inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
457	return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32);
458	}
459
460	/// Return a target constant with the specified value, of type i64.
461	inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
462	return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64);
463	}
464
465	SDValue getExtractVEXTRACTImmediate(SDNode N, unsigned* VecWidth,
466	const SDLoc &DL) {
467	assert((VecWidth == `128` \|\| VecWidth == `256`) && "Unexpected vector width");
468	uint64_t Index = N->getConstantOperandVal(Num: `1`);
469	MVT VecVT = N->getOperand(Num: `0`).getSimpleValueType();
470	return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
471	}
472
473	SDValue getInsertVINSERTImmediate(SDNode N, unsigned* VecWidth,
474	const SDLoc &DL) {
475	assert((VecWidth == `128` \|\| VecWidth == `256`) && "Unexpected vector width");
476	uint64_t Index = N->getConstantOperandVal(Num: `2`);
477	MVT VecVT = N->getSimpleValueType(ResNo: `0`);
478	return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
479	}
480
481	SDValue getPermuteVINSERTCommutedImmediate(SDNode N, unsigned* VecWidth,
482	const SDLoc &DL) {
483	assert(VecWidth == `128` && "Unexpected vector width");
484	uint64_t Index = N->getConstantOperandVal(Num: `2`);
485	MVT VecVT = N->getSimpleValueType(ResNo: `0`);
486	uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
487	assert((InsertIdx == `0` \|\| InsertIdx == `1`) && "Bad insertf128 index");
488	// vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
489	// vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
490	return getI8Imm(Imm: InsertIdx ? `0x02` : `0x30`, DL);
491	}
492
493	SDValue getSBBZero(SDNode *N) {
494	SDLoc dl(N);
495	MVT VT = N->getSimpleValueType(ResNo: `0`);
496
497	// Create zero.
498	SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
499	SDValue Zero =
500	SDValue (CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), `0`);
501	if (VT == MVT::i64) {
502	Zero = SDValue (
503	CurDAG->getMachineNode(
504	Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64,
505	Op1: CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i64), Op2: Zero,
506	Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
507	`0`);
508	}
509
510	// Copy flags to the EFLAGS register and glue it to next node.
511	unsigned Opcode = N->getOpcode();
512	assert((Opcode == X86ISD::SBB \|\| Opcode == X86ISD::SETCC_CARRY) &&
513	"Unexpected opcode for SBB materialization");
514	unsigned FlagOpIndex = Opcode == X86ISD::SBB ? `2` : `1`;
515	SDValue EFLAGS =
516	CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
517	N: N->getOperand(Num: FlagOpIndex), Glue: SDValue ());
518
519	// Create a 64-bit instruction if the result is 64-bits otherwise use the
520	// 32-bit version.
521	unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
522	MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
523	VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32);
524	return SDValue (
525	CurDAG->getMachineNode(Opcode: Opc, dl, VTs,
526	Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: `1`)}),
527	`0`);
528	}
529
530	// Helper to detect unneeded and instructions on shift amounts. Called
531	// from PatFrags in tablegen.
532	bool isUnneededShiftMask(SDNode N, unsigned* Width) const {
533	assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
534	const APInt &Val = N->getConstantOperandAPInt(Num: `1`);
535
536	if (Val.countr_one() >= Width)
537	return true;
538
539	APInt Mask = Val \| CurDAG->computeKnownBits(Op: N->getOperand(Num: `0`)).Zero;
540	return Mask.countr_one() >= Width;
541	}
542
543	/// Return an SDNode that returns the value of the global base register.
544	/// Output instructions required to initialize the global base register,
545	/// if necessary.
546	SDNode *getGlobalBaseReg();
547
548	/// Return a reference to the TargetMachine, casted to the target-specific
549	/// type.
550	const X86TargetMachine &getTargetMachine() const {
551	return static_cast<const X86TargetMachine &>(TM);
552	}
553
554	/// Return a reference to the TargetInstrInfo, casted to the target-specific
555	/// type.
556	const X86InstrInfo getInstrInfo() const* {
557	return Subtarget->getInstrInfo();
558	}
559
560	/// Return a condition code of the given SDNode
561	X86::CondCode getCondFromNode(SDNode N) const*;
562
563	/// Address-mode matching performs shift-of-and to and-of-shift
564	/// reassociation in order to expose more scaled addressing
565	/// opportunities.
566	bool ComplexPatternFuncMutatesDAG() const override {
567	return true;
568	}
569
570	bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode N) const*;
571
572	// Indicates we should prefer to use a non-temporal load for this load.
573	bool useNonTemporalLoad(LoadSDNode N) const* {
574	if (!N->isNonTemporal())
575	return false;
576
577	unsigned StoreSize = N->getMemoryVT().getStoreSize();
578
579	if (N->getAlign().value() < StoreSize)
580	return false;
581
582	switch (StoreSize) {
583	default: llvm_unreachable("Unsupported store size");
584	case `4`:
585	case `8`:
586	return false;
587	case `16`:
588	return Subtarget->hasSSE41();
589	case `32`:
590	return Subtarget->hasAVX2();
591	case `64`:
592	return Subtarget->hasAVX512();
593	}
594	}
595
596	bool foldLoadStoreIntoMemOperand(SDNode *Node);
597	MachineSDNode matchBEXTRFromAndImm(SDNode Node);
598	bool matchBitExtract(SDNode *Node);
599	bool shrinkAndImmediate(SDNode *N);
600	bool isMaskZeroExtended(SDNode N) const*;
601	bool tryShiftAmountMod(SDNode *N);
602	bool tryShrinkShlLogicImm(SDNode *N);
603	bool tryVPTERNLOG(SDNode *N);
604	bool matchVPTERNLOG(SDNode Root, SDNode ParentA, SDNode *ParentB,
605	SDNode *ParentC, SDValue A, SDValue B, SDValue C,
606	uint8_t Imm);
607	bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
608	bool tryMatchBitSelect(SDNode *N);
609
610	MachineSDNode emitPCMPISTR(unsigned* ROpc, unsigned MOpc, bool MayFoldLoad,
611	const SDLoc &dl, MVT VT, SDNode *Node);
612	MachineSDNode emitPCMPESTR(unsigned* ROpc, unsigned MOpc, bool MayFoldLoad,
613	const SDLoc &dl, MVT VT, SDNode *Node,
614	SDValue &InGlue);
615
616	bool tryOptimizeRem8Extend(SDNode *N);
617
618	bool onlyUsesZeroFlag(SDValue Flags) const;
619	bool hasNoSignFlagUses(SDValue Flags) const;
620	bool hasNoCarryFlagUses(SDValue Flags) const;
621	};
622
623	class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
624	public:
625	static char ID;
626	explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
627	CodeGenOptLevel OptLevel)
628	: SelectionDAGISelLegacy (
629	ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {}
630	};
631	}
632
633	char X86DAGToDAGISelLegacy::ID = `0`;
634
635	INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
636
637	// Returns true if this masked compare can be implemented legally with this
638	// type.
639	static bool isLegalMaskCompare(SDNode N, const* X86Subtarget *Subtarget) {
640	unsigned Opcode = N->getOpcode();
641	if (Opcode == X86ISD::CMPM \|\| Opcode == X86ISD::CMPMM \|\|
642	Opcode == X86ISD::STRICT_CMPM \|\| Opcode == ISD::SETCC \|\|
643	Opcode == X86ISD::CMPMM_SAE \|\| Opcode == X86ISD::VFPCLASS) {
644	// We can get 256-bit 8 element types here without VLX being enabled. When
645	// this happens we will use 512-bit operations and the mask will not be
646	// zero extended.
647	EVT OpVT = N->getOperand(Num: `0`).getValueType();
648	// The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
649	// second operand.
650	if (Opcode == X86ISD::STRICT_CMPM)
651	OpVT = N->getOperand(Num: `1`).getValueType();
652	if (OpVT.is256BitVector() \|\| OpVT.is128BitVector())
653	return Subtarget->hasVLX();
654
655	return true;
656	}
657	// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
658	if (Opcode == X86ISD::VFPCLASSS \|\| Opcode == X86ISD::FSETCCM \|\|
659	Opcode == X86ISD::FSETCCM_SAE)
660	return true;
661
662	return false;
663	}
664
665	// Returns true if we can assume the writer of the mask has zero extended it
666	// for us.
667	bool X86DAGToDAGISel::isMaskZeroExtended(SDNode N) const* {
668	// If this is an AND, check if we have a compare on either side. As long as
669	// one side guarantees the mask is zero extended, the AND will preserve those
670	// zeros.
671	if (N->getOpcode() == ISD::AND)
672	return isLegalMaskCompare(N: N->getOperand(Num: `0`).getNode(), Subtarget) \|\|
673	isLegalMaskCompare(N: N->getOperand(Num: `1`).getNode(), Subtarget);
674
675	return isLegalMaskCompare(N, Subtarget);
676	}
677
678	bool
679	X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const {
680	if (OptLevel == CodeGenOptLevel::None)
681	return false;
682
683	if (!N.hasOneUse())
684	return false;
685
686	if (N.getOpcode() != ISD::LOAD)
687	return true;
688
689	// Don't fold non-temporal loads if we have an instruction for them.
690	if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N)))
691	return false;
692
693	// If N is a load, do additional profitability checks.
694	if (U == Root) {
695	switch (U->getOpcode()) {
696	default: break;
697	case X86ISD::ADD:
698	case X86ISD::ADC:
699	case X86ISD::SUB:
700	case X86ISD::SBB:
701	case X86ISD::AND:
702	case X86ISD::XOR:
703	case X86ISD::OR:
704	case ISD::ADD:
705	case ISD::UADDO_CARRY:
706	case ISD::AND:
707	case ISD::OR:
708	case ISD::XOR: {
709	SDValue Op1 = U->getOperand(Num: `1`);
710
711	// If the other operand is a 8-bit immediate we should fold the immediate
712	// instead. This reduces code size.
713	// e.g.
714	// movl 4(%esp), %eax
715	// addl $4, %eax
716	// vs.
717	// movl $4, %eax
718	// addl 4(%esp), %eax
719	// The former is 2 bytes shorter. In case where the increment is 1, then
720	// the saving can be 4 bytes (by using incl %eax).
721	if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) {
722	if (Imm->getAPIntValue().isSignedIntN(N: `8`))
723	return false;
724
725	// If this is a 64-bit AND with an immediate that fits in 32-bits,
726	// prefer using the smaller and over folding the load. This is needed to
727	// make sure immediates created by shrinkAndImmediate are always folded.
728	// Ideally we would narrow the load during DAG combine and get the
729	// best of both worlds.
730	if (U->getOpcode() == ISD::AND &&
731	Imm->getAPIntValue().getBitWidth() == `64` &&
732	Imm->getAPIntValue().isIntN(N: `32`))
733	return false;
734
735	// If this really a zext_inreg that can be represented with a movzx
736	// instruction, prefer that.
737	// TODO: We could shrink the load and fold if it is non-volatile.
738	if (U->getOpcode() == ISD::AND &&
739	(Imm->getAPIntValue() == UINT8_MAX \|\|
740	Imm->getAPIntValue() == UINT16_MAX \|\|
741	Imm->getAPIntValue() == UINT32_MAX))
742	return false;
743
744	// ADD/SUB with can negate the immediate and use the opposite operation
745	// to fit 128 into a sign extended 8 bit immediate.
746	if ((U->getOpcode() == ISD::ADD \|\| U->getOpcode() == ISD::SUB) &&
747	(-Imm->getAPIntValue()).isSignedIntN(N: `8`))
748	return false;
749
750	if ((U->getOpcode() == X86ISD::ADD \|\| U->getOpcode() == X86ISD::SUB) &&
751	(-Imm->getAPIntValue()).isSignedIntN(N: `8`) &&
752	hasNoCarryFlagUses(Flags: SDValue (U, `1`)))
753	return false;
754	}
755
756	// If the other operand is a TLS address, we should fold it instead.
757	// This produces
758	// movl %gs:0, %eax
759	// leal i@NTPOFF(%eax), %eax
760	// instead of
761	// movl $i@NTPOFF, %eax
762	// addl %gs:0, %eax
763	// if the block also has an access to a second TLS address this will save
764	// a load.
765	// FIXME: This is probably also true for non-TLS addresses.
766	if (Op1.getOpcode() == X86ISD::Wrapper) {
767	SDValue Val = Op1.getOperand(i: `0`);
768	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
769	return false;
770	}
771
772	// Don't fold load if this matches the BTS/BTR/BTC patterns.
773	// BTS: (or X, (shl 1, n))
774	// BTR: (and X, (rotl -2, n))
775	// BTC: (xor X, (shl 1, n))
776	if (U->getOpcode() == ISD::OR \|\| U->getOpcode() == ISD::XOR) {
777	if (U->getOperand(Num: `0`).getOpcode() == ISD::SHL &&
778	isOneConstant(V: U->getOperand(Num: `0`).getOperand(i: `0`)))
779	return false;
780
781	if (U->getOperand(Num: `1`).getOpcode() == ISD::SHL &&
782	isOneConstant(V: U->getOperand(Num: `1`).getOperand(i: `0`)))
783	return false;
784	}
785	if (U->getOpcode() == ISD::AND) {
786	SDValue U0 = U->getOperand(Num: `0`);
787	SDValue U1 = U->getOperand(Num: `1`);
788	if (U0.getOpcode() == ISD::ROTL) {
789	auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: `0`));
790	if (C && C->getSExtValue() == -`2`)
791	return false;
792	}
793
794	if (U1.getOpcode() == ISD::ROTL) {
795	auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: `0`));
796	if (C && C->getSExtValue() == -`2`)
797	return false;
798	}
799	}
800
801	break;
802	}
803	case ISD::SHL:
804	case ISD::SRA:
805	case ISD::SRL:
806	// Don't fold a load into a shift by immediate. The BMI2 instructions
807	// support folding a load, but not an immediate. The legacy instructions
808	// support folding an immediate, but can't fold a load. Folding an
809	// immediate is preferable to folding a load.
810	if (isa<ConstantSDNode>(Val: U->getOperand(Num: `1`)))
811	return false;
812
813	break;
814	}
815	}
816
817	// Prevent folding a load if this can implemented with an insert_subreg or
818	// a move that implicitly zeroes.
819	if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
820	isNullConstant(V: Root->getOperand(Num: `2`)) &&
821	(Root->getOperand(Num: `0`).isUndef() \|\|
822	ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: `0`).getNode())))
823	return false;
824
825	return true;
826	}
827
828	// Indicates it is profitable to form an AVX512 masked operation. Returning
829	// false will favor a masked register-register masked move or vblendm and the
830	// operation will be selected separately.
831	bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode N) const* {
832	assert(
833	(N->getOpcode() == ISD::VSELECT \|\| N->getOpcode() == X86ISD::SELECTS) &&
834	"Unexpected opcode!");
835
836	// If the operation has additional users, the operation will be duplicated.
837	// Check the use count to prevent that.
838	// FIXME: Are there cheap opcodes we might want to duplicate?
839	return N->getOperand(Num: `1`).hasOneUse();
840	}
841
842	/// Replace the original chain operand of the call with
843	/// load's chain operand and move load below the call's chain operand.
844	static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
845	SDValue Call, SDValue OrigChain) {
846	SmallVector<SDValue, `8`> Ops;
847	SDValue Chain = OrigChain.getOperand(i: `0`);
848	if (Chain.getNode() == Load.getNode())
849	Ops.push_back(Elt: Load.getOperand(i: `0`));
850	else {
851	assert(Chain.getOpcode() == ISD::TokenFactor &&
852	"Unexpected chain operand");
853	for (unsigned i = `0`, e = Chain.getNumOperands(); i != e; ++i)
854	if (Chain.getOperand(i).getNode() == Load.getNode())
855	Ops.push_back(Elt: Load.getOperand(i: `0`));
856	else
857	Ops.push_back(Elt: Chain.getOperand(i));
858	SDValue NewChain =
859	CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc (Load), VT: MVT::Other, Ops);
860	Ops.clear();
861	Ops.push_back(Elt: NewChain);
862	}
863	Ops.append(in_start: OrigChain ->op_begin() + `1`, in_end: OrigChain ->op_end());
864	CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops);
865	CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: `0`),
866	Op2: Load.getOperand(i: `1`), Op3: Load.getOperand(i: `2`));
867
868	Ops.clear();
869	Ops.push_back(Elt: SDValue (Load.getNode(), `1`));
870	Ops.append(in_start: Call ->op_begin() + `1`, in_end: Call ->op_end());
871	CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops);
872	}
873
874	/// Return true if call address is a load and it can be
875	/// moved below CALLSEQ_START and the chains leading up to the call.
876	/// Return the CALLSEQ_START by reference as a second output.
877	/// In the case of a tail call, there isn't a callseq node between the call
878	/// chain and the load.
879	static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
880	// The transformation is somewhat dangerous if the call's chain was glued to
881	// the call. After MoveBelowOrigChain the load is moved between the call and
882	// the chain, this can create a cycle if the load is not folded. So it is
883	// really* important that we are sure the load will be folded.*
884	if (Callee.getNode() == Chain.getNode() \|\| !Callee.hasOneUse())
885	return false;
886	auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode());
887	if (!LD \|\|
888	!LD->isSimple() \|\|
889	LD->getAddressingMode() != ISD::UNINDEXED \|\|
890	LD->getExtensionType() != ISD::NON_EXTLOAD)
891	return false;
892
893	// Now let's find the callseq_start.
894	while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
895	if (!Chain.hasOneUse())
896	return false;
897	Chain = Chain.getOperand(i: `0`);
898	}
899
900	if (!Chain.getNumOperands())
901	return false;
902	// Since we are not checking for AA here, conservatively abort if the chain
903	// writes to memory. It's not safe to move the callee (a load) across a store.
904	if (isa<MemSDNode>(Val: Chain.getNode()) &&
905	cast<MemSDNode>(Val: Chain.getNode())->writeMem())
906	return false;
907	if (Chain.getOperand(i: `0`).getNode() == Callee.getNode())
908	return true;
909	if (Chain.getOperand(i: `0`).getOpcode() == ISD::TokenFactor &&
910	Callee.getValue(R: `1`).isOperandOf(N: Chain.getOperand(i: `0`).getNode()) &&
911	Callee.getValue(R: `1`).hasOneUse())
912	return true;
913	return false;
914	}
915
916	static bool isEndbrImm64(uint64_t Imm) {
917	// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
918	// i.g: 0xF3660F1EFA, 0xF3670F1EFA
919	if ((Imm & `0x00FFFFFF`) != `0x0F1EFA`)
920	return false;
921
922	uint8_t OptionalPrefixBytes [] = {`0x26`, `0x2e`, `0x36`, `0x3e`, `0x64`,
923	`0x65`, `0x66`, `0x67`, `0xf0`, `0xf2`};
924	int i = `24`; // 24bit 0x0F1EFA has matched
925	while (i < `64`) {
926	uint8_t Byte = (Imm >> i) & `0xFF`;
927	if (Byte == `0xF3`)
928	return true;
929	if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte))
930	return false;
931	i += `8`;
932	}
933
934	return false;
935	}
936
937	static bool needBWI(MVT VT) {
938	return (VT == MVT::v32i16 \|\| VT == MVT::v32f16 \|\| VT == MVT::v64i8);
939	}
940
941	void X86DAGToDAGISel::PreprocessISelDAG() {
942	bool MadeChange = false;
943	for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
944	E = CurDAG->allnodes_end(); I != E; ) {
945	SDNode N = &I ++; // Preincrement iterator to avoid invalidation issues.
946
947	// This is for CET enhancement.
948	//
949	// ENDBR32 and ENDBR64 have specific opcodes:
950	// ENDBR32: F3 0F 1E FB
951	// ENDBR64: F3 0F 1E FA
952	// And we want that attackers won’t find unintended ENDBR32/64
953	// opcode matches in the binary
954	// Here’s an example:
955	// If the compiler had to generate asm for the following code:
956	// a = 0xF30F1EFA
957	// it could, for example, generate:
958	// mov 0xF30F1EFA, dword ptr[a]
959	// In such a case, the binary would include a gadget that starts
960	// with a fake ENDBR64 opcode. Therefore, we split such generation
961	// into multiple operations, let it not shows in the binary
962	if (N->getOpcode() == ISD::Constant) {
963	MVT VT = N->getSimpleValueType(ResNo: `0`);
964	int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
965	int32_t EndbrImm = Subtarget->is64Bit() ? `0xF30F1EFA` : `0xF30F1EFB`;
966	if (Imm == EndbrImm \|\| isEndbrImm64(Imm)) {
967	// Check that the cf-protection-branch is enabled.
968	Metadata *CFProtectionBranch =
969	MF->getFunction().getParent()->getModuleFlag(
970	Key: "cf-protection-branch");
971	if (CFProtectionBranch \|\| IndirectBranchTracking) {
972	SDLoc dl(N);
973	SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true);
974	Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT);
975	--I;
976	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Complement);
977	++I;
978	MadeChange = true;
979	continue;
980	}
981	}
982	}
983
984	// If this is a target specific AND node with no flag usages, turn it back
985	// into ISD::AND to enable test instruction matching.
986	if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: `1`)) {
987	SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
988	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`));
989	--I;
990	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
991	++I;
992	MadeChange = true;
993	continue;
994	}
995
996	// Convert vector increment or decrement to sub/add with an all-ones
997	// constant:
998	// add X, <1, 1...> --> sub X, <-1, -1...>
999	// sub X, <1, 1...> --> add X, <-1, -1...>
1000	// The all-ones vector constant can be materialized using a pcmpeq
1001	// instruction that is commonly recognized as an idiom (has no register
1002	// dependency), so that's better/smaller than loading a splat 1 constant.
1003	//
1004	// But don't do this if it would inhibit a potentially profitable load
1005	// folding opportunity for the other operand. That only occurs with the
1006	// intersection of:
1007	// (1) The other operand (op0) is load foldable.
1008	// (2) The op is an add (otherwise, we are creating* an add and can still*
1009	// load fold the other op).
1010	// (3) The target has AVX (otherwise, we have a destructive add and can't
1011	// load fold the other op without killing the constant op).
1012	// (4) The constant 1 vector has multiple uses (so it is profitable to load
1013	// into a register anyway).
1014	auto mayPreventLoadFold = [&]() {
1015	return X86::mayFoldLoad(Op: N->getOperand(Num: `0`), Subtarget: *Subtarget) &&
1016	N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1017	!N->getOperand(Num: `1`).hasOneUse();
1018	};
1019	if ((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
1020	N->getSimpleValueType(ResNo: `0`).isVector() && !mayPreventLoadFold ()) {
1021	APInt SplatVal;
1022	if (X86::isConstantSplat(Op: N->getOperand(Num: `1`), SplatVal) &&
1023	SplatVal.isOne()) {
1024	SDLoc DL(N);
1025
1026	MVT VT = N->getSimpleValueType(ResNo: `0`);
1027	unsigned NumElts = VT.getSizeInBits() / `32`;
1028	SDValue AllOnes =
1029	CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts));
1030	AllOnes = CurDAG->getBitcast(VT, V: AllOnes);
1031
1032	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1033	SDValue Res =
1034	CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: `0`), N2: AllOnes);
1035	--I;
1036	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1037	++I;
1038	MadeChange = true;
1039	continue;
1040	}
1041	}
1042
1043	switch (N->getOpcode()) {
1044	case X86ISD::VBROADCAST: {
1045	MVT VT = N->getSimpleValueType(ResNo: `0`);
1046	// Emulate v32i16/v64i8 broadcast without BWI.
1047	if (!Subtarget->hasBWI() && needBWI(VT)) {
1048	MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1049	SDLoc dl(N);
1050	SDValue NarrowBCast =
1051	CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: `0`));
1052	SDValue Res =
1053	CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1054	N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1055	unsigned Index = NarrowVT.getVectorMinNumElements();
1056	Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1057	N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1058
1059	--I;
1060	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1061	++I;
1062	MadeChange = true;
1063	continue;
1064	}
1065
1066	break;
1067	}
1068	case X86ISD::VBROADCAST_LOAD: {
1069	MVT VT = N->getSimpleValueType(ResNo: `0`);
1070	// Emulate v32i16/v64i8 broadcast without BWI.
1071	if (!Subtarget->hasBWI() && needBWI(VT)) {
1072	MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1073	auto *MemNode = cast<MemSDNode>(Val: N);
1074	SDLoc dl(N);
1075	SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other);
1076	SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1077	SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1078	Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(),
1079	MMO: MemNode->getMemOperand());
1080	SDValue Res =
1081	CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1082	N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1083	unsigned Index = NarrowVT.getVectorMinNumElements();
1084	Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1085	N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1086
1087	--I;
1088	SDValue To[] = {Res, NarrowBCast.getValue(R: `1`)};
1089	CurDAG->ReplaceAllUsesWith(From: N, To);
1090	++I;
1091	MadeChange = true;
1092	continue;
1093	}
1094
1095	break;
1096	}
1097	case ISD::LOAD: {
1098	// If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1099	// load, then just extract the lower subvector and avoid the second load.
1100	auto *Ld = cast<LoadSDNode>(Val: N);
1101	MVT VT = N->getSimpleValueType(ResNo: `0`);
1102	if (!ISD::isNormalLoad(N: Ld) \|\| !Ld->isSimple() \|\|
1103	!(VT.is128BitVector() \|\| VT.is256BitVector()))
1104	break;
1105
1106	MVT MaxVT = VT;
1107	SDNode MaxLd = nullptr*;
1108	SDValue Ptr = Ld->getBasePtr();
1109	SDValue Chain = Ld->getChain();
1110	for (SDNode *User : Ptr ->users()) {
1111	auto *UserLd = dyn_cast<LoadSDNode>(Val: User);
1112	MVT UserVT = User->getSimpleValueType(ResNo: `0`);
1113	if (User != N && UserLd && ISD::isNormalLoad(N: User) &&
1114	UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1115	!User->hasAnyUseOfValue(Value: `1`) &&
1116	(UserVT.is256BitVector() \|\| UserVT.is512BitVector()) &&
1117	UserVT.getSizeInBits() > VT.getSizeInBits() &&
1118	(!MaxLd \|\| UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1119	MaxLd = User;
1120	MaxVT = UserVT;
1121	}
1122	}
1123	if (MaxLd) {
1124	SDLoc dl(N);
1125	unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1126	MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts);
1127	SDValue Extract = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT,
1128	N1: SDValue (MaxLd, `0`),
1129	N2: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1130	SDValue Res = CurDAG->getBitcast(VT, V: Extract);
1131
1132	--I;
1133	SDValue To[] = {Res, SDValue (MaxLd, `1`)};
1134	CurDAG->ReplaceAllUsesWith(From: N, To);
1135	++I;
1136	MadeChange = true;
1137	continue;
1138	}
1139	break;
1140	}
1141	case ISD::VSELECT: {
1142	// Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1143	EVT EleVT = N->getOperand(Num: `0`).getValueType().getVectorElementType();
1144	if (EleVT == MVT::i1)
1145	break;
1146
1147	assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1148	assert(N->getValueType(`0`).getVectorElementType() != MVT::i16 &&
1149	"We can't replace VSELECT with BLENDV in vXi16!");
1150	SDValue R;
1151	if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: `0`)) ==
1152	EleVT.getSizeInBits()) {
1153	R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1154	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`), N3: N->getOperand(Num: `2`),
1155	N4: CurDAG->getTargetConstant(Val: `0xCA`, DL: SDLoc (N), VT: MVT::i8));
1156	} else {
1157	R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1158	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`),
1159	N3: N->getOperand(Num: `2`));
1160	}
1161	--I;
1162	CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode());
1163	++I;
1164	MadeChange = true;
1165	continue;
1166	}
1167	case ISD::FP_ROUND:
1168	case ISD::STRICT_FP_ROUND:
1169	case ISD::FP_TO_SINT:
1170	case ISD::FP_TO_UINT:
1171	case ISD::STRICT_FP_TO_SINT:
1172	case ISD::STRICT_FP_TO_UINT: {
1173	// Replace vector fp_to_s/uint with their X86 specific equivalent so we
1174	// don't need 2 sets of patterns.
1175	if (!N->getSimpleValueType(ResNo: `0`).isVector())
1176	break;
1177
1178	unsigned NewOpc;
1179	switch (N->getOpcode()) {
1180	default: llvm_unreachable("Unexpected opcode!");
1181	case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1182	case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1183	case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1184	case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1185	case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1186	case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1187	}
1188	SDValue Res;
1189	if (N->isStrictFPOpcode())
1190	Res =
1191	CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), ResultTys: {N->getValueType(ResNo: `0`), MVT::Other},
1192	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`)});
1193	else
1194	Res =
1195	CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1196	Operand: N->getOperand(Num: `0`));
1197	--I;
1198	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1199	++I;
1200	MadeChange = true;
1201	continue;
1202	}
1203	case ISD::SHL:
1204	case ISD::SRA:
1205	case ISD::SRL: {
1206	// Replace vector shifts with their X86 specific equivalent so we don't
1207	// need 2 sets of patterns.
1208	if (!N->getValueType(ResNo: `0`).isVector())
1209	break;
1210
1211	unsigned NewOpc;
1212	switch (N->getOpcode()) {
1213	default: llvm_unreachable("Unexpected opcode!");
1214	case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1215	case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1216	case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1217	}
1218	SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1219	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`));
1220	--I;
1221	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1222	++I;
1223	MadeChange = true;
1224	continue;
1225	}
1226	case ISD::ANY_EXTEND:
1227	case ISD::ANY_EXTEND_VECTOR_INREG: {
1228	// Replace vector any extend with the zero extend equivalents so we don't
1229	// need 2 sets of patterns. Ignore vXi1 extensions.
1230	if (!N->getValueType(ResNo: `0`).isVector())
1231	break;
1232
1233	unsigned NewOpc;
1234	if (N->getOperand(Num: `0`).getScalarValueSizeInBits() == `1`) {
1235	assert(N->getOpcode() == ISD::ANY_EXTEND &&
1236	"Unexpected opcode for mask vector!");
1237	NewOpc = ISD::SIGN_EXTEND;
1238	} else {
1239	NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1240	? ISD::ZERO_EXTEND
1241	: ISD::ZERO_EXTEND_VECTOR_INREG;
1242	}
1243
1244	SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1245	Operand: N->getOperand(Num: `0`));
1246	--I;
1247	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1248	++I;
1249	MadeChange = true;
1250	continue;
1251	}
1252	case ISD::FCEIL:
1253	case ISD::STRICT_FCEIL:
1254	case ISD::FFLOOR:
1255	case ISD::STRICT_FFLOOR:
1256	case ISD::FTRUNC:
1257	case ISD::STRICT_FTRUNC:
1258	case ISD::FROUNDEVEN:
1259	case ISD::STRICT_FROUNDEVEN:
1260	case ISD::FNEARBYINT:
1261	case ISD::STRICT_FNEARBYINT:
1262	case ISD::FRINT:
1263	case ISD::STRICT_FRINT: {
1264	// Replace fp rounding with their X86 specific equivalent so we don't
1265	// need 2 sets of patterns.
1266	unsigned Imm;
1267	switch (N->getOpcode()) {
1268	default: llvm_unreachable("Unexpected opcode!");
1269	case ISD::STRICT_FCEIL:
1270	case ISD::FCEIL: Imm = `0xA`; break;
1271	case ISD::STRICT_FFLOOR:
1272	case ISD::FFLOOR: Imm = `0x9`; break;
1273	case ISD::STRICT_FTRUNC:
1274	case ISD::FTRUNC: Imm = `0xB`; break;
1275	case ISD::STRICT_FROUNDEVEN:
1276	case ISD::FROUNDEVEN: Imm = `0x8`; break;
1277	case ISD::STRICT_FNEARBYINT:
1278	case ISD::FNEARBYINT: Imm = `0xC`; break;
1279	case ISD::STRICT_FRINT:
1280	case ISD::FRINT: Imm = `0x4`; break;
1281	}
1282	SDLoc dl(N);
1283	bool IsStrict = N->isStrictFPOpcode();
1284	SDValue Res;
1285	if (IsStrict)
1286	Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl,
1287	ResultTys: {N->getValueType(ResNo: `0`), MVT::Other},
1288	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`),
1289	CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)});
1290	else
1291	Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: `0`),
1292	N1: N->getOperand(Num: `0`),
1293	N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32));
1294	--I;
1295	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1296	++I;
1297	MadeChange = true;
1298	continue;
1299	}
1300	case X86ISD::FANDN:
1301	case X86ISD::FAND:
1302	case X86ISD::FOR:
1303	case X86ISD::FXOR: {
1304	// Widen scalar fp logic ops to vector to reduce isel patterns.
1305	// FIXME: Can we do this during lowering/combine.
1306	MVT VT = N->getSimpleValueType(ResNo: `0`);
1307	if (VT.isVector() \|\| VT == MVT::f128)
1308	break;
1309
1310	MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1311	: VT == MVT::f32 ? MVT::v4f32
1312	: MVT::v8f16;
1313
1314	SDLoc dl(N);
1315	SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1316	Operand: N->getOperand(Num: `0`));
1317	SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1318	Operand: N->getOperand(Num: `1`));
1319
1320	SDValue Res;
1321	if (Subtarget->hasSSE2()) {
1322	EVT IntVT = EVT (VecVT).changeVectorElementTypeToInteger();
1323	Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0);
1324	Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1);
1325	unsigned Opc;
1326	switch (N->getOpcode()) {
1327	default: llvm_unreachable("Unexpected opcode!");
1328	case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1329	case X86ISD::FAND: Opc = ISD::AND; break;
1330	case X86ISD::FOR: Opc = ISD::OR; break;
1331	case X86ISD::FXOR: Opc = ISD::XOR; break;
1332	}
1333	Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1);
1334	Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res);
1335	} else {
1336	Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1);
1337	}
1338	Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res,
1339	N2: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1340	--I;
1341	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1342	++I;
1343	MadeChange = true;
1344	continue;
1345	}
1346	}
1347
1348	if (OptLevel != CodeGenOptLevel::None &&
1349	// Only do this when the target can fold the load into the call or
1350	// jmp.
1351	!Subtarget->useIndirectThunkCalls() &&
1352	((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) \|\|
1353	(N->getOpcode() == X86ISD::TC_RETURN &&
1354	(Subtarget->is64Bit() \|\|
1355	!getTargetMachine().isPositionIndependent())))) {
1356	/// Also try moving call address load from outside callseq_start to just
1357	/// before the call to allow it to be folded.
1358	///
1359	/// [Load chain]
1360	/// ^
1361	/// \|
1362	/// [Load]
1363	/// ^ ^
1364	/// \| \|
1365	/// / \--
1366	/// / \|
1367	///[CALLSEQ_START] \|
1368	/// ^ \|
1369	/// \| \|
1370	/// [LOAD/C2Reg] \|
1371	/// \| \|
1372	/// \ /
1373	/// \ /
1374	/// [CALL]
1375	bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1376	SDValue Chain = N->getOperand(Num: `0`);
1377	SDValue Load = N->getOperand(Num: `1`);
1378	if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq))
1379	continue;
1380	moveBelowOrigChain(CurDAG, Load, Call: SDValue (N, `0`), OrigChain: Chain);
1381	++NumLoadMoved;
1382	MadeChange = true;
1383	continue;
1384	}
1385
1386	// Lower fpround and fpextend nodes that target the FP stack to be store and
1387	// load to the stack. This is a gross hack. We would like to simply mark
1388	// these as being illegal, but when we do that, legalize produces these when
1389	// it expands calls, then expands these in the same legalize pass. We would
1390	// like dag combine to be able to hack on these between the call expansion
1391	// and the node legalization. As such this pass basically does "really
1392	// late" legalization of these inline with the X86 isel pass.
1393	// FIXME: This should only happen when not compiled with -O0.
1394	switch (N->getOpcode()) {
1395	default: continue;
1396	case ISD::FP_ROUND:
1397	case ISD::FP_EXTEND:
1398	{
1399	MVT SrcVT = N->getOperand(Num: `0`).getSimpleValueType();
1400	MVT DstVT = N->getSimpleValueType(ResNo: `0`);
1401
1402	// If any of the sources are vectors, no fp stack involved.
1403	if (SrcVT.isVector() \|\| DstVT.isVector())
1404	continue;
1405
1406	// If the source and destination are SSE registers, then this is a legal
1407	// conversion that should not be lowered.
1408	const X86TargetLowering *X86Lowering =
1409	static_cast<const X86TargetLowering *>(TLI);
1410	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1411	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1412	if (SrcIsSSE && DstIsSSE)
1413	continue;
1414
1415	if (!SrcIsSSE && !DstIsSSE) {
1416	// If this is an FPStack extension, it is a noop.
1417	if (N->getOpcode() == ISD::FP_EXTEND)
1418	continue;
1419	// If this is a value-preserving FPStack truncation, it is a noop.
1420	if (N->getConstantOperandVal(Num: `1`))
1421	continue;
1422	}
1423
1424	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1425	// FPStack has extload and truncstore. SSE can fold direct loads into other
1426	// operations. Based on this, decide what we want to do.
1427	MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1428	SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1429	int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1430	MachinePointerInfo MPI =
1431	MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1432	SDLoc dl(N);
1433
1434	// FIXME: optimize the case where the src/dest is a load or store?
1435
1436	SDValue Store = CurDAG->getTruncStore(
1437	Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: `0`), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT);
1438	SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store,
1439	Ptr: MemTmp, PtrInfo: MPI, MemVT);
1440
1441	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1442	// extload we created. This will cause general havok on the dag because
1443	// anything below the conversion could be folded into other existing nodes.
1444	// To avoid invalidating 'I', back it up to the convert node.
1445	--I;
1446	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Result);
1447	break;
1448	}
1449
1450	//The sequence of events for lowering STRICT_FP versions of these nodes requires
1451	//dealing with the chain differently, as there is already a preexisting chain.
1452	case ISD::STRICT_FP_ROUND:
1453	case ISD::STRICT_FP_EXTEND:
1454	{
1455	MVT SrcVT = N->getOperand(Num: `1`).getSimpleValueType();
1456	MVT DstVT = N->getSimpleValueType(ResNo: `0`);
1457
1458	// If any of the sources are vectors, no fp stack involved.
1459	if (SrcVT.isVector() \|\| DstVT.isVector())
1460	continue;
1461
1462	// If the source and destination are SSE registers, then this is a legal
1463	// conversion that should not be lowered.
1464	const X86TargetLowering *X86Lowering =
1465	static_cast<const X86TargetLowering *>(TLI);
1466	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1467	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1468	if (SrcIsSSE && DstIsSSE)
1469	continue;
1470
1471	if (!SrcIsSSE && !DstIsSSE) {
1472	// If this is an FPStack extension, it is a noop.
1473	if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1474	continue;
1475	// If this is a value-preserving FPStack truncation, it is a noop.
1476	if (N->getConstantOperandVal(Num: `2`))
1477	continue;
1478	}
1479
1480	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1481	// FPStack has extload and truncstore. SSE can fold direct loads into other
1482	// operations. Based on this, decide what we want to do.
1483	MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1484	SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1485	int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1486	MachinePointerInfo MPI =
1487	MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1488	SDLoc dl(N);
1489
1490	// FIXME: optimize the case where the src/dest is a load or store?
1491
1492	//Since the operation is StrictFP, use the preexisting chain.
1493	SDValue Store, Result;
1494	if (!SrcIsSSE) {
1495	SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
1496	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `1`), MemTmp};
1497	Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT,
1498	PtrInfo: MPI, /Align/ Alignment: std::nullopt,
1499	Flags: MachineMemOperand::MOStore);
1500	if (N->getFlags().hasNoFPExcept()) {
1501	SDNodeFlags Flags = Store ->getFlags();
1502	Flags.setNoFPExcept(true);
1503	Store ->setFlags(Flags);
1504	}
1505	} else {
1506	assert(SrcVT == MemVT && "Unexpected VT!");
1507	Store = CurDAG->getStore(Chain: N->getOperand(Num: `0`), dl, Val: N->getOperand(Num: `1`), Ptr: MemTmp,
1508	PtrInfo: MPI);
1509	}
1510
1511	if (!DstIsSSE) {
1512	SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other);
1513	SDValue Ops[] = {Store, MemTmp};
1514	Result = CurDAG->getMemIntrinsicNode(
1515	Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI,
1516	/Align/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad);
1517	if (N->getFlags().hasNoFPExcept()) {
1518	SDNodeFlags Flags = Result ->getFlags();
1519	Flags.setNoFPExcept(true);
1520	Result ->setFlags(Flags);
1521	}
1522	} else {
1523	assert(DstVT == MemVT && "Unexpected VT!");
1524	Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI);
1525	}
1526
1527	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1528	// extload we created. This will cause general havok on the dag because
1529	// anything below the conversion could be folded into other existing nodes.
1530	// To avoid invalidating 'I', back it up to the convert node.
1531	--I;
1532	CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode());
1533	break;
1534	}
1535	}
1536
1537
1538	// Now that we did that, the node is dead. Increment the iterator to the
1539	// next node to process, then delete N.
1540	++I;
1541	MadeChange = true;
1542	}
1543
1544	// Remove any dead nodes that may have been left behind.
1545	if (MadeChange)
1546	CurDAG->RemoveDeadNodes();
1547	}
1548
1549	// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1550	bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1551	unsigned Opc = N->getMachineOpcode();
1552	if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1553	Opc != X86::MOVSX64rr8)
1554	return false;
1555
1556	SDValue N0 = N->getOperand(Num: `0`);
1557
1558	// We need to be extracting the lower bit of an extend.
1559	if (!N0.isMachineOpcode() \|\|
1560	N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG \|\|
1561	N0.getConstantOperandVal(i: `1`) != X86::sub_8bit)
1562	return false;
1563
1564	// We're looking for either a movsx or movzx to match the original opcode.
1565	unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1566	: X86::MOVSX32rr8_NOREX;
1567	SDValue N00 = N0.getOperand(i: `0`);
1568	if (!N00.isMachineOpcode() \|\| N00.getMachineOpcode() != ExpectedOpc)
1569	return false;
1570
1571	if (Opc == X86::MOVSX64rr8) {
1572	// If we had a sign extend from 8 to 64 bits. We still need to go from 32
1573	// to 64.
1574	MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc (N),
1575	VT: MVT::i64, Op1: N00);
1576	ReplaceUses(F: N, T: Extend);
1577	} else {
1578	// Ok we can drop this extend and just use the original extend.
1579	ReplaceUses(F: N, T: N00.getNode());
1580	}
1581
1582	return true;
1583	}
1584
1585	void X86DAGToDAGISel::PostprocessISelDAG() {
1586	// Skip peepholes at -O0.
1587	if (TM.getOptLevel() == CodeGenOptLevel::None)
1588	return;
1589
1590	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1591
1592	bool MadeChange = false;
1593	while (Position != CurDAG->allnodes_begin()) {
1594	SDNode N = &--Position;
1595	// Skip dead nodes and any non-machine opcodes.
1596	if (N->use_empty() \|\| !N->isMachineOpcode())
1597	continue;
1598
1599	if (tryOptimizeRem8Extend(N)) {
1600	MadeChange = true;
1601	continue;
1602	}
1603
1604	unsigned Opc = N->getMachineOpcode();
1605	switch (Opc) {
1606	default:
1607	continue;
1608	// ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1609	case X86::TEST8rr:
1610	case X86::TEST16rr:
1611	case X86::TEST32rr:
1612	case X86::TEST64rr:
1613	// ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1614	case X86::CTEST8rr:
1615	case X86::CTEST16rr:
1616	case X86::CTEST32rr:
1617	case X86::CTEST64rr: {
1618	auto &Op0 = N->getOperand(Num: `0`);
1619	if (Op0 != N->getOperand(Num: `1`) \|\| !Op0 ->hasNUsesOfValue(NUses: `2`, Value: Op0.getResNo()) \|\|
1620	!Op0.isMachineOpcode())
1621	continue;
1622	SDValue And = N->getOperand(Num: `0`);
1623	#define CASE_ND(OP) \
1624	case X86::OP: \
1625	case X86::OP##_ND:
1626	switch (And.getMachineOpcode()) {
1627	default:
1628	continue;
1629	CASE_ND(AND8rr)
1630	CASE_ND(AND16rr)
1631	CASE_ND(AND32rr)
1632	CASE_ND(AND64rr) {
1633	if (And ->hasAnyUseOfValue(Value: `1`))
1634	continue;
1635	SmallVector<SDValue> Ops(N->op_values());
1636	Ops [`0`] = And.getOperand(i: `0`);
1637	Ops [`1`] = And.getOperand(i: `1`);
1638	MachineSDNode *Test =
1639	CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (N), VT: MVT::i32, Ops);
1640	ReplaceUses(F: N, T: Test);
1641	MadeChange = true;
1642	continue;
1643	}
1644	CASE_ND(AND8rm)
1645	CASE_ND(AND16rm)
1646	CASE_ND(AND32rm)
1647	CASE_ND(AND64rm) {
1648	if (And ->hasAnyUseOfValue(Value: `1`))
1649	continue;
1650	unsigned NewOpc;
1651	bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc);
1652	#define FROM_TO(A, B) \
1653	CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1654	break;
1655	switch (And.getMachineOpcode()) {
1656	FROM_TO(AND8rm, TEST8mr);
1657	FROM_TO(AND16rm, TEST16mr);
1658	FROM_TO(AND32rm, TEST32mr);
1659	FROM_TO(AND64rm, TEST64mr);
1660	}
1661	#undef FROM_TO
1662	#undef CASE_ND
1663	// Need to swap the memory and register operand.
1664	SmallVector<SDValue> Ops = {And.getOperand(i: `1`), And.getOperand(i: `2`),
1665	And.getOperand(i: `3`), And.getOperand(i: `4`),
1666	And.getOperand(i: `5`), And.getOperand(i: `0`)};
1667	// CC, Cflags.
1668	if (IsCTESTCC) {
1669	Ops.push_back(Elt: N->getOperand(Num: `2`));
1670	Ops.push_back(Elt: N->getOperand(Num: `3`));
1671	}
1672	// Chain of memory load
1673	Ops.push_back(Elt: And.getOperand(i: `6`));
1674	// Glue
1675	if (IsCTESTCC)
1676	Ops.push_back(Elt: N->getOperand(Num: `4`));
1677
1678	MachineSDNode *Test = CurDAG->getMachineNode(
1679	Opcode: NewOpc, dl: SDLoc (N), VT1: MVT::i32, VT2: MVT::Other, Ops);
1680	CurDAG->setNodeMemRefs(
1681	N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands());
1682	ReplaceUses(F: And.getValue(R: `2`), T: SDValue (Test, `1`));
1683	ReplaceUses(F: SDValue (N, `0`), T: SDValue (Test, `0`));
1684	MadeChange = true;
1685	continue;
1686	}
1687	}
1688	}
1689	// Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1690	// used. We're doing this late so we can prefer to fold the AND into masked
1691	// comparisons. Doing that can be better for the live range of the mask
1692	// register.
1693	case X86::KORTESTBkk:
1694	case X86::KORTESTWkk:
1695	case X86::KORTESTDkk:
1696	case X86::KORTESTQkk: {
1697	SDValue Op0 = N->getOperand(Num: `0`);
1698	if (Op0 != N->getOperand(Num: `1`) \|\| !N->isOnlyUserOf(N: Op0.getNode()) \|\|
1699	!Op0.isMachineOpcode() \|\| !onlyUsesZeroFlag(Flags: SDValue (N, `0`)))
1700	continue;
1701	#define CASE(A) \
1702	case X86::A: \
1703	break;
1704	switch (Op0.getMachineOpcode()) {
1705	default:
1706	continue;
1707	CASE(KANDBkk)
1708	CASE(KANDWkk)
1709	CASE(KANDDkk)
1710	CASE(KANDQkk)
1711	}
1712	unsigned NewOpc;
1713	#define FROM_TO(A, B) \
1714	case X86::A: \
1715	NewOpc = X86::B; \
1716	break;
1717	switch (Opc) {
1718	FROM_TO(KORTESTBkk, KTESTBkk)
1719	FROM_TO(KORTESTWkk, KTESTWkk)
1720	FROM_TO(KORTESTDkk, KTESTDkk)
1721	FROM_TO(KORTESTQkk, KTESTQkk)
1722	}
1723	// KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1724	// KAND instructions and KTEST use the same ISA feature.
1725	if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1726	continue;
1727	#undef FROM_TO
1728	MachineSDNode *KTest = CurDAG->getMachineNode(
1729	Opcode: NewOpc, dl: SDLoc (N), VT: MVT::i32, Op1: Op0.getOperand(i: `0`), Op2: Op0.getOperand(i: `1`));
1730	ReplaceUses(F: N, T: KTest);
1731	MadeChange = true;
1732	continue;
1733	}
1734	// Attempt to remove vectors moves that were inserted to zero upper bits.
1735	case TargetOpcode::SUBREG_TO_REG: {
1736	unsigned SubRegIdx = N->getConstantOperandVal(Num: `2`);
1737	if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1738	continue;
1739
1740	SDValue Move = N->getOperand(Num: `1`);
1741	if (!Move.isMachineOpcode())
1742	continue;
1743
1744	// Make sure its one of the move opcodes we recognize.
1745	switch (Move.getMachineOpcode()) {
1746	default:
1747	continue;
1748	CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1749	CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1750	CASE(VMOVDQArr) CASE(VMOVDQUrr)
1751	CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1752	CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1753	CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1754	CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1755	CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1756	CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1757	CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1758	CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1759	CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1760	CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1761	CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1762	}
1763	#undef CASE
1764
1765	SDValue In = Move.getOperand(i: `0`);
1766	if (!In.isMachineOpcode() \|\|
1767	In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1768	continue;
1769
1770	// Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1771	// the SHA instructions which use a legacy encoding.
1772	uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags;
1773	if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1774	(TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1775	(TSFlags & X86II::EncodingMask) != X86II::XOP)
1776	continue;
1777
1778	// Producing instruction is another vector instruction. We can drop the
1779	// move.
1780	CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: `0`), Op2: In, Op3: N->getOperand(Num: `2`));
1781	MadeChange = true;
1782	}
1783	}
1784	}
1785
1786	if (MadeChange)
1787	CurDAG->RemoveDeadNodes();
1788	}
1789
1790
1791	/// Emit any code that needs to be executed only in the main function.
1792	void X86DAGToDAGISel::emitSpecialCodeForMain() {
1793	if (Subtarget->isTargetCygMing()) {
1794	TargetLowering::ArgListTy Args;
1795	auto &DL = CurDAG->getDataLayout();
1796
1797	TargetLowering::CallLoweringInfo CLI(*CurDAG);
1798	CLI.setChain(CurDAG->getRoot())
1799	.setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()),
1800	Target: CurDAG->getExternalSymbol(Sym: "__main", VT: TLI->getPointerTy(DL)),
1801	ArgsList: std::move(Args));
1802	const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1803	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1804	CurDAG->setRoot(Result.second);
1805	}
1806	}
1807
1808	void X86DAGToDAGISel::emitFunctionEntryCode() {
1809	// If this is main, emit special code for main.
1810	const Function &F = MF->getFunction();
1811	if (F.hasExternalLinkage() && F.getName() == "main")
1812	emitSpecialCodeForMain();
1813	}
1814
1815	static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1816	// We can run into an issue where a frame index or a register base
1817	// includes a displacement that, when added to the explicit displacement,
1818	// will overflow the displacement field. Assuming that the
1819	// displacement fits into a 31-bit integer (which is only slightly more
1820	// aggressive than the current fundamental assumption that it fits into
1821	// a 32-bit integer), a 31-bit disp should always be safe.
1822	return isInt<`31`>(x: Val);
1823	}
1824
1825	bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1826	X86ISelAddressMode &AM) {
1827	// We may have already matched a displacement and the caller just added the
1828	// symbolic displacement. So we still need to do the checks even if Offset
1829	// is zero.
1830
1831	int64_t Val = AM.Disp + Offset;
1832
1833	// Cannot combine ExternalSymbol displacements with integer offsets.
1834	if (Val != `0` && (AM.ES \|\| AM.MCSym))
1835	return true;
1836
1837	CodeModel::Model M = TM.getCodeModel();
1838	if (Subtarget->is64Bit()) {
1839	if (Val != `0` &&
1840	!X86::isOffsetSuitableForCodeModel(Offset: Val, M,
1841	hasSymbolicDisplacement: AM.hasSymbolicDisplacement()))
1842	return true;
1843	// In addition to the checks required for a register base, check that
1844	// we do not try to use an unsafe Disp with a frame index.
1845	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1846	!isDispSafeForFrameIndexOrRegBase(Val))
1847	return true;
1848	// In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1849	// 64 bits. Instructions with 32-bit register addresses perform this zero
1850	// extension for us and we can safely ignore the high bits of Offset.
1851	// Instructions with only a 32-bit immediate address do not, though: they
1852	// sign extend instead. This means only address the low 2GB of address space
1853	// is directly addressable, we need indirect addressing for the high 2GB of
1854	// address space.
1855	// TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1856	// implicit zero extension of instructions would cover up any problem.
1857	// However, we have asserts elsewhere that get triggered if we do, so keep
1858	// the checks for now.
1859	// TODO: We would actually be able to accept these, as well as the same
1860	// addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1861	// to get an address size override to be emitted. However, this
1862	// pseudo-register is not part of any register class and therefore causes
1863	// MIR verification to fail.
1864	if (Subtarget->isTarget64BitILP32() &&
1865	!isDispSafeForFrameIndexOrRegBase(Val: (uint32_t)Val) &&
1866	!AM.hasBaseOrIndexReg())
1867	return true;
1868	} else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1869	// For 32-bit X86, make sure the displacement still isn't close to the
1870	// expressible limit.
1871	return true;
1872	AM.Disp = Val;
1873	return false;
1874	}
1875
1876	bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1877	bool AllowSegmentRegForX32) {
1878	SDValue Address = N->getOperand(Num: `1`);
1879
1880	// load gs:0 -> GS segment register.
1881	// load fs:0 -> FS segment register.
1882	//
1883	// This optimization is generally valid because the GNU TLS model defines that
1884	// gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1885	// with 32-bit registers, as we get in ILP32 mode, those registers are first
1886	// zero-extended to 64 bits and then added it to the base address, which gives
1887	// unwanted results when the register holds a negative value.
1888	// For more information see http://people.redhat.com/drepper/tls.pdf
1889	if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr &&
1890	!IndirectTlsSegRefs &&
1891	(Subtarget->isTargetGlibc() \|\| Subtarget->isTargetAndroid() \|\|
1892	Subtarget->isTargetFuchsia())) {
1893	if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1894	return true;
1895	switch (N->getPointerInfo().getAddrSpace()) {
1896	case X86AS::GS:
1897	AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
1898	return false;
1899	case X86AS::FS:
1900	AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
1901	return false;
1902	// Address space X86AS::SS is not handled here, because it is not used to
1903	// address TLS areas.
1904	}
1905	}
1906
1907	return true;
1908	}
1909
1910	/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1911	/// mode. These wrap things that will resolve down into a symbol reference.
1912	/// If no match is possible, this returns true, otherwise it returns false.
1913	bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1914	// If the addressing mode already has a symbol as the displacement, we can
1915	// never match another symbol.
1916	if (AM.hasSymbolicDisplacement())
1917	return true;
1918
1919	bool IsRIPRelTLS = false;
1920	bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1921	if (IsRIPRel) {
1922	SDValue Val = N.getOperand(i: `0`);
1923	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1924	IsRIPRelTLS = true;
1925	}
1926
1927	// We can't use an addressing mode in the 64-bit large code model.
1928	// Global TLS addressing is an exception. In the medium code model,
1929	// we use can use a mode when RIP wrappers are present.
1930	// That signifies access to globals that are known to be "near",
1931	// such as the GOT itself.
1932	CodeModel::Model M = TM.getCodeModel();
1933	if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1934	return true;
1935
1936	// Base and index reg must be 0 in order to use %rip as base.
1937	if (IsRIPRel && AM.hasBaseOrIndexReg())
1938	return true;
1939
1940	// Make a local copy in case we can't do this fold.
1941	X86ISelAddressMode Backup = AM;
1942
1943	int64_t Offset = `0`;
1944	SDValue N0 = N.getOperand(i: `0`);
1945	if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) {
1946	AM.GV = G->getGlobal();
1947	AM.SymbolFlags = G->getTargetFlags();
1948	Offset = G->getOffset();
1949	} else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) {
1950	AM.CP = CP->getConstVal();
1951	AM.Alignment = CP->getAlign();
1952	AM.SymbolFlags = CP->getTargetFlags();
1953	Offset = CP->getOffset();
1954	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) {
1955	AM.ES = S->getSymbol();
1956	AM.SymbolFlags = S->getTargetFlags();
1957	} else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) {
1958	AM.MCSym = S->getMCSymbol();
1959	} else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) {
1960	AM.JT = J->getIndex();
1961	AM.SymbolFlags = J->getTargetFlags();
1962	} else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) {
1963	AM.BlockAddr = BA->getBlockAddress();
1964	AM.SymbolFlags = BA->getTargetFlags();
1965	Offset = BA->getOffset();
1966	} else
1967	llvm_unreachable("Unhandled symbol reference node.");
1968
1969	// Can't use an addressing mode with large globals.
1970	if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1971	TM.isLargeGlobalValue(GV: AM.GV)) {
1972	AM = Backup;
1973	return true;
1974	}
1975
1976	if (foldOffsetIntoAddress(Offset, AM)) {
1977	AM = Backup;
1978	return true;
1979	}
1980
1981	if (IsRIPRel)
1982	AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64));
1983
1984	// Commit the changes now that we know this fold is safe.
1985	return false;
1986	}
1987
1988	/// Add the specified node to the specified addressing mode, returning true if
1989	/// it cannot be done. This just pattern matches for the addressing mode.
1990	bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1991	if (matchAddressRecursively(N, AM, Depth: `0`))
1992	return true;
1993
1994	// Post-processing: Make a second attempt to fold a load, if we now know
1995	// that there will not be any other register. This is only performed for
1996	// 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1997	// any foldable load the first time.
1998	if (Subtarget->isTarget64BitILP32() &&
1999	AM.BaseType == X86ISelAddressMode::RegBase &&
2000	AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
2001	SDValue Save_Base_Reg = AM.Base_Reg;
2002	if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) {
2003	AM.Base_Reg = SDValue ();
2004	if (matchLoadInAddress(N: LoadN, AM, /AllowSegmentRegForX32=/true))
2005	AM.Base_Reg = Save_Base_Reg;
2006	}
2007	}
2008
2009	// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
2010	// a smaller encoding and avoids a scaled-index.
2011	if (AM.Scale == `2` &&
2012	AM.BaseType == X86ISelAddressMode::RegBase &&
2013	AM.Base_Reg.getNode() == nullptr) {
2014	AM.Base_Reg = AM.IndexReg;
2015	AM.Scale = `1`;
2016	}
2017
2018	// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2019	// because it has a smaller encoding.
2020	if (TM.getCodeModel() != CodeModel::Large &&
2021	(!AM.GV \|\| !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() &&
2022	AM.Scale == `1` && AM.BaseType == X86ISelAddressMode::RegBase &&
2023	AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2024	AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2025	// However, when GV is a local function symbol and in the same section as
2026	// the current instruction, and AM.Disp is negative and near INT32_MIN,
2027	// referencing GV+Disp generates a relocation referencing the section symbol
2028	// with an even smaller offset, which might underflow. We should bail out if
2029	// the negative offset is too close to INT32_MIN. Actually, we are more
2030	// conservative here, using a smaller magic number also used by
2031	// isOffsetSuitableForCodeModel.
2032	if (isa_and_nonnull<Function>(Val: AM.GV) && AM.Disp < -`16` * `1024` * `1024`)
2033	return true;
2034
2035	AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64);
2036	}
2037
2038	return false;
2039	}
2040
2041	bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2042	unsigned Depth) {
2043	// Add an artificial use to this node so that we can keep track of
2044	// it if it gets CSE'd with a different node.
2045	HandleSDNode Handle(N);
2046
2047	X86ISelAddressMode Backup = AM;
2048	if (!matchAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth+`1`) &&
2049	!matchAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM, Depth: Depth+`1`))
2050	return false;
2051	AM = Backup;
2052
2053	// Try again after commutating the operands.
2054	if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
2055	Depth: Depth + `1`) &&
2056	!matchAddressRecursively(N: Handle.getValue().getOperand(i: `0`), AM, Depth: Depth + `1`))
2057	return false;
2058	AM = Backup;
2059
2060	// If we couldn't fold both operands into the address at the same time,
2061	// see if we can just put each operand into a register and fold at least
2062	// the add.
2063	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2064	!AM.Base_Reg.getNode() &&
2065	!AM.IndexReg.getNode()) {
2066	N = Handle.getValue();
2067	AM.Base_Reg = N.getOperand(i: `0`);
2068	AM.IndexReg = N.getOperand(i: `1`);
2069	AM.Scale = `1`;
2070	return false;
2071	}
2072	N = Handle.getValue();
2073	return true;
2074	}
2075
2076	// Insert a node into the DAG at least before the Pos node's position. This
2077	// will reposition the node as needed, and will assign it a node ID that is <=
2078	// the Pos node's ID. Note that this does not* preserve the uniqueness of node*
2079	// IDs! The selection DAG must no longer depend on their uniqueness when this
2080	// is used.
2081	static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2082	if (N ->getNodeId() == -`1` \|\|
2083	(SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) >
2084	SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) {
2085	DAG.RepositionNode(Position: Pos ->getIterator(), N: N.getNode());
2086	// Mark Node as invalid for pruning as after this it may be a successor to a
2087	// selected node but otherwise be in the same position of Pos.
2088	// Conservatively mark it with the same -abs(Id) to assure node id
2089	// invariant is preserved.
2090	N ->setNodeId(Pos ->getNodeId());
2091	SelectionDAGISel::InvalidateNodeId(N: N.getNode());
2092	}
2093	}
2094
2095	// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2096	// safe. This allows us to convert the shift and and into an h-register
2097	// extract and a scaled index. Returns false if the simplification is
2098	// performed.
2099	static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2100	uint64_t Mask,
2101	SDValue Shift, SDValue X,
2102	X86ISelAddressMode &AM) {
2103	if (Shift.getOpcode() != ISD::SRL \|\|
2104	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)) \|\|
2105	!Shift.hasOneUse())
2106	return true;
2107
2108	int ScaleLog = `8` - Shift.getConstantOperandVal(i: `1`);
2109	if (ScaleLog <= `0` \|\| ScaleLog >= `4` \|\|
2110	Mask != (`0xffu` << ScaleLog))
2111	return true;
2112
2113	MVT XVT = X.getSimpleValueType();
2114	MVT VT = N.getSimpleValueType();
2115	SDLoc DL(N);
2116	SDValue Eight = DAG.getConstant(Val: `8`, DL, VT: MVT::i8);
2117	SDValue NewMask = DAG.getConstant(Val: `0xff`, DL, VT: XVT);
2118	SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight);
2119	SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask);
2120	SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT);
2121	SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8);
2122	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount);
2123
2124	// Insert the new nodes into the topological ordering. We must do this in
2125	// a valid topological ordering as nothing is going to go back and re-sort
2126	// these nodes. We continually insert before 'N' in sequence as this is
2127	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2128	// hierarchy left to express.
2129	insertDAGNode(DAG, Pos: N, N: Eight);
2130	insertDAGNode(DAG, Pos: N, N: NewMask);
2131	insertDAGNode(DAG, Pos: N, N: Srl);
2132	insertDAGNode(DAG, Pos: N, N: And);
2133	insertDAGNode(DAG, Pos: N, N: Ext);
2134	insertDAGNode(DAG, Pos: N, N: ShlCount);
2135	insertDAGNode(DAG, Pos: N, N: Shl);
2136	DAG.ReplaceAllUsesWith(From: N, To: Shl);
2137	DAG.RemoveDeadNode(N: N.getNode());
2138	AM.IndexReg = Ext;
2139	AM.Scale = (`1` << ScaleLog);
2140	return false;
2141	}
2142
2143	// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2144	// allows us to fold the shift into this addressing mode. Returns false if the
2145	// transform succeeded.
2146	static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2147	X86ISelAddressMode &AM) {
2148	SDValue Shift = N.getOperand(i: `0`);
2149
2150	// Use a signed mask so that shifting right will insert sign bits. These
2151	// bits will be removed when we shift the result left so it doesn't matter
2152	// what we use. This might allow a smaller immediate encoding.
2153	int64_t Mask = cast<ConstantSDNode>(Val: N ->getOperand(Num: `1`))->getSExtValue();
2154
2155	// If we have an any_extend feeding the AND, look through it to see if there
2156	// is a shift behind it. But only if the AND doesn't use the extended bits.
2157	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2158	bool FoundAnyExtend = false;
2159	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2160	Shift.getOperand(i: `0`).getSimpleValueType() == MVT::i32 &&
2161	isUInt<`32`>(x: Mask)) {
2162	FoundAnyExtend = true;
2163	Shift = Shift.getOperand(i: `0`);
2164	}
2165
2166	if (Shift.getOpcode() != ISD::SHL \|\|
2167	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
2168	return true;
2169
2170	SDValue X = Shift.getOperand(i: `0`);
2171
2172	// Not likely to be profitable if either the AND or SHIFT node has more
2173	// than one use (unless all uses are for address computation). Besides,
2174	// isel mechanism requires their node ids to be reused.
2175	if (!N.hasOneUse() \|\| !Shift.hasOneUse())
2176	return true;
2177
2178	// Verify that the shift amount is something we can fold.
2179	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2180	if (ShiftAmt != `1` && ShiftAmt != `2` && ShiftAmt != `3`)
2181	return true;
2182
2183	MVT VT = N.getSimpleValueType();
2184	SDLoc DL(N);
2185	if (FoundAnyExtend) {
2186	SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X);
2187	insertDAGNode(DAG, Pos: N, N: NewX);
2188	X = NewX;
2189	}
2190
2191	SDValue NewMask = DAG.getSignedConstant(Val: Mask >> ShiftAmt, DL, VT);
2192	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask);
2193	SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: `1`));
2194
2195	// Insert the new nodes into the topological ordering. We must do this in
2196	// a valid topological ordering as nothing is going to go back and re-sort
2197	// these nodes. We continually insert before 'N' in sequence as this is
2198	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2199	// hierarchy left to express.
2200	insertDAGNode(DAG, Pos: N, N: NewMask);
2201	insertDAGNode(DAG, Pos: N, N: NewAnd);
2202	insertDAGNode(DAG, Pos: N, N: NewShift);
2203	DAG.ReplaceAllUsesWith(From: N, To: NewShift);
2204	DAG.RemoveDeadNode(N: N.getNode());
2205
2206	AM.Scale = `1` << ShiftAmt;
2207	AM.IndexReg = NewAnd;
2208	return false;
2209	}
2210
2211	// Implement some heroics to detect shifts of masked values where the mask can
2212	// be replaced by extending the shift and undoing that in the addressing mode
2213	// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2214	// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2215	// the addressing mode. This results in code such as:
2216	//
2217	// int f(short y, int lookup_table) {
2218	// ...
2219	// return y + lookup_table[y >> 11];
2220	// }
2221	//
2222	// Turning into:
2223	// movzwl (%rdi), %eax
2224	// movl %eax, %ecx
2225	// shrl $11, %ecx
2226	// addl (%rsi,%rcx,4), %eax
2227	//
2228	// Instead of:
2229	// movzwl (%rdi), %eax
2230	// movl %eax, %ecx
2231	// shrl $9, %ecx
2232	// andl $124, %rcx
2233	// addl (%rsi,%rcx), %eax
2234	//
2235	// Note that this function assumes the mask is provided as a mask after* the*
2236	// value is shifted. The input chain may or may not match that, but computing
2237	// such a mask is trivial.
2238	static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2239	uint64_t Mask,
2240	SDValue Shift, SDValue X,
2241	X86ISelAddressMode &AM) {
2242	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse() \|\|
2243	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
2244	return true;
2245
2246	// We need to ensure that mask is a continuous run of bits.
2247	unsigned MaskIdx, MaskLen;
2248	if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2249	return true;
2250	unsigned MaskLZ = `64` - (MaskIdx + MaskLen);
2251
2252	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2253
2254	// The amount of shift we're trying to fit into the addressing mode is taken
2255	// from the shifted mask index (number of trailing zeros of the mask).
2256	unsigned AMShiftAmt = MaskIdx;
2257
2258	// There is nothing we can do here unless the mask is removing some bits.
2259	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2260	if (AMShiftAmt == `0` \|\| AMShiftAmt > `3`) return true;
2261
2262	// Scale the leading zero count down based on the actual size of the value.
2263	// Also scale it down based on the size of the shift.
2264	unsigned ScaleDown = (`64` - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2265	if (MaskLZ < ScaleDown)
2266	return true;
2267	MaskLZ -= ScaleDown;
2268
2269	// The final check is to ensure that any masked out high bits of X are
2270	// already known to be zero. Otherwise, the mask has a semantic impact
2271	// other than masking out a couple of low bits. Unfortunately, because of
2272	// the mask, zero extensions will be removed from operands in some cases.
2273	// This code works extra hard to look through extensions because we can
2274	// replace them with zero extensions cheaply if necessary.
2275	bool ReplacingAnyExtend = false;
2276	if (X.getOpcode() == ISD::ANY_EXTEND) {
2277	unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2278	X.getOperand(i: `0`).getSimpleValueType().getSizeInBits();
2279	// Assume that we'll replace the any-extend with a zero-extend, and
2280	// narrow the search to the extended value.
2281	X = X.getOperand(i: `0`);
2282	MaskLZ = ExtendBits > MaskLZ ? `0` : MaskLZ - ExtendBits;
2283	ReplacingAnyExtend = true;
2284	}
2285	APInt MaskedHighBits =
2286	APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ);
2287	if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits))
2288	return true;
2289
2290	// We've identified a pattern that can be transformed into a single shift
2291	// and an addressing mode. Make it so.
2292	MVT VT = N.getSimpleValueType();
2293	if (ReplacingAnyExtend) {
2294	assert(X.getValueType() != VT);
2295	// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2296	SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc (X), VT, Operand: X);
2297	insertDAGNode(DAG, Pos: N, N: NewX);
2298	X = NewX;
2299	}
2300
2301	MVT XVT = X.getSimpleValueType();
2302	SDLoc DL(N);
2303	SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2304	SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2305	SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT);
2306	SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2307	SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2308
2309	// Insert the new nodes into the topological ordering. We must do this in
2310	// a valid topological ordering as nothing is going to go back and re-sort
2311	// these nodes. We continually insert before 'N' in sequence as this is
2312	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2313	// hierarchy left to express.
2314	insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2315	insertDAGNode(DAG, Pos: N, N: NewSRL);
2316	insertDAGNode(DAG, Pos: N, N: NewExt);
2317	insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2318	insertDAGNode(DAG, Pos: N, N: NewSHL);
2319	DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2320	DAG.RemoveDeadNode(N: N.getNode());
2321
2322	AM.Scale = `1` << AMShiftAmt;
2323	AM.IndexReg = NewExt;
2324	return false;
2325	}
2326
2327	// Transform "(X >> SHIFT) & (MASK << C1)" to
2328	// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2329	// matched to a BEXTR later. Returns false if the simplification is performed.
2330	static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2331	uint64_t Mask,
2332	SDValue Shift, SDValue X,
2333	X86ISelAddressMode &AM,
2334	const X86Subtarget &Subtarget) {
2335	if (Shift.getOpcode() != ISD::SRL \|\|
2336	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)) \|\|
2337	!Shift.hasOneUse() \|\| !N.hasOneUse())
2338	return true;
2339
2340	// Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2341	if (!Subtarget.hasTBM() &&
2342	!(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2343	return true;
2344
2345	// We need to ensure that mask is a continuous run of bits.
2346	unsigned MaskIdx, MaskLen;
2347	if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2348	return true;
2349
2350	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2351
2352	// The amount of shift we're trying to fit into the addressing mode is taken
2353	// from the shifted mask index (number of trailing zeros of the mask).
2354	unsigned AMShiftAmt = MaskIdx;
2355
2356	// There is nothing we can do here unless the mask is removing some bits.
2357	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2358	if (AMShiftAmt == `0` \|\| AMShiftAmt > `3`) return true;
2359
2360	MVT XVT = X.getSimpleValueType();
2361	MVT VT = N.getSimpleValueType();
2362	SDLoc DL(N);
2363	SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2364	SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2365	SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT);
2366	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask);
2367	SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT);
2368	SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2369	SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2370
2371	// Insert the new nodes into the topological ordering. We must do this in
2372	// a valid topological ordering as nothing is going to go back and re-sort
2373	// these nodes. We continually insert before 'N' in sequence as this is
2374	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2375	// hierarchy left to express.
2376	insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2377	insertDAGNode(DAG, Pos: N, N: NewSRL);
2378	insertDAGNode(DAG, Pos: N, N: NewMask);
2379	insertDAGNode(DAG, Pos: N, N: NewAnd);
2380	insertDAGNode(DAG, Pos: N, N: NewExt);
2381	insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2382	insertDAGNode(DAG, Pos: N, N: NewSHL);
2383	DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2384	DAG.RemoveDeadNode(N: N.getNode());
2385
2386	AM.Scale = `1` << AMShiftAmt;
2387	AM.IndexReg = NewExt;
2388	return false;
2389	}
2390
2391	// Attempt to peek further into a scaled index register, collecting additional
2392	// extensions / offsets / etc. Returns /p N if we can't peek any further.
2393	SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2394	X86ISelAddressMode &AM,
2395	unsigned Depth) {
2396	assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2397	assert((AM.Scale == `1` \|\| AM.Scale == `2` \|\| AM.Scale == `4` \|\| AM.Scale == `8`) &&
2398	"Illegal index scale");
2399
2400	// Limit recursion.
2401	if (Depth >= SelectionDAG::MaxRecursionDepth)
2402	return N;
2403
2404	EVT VT = N.getValueType();
2405	unsigned Opc = N.getOpcode();
2406
2407	// index: add(x,c) -> index: x, disp + c
2408	if (CurDAG->isBaseWithConstantOffset(Op: N)) {
2409	auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: `1`));
2410	uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2411	if (!foldOffsetIntoAddress(Offset, AM))
2412	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2413	}
2414
2415	// index: add(x,x) -> index: x, scale 2*
2416	if (Opc == ISD::ADD && N.getOperand(i: `0`) == N.getOperand(i: `1`)) {
2417	if (AM.Scale <= `4`) {
2418	AM.Scale *= `2`;
2419	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2420	}
2421	}
2422
2423	// index: shl(x,i) -> index: x, scale (1 << i)*
2424	if (Opc == X86ISD::VSHLI) {
2425	uint64_t ShiftAmt = N.getConstantOperandVal(i: `1`);
2426	uint64_t ScaleAmt = `1ULL` << ShiftAmt;
2427	if ((AM.Scale * ScaleAmt) <= `8`) {
2428	AM.Scale *= ScaleAmt;
2429	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2430	}
2431	}
2432
2433	// index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2434	// TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2435	if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2436	SDValue Src = N.getOperand(i: `0`);
2437	if (Src.getOpcode() == ISD::ADD && Src ->getFlags().hasNoSignedWrap() &&
2438	Src.hasOneUse()) {
2439	if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2440	SDValue AddSrc = Src.getOperand(i: `0`);
2441	auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: `1`));
2442	int64_t Offset = AddVal->getSExtValue();
2443	if (!foldOffsetIntoAddress(Offset: (uint64_t)Offset * AM.Scale, AM)) {
2444	SDLoc DL(N);
2445	SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2446	SDValue ExtVal = CurDAG->getSignedConstant(Val: Offset, DL, VT);
2447	SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal);
2448	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2449	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2450	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2451	CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2452	CurDAG->RemoveDeadNode(N: N.getNode());
2453	return ExtSrc;
2454	}
2455	}
2456	}
2457	}
2458
2459	// index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2460	// index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2461	// TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2462	if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2463	SDValue Src = N.getOperand(i: `0`);
2464	unsigned SrcOpc = Src.getOpcode();
2465	if (((SrcOpc == ISD::ADD && Src ->getFlags().hasNoUnsignedWrap()) \|\|
2466	CurDAG->isADDLike(Op: Src, /NoWrap=/true)) &&
2467	Src.hasOneUse()) {
2468	if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2469	SDValue AddSrc = Src.getOperand(i: `0`);
2470	uint64_t Offset = Src.getConstantOperandVal(i: `1`);
2471	if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2472	SDLoc DL(N);
2473	SDValue Res;
2474	// If we're also scaling, see if we can use that as well.
2475	if (AddSrc.getOpcode() == ISD::SHL &&
2476	isa<ConstantSDNode>(Val: AddSrc.getOperand(i: `1`))) {
2477	SDValue ShVal = AddSrc.getOperand(i: `0`);
2478	uint64_t ShAmt = AddSrc.getConstantOperandVal(i: `1`);
2479	APInt HiBits =
2480	APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt);
2481	uint64_t ScaleAmt = `1ULL` << ShAmt;
2482	if ((AM.Scale * ScaleAmt) <= `8` &&
2483	(AddSrc ->getFlags().hasNoUnsignedWrap() \|\|
2484	CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) {
2485	AM.Scale *= ScaleAmt;
2486	SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal);
2487	SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal,
2488	N2: AddSrc.getOperand(i: `1`));
2489	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal);
2490	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift);
2491	AddSrc = ExtShift;
2492	Res = ExtShVal;
2493	}
2494	}
2495	SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2496	SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2497	SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal);
2498	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2499	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2500	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2501	CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2502	CurDAG->RemoveDeadNode(N: N.getNode());
2503	return Res ? Res : ExtSrc;
2504	}
2505	}
2506	}
2507	}
2508
2509	// TODO: Handle extensions, shifted masks etc.
2510	return N;
2511	}
2512
2513	bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2514	unsigned Depth) {
2515	LLVM_DEBUG({
2516	dbgs() << "MatchAddress: ";
2517	AM.dump(CurDAG);
2518	});
2519	// Limit recursion.
2520	if (Depth >= SelectionDAG::MaxRecursionDepth)
2521	return matchAddressBase(N, AM);
2522
2523	// If this is already a %rip relative address, we can only merge immediates
2524	// into it. Instead of handling this in every case, we handle it here.
2525	// RIP relative addressing: %rip + 32-bit displacement!
2526	if (AM.isRIPRelative()) {
2527	// FIXME: JumpTable and ExternalSymbol address currently don't like
2528	// displacements. It isn't very important, but this should be fixed for
2529	// consistency.
2530	if (!(AM.ES \|\| AM.MCSym) && AM.JT != -`1`)
2531	return true;
2532
2533	if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N))
2534	if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM))
2535	return false;
2536	return true;
2537	}
2538
2539	switch (N.getOpcode()) {
2540	default: break;
2541	case ISD::LOCAL_RECOVER: {
2542	if (!AM.hasSymbolicDisplacement() && AM.Disp == `0`)
2543	if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: `0`))) {
2544	// Use the symbol and don't prefix it.
2545	AM.MCSym = ESNode->getMCSymbol();
2546	return false;
2547	}
2548	break;
2549	}
2550	case ISD::Constant: {
2551	uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2552	if (!foldOffsetIntoAddress(Offset: Val, AM))
2553	return false;
2554	break;
2555	}
2556
2557	case X86ISD::Wrapper:
2558	case X86ISD::WrapperRIP:
2559	if (!matchWrapper(N, AM))
2560	return false;
2561	break;
2562
2563	case ISD::LOAD:
2564	if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM))
2565	return false;
2566	break;
2567
2568	case ISD::FrameIndex:
2569	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2570	AM.Base_Reg.getNode() == nullptr &&
2571	(!Subtarget->is64Bit() \|\| isDispSafeForFrameIndexOrRegBase(Val: AM.Disp))) {
2572	AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2573	AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex();
2574	return false;
2575	}
2576	break;
2577
2578	case ISD::SHL:
2579	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`)
2580	break;
2581
2582	if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`))) {
2583	unsigned Val = CN->getZExtValue();
2584	// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2585	// that the base operand remains free for further matching. If
2586	// the base doesn't end up getting used, a post-processing step
2587	// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2588	if (Val == `1` \|\| Val == `2` \|\| Val == `3`) {
2589	SDValue ShVal = N.getOperand(i: `0`);
2590	AM.Scale = `1` << Val;
2591	AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + `1`);
2592	return false;
2593	}
2594	}
2595	break;
2596
2597	case ISD::SRL: {
2598	// Scale must not be used already.
2599	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`) break;
2600
2601	// We only handle up to 64-bit values here as those are what matter for
2602	// addressing mode optimizations.
2603	assert(N.getSimpleValueType().getSizeInBits() <= `64` &&
2604	"Unexpected value size!");
2605
2606	SDValue And = N.getOperand(i: `0`);
2607	if (And.getOpcode() != ISD::AND) break;
2608	SDValue X = And.getOperand(i: `0`);
2609
2610	// The mask used for the transform is expected to be post-shift, but we
2611	// found the shift first so just apply the shift to the mask before passing
2612	// it down.
2613	if (!isa<ConstantSDNode>(Val: N.getOperand(i: `1`)) \|\|
2614	!isa<ConstantSDNode>(Val: And.getOperand(i: `1`)))
2615	break;
2616	uint64_t Mask = And.getConstantOperandVal(i: `1`) >> N.getConstantOperandVal(i: `1`);
2617
2618	// Try to fold the mask and shift into the scale, and return false if we
2619	// succeed.
2620	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM))
2621	return false;
2622	break;
2623	}
2624
2625	case ISD::SMUL_LOHI:
2626	case ISD::UMUL_LOHI:
2627	// A mul_lohi where we need the low part can be folded as a plain multiply.
2628	if (N.getResNo() != `0`) break;
2629	[[fallthrough]];
2630	case ISD::MUL:
2631	case X86ISD::MUL_IMM:
2632	// X[3,5,9] -> X+X[2,4,8]
2633	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2634	AM.Base_Reg.getNode() == nullptr &&
2635	AM.IndexReg.getNode() == nullptr) {
2636	if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`)))
2637	if (CN->getZExtValue() == `3` \|\| CN->getZExtValue() == `5` \|\|
2638	CN->getZExtValue() == `9`) {
2639	AM.Scale = unsigned(CN->getZExtValue())-`1`;
2640
2641	SDValue MulVal = N.getOperand(i: `0`);
2642	SDValue Reg;
2643
2644	// Okay, we know that we have a scale by now. However, if the scaled
2645	// value is an add of something and a constant, we can fold the
2646	// constant into the disp field here.
2647	if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2648	isa<ConstantSDNode>(Val: MulVal.getOperand(i: `1`))) {
2649	Reg = MulVal.getOperand(i: `0`);
2650	auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: `1`));
2651	uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2652	if (foldOffsetIntoAddress(Offset: Disp, AM))
2653	Reg = N.getOperand(i: `0`);
2654	} else {
2655	Reg = N.getOperand(i: `0`);
2656	}
2657
2658	AM.IndexReg = AM.Base_Reg = Reg;
2659	return false;
2660	}
2661	}
2662	break;
2663
2664	case ISD::SUB: {
2665	// Given A-B, if A can be completely folded into the address and
2666	// the index field with the index field unused, use -B as the index.
2667	// This is a win if a has multiple parts that can be folded into
2668	// the address. Also, this saves a mov if the base register has
2669	// other uses, since it avoids a two-address sub instruction, however
2670	// it costs an additional mov if the index register has other uses.
2671
2672	// Add an artificial use to this node so that we can keep track of
2673	// it if it gets CSE'd with a different node.
2674	HandleSDNode Handle(N);
2675
2676	// Test if the LHS of the sub can be folded.
2677	X86ISelAddressMode Backup = AM;
2678	if (matchAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth+`1`)) {
2679	N = Handle.getValue();
2680	AM = Backup;
2681	break;
2682	}
2683	N = Handle.getValue();
2684	// Test if the index field is free for use.
2685	if (AM.IndexReg.getNode() \|\| AM.isRIPRelative()) {
2686	AM = Backup;
2687	break;
2688	}
2689
2690	int Cost = `0`;
2691	SDValue RHS = N.getOperand(i: `1`);
2692	// If the RHS involves a register with multiple uses, this
2693	// transformation incurs an extra mov, due to the neg instruction
2694	// clobbering its operand.
2695	if (!RHS.getNode()->hasOneUse() \|\|
2696	RHS.getNode()->getOpcode() == ISD::CopyFromReg \|\|
2697	RHS.getNode()->getOpcode() == ISD::TRUNCATE \|\|
2698	RHS.getNode()->getOpcode() == ISD::ANY_EXTEND \|\|
2699	(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2700	RHS.getOperand(i: `0`).getValueType() == MVT::i32))
2701	++Cost;
2702	// If the base is a register with multiple uses, this
2703	// transformation may save a mov.
2704	if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2705	!AM.Base_Reg.getNode()->hasOneUse()) \|\|
2706	AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2707	--Cost;
2708	// If the folded LHS was interesting, this transformation saves
2709	// address arithmetic.
2710	if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2711	((AM.Disp != `0`) && (Backup.Disp == `0`)) +
2712	(AM.Segment.getNode() && !Backup.Segment.getNode()) >= `2`)
2713	--Cost;
2714	// If it doesn't look like it may be an overall win, don't do it.
2715	if (Cost >= `0`) {
2716	AM = Backup;
2717	break;
2718	}
2719
2720	// Ok, the transformation is legal and appears profitable. Go for it.
2721	// Negation will be emitted later to avoid creating dangling nodes if this
2722	// was an unprofitable LEA.
2723	AM.IndexReg = RHS;
2724	AM.NegateIndex = true;
2725	AM.Scale = `1`;
2726	return false;
2727	}
2728
2729	case ISD::OR:
2730	case ISD::XOR:
2731	// See if we can treat the OR/XOR node as an ADD node.
2732	if (!CurDAG->isADDLike(Op: N))
2733	break;
2734	[[fallthrough]];
2735	case ISD::ADD:
2736	if (!matchAdd(N, AM, Depth))
2737	return false;
2738	break;
2739
2740	case ISD::AND: {
2741	// Perform some heroic transforms on an and of a constant-count shift
2742	// with a constant to enable use of the scaled offset field.
2743
2744	// Scale must not be used already.
2745	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`) break;
2746
2747	// We only handle up to 64-bit values here as those are what matter for
2748	// addressing mode optimizations.
2749	assert(N.getSimpleValueType().getSizeInBits() <= `64` &&
2750	"Unexpected value size!");
2751
2752	if (!isa<ConstantSDNode>(Val: N.getOperand(i: `1`)))
2753	break;
2754
2755	if (N.getOperand(i: `0`).getOpcode() == ISD::SRL) {
2756	SDValue Shift = N.getOperand(i: `0`);
2757	SDValue X = Shift.getOperand(i: `0`);
2758
2759	uint64_t Mask = N.getConstantOperandVal(i: `1`);
2760
2761	// Try to fold the mask and shift into an extract and scale.
2762	if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2763	return false;
2764
2765	// Try to fold the mask and shift directly into the scale.
2766	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2767	return false;
2768
2769	// Try to fold the mask and shift into BEXTR and scale.
2770	if (!foldMaskedShiftToBEXTR(DAG&: CurDAG, N, Mask, Shift, X, AM, Subtarget: Subtarget))
2771	return false;
2772	}
2773
2774	// Try to swap the mask and shift to place shifts which can be done as
2775	// a scale on the outside of the mask.
2776	if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM))
2777	return false;
2778
2779	break;
2780	}
2781	case ISD::ZERO_EXTEND: {
2782	// Try to widen a zexted shift left to the same size as its use, so we can
2783	// match the shift as a scale factor.
2784	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`)
2785	break;
2786
2787	SDValue Src = N.getOperand(i: `0`);
2788
2789	// See if we can match a zext(addlike(x,c)).
2790	// TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2791	if (Src.getOpcode() == ISD::ADD \|\| Src.getOpcode() == ISD::OR)
2792	if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + `1`))
2793	if (Index != N) {
2794	AM.IndexReg = Index;
2795	return false;
2796	}
2797
2798	// Peek through mask: zext(and(shl(x,c1),c2))
2799	APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits());
2800	if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2801	if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: `1`))) {
2802	Mask = MaskC->getAPIntValue();
2803	Src = Src.getOperand(i: `0`);
2804	}
2805
2806	if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N ->hasOneUse()) {
2807	// Give up if the shift is not a valid scale factor [1,2,3].
2808	SDValue ShlSrc = Src.getOperand(i: `0`);
2809	SDValue ShlAmt = Src.getOperand(i: `1`);
2810	auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt);
2811	if (!ShAmtC)
2812	break;
2813	unsigned ShAmtV = ShAmtC->getZExtValue();
2814	if (ShAmtV > `3`)
2815	break;
2816
2817	// The narrow shift must only shift out zero bits (it must be 'nuw').
2818	// That makes it safe to widen to the destination type.
2819	APInt HighZeros =
2820	APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV);
2821	if (!Src ->getFlags().hasNoUnsignedWrap() &&
2822	!CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask))
2823	break;
2824
2825	// zext (shl nuw i8 %x, C1) to i32
2826	// --> shl (zext i8 %x to i32), (zext C1)
2827	// zext (and (shl nuw i8 %x, C1), C2) to i32
2828	// --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2829	MVT SrcVT = ShlSrc.getSimpleValueType();
2830	MVT VT = N.getSimpleValueType();
2831	SDLoc DL(N);
2832
2833	SDValue Res = ShlSrc;
2834	if (!Mask.isAllOnes()) {
2835	Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT);
2836	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2837	Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res);
2838	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2839	}
2840	SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res);
2841	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext);
2842	SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt);
2843	insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl);
2844	CurDAG->ReplaceAllUsesWith(From: N, To: NewShl);
2845	CurDAG->RemoveDeadNode(N: N.getNode());
2846
2847	// Convert the shift to scale factor.
2848	AM.Scale = `1` << ShAmtV;
2849	// If matchIndexRecursively is not called here,
2850	// Zext may be replaced by other nodes but later used to call a builder
2851	// method
2852	AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + `1`);
2853	return false;
2854	}
2855
2856	if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2857	// Try to fold the mask and shift into an extract and scale.
2858	if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2859	X: Src.getOperand(i: `0`), AM))
2860	return false;
2861
2862	// Try to fold the mask and shift directly into the scale.
2863	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2864	X: Src.getOperand(i: `0`), AM))
2865	return false;
2866
2867	// Try to fold the mask and shift into BEXTR and scale.
2868	if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2869	X: Src.getOperand(i: `0`), AM, Subtarget: *Subtarget))
2870	return false;
2871	}
2872
2873	break;
2874	}
2875	}
2876
2877	return matchAddressBase(N, AM);
2878	}
2879
2880	/// Helper for MatchAddress. Add the specified node to the
2881	/// specified addressing mode without any further recursion.
2882	bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2883	// Is the base register already occupied?
2884	if (AM.BaseType != X86ISelAddressMode::RegBase \|\| AM.Base_Reg.getNode()) {
2885	// If so, check to see if the scale index register is set.
2886	if (!AM.IndexReg.getNode()) {
2887	AM.IndexReg = N;
2888	AM.Scale = `1`;
2889	return false;
2890	}
2891
2892	// Otherwise, we cannot select it.
2893	return true;
2894	}
2895
2896	// Default, generate it as a register.
2897	AM.BaseType = X86ISelAddressMode::RegBase;
2898	AM.Base_Reg = N;
2899	return false;
2900	}
2901
2902	bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2903	X86ISelAddressMode &AM,
2904	unsigned Depth) {
2905	LLVM_DEBUG({
2906	dbgs() << "MatchVectorAddress: ";
2907	AM.dump(CurDAG);
2908	});
2909	// Limit recursion.
2910	if (Depth >= SelectionDAG::MaxRecursionDepth)
2911	return matchAddressBase(N, AM);
2912
2913	// TODO: Support other operations.
2914	switch (N.getOpcode()) {
2915	case ISD::Constant: {
2916	uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2917	if (!foldOffsetIntoAddress(Offset: Val, AM))
2918	return false;
2919	break;
2920	}
2921	case X86ISD::Wrapper:
2922	if (!matchWrapper(N, AM))
2923	return false;
2924	break;
2925	case ISD::ADD: {
2926	// Add an artificial use to this node so that we can keep track of
2927	// it if it gets CSE'd with a different node.
2928	HandleSDNode Handle(N);
2929
2930	X86ISelAddressMode Backup = AM;
2931	if (!matchVectorAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`) &&
2932	!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
2933	Depth: Depth + `1`))
2934	return false;
2935	AM = Backup;
2936
2937	// Try again after commuting the operands.
2938	if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
2939	Depth: Depth + `1`) &&
2940	!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `0`), AM,
2941	Depth: Depth + `1`))
2942	return false;
2943	AM = Backup;
2944
2945	N = Handle.getValue();
2946	break;
2947	}
2948	}
2949
2950	return matchAddressBase(N, AM);
2951	}
2952
2953	/// Helper for selectVectorAddr. Handles things that can be folded into a
2954	/// gather/scatter address. The index register and scale should have already
2955	/// been handled.
2956	bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2957	return matchVectorAddressRecursively(N, AM, Depth: `0`);
2958	}
2959
2960	bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2961	SDValue IndexOp, SDValue ScaleOp,
2962	SDValue &Base, SDValue &Scale,
2963	SDValue &Index, SDValue &Disp,
2964	SDValue &Segment) {
2965	X86ISelAddressMode AM;
2966	AM.Scale = ScaleOp ->getAsZExtVal();
2967
2968	// Attempt to match index patterns, as long as we're not relying on implicit
2969	// sign-extension, which is performed BEFORE scale.
2970	if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2971	AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: `0`);
2972	else
2973	AM.IndexReg = IndexOp;
2974
2975	unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2976	if (AddrSpace == X86AS::GS)
2977	AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
2978	if (AddrSpace == X86AS::FS)
2979	AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
2980	if (AddrSpace == X86AS::SS)
2981	AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
2982
2983	SDLoc DL(BasePtr);
2984	MVT VT = BasePtr.getSimpleValueType();
2985
2986	// Try to match into the base and displacement fields.
2987	if (matchVectorAddress(N: BasePtr, AM))
2988	return false;
2989
2990	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2991	return true;
2992	}
2993
2994	/// Returns true if it is able to pattern match an addressing mode.
2995	/// It returns the operands which make up the maximal addressing mode it can
2996	/// match by reference.
2997	///
2998	/// Parent is the parent node of the addr operand that is being matched. It
2999	/// is always a load, store, atomic node, or null. It is only null when
3000	/// checking memory operands for inline asm nodes.
3001	bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
3002	SDValue &Scale, SDValue &Index,
3003	SDValue &Disp, SDValue &Segment) {
3004	X86ISelAddressMode AM;
3005
3006	if (Parent &&
3007	// This list of opcodes are all the nodes that have an "addr:$ptr" operand
3008	// that are not a MemSDNode, and thus don't have proper addrspace info.
3009	Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
3010	Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
3011	Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
3012	Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3013	Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3014	Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3015	Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3016	unsigned AddrSpace =
3017	cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace();
3018	if (AddrSpace == X86AS::GS)
3019	AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
3020	if (AddrSpace == X86AS::FS)
3021	AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
3022	if (AddrSpace == X86AS::SS)
3023	AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
3024	}
3025
3026	// Save the DL and VT before calling matchAddress, it can invalidate N.
3027	SDLoc DL(N);
3028	MVT VT = N.getSimpleValueType();
3029
3030	if (matchAddress(N, AM))
3031	return false;
3032
3033	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3034	return true;
3035	}
3036
3037	bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3038	// Cannot use 32 bit constants to reference objects in kernel/large code
3039	// model.
3040	if (TM.getCodeModel() == CodeModel::Kernel \|\|
3041	TM.getCodeModel() == CodeModel::Large)
3042	return false;
3043
3044	// In static codegen with small code model, we can get the address of a label
3045	// into a register with 'movl'
3046	if (N ->getOpcode() != X86ISD::Wrapper)
3047	return false;
3048
3049	N = N.getOperand(i: `0`);
3050
3051	// At least GNU as does not accept 'movl' for TPOFF relocations.
3052	// FIXME: We could use 'movl' when we know we are targeting MC.
3053	if (N ->getOpcode() == ISD::TargetGlobalTLSAddress)
3054	return false;
3055
3056	Imm = N;
3057	// Small/medium code model can reference non-TargetGlobalAddress objects with
3058	// 32 bit constants.
3059	if (N ->getOpcode() != ISD::TargetGlobalAddress) {
3060	return TM.getCodeModel() == CodeModel::Small \|\|
3061	TM.getCodeModel() == CodeModel::Medium;
3062	}
3063
3064	const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal();
3065	if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3066	return CR ->getUnsignedMax().ult(RHS: `1ull` << `32`);
3067
3068	return !TM.isLargeGlobalValue(GV);
3069	}
3070
3071	bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3072	SDValue &Index, SDValue &Disp,
3073	SDValue &Segment) {
3074	// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3075	SDLoc DL(N);
3076
3077	if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3078	return false;
3079
3080	EVT BaseType = Base.getValueType();
3081	unsigned SubReg;
3082	if (BaseType == MVT::i8)
3083	SubReg = X86::sub_8bit;
3084	else if (BaseType == MVT::i16)
3085	SubReg = X86::sub_16bit;
3086	else
3087	SubReg = X86::sub_32bit;
3088
3089	auto *RN = dyn_cast<RegisterSDNode>(Val&: Base);
3090	if (RN && RN->getReg() == `0`)
3091	Base = CurDAG->getRegister(Reg: `0`, VT: MVT::i64);
3092	else if ((BaseType == MVT::i8 \|\| BaseType == MVT::i16 \|\|
3093	BaseType == MVT::i32) &&
3094	!isa<FrameIndexSDNode>(Val: Base)) {
3095	// Base could already be %rip, particularly in the x32 ABI.
3096	SDValue ImplDef = SDValue (CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3097	VT: MVT::i64), `0`);
3098	Base = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Base);
3099	}
3100
3101	[[maybe_unused]] EVT IndexType = Index.getValueType();
3102	RN = dyn_cast<RegisterSDNode>(Val&: Index);
3103	if (RN && RN->getReg() == `0`)
3104	Index = CurDAG->getRegister(Reg: `0`, VT: MVT::i64);
3105	else {
3106	assert((IndexType == BaseType) &&
3107	"Expect to be extending 8/16/32-bit registers for use in LEA");
3108	SDValue ImplDef = SDValue (CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3109	VT: MVT::i64), `0`);
3110	Index = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Index);
3111	}
3112
3113	return true;
3114	}
3115
3116	/// Calls SelectAddr and determines if the maximal addressing
3117	/// mode it matches can be cost effectively emitted as an LEA instruction.
3118	bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3119	SDValue &Base, SDValue &Scale,
3120	SDValue &Index, SDValue &Disp,
3121	SDValue &Segment) {
3122	X86ISelAddressMode AM;
3123
3124	// Save the DL and VT before calling matchAddress, it can invalidate N.
3125	SDLoc DL(N);
3126	MVT VT = N.getSimpleValueType();
3127
3128	// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3129	// segments.
3130	SDValue Copy = AM.Segment;
3131	SDValue T = CurDAG->getRegister(Reg: `0`, VT: MVT::i32);
3132	AM.Segment = T;
3133	if (matchAddress(N, AM))
3134	return false;
3135	assert (T == AM.Segment);
3136	AM.Segment = Copy;
3137
3138	unsigned Complexity = `0`;
3139	if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3140	Complexity = `1`;
3141	else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3142	Complexity = `4`;
3143
3144	if (AM.IndexReg.getNode())
3145	Complexity++;
3146
3147	// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3148	// a simple shift.
3149	if (AM.Scale > `1`)
3150	Complexity++;
3151
3152	// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3153	// to a LEA. This is determined with some experimentation but is by no means
3154	// optimal (especially for code size consideration). LEA is nice because of
3155	// its three-address nature. Tweak the cost function again when we can run
3156	// convertToThreeAddress() at register allocation time.
3157	if (AM.hasSymbolicDisplacement()) {
3158	// For X86-64, always use LEA to materialize RIP-relative addresses.
3159	if (Subtarget->is64Bit())
3160	Complexity = `4`;
3161	else
3162	Complexity += `2`;
3163	}
3164
3165	// Heuristic: try harder to form an LEA from ADD if the operands set flags.
3166	// Unlike ADD, LEA does not affect flags, so we will be less likely to require
3167	// duplicating flag-producing instructions later in the pipeline.
3168	if (N.getOpcode() == ISD::ADD) {
3169	auto isMathWithFlags = [](SDValue V) {
3170	switch (V.getOpcode()) {
3171	case X86ISD::ADD:
3172	case X86ISD::SUB:
3173	case X86ISD::ADC:
3174	case X86ISD::SBB:
3175	case X86ISD::SMUL:
3176	case X86ISD::UMUL:
3177	/ TODO: These opcodes can be added safely, but we may want to justify*
3178	their inclusion for different reasons (better for reg-alloc).
3179	case X86ISD::OR:
3180	case X86ISD::XOR:
3181	case X86ISD::AND:
3182	*/
3183	// Value 1 is the flag output of the node - verify it's not dead.
3184	return !SDValue (V.getNode(), `1`).use_empty();
3185	default:
3186	return false;
3187	}
3188	};
3189	// TODO: We might want to factor in whether there's a load folding
3190	// opportunity for the math op that disappears with LEA.
3191	if (isMathWithFlags (N.getOperand(i: `0`)) \|\| isMathWithFlags (N.getOperand(i: `1`)))
3192	Complexity++;
3193	}
3194
3195	if (AM.Disp)
3196	Complexity++;
3197
3198	// If it isn't worth using an LEA, reject it.
3199	if (Complexity <= `2`)
3200	return false;
3201
3202	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3203	return true;
3204	}
3205
3206	/// This is only run on TargetGlobalTLSAddress nodes.
3207	bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3208	SDValue &Scale, SDValue &Index,
3209	SDValue &Disp, SDValue &Segment) {
3210	assert(N.getOpcode() == ISD::TargetGlobalTLSAddress \|\|
3211	N.getOpcode() == ISD::TargetExternalSymbol);
3212
3213	X86ISelAddressMode AM;
3214	if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) {
3215	AM.GV = GA->getGlobal();
3216	AM.Disp += GA->getOffset();
3217	AM.SymbolFlags = GA->getTargetFlags();
3218	} else {
3219	auto *SA = cast<ExternalSymbolSDNode>(Val&: N);
3220	AM.ES = SA->getSymbol();
3221	AM.SymbolFlags = SA->getTargetFlags();
3222	}
3223
3224	if (Subtarget->is32Bit()) {
3225	AM.Scale = `1`;
3226	AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32);
3227	}
3228
3229	MVT VT = N.getSimpleValueType();
3230	getAddressOperands(AM, DL: SDLoc (N), VT, Base, Scale, Index, Disp, Segment);
3231	return true;
3232	}
3233
3234	bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3235	// Keep track of the original value type and whether this value was
3236	// truncated. If we see a truncation from pointer type to VT that truncates
3237	// bits that are known to be zero, we can use a narrow reference.
3238	EVT VT = N.getValueType();
3239	bool WasTruncated = false;
3240	if (N.getOpcode() == ISD::TRUNCATE) {
3241	WasTruncated = true;
3242	N = N.getOperand(i: `0`);
3243	}
3244
3245	if (N.getOpcode() != X86ISD::Wrapper)
3246	return false;
3247
3248	// We can only use non-GlobalValues as immediates if they were not truncated,
3249	// as we do not have any range information. If we have a GlobalValue and the
3250	// address was not truncated, we can select it as an operand directly.
3251	unsigned Opc = N.getOperand(i: `0`)->getOpcode();
3252	if (Opc != ISD::TargetGlobalAddress \|\| !WasTruncated) {
3253	Op = N.getOperand(i: `0`);
3254	// We can only select the operand directly if we didn't have to look past a
3255	// truncate.
3256	return !WasTruncated;
3257	}
3258
3259	// Check that the global's range fits into VT.
3260	auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: `0`));
3261	std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3262	if (!CR \|\| CR ->getUnsignedMax().uge(RHS: `1ull` << VT.getSizeInBits()))
3263	return false;
3264
3265	// Okay, we can use a narrow reference.
3266	Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc (N), VT,
3267	offset: GA->getOffset(), TargetFlags: GA->getTargetFlags());
3268	return true;
3269	}
3270
3271	bool X86DAGToDAGISel::tryFoldLoad(SDNode Root, SDNode P, SDValue N,
3272	SDValue &Base, SDValue &Scale,
3273	SDValue &Index, SDValue &Disp,
3274	SDValue &Segment) {
3275	assert(Root && P && "Unknown root/parent nodes");
3276	if (!ISD::isNON_EXTLoad(N: N.getNode()) \|\|
3277	!IsProfitableToFold(N, U: P, Root) \|\|
3278	!IsLegalToFold(N, U: P, Root, OptLevel))
3279	return false;
3280
3281	return selectAddr(Parent: N.getNode(),
3282	N: N.getOperand(i: `1`), Base, Scale, Index, Disp, Segment);
3283	}
3284
3285	bool X86DAGToDAGISel::tryFoldBroadcast(SDNode Root, SDNode P, SDValue N,
3286	SDValue &Base, SDValue &Scale,
3287	SDValue &Index, SDValue &Disp,
3288	SDValue &Segment) {
3289	assert(Root && P && "Unknown root/parent nodes");
3290	if (N ->getOpcode() != X86ISD::VBROADCAST_LOAD \|\|
3291	!IsProfitableToFold(N, U: P, Root) \|\|
3292	!IsLegalToFold(N, U: P, Root, OptLevel))
3293	return false;
3294
3295	return selectAddr(Parent: N.getNode(),
3296	N: N.getOperand(i: `1`), Base, Scale, Index, Disp, Segment);
3297	}
3298
3299	/// Return an SDNode that returns the value of the global base register.
3300	/// Output instructions required to initialize the global base register,
3301	/// if necessary.
3302	SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3303	Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3304	auto &DL = MF->getDataLayout();
3305	return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode();
3306	}
3307
3308	bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode N) const* {
3309	if (N->getOpcode() == ISD::TRUNCATE)
3310	N = N->getOperand(Num: `0`).getNode();
3311	if (N->getOpcode() != X86ISD::Wrapper)
3312	return false;
3313
3314	auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: `0`));
3315	if (!GA)
3316	return false;
3317
3318	auto *GV = GA->getGlobal();
3319	std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3320	if (CR)
3321	return CR ->getSignedMin().sge(RHS: -`1ull` << Width) &&
3322	CR ->getSignedMax().slt(RHS: `1ull` << Width);
3323	// In the kernel code model, globals are in the negative 2GB of the address
3324	// space, so globals can be a sign extended 32-bit immediate.
3325	// In other code models, small globals are in the low 2GB of the address
3326	// space, so sign extending them is equivalent to zero extending them.
3327	return Width == `32` && !TM.isLargeGlobalValue(GV);
3328	}
3329
3330	X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode N) const* {
3331	assert(N->isMachineOpcode() && "Unexpected node");
3332	unsigned Opc = N->getMachineOpcode();
3333	const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc);
3334	int CondNo = X86::getCondSrcNoFromDesc(MCID);
3335	if (CondNo < `0`)
3336	return X86::COND_INVALID;
3337
3338	return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo));
3339	}
3340
3341	/// Test whether the given X86ISD::CMP node has any users that use a flag
3342	/// other than ZF.
3343	bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3344	// Examine each user of the node.
3345	for (SDUse &Use : Flags ->uses()) {
3346	// Only check things that use the flags.
3347	if (Use.getResNo() != Flags.getResNo())
3348	continue;
3349	SDNode *User = Use.getUser();
3350	// Only examine CopyToReg uses that copy to EFLAGS.
3351	if (User->getOpcode() != ISD::CopyToReg \|\|
3352	cast<RegisterSDNode>(Val: User->getOperand(Num: `1`))->getReg() != X86::EFLAGS)
3353	return false;
3354	// Examine each user of the CopyToReg use.
3355	for (SDUse &FlagUse : User->uses()) {
3356	// Only examine the Flag result.
3357	if (FlagUse.getResNo() != `1`)
3358	continue;
3359	// Anything unusual: assume conservatively.
3360	if (!FlagUse.getUser()->isMachineOpcode())
3361	return false;
3362	// Examine the condition code of the user.
3363	X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3364
3365	switch (CC) {
3366	// Comparisons which only use the zero flag.
3367	case X86::COND_E: case X86::COND_NE:
3368	continue;
3369	// Anything else: assume conservatively.
3370	default:
3371	return false;
3372	}
3373	}
3374	}
3375	return true;
3376	}
3377
3378	/// Test whether the given X86ISD::CMP node has any uses which require the SF
3379	/// flag to be accurate.
3380	bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3381	// Examine each user of the node.
3382	for (SDUse &Use : Flags ->uses()) {
3383	// Only check things that use the flags.
3384	if (Use.getResNo() != Flags.getResNo())
3385	continue;
3386	SDNode *User = Use.getUser();
3387	// Only examine CopyToReg uses that copy to EFLAGS.
3388	if (User->getOpcode() != ISD::CopyToReg \|\|
3389	cast<RegisterSDNode>(Val: User->getOperand(Num: `1`))->getReg() != X86::EFLAGS)
3390	return false;
3391	// Examine each user of the CopyToReg use.
3392	for (SDUse &FlagUse : User->uses()) {
3393	// Only examine the Flag result.
3394	if (FlagUse.getResNo() != `1`)
3395	continue;
3396	// Anything unusual: assume conservatively.
3397	if (!FlagUse.getUser()->isMachineOpcode())
3398	return false;
3399	// Examine the condition code of the user.
3400	X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3401
3402	switch (CC) {
3403	// Comparisons which don't examine the SF flag.
3404	case X86::COND_A: case X86::COND_AE:
3405	case X86::COND_B: case X86::COND_BE:
3406	case X86::COND_E: case X86::COND_NE:
3407	case X86::COND_O: case X86::COND_NO:
3408	case X86::COND_P: case X86::COND_NP:
3409	continue;
3410	// Anything else: assume conservatively.
3411	default:
3412	return false;
3413	}
3414	}
3415	}
3416	return true;
3417	}
3418
3419	static bool mayUseCarryFlag(X86::CondCode CC) {
3420	switch (CC) {
3421	// Comparisons which don't examine the CF flag.
3422	case X86::COND_O: case X86::COND_NO:
3423	case X86::COND_E: case X86::COND_NE:
3424	case X86::COND_S: case X86::COND_NS:
3425	case X86::COND_P: case X86::COND_NP:
3426	case X86::COND_L: case X86::COND_GE:
3427	case X86::COND_G: case X86::COND_LE:
3428	return false;
3429	// Anything else: assume conservatively.
3430	default:
3431	return true;
3432	}
3433	}
3434
3435	/// Test whether the given node which sets flags has any uses which require the
3436	/// CF flag to be accurate.
3437	bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3438	// Examine each user of the node.
3439	for (SDUse &Use : Flags ->uses()) {
3440	// Only check things that use the flags.
3441	if (Use.getResNo() != Flags.getResNo())
3442	continue;
3443
3444	SDNode *User = Use.getUser();
3445	unsigned UserOpc = User->getOpcode();
3446
3447	if (UserOpc == ISD::CopyToReg) {
3448	// Only examine CopyToReg uses that copy to EFLAGS.
3449	if (cast<RegisterSDNode>(Val: User->getOperand(Num: `1`))->getReg() != X86::EFLAGS)
3450	return false;
3451	// Examine each user of the CopyToReg use.
3452	for (SDUse &FlagUse : User->uses()) {
3453	// Only examine the Flag result.
3454	if (FlagUse.getResNo() != `1`)
3455	continue;
3456	// Anything unusual: assume conservatively.
3457	if (!FlagUse.getUser()->isMachineOpcode())
3458	return false;
3459	// Examine the condition code of the user.
3460	X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3461
3462	if (mayUseCarryFlag(CC))
3463	return false;
3464	}
3465
3466	// This CopyToReg is ok. Move on to the next user.
3467	continue;
3468	}
3469
3470	// This might be an unselected node. So look for the pre-isel opcodes that
3471	// use flags.
3472	unsigned CCOpNo;
3473	switch (UserOpc) {
3474	default:
3475	// Something unusual. Be conservative.
3476	return false;
3477	case X86ISD::SETCC: CCOpNo = `0`; break;
3478	case X86ISD::SETCC_CARRY: CCOpNo = `0`; break;
3479	case X86ISD::CMOV: CCOpNo = `2`; break;
3480	case X86ISD::BRCOND: CCOpNo = `2`; break;
3481	}
3482
3483	X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(Num: CCOpNo);
3484	if (mayUseCarryFlag(CC))
3485	return false;
3486	}
3487	return true;
3488	}
3489
3490	/// Check whether or not the chain ending in StoreNode is suitable for doing
3491	/// the {load; op; store} to modify transformation.
3492	static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3493	SDValue StoredVal, SelectionDAG *CurDAG,
3494	unsigned LoadOpNo,
3495	LoadSDNode *&LoadNode,
3496	SDValue &InputChain) {
3497	// Is the stored value result 0 of the operation?
3498	if (StoredVal.getResNo() != `0`) return false;
3499
3500	// Are there other uses of the operation other than the store?
3501	if (!StoredVal.getNode()->hasNUsesOfValue(NUses: `1`, Value: `0`)) return false;
3502
3503	// Is the store non-extending and non-indexed?
3504	if (!ISD::isNormalStore(N: StoreNode) \|\| StoreNode->isNonTemporal())
3505	return false;
3506
3507	SDValue Load = StoredVal ->getOperand(Num: LoadOpNo);
3508	// Is the stored value a non-extending and non-indexed load?
3509	if (!ISD::isNormalLoad(N: Load.getNode())) return false;
3510
3511	// Return LoadNode by reference.
3512	LoadNode = cast<LoadSDNode>(Val&: Load);
3513
3514	// Is store the only read of the loaded value?
3515	if (!Load.hasOneUse())
3516	return false;
3517
3518	// Is the address of the store the same as the load?
3519	if (LoadNode->getBasePtr() != StoreNode->getBasePtr() \|\|
3520	LoadNode->getOffset() != StoreNode->getOffset())
3521	return false;
3522
3523	bool FoundLoad = false;
3524	SmallVector<SDValue, `4`> ChainOps;
3525	SmallVector<const SDNode *, `4`> LoopWorklist;
3526	SmallPtrSet<const SDNode *, `16`> Visited;
3527	const unsigned int Max = `1024`;
3528
3529	// Visualization of Load-Op-Store fusion:
3530	// -------------------------
3531	// Legend:
3532	// -lines = Chain operand dependencies.*
3533	// \|-lines = Normal operand dependencies.
3534	// Dependencies flow down and right. n-suffix references multiple nodes.
3535	//
3536	// C Xn C
3537	// * * *
3538	// * * *
3539	// Xn A-LD Yn TF Yn
3540	// * \ \| * \|*
3541	// * \ \| * \|*
3542	// * \ \| => A--LD_OP_ST*
3543	// * \\| \*
3544	// TF OP \
3545	// \| \ Zn*
3546	// \| \*
3547	// A-ST Zn
3548	//
3549
3550	// This merge induced dependences from: #1: Xn -> LD, OP, Zn
3551	// #2: Yn -> LD
3552	// #3: ST -> Zn
3553
3554	// Ensure the transform is safe by checking for the dual
3555	// dependencies to make sure we do not induce a loop.
3556
3557	// As LD is a predecessor to both OP and ST we can do this by checking:
3558	// a). if LD is a predecessor to a member of Xn or Yn.
3559	// b). if a Zn is a predecessor to ST.
3560
3561	// However, (b) can only occur through being a chain predecessor to
3562	// ST, which is the same as Zn being a member or predecessor of Xn,
3563	// which is a subset of LD being a predecessor of Xn. So it's
3564	// subsumed by check (a).
3565
3566	SDValue Chain = StoreNode->getChain();
3567
3568	// Gather X elements in ChainOps.
3569	if (Chain == Load.getValue(R: `1`)) {
3570	FoundLoad = true;
3571	ChainOps.push_back(Elt: Load.getOperand(i: `0`));
3572	} else if (Chain.getOpcode() == ISD::TokenFactor) {
3573	for (unsigned i = `0`, e = Chain.getNumOperands(); i != e; ++i) {
3574	SDValue Op = Chain.getOperand(i);
3575	if (Op == Load.getValue(R: `1`)) {
3576	FoundLoad = true;
3577	// Drop Load, but keep its chain. No cycle check necessary.
3578	ChainOps.push_back(Elt: Load.getOperand(i: `0`));
3579	continue;
3580	}
3581	LoopWorklist.push_back(Elt: Op.getNode());
3582	ChainOps.push_back(Elt: Op);
3583	}
3584	}
3585
3586	if (!FoundLoad)
3587	return false;
3588
3589	// Worklist is currently Xn. Add Yn to worklist.
3590	for (SDValue Op : StoredVal ->ops())
3591	if (Op.getNode() != LoadNode)
3592	LoopWorklist.push_back(Elt: Op.getNode());
3593
3594	// Check (a) if Load is a predecessor to Xn + Yn
3595	if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max,
3596	TopologicalPrune: true))
3597	return false;
3598
3599	InputChain =
3600	CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc (Chain), VT: MVT::Other, Ops: ChainOps);
3601	return true;
3602	}
3603
3604	// Change a chain of {load; op; store} of the same value into a simple op
3605	// through memory of that value, if the uses of the modified value and its
3606	// address are suitable.
3607	//
3608	// The tablegen pattern memory operand pattern is currently not able to match
3609	// the case where the EFLAGS on the original operation are used.
3610	//
3611	// To move this to tablegen, we'll need to improve tablegen to allow flags to
3612	// be transferred from a node in the pattern to the result node, probably with
3613	// a new keyword. For example, we have this
3614	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3615	// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3616	// but maybe need something like this
3617	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3618	// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3619	// (transferrable EFLAGS)]>;
3620	//
3621	// Until then, we manually fold these and instruction select the operation
3622	// here.
3623	bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3624	auto *StoreNode = cast<StoreSDNode>(Val: Node);
3625	SDValue StoredVal = StoreNode->getOperand(Num: `1`);
3626	unsigned Opc = StoredVal ->getOpcode();
3627
3628	// Before we try to select anything, make sure this is memory operand size
3629	// and opcode we can handle. Note that this must match the code below that
3630	// actually lowers the opcodes.
3631	EVT MemVT = StoreNode->getMemoryVT();
3632	if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3633	MemVT != MVT::i8)
3634	return false;
3635
3636	bool IsCommutable = false;
3637	bool IsNegate = false;
3638	switch (Opc) {
3639	default:
3640	return false;
3641	case X86ISD::SUB:
3642	IsNegate = isNullConstant(V: StoredVal.getOperand(i: `0`));
3643	break;
3644	case X86ISD::SBB:
3645	break;
3646	case X86ISD::ADD:
3647	case X86ISD::ADC:
3648	case X86ISD::AND:
3649	case X86ISD::OR:
3650	case X86ISD::XOR:
3651	IsCommutable = true;
3652	break;
3653	}
3654
3655	unsigned LoadOpNo = IsNegate ? `1` : `0`;
3656	LoadSDNode LoadNode = nullptr*;
3657	SDValue InputChain;
3658	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3659	LoadNode, InputChain)) {
3660	if (!IsCommutable)
3661	return false;
3662
3663	// This operation is commutable, try the other operand.
3664	LoadOpNo = `1`;
3665	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3666	LoadNode, InputChain))
3667	return false;
3668	}
3669
3670	SDValue Base, Scale, Index, Disp, Segment;
3671	if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3672	Segment))
3673	return false;
3674
3675	auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3676	unsigned Opc8) {
3677	switch (MemVT.getSimpleVT().SimpleTy) {
3678	case MVT::i64:
3679	return Opc64;
3680	case MVT::i32:
3681	return Opc32;
3682	case MVT::i16:
3683	return Opc16;
3684	case MVT::i8:
3685	return Opc8;
3686	default:
3687	llvm_unreachable("Invalid size!");
3688	}
3689	};
3690
3691	MachineSDNode *Result;
3692	switch (Opc) {
3693	case X86ISD::SUB:
3694	// Handle negate.
3695	if (IsNegate) {
3696	unsigned NewOpc = SelectOpcode (X86::NEG64m, X86::NEG32m, X86::NEG16m,
3697	X86::NEG8m);
3698	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3699	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32,
3700	VT2: MVT::Other, Ops);
3701	break;
3702	}
3703	[[fallthrough]];
3704	case X86ISD::ADD:
3705	// Try to match inc/dec.
3706	if (!Subtarget->slowIncDec() \|\| CurDAG->shouldOptForSize()) {
3707	bool IsOne = isOneConstant(V: StoredVal.getOperand(i: `1`));
3708	bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: `1`));
3709	// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3710	if ((IsOne \|\| IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: `1`))) {
3711	unsigned NewOpc =
3712	((Opc == X86ISD::ADD) == IsOne)
3713	? SelectOpcode (X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3714	: SelectOpcode (X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3715	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3716	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32,
3717	VT2: MVT::Other, Ops);
3718	break;
3719	}
3720	}
3721	[[fallthrough]];
3722	case X86ISD::ADC:
3723	case X86ISD::SBB:
3724	case X86ISD::AND:
3725	case X86ISD::OR:
3726	case X86ISD::XOR: {
3727	auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3728	switch (Opc) {
3729	case X86ISD::ADD:
3730	return SelectOpcode (X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3731	X86::ADD8mr);
3732	case X86ISD::ADC:
3733	return SelectOpcode (X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3734	X86::ADC8mr);
3735	case X86ISD::SUB:
3736	return SelectOpcode (X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3737	X86::SUB8mr);
3738	case X86ISD::SBB:
3739	return SelectOpcode (X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3740	X86::SBB8mr);
3741	case X86ISD::AND:
3742	return SelectOpcode (X86::AND64mr, X86::AND32mr, X86::AND16mr,
3743	X86::AND8mr);
3744	case X86ISD::OR:
3745	return SelectOpcode (X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3746	case X86ISD::XOR:
3747	return SelectOpcode (X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3748	X86::XOR8mr);
3749	default:
3750	llvm_unreachable("Invalid opcode!");
3751	}
3752	};
3753	auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3754	switch (Opc) {
3755	case X86ISD::ADD:
3756	return SelectOpcode (X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3757	X86::ADD8mi);
3758	case X86ISD::ADC:
3759	return SelectOpcode (X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3760	X86::ADC8mi);
3761	case X86ISD::SUB:
3762	return SelectOpcode (X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3763	X86::SUB8mi);
3764	case X86ISD::SBB:
3765	return SelectOpcode (X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3766	X86::SBB8mi);
3767	case X86ISD::AND:
3768	return SelectOpcode (X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3769	X86::AND8mi);
3770	case X86ISD::OR:
3771	return SelectOpcode (X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3772	X86::OR8mi);
3773	case X86ISD::XOR:
3774	return SelectOpcode (X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3775	X86::XOR8mi);
3776	default:
3777	llvm_unreachable("Invalid opcode!");
3778	}
3779	};
3780
3781	unsigned NewOpc = SelectRegOpcode (Opc);
3782	SDValue Operand = StoredVal ->getOperand(Num: `1`-LoadOpNo);
3783
3784	// See if the operand is a constant that we can fold into an immediate
3785	// operand.
3786	if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) {
3787	int64_t OperandV = OperandC->getSExtValue();
3788
3789	// Check if we can shrink the operand enough to fit in an immediate (or
3790	// fit into a smaller immediate) by negating it and switching the
3791	// operation.
3792	if ((Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB) &&
3793	((MemVT != MVT::i8 && !isInt<`8`>(x: OperandV) && isInt<`8`>(x: -OperandV)) \|\|
3794	(MemVT == MVT::i64 && !isInt<`32`>(x: OperandV) &&
3795	isInt<`32`>(x: -OperandV))) &&
3796	hasNoCarryFlagUses(Flags: StoredVal.getValue(R: `1`))) {
3797	OperandV = -OperandV;
3798	Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3799	}
3800
3801	if (MemVT != MVT::i64 \|\| isInt<`32`>(x: OperandV)) {
3802	Operand = CurDAG->getSignedTargetConstant(Val: OperandV, DL: SDLoc (Node), VT: MemVT);
3803	NewOpc = SelectImmOpcode (Opc);
3804	}
3805	}
3806
3807	if (Opc == X86ISD::ADC \|\| Opc == X86ISD::SBB) {
3808	SDValue CopyTo =
3809	CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc (Node), Reg: X86::EFLAGS,
3810	N: StoredVal.getOperand(i: `2`), Glue: SDValue ());
3811
3812	const SDValue Ops[] = {Base, Scale, Index, Disp,
3813	Segment, Operand, CopyTo, CopyTo.getValue(R: `1`)};
3814	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32, VT2: MVT::Other,
3815	Ops);
3816	} else {
3817	const SDValue Ops[] = {Base, Scale, Index, Disp,
3818	Segment, Operand, InputChain};
3819	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32, VT2: MVT::Other,
3820	Ops);
3821	}
3822	break;
3823	}
3824	default:
3825	llvm_unreachable("Invalid opcode!");
3826	}
3827
3828	MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3829	LoadNode->getMemOperand()};
3830	CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps);
3831
3832	// Update Load Chain uses as well.
3833	ReplaceUses(F: SDValue (LoadNode, `1`), T: SDValue (Result, `1`));
3834	ReplaceUses(F: SDValue (StoreNode, `0`), T: SDValue (Result, `1`));
3835	ReplaceUses(F: SDValue (StoredVal.getNode(), `1`), T: SDValue (Result, `0`));
3836	CurDAG->RemoveDeadNode(N: Node);
3837	return true;
3838	}
3839
3840	// See if this is an X & Mask that we can match to BEXTR/BZHI.
3841	// Where Mask is one of the following patterns:
3842	// a) x & (1 << nbits) - 1
3843	// b) x & ~(-1 << nbits)
3844	// c) x & (-1 >> (32 - y))
3845	// d) x << (32 - y) >> (32 - y)
3846	// e) (1 << nbits) - 1
3847	bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3848	assert(
3849	(Node->getOpcode() == ISD::ADD \|\| Node->getOpcode() == ISD::AND \|\|
3850	Node->getOpcode() == ISD::SRL) &&
3851	"Should be either an and-mask, or right-shift after clearing high bits.");
3852
3853	// BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3854	if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3855	return false;
3856
3857	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
3858
3859	// Only supported for 32 and 64 bits.
3860	if (NVT != MVT::i32 && NVT != MVT::i64)
3861	return false;
3862
3863	SDValue NBits;
3864	bool NegateNBits;
3865
3866	// If we have BMI2's BZHI, we are ok with muti-use patterns.
3867	// Else, if we only have BMI1's BEXTR, we require one-use.
3868	const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3869	auto checkUses = [AllowExtraUsesByDefault](
3870	SDValue Op, unsigned NUses,
3871	std::optional<bool> AllowExtraUses) {
3872	return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) \|\|
3873	Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo());
3874	};
3875	auto checkOneUse = [checkUses](SDValue Op,
3876	std::optional<bool> AllowExtraUses =
3877	std::nullopt) {
3878	return checkUses (Op, `1`, AllowExtraUses);
3879	};
3880	auto checkTwoUse = [checkUses](SDValue Op,
3881	std::optional<bool> AllowExtraUses =
3882	std::nullopt) {
3883	return checkUses (Op, `2`, AllowExtraUses);
3884	};
3885
3886	auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3887	if (V ->getOpcode() == ISD::TRUNCATE && checkOneUse (V)) {
3888	assert(V.getSimpleValueType() == MVT::i32 &&
3889	V.getOperand(`0`).getSimpleValueType() == MVT::i64 &&
3890	"Expected i64 -> i32 truncation");
3891	V = V.getOperand(i: `0`);
3892	}
3893	return V;
3894	};
3895
3896	// a) x & ((1 << nbits) + (-1))
3897	auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3898	&NegateNBits](SDValue Mask) -> bool {
3899	// Match `add`. Must only have one use!
3900	if (Mask ->getOpcode() != ISD::ADD \|\| !checkOneUse (Mask))
3901	return false;
3902	// We should be adding all-ones constant (i.e. subtracting one.)
3903	if (!isAllOnesConstant(V: Mask ->getOperand(Num: `1`)))
3904	return false;
3905	// Match `1 << nbits`. Might be truncated. Must only have one use!
3906	SDValue M0 = peekThroughOneUseTruncation (Mask ->getOperand(Num: `0`));
3907	if (M0 ->getOpcode() != ISD::SHL \|\| !checkOneUse (M0))
3908	return false;
3909	if (!isOneConstant(V: M0 ->getOperand(Num: `0`)))
3910	return false;
3911	NBits = M0 ->getOperand(Num: `1`);
3912	NegateNBits = false;
3913	return true;
3914	};
3915
3916	auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3917	V = peekThroughOneUseTruncation (V);
3918	return CurDAG->MaskedValueIsAllOnes(
3919	Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(),
3920	loBitsSet: NVT.getSizeInBits()));
3921	};
3922
3923	// b) x & ~(-1 << nbits)
3924	auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3925	&NBits, &NegateNBits](SDValue Mask) -> bool {
3926	// Match `~()`. Must only have one use!
3927	if (Mask.getOpcode() != ISD::XOR \|\| !checkOneUse (Mask))
3928	return false;
3929	// The -1 only has to be all-ones for the final Node's NVT.
3930	if (!isAllOnes (Mask ->getOperand(Num: `1`)))
3931	return false;
3932	// Match `-1 << nbits`. Might be truncated. Must only have one use!
3933	SDValue M0 = peekThroughOneUseTruncation (Mask ->getOperand(Num: `0`));
3934	if (M0 ->getOpcode() != ISD::SHL \|\| !checkOneUse (M0))
3935	return false;
3936	// The -1 only has to be all-ones for the final Node's NVT.
3937	if (!isAllOnes (M0 ->getOperand(Num: `0`)))
3938	return false;
3939	NBits = M0 ->getOperand(Num: `1`);
3940	NegateNBits = false;
3941	return true;
3942	};
3943
3944	// Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3945	// or leave the shift amount as-is, but then we'll have to negate it.
3946	auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3947	unsigned Bitwidth) {
3948	NBits = ShiftAmt;
3949	NegateNBits = true;
3950	// Skip over a truncate of the shift amount, if any.
3951	if (NBits.getOpcode() == ISD::TRUNCATE)
3952	NBits = NBits.getOperand(i: `0`);
3953	// Try to match the shift amount as (bitwidth - y). It should go away, too.
3954	// If it doesn't match, that's fine, we'll just negate it ourselves.
3955	if (NBits.getOpcode() != ISD::SUB)
3956	return;
3957	auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: `0`));
3958	if (!V0 \|\| V0->getZExtValue() != Bitwidth)
3959	return;
3960	NBits = NBits.getOperand(i: `1`);
3961	NegateNBits = false;
3962	};
3963
3964	// c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3965	// or
3966	// c) x & (-1 >> (32 - y))
3967	auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3968	canonicalizeShiftAmt](SDValue Mask) -> bool {
3969	// The mask itself may be truncated.
3970	Mask = peekThroughOneUseTruncation (Mask);
3971	unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3972	// Match `l>>`. Must only have one use!
3973	if (Mask.getOpcode() != ISD::SRL \|\| !checkOneUse (Mask))
3974	return false;
3975	// We should be shifting truly all-ones constant.
3976	if (!isAllOnesConstant(V: Mask.getOperand(i: `0`)))
3977	return false;
3978	SDValue M1 = Mask.getOperand(i: `1`);
3979	// The shift amount should not be used externally.
3980	if (!checkOneUse (M1))
3981	return false;
3982	canonicalizeShiftAmt (M1, Bitwidth);
3983	// Pattern c. is non-canonical, and is expanded into pattern d. iff there
3984	// is no extra use of the mask. Clearly, there was one since we are here.
3985	// But at the same time, if we need to negate the shift amount,
3986	// then we don't want the mask to stick around, else it's unprofitable.
3987	return !NegateNBits;
3988	};
3989
3990	SDValue X;
3991
3992	// d) x << z >> z but then we'll have to subtract z from bitwidth
3993	// or
3994	// d) x << (32 - y) >> (32 - y)
3995	auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3996	AllowExtraUsesByDefault, &NegateNBits,
3997	&X](SDNode Node) -> bool* {
3998	if (Node->getOpcode() != ISD::SRL)
3999	return false;
4000	SDValue N0 = Node->getOperand(Num: `0`);
4001	if (N0 ->getOpcode() != ISD::SHL)
4002	return false;
4003	unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
4004	SDValue N1 = Node->getOperand(Num: `1`);
4005	SDValue N01 = N0 ->getOperand(Num: `1`);
4006	// Both of the shifts must be by the exact same value.
4007	if (N1 != N01)
4008	return false;
4009	canonicalizeShiftAmt (N1, Bitwidth);
4010	// There should not be any external uses of the inner shift / shift amount.
4011	// Note that while we are generally okay with external uses given BMI2,
4012	// iff we need to negate the shift amount, we are not okay with extra uses.
4013	const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4014	if (!checkOneUse (N0, AllowExtraUses) \|\| !checkTwoUse (N1, AllowExtraUses))
4015	return false;
4016	X = N0 ->getOperand(Num: `0`);
4017	return true;
4018	};
4019
4020	auto matchLowBitMask = [matchPatternA, matchPatternB,
4021	matchPatternC](SDValue Mask) -> bool {
4022	return matchPatternA (Mask) \|\| matchPatternB (Mask) \|\| matchPatternC (Mask);
4023	};
4024
4025	if (Node->getOpcode() == ISD::AND) {
4026	X = Node->getOperand(Num: `0`);
4027	SDValue Mask = Node->getOperand(Num: `1`);
4028
4029	if (matchLowBitMask (Mask)) {
4030	// Great.
4031	} else {
4032	std::swap(a&: X, b&: Mask);
4033	if (!matchLowBitMask (Mask))
4034	return false;
4035	}
4036	} else if (matchLowBitMask (SDValue (Node, `0`))) {
4037	X = CurDAG->getAllOnesConstant(DL: SDLoc (Node), VT: NVT);
4038	} else if (!matchPatternD (Node))
4039	return false;
4040
4041	// If we need to negate the shift amount, require BMI2 BZHI support.
4042	// It's just too unprofitable for BMI1 BEXTR.
4043	if (NegateNBits && !Subtarget->hasBMI2())
4044	return false;
4045
4046	SDLoc DL(Node);
4047
4048	// Truncate the shift amount.
4049	NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits);
4050	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4051
4052	// Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4053	// All the other bits are undefined, we do not care about them.
4054	SDValue ImplDef = SDValue (
4055	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), `0`);
4056	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: ImplDef);
4057
4058	SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32);
4059	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: SRIdxVal);
4060	NBits = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL,
4061	VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal),
4062	`0`);
4063	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4064
4065	// We might have matched the amount of high bits to be cleared,
4066	// but we want the amount of low bits to be kept, so negate it then.
4067	if (NegateNBits) {
4068	SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32);
4069	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: BitWidthC);
4070
4071	NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits);
4072	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4073	}
4074
4075	if (Subtarget->hasBMI2()) {
4076	// Great, just emit the BZHI..
4077	if (NVT != MVT::i32) {
4078	// But have to place the bit count into the wide-enough register first.
4079	NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits);
4080	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4081	}
4082
4083	SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits);
4084	ReplaceNode(F: Node, T: Extract.getNode());
4085	SelectCode(N: Extract.getNode());
4086	return true;
4087	}
4088
4089	// Else, if we do NOT* have BMI2, let's find out if the if the 'X' is*
4090	// logically* shifted (potentially with one-use trunc inbetween),*
4091	// and the truncation was the only use of the shift,
4092	// and if so look past one-use truncation.
4093	{
4094	SDValue RealX = peekThroughOneUseTruncation (X);
4095	// FIXME: only if the shift is one-use?
4096	if (RealX != X && RealX.getOpcode() == ISD::SRL)
4097	X = RealX;
4098	}
4099
4100	MVT XVT = X.getSimpleValueType();
4101
4102	// Else, emitting BEXTR requires one more step.
4103	// The 'control' of BEXTR has the pattern of:
4104	// [15...8 bit][ 7...0 bit] location
4105	// [ bit count][ shift] name
4106	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4107
4108	// Shift NBits left by 8 bits, thus producing 'control'.
4109	// This makes the low 8 bits to be zero.
4110	SDValue C8 = CurDAG->getConstant(Val: `8`, DL, VT: MVT::i8);
4111	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: C8);
4112	SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8);
4113	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4114
4115	// If the 'X' is logically* shifted, we can fold that shift into 'control'.*
4116	// FIXME: only if the shift is one-use?
4117	if (X.getOpcode() == ISD::SRL) {
4118	SDValue ShiftAmt = X.getOperand(i: `1`);
4119	X = X.getOperand(i: `0`);
4120
4121	assert(ShiftAmt.getValueType() == MVT::i8 &&
4122	"Expected shift amount to be i8");
4123
4124	// Now, zero-extend the shift amount. The bits 8...15 must* be zero!*
4125	// We could zext to i16 in some form, but we intentionally don't do that.
4126	SDValue OrigShiftAmt = ShiftAmt;
4127	ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt);
4128	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt);
4129
4130	// And now 'or' these low 8 bits of shift amount into the 'control'.
4131	Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt);
4132	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4133	}
4134
4135	// But have to place the 'control' into the wide-enough register first.
4136	if (XVT != MVT::i32) {
4137	Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control);
4138	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4139	}
4140
4141	// And finally, form the BEXTR itself.
4142	SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control);
4143
4144	// The 'X' was originally truncated. Do that now.
4145	if (XVT != NVT) {
4146	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Extract);
4147	Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract);
4148	}
4149
4150	ReplaceNode(F: Node, T: Extract.getNode());
4151	SelectCode(N: Extract.getNode());
4152
4153	return true;
4154	}
4155
4156	// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4157	MachineSDNode X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode Node) {
4158	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
4159	SDLoc dl(Node);
4160
4161	SDValue N0 = Node->getOperand(Num: `0`);
4162	SDValue N1 = Node->getOperand(Num: `1`);
4163
4164	// If we have TBM we can use an immediate for the control. If we have BMI
4165	// we should only do this if the BEXTR instruction is implemented well.
4166	// Otherwise moving the control into a register makes this more costly.
4167	// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4168	// hoisting the move immediate would make it worthwhile with a less optimal
4169	// BEXTR?
4170	bool PreferBEXTR =
4171	Subtarget->hasTBM() \|\| (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4172	if (!PreferBEXTR && !Subtarget->hasBMI2())
4173	return nullptr;
4174
4175	// Must have a shift right.
4176	if (N0 ->getOpcode() != ISD::SRL && N0 ->getOpcode() != ISD::SRA)
4177	return nullptr;
4178
4179	// Shift can't have additional users.
4180	if (!N0 ->hasOneUse())
4181	return nullptr;
4182
4183	// Only supported for 32 and 64 bits.
4184	if (NVT != MVT::i32 && NVT != MVT::i64)
4185	return nullptr;
4186
4187	// Shift amount and RHS of and must be constant.
4188	auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1);
4189	auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0 ->getOperand(Num: `1`));
4190	if (!MaskCst \|\| !ShiftCst)
4191	return nullptr;
4192
4193	// And RHS must be a mask.
4194	uint64_t Mask = MaskCst->getZExtValue();
4195	if (!isMask_64(Value: Mask))
4196	return nullptr;
4197
4198	uint64_t Shift = ShiftCst->getZExtValue();
4199	uint64_t MaskSize = llvm::popcount(Value: Mask);
4200
4201	// Don't interfere with something that can be handled by extracting AH.
4202	// TODO: If we are able to fold a load, BEXTR might still be better than AH.
4203	if (Shift == `8` && MaskSize == `8`)
4204	return nullptr;
4205
4206	// Make sure we are only using bits that were in the original value, not
4207	// shifted in.
4208	if (Shift + MaskSize > NVT.getSizeInBits())
4209	return nullptr;
4210
4211	// BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4212	// that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4213	// does not fit into 32 bits. Load folding is not a sufficient reason.
4214	if (!PreferBEXTR && MaskSize <= `32`)
4215	return nullptr;
4216
4217	SDValue Control;
4218	unsigned ROpc, MOpc;
4219
4220	#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4221	if (!PreferBEXTR) {
4222	assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4223	// If we can't make use of BEXTR then we can't fuse shift+mask stages.
4224	// Let's perform the mask first, and apply shift later. Note that we need to
4225	// widen the mask to account for the fact that we'll apply shift afterwards!
4226	Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT);
4227	ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4228	: GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4229	MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4230	: GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4231	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4232	Control = SDValue (CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), `0`);
4233	} else {
4234	// The 'control' of BEXTR has the pattern of:
4235	// [15...8 bit][ 7...0 bit] location
4236	// [ bit count][ shift] name
4237	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4238	Control = CurDAG->getTargetConstant(Val: Shift \| (MaskSize << `8`), DL: dl, VT: NVT);
4239	if (Subtarget->hasTBM()) {
4240	ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4241	MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4242	} else {
4243	assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4244	// BMI requires the immediate to placed in a register.
4245	ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4246	: GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4247	MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4248	: GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4249	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4250	Control = SDValue (CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), `0`);
4251	}
4252	}
4253
4254	MachineSDNode *NewNode;
4255	SDValue Input = N0 ->getOperand(Num: `0`);
4256	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4257	if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4258	SDValue Ops[] = {
4259	Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: `0`)};
4260	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
4261	NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4262	// Update the chain.
4263	ReplaceUses(F: Input.getValue(R: `1`), T: SDValue (NewNode, `2`));
4264	// Record the mem-refs
4265	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()});
4266	} else {
4267	NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control);
4268	}
4269
4270	if (!PreferBEXTR) {
4271	// We still need to apply the shift.
4272	SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT);
4273	unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4274	: GET_ND_IF_ENABLED(X86::SHR32ri);
4275	NewNode =
4276	CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue (NewNode, `0`), Op2: ShAmt);
4277	}
4278
4279	return NewNode;
4280	}
4281
4282	// Emit a PCMISTR(I/M) instruction.
4283	MachineSDNode X86DAGToDAGISel::emitPCMPISTR(unsigned* ROpc, unsigned MOpc,
4284	bool MayFoldLoad, const SDLoc &dl,
4285	MVT VT, SDNode *Node) {
4286	SDValue N0 = Node->getOperand(Num: `0`);
4287	SDValue N1 = Node->getOperand(Num: `1`);
4288	SDValue Imm = Node->getOperand(Num: `2`);
4289	auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4290	Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc (Node), VT: Imm.getValueType());
4291
4292	// Try to fold a load. No need to check alignment.
4293	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4294	if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4295	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4296	N1.getOperand(i: `0`) };
4297	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other);
4298	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4299	// Update the chain.
4300	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, `2`));
4301	// Record the mem-refs
4302	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
4303	return CNode;
4304	}
4305
4306	SDValue Ops[] = { N0, N1, Imm };
4307	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32);
4308	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4309	return CNode;
4310	}
4311
4312	// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4313	// to emit a second instruction after this one. This is needed since we have two
4314	// copyToReg nodes glued before this and we need to continue that glue through.
4315	MachineSDNode X86DAGToDAGISel::emitPCMPESTR(unsigned* ROpc, unsigned MOpc,
4316	bool MayFoldLoad, const SDLoc &dl,
4317	MVT VT, SDNode *Node,
4318	SDValue &InGlue) {
4319	SDValue N0 = Node->getOperand(Num: `0`);
4320	SDValue N2 = Node->getOperand(Num: `2`);
4321	SDValue Imm = Node->getOperand(Num: `4`);
4322	auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4323	Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc (Node), VT: Imm.getValueType());
4324
4325	// Try to fold a load. No need to check alignment.
4326	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4327	if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4328	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4329	N2.getOperand(i: `0`), InGlue };
4330	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
4331	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4332	InGlue = SDValue (CNode, `3`);
4333	// Update the chain.
4334	ReplaceUses(F: N2.getValue(R: `1`), T: SDValue (CNode, `2`));
4335	// Record the mem-refs
4336	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()});
4337	return CNode;
4338	}
4339
4340	SDValue Ops[] = { N0, N2, Imm, InGlue };
4341	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue);
4342	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4343	InGlue = SDValue (CNode, `2`);
4344	return CNode;
4345	}
4346
4347	bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4348	EVT VT = N->getValueType(ResNo: `0`);
4349
4350	// Only handle scalar shifts.
4351	if (VT.isVector())
4352	return false;
4353
4354	// Narrower shifts only mask to 5 bits in hardware.
4355	unsigned Size = VT == MVT::i64 ? `64` : `32`;
4356
4357	SDValue OrigShiftAmt = N->getOperand(Num: `1`);
4358	SDValue ShiftAmt = OrigShiftAmt;
4359	SDLoc DL(N);
4360
4361	// Skip over a truncate of the shift amount.
4362	if (ShiftAmt ->getOpcode() == ISD::TRUNCATE)
4363	ShiftAmt = ShiftAmt ->getOperand(Num: `0`);
4364
4365	// This function is called after X86DAGToDAGISel::matchBitExtract(),
4366	// so we are not afraid that we might mess up BZHI/BEXTR pattern.
4367
4368	SDValue NewShiftAmt;
4369	if (ShiftAmt ->getOpcode() == ISD::ADD \|\| ShiftAmt ->getOpcode() == ISD::SUB \|\|
4370	ShiftAmt ->getOpcode() == ISD::XOR) {
4371	SDValue Add0 = ShiftAmt ->getOperand(Num: `0`);
4372	SDValue Add1 = ShiftAmt ->getOperand(Num: `1`);
4373	auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0);
4374	auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1);
4375	// If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4376	// to avoid the ADD/SUB/XOR.
4377	if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == `0`) {
4378	NewShiftAmt = Add0;
4379
4380	} else if (ShiftAmt ->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4381	((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - `1`) \|\|
4382	(Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - `1`))) {
4383	// If we are doing a NOT on just the lower bits with (SizeN-1) -/^ X*
4384	// we can replace it with a NOT. In the XOR case it may save some code
4385	// size, in the SUB case it also may save a move.
4386	assert(Add0C == nullptr \|\| Add1C == nullptr);
4387
4388	// We can only do N-X, not X-N
4389	if (ShiftAmt ->getOpcode() == ISD::SUB && Add0C == nullptr)
4390	return false;
4391
4392	EVT OpVT = ShiftAmt.getValueType();
4393
4394	SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT);
4395	NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT,
4396	N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes);
4397	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes);
4398	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4399	// If we are shifting by N-X where N == 0 mod Size, then just shift by
4400	// -X to generate a NEG instead of a SUB of a constant.
4401	} else if (ShiftAmt ->getOpcode() == ISD::SUB && Add0C &&
4402	Add0C->getZExtValue() != `0`) {
4403	EVT SubVT = ShiftAmt.getValueType();
4404	SDValue X;
4405	if (Add0C->getZExtValue() % Size == `0`)
4406	X = Add1;
4407	else if (ShiftAmt.hasOneUse() && Size == `64` &&
4408	Add0C->getZExtValue() % `32` == `0`) {
4409	// We have a 64-bit shift by (n32-x), turn it into -(x+n32).
4410	// This is mainly beneficial if we already compute (x+n32).*
4411	if (Add1.getOpcode() == ISD::TRUNCATE) {
4412	Add1 = Add1.getOperand(i: `0`);
4413	SubVT = Add1.getValueType();
4414	}
4415	if (Add0.getValueType() != SubVT) {
4416	Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT);
4417	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0);
4418	}
4419
4420	X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0);
4421	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X);
4422	} else
4423	return false;
4424	// Insert a negate op.
4425	// TODO: This isn't guaranteed to replace the sub if there is a logic cone
4426	// that uses it that's not a shift.
4427	SDValue Zero = CurDAG->getConstant(Val: `0`, DL, VT: SubVT);
4428	SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X);
4429	NewShiftAmt = Neg;
4430
4431	// Insert these operands into a valid topological order so they can
4432	// get selected independently.
4433	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero);
4434	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg);
4435	} else
4436	return false;
4437	} else
4438	return false;
4439
4440	if (NewShiftAmt.getValueType() != MVT::i8) {
4441	// Need to truncate the shift amount.
4442	NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt);
4443	// Add to a correct topological ordering.
4444	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4445	}
4446
4447	// Insert a new mask to keep the shift amount legal. This should be removed
4448	// by isel patterns.
4449	NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt,
4450	N2: CurDAG->getConstant(Val: Size - `1`, DL, VT: MVT::i8));
4451	// Place in a correct topological ordering.
4452	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4453
4454	SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: `0`),
4455	Op2: NewShiftAmt);
4456	if (UpdatedNode != N) {
4457	// If we found an existing node, we should replace ourselves with that node
4458	// and wait for it to be selected after its other users.
4459	ReplaceNode(F: N, T: UpdatedNode);
4460	return true;
4461	}
4462
4463	// If the original shift amount is now dead, delete it so that we don't run
4464	// it through isel.
4465	if (OrigShiftAmt.getNode()->use_empty())
4466	CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode());
4467
4468	// Now that we've optimized the shift amount, defer to normal isel to get
4469	// load folding and legacy vs BMI2 selection without repeating it here.
4470	SelectCode(N);
4471	return true;
4472	}
4473
4474	bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4475	MVT NVT = N->getSimpleValueType(ResNo: `0`);
4476	unsigned Opcode = N->getOpcode();
4477	SDLoc dl(N);
4478
4479	// For operations of the form (x << C1) op C2, check if we can use a smaller
4480	// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4481	SDValue Shift = N->getOperand(Num: `0`);
4482	SDValue N1 = N->getOperand(Num: `1`);
4483
4484	auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
4485	if (!Cst)
4486	return false;
4487
4488	int64_t Val = Cst->getSExtValue();
4489
4490	// If we have an any_extend feeding the AND, look through it to see if there
4491	// is a shift behind it. But only if the AND doesn't use the extended bits.
4492	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4493	bool FoundAnyExtend = false;
4494	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4495	Shift.getOperand(i: `0`).getSimpleValueType() == MVT::i32 &&
4496	isUInt<`32`>(x: Val)) {
4497	FoundAnyExtend = true;
4498	Shift = Shift.getOperand(i: `0`);
4499	}
4500
4501	if (Shift.getOpcode() != ISD::SHL \|\| !Shift.hasOneUse())
4502	return false;
4503
4504	// i8 is unshrinkable, i16 should be promoted to i32.
4505	if (NVT != MVT::i32 && NVT != MVT::i64)
4506	return false;
4507
4508	auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`));
4509	if (!ShlCst)
4510	return false;
4511
4512	uint64_t ShAmt = ShlCst->getZExtValue();
4513
4514	// Make sure that we don't change the operation by removing bits.
4515	// This only matters for OR and XOR, AND is unaffected.
4516	uint64_t RemovedBitsMask = (`1ULL` << ShAmt) - `1`;
4517	if (Opcode != ISD::AND && (Val & RemovedBitsMask) != `0`)
4518	return false;
4519
4520	// Check the minimum bitwidth for the new constant.
4521	// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4522	auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4523	if (Opcode == ISD::AND) {
4524	// AND32ri is the same as AND64ri32 with zext imm.
4525	// Try this before sign extended immediates below.
4526	ShiftedVal = (uint64_t)Val >> ShAmt;
4527	if (NVT == MVT::i64 && !isUInt<`32`>(x: Val) && isUInt<`32`>(x: ShiftedVal))
4528	return true;
4529	// Also swap order when the AND can become MOVZX.
4530	if (ShiftedVal == UINT8_MAX \|\| ShiftedVal == UINT16_MAX)
4531	return true;
4532	}
4533	ShiftedVal = Val >> ShAmt;
4534	if ((!isInt<`8`>(x: Val) && isInt<`8`>(x: ShiftedVal)) \|\|
4535	(!isInt<`32`>(x: Val) && isInt<`32`>(x: ShiftedVal)))
4536	return true;
4537	if (Opcode != ISD::AND) {
4538	// MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4539	ShiftedVal = (uint64_t)Val >> ShAmt;
4540	if (NVT == MVT::i64 && !isUInt<`32`>(x: Val) && isUInt<`32`>(x: ShiftedVal))
4541	return true;
4542	}
4543	return false;
4544	};
4545
4546	int64_t ShiftedVal;
4547	if (!CanShrinkImmediate (ShiftedVal))
4548	return false;
4549
4550	// Ok, we can reorder to get a smaller immediate.
4551
4552	// But, its possible the original immediate allowed an AND to become MOVZX.
4553	// Doing this late due to avoid the MakedValueIsZero call as late as
4554	// possible.
4555	if (Opcode == ISD::AND) {
4556	// Find the smallest zext this could possibly be.
4557	unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4558	ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: `8U`));
4559
4560	// Figure out which bits need to be zero to achieve that mask.
4561	APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(),
4562	loBitsSet: ZExtWidth);
4563	NeededMask &= ~Cst->getAPIntValue();
4564
4565	if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: `0`), Mask: NeededMask))
4566	return false;
4567	}
4568
4569	SDValue X = Shift.getOperand(i: `0`);
4570	if (FoundAnyExtend) {
4571	SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X);
4572	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewX);
4573	X = NewX;
4574	}
4575
4576	SDValue NewCst = CurDAG->getSignedConstant(Val: ShiftedVal, DL: dl, VT: NVT);
4577	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewCst);
4578	SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst);
4579	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewBinOp);
4580	SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp,
4581	N2: Shift.getOperand(i: `1`));
4582	ReplaceNode(F: N, T: NewSHL.getNode());
4583	SelectCode(N: NewSHL.getNode());
4584	return true;
4585	}
4586
4587	bool X86DAGToDAGISel::matchVPTERNLOG(SDNode Root, SDNode ParentA,
4588	SDNode ParentB, SDNode ParentC,
4589	SDValue A, SDValue B, SDValue C,
4590	uint8_t Imm) {
4591	assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4592	C.isOperandOf(ParentC) && "Incorrect parent node");
4593
4594	auto tryFoldLoadOrBCast =
4595	[this](SDNode Root, SDNode P, SDValue &L, SDValue &Base, SDValue &Scale,
4596	SDValue &Index, SDValue &Disp, SDValue &Segment) {
4597	if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4598	return true;
4599
4600	// Not a load, check for broadcast which may be behind a bitcast.
4601	if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4602	P = L.getNode();
4603	L = L.getOperand(i: `0`);
4604	}
4605
4606	if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4607	return false;
4608
4609	// Only 32 and 64 bit broadcasts are supported.
4610	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4611	unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4612	if (Size != `32` && Size != `64`)
4613	return false;
4614
4615	return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4616	};
4617
4618	bool FoldedLoad = false;
4619	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4620	if (tryFoldLoadOrBCast (Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4621	FoldedLoad = true;
4622	} else if (tryFoldLoadOrBCast (Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4623	Tmp4)) {
4624	FoldedLoad = true;
4625	std::swap(a&: A, b&: C);
4626	// Swap bits 1/4 and 3/6.
4627	uint8_t OldImm = Imm;
4628	Imm = OldImm & `0xa5`;
4629	if (OldImm & `0x02`) Imm \|= `0x10`;
4630	if (OldImm & `0x10`) Imm \|= `0x02`;
4631	if (OldImm & `0x08`) Imm \|= `0x40`;
4632	if (OldImm & `0x40`) Imm \|= `0x08`;
4633	} else if (tryFoldLoadOrBCast (Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4634	Tmp4)) {
4635	FoldedLoad = true;
4636	std::swap(a&: B, b&: C);
4637	// Swap bits 1/2 and 5/6.
4638	uint8_t OldImm = Imm;
4639	Imm = OldImm & `0x99`;
4640	if (OldImm & `0x02`) Imm \|= `0x04`;
4641	if (OldImm & `0x04`) Imm \|= `0x02`;
4642	if (OldImm & `0x20`) Imm \|= `0x40`;
4643	if (OldImm & `0x40`) Imm \|= `0x20`;
4644	}
4645
4646	SDLoc DL(Root);
4647
4648	SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
4649
4650	MVT NVT = Root->getSimpleValueType(ResNo: `0`);
4651
4652	MachineSDNode *MNode;
4653	if (FoldedLoad) {
4654	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
4655
4656	unsigned Opc;
4657	if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4658	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C);
4659	unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4660	assert((EltSize == `32` \|\| EltSize == `64`) && "Unexpected broadcast size!");
4661
4662	bool UseD = EltSize == `32`;
4663	if (NVT.is128BitVector())
4664	Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4665	else if (NVT.is256BitVector())
4666	Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4667	else if (NVT.is512BitVector())
4668	Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4669	else
4670	llvm_unreachable("Unexpected vector size!");
4671	} else {
4672	bool UseD = NVT.getVectorElementType() == MVT::i32;
4673	if (NVT.is128BitVector())
4674	Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4675	else if (NVT.is256BitVector())
4676	Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4677	else if (NVT.is512BitVector())
4678	Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4679	else
4680	llvm_unreachable("Unexpected vector size!");
4681	}
4682
4683	SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: `0`)};
4684	MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops);
4685
4686	// Update the chain.
4687	ReplaceUses(F: C.getValue(R: `1`), T: SDValue (MNode, `1`));
4688	// Record the mem-refs
4689	CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()});
4690	} else {
4691	bool UseD = NVT.getVectorElementType() == MVT::i32;
4692	unsigned Opc;
4693	if (NVT.is128BitVector())
4694	Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4695	else if (NVT.is256BitVector())
4696	Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4697	else if (NVT.is512BitVector())
4698	Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4699	else
4700	llvm_unreachable("Unexpected vector size!");
4701
4702	MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm});
4703	}
4704
4705	ReplaceUses(F: SDValue (Root, `0`), T: SDValue (MNode, `0`));
4706	CurDAG->RemoveDeadNode(N: Root);
4707	return true;
4708	}
4709
4710	// Try to match two logic ops to a VPTERNLOG.
4711	// FIXME: Handle more complex patterns that use an operand more than once?
4712	bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4713	MVT NVT = N->getSimpleValueType(ResNo: `0`);
4714
4715	// Make sure we support VPTERNLOG.
4716	if (!NVT.isVector() \|\| !Subtarget->hasAVX512() \|\|
4717	NVT.getVectorElementType() == MVT::i1)
4718	return false;
4719
4720	// We need VLX for 128/256-bit.
4721	if (!(Subtarget->hasVLX() \|\| NVT.is512BitVector()))
4722	return false;
4723
4724	SDValue N0 = N->getOperand(Num: `0`);
4725	SDValue N1 = N->getOperand(Num: `1`);
4726
4727	auto getFoldableLogicOp = [](SDValue Op) {
4728	// Peek through single use bitcast.
4729	if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4730	Op = Op.getOperand(i: `0`);
4731
4732	if (!Op.hasOneUse())
4733	return SDValue ();
4734
4735	unsigned Opc = Op.getOpcode();
4736	if (Opc == ISD::AND \|\| Opc == ISD::OR \|\| Opc == ISD::XOR \|\|
4737	Opc == X86ISD::ANDNP)
4738	return Op;
4739
4740	return SDValue ();
4741	};
4742
4743	SDValue A, FoldableOp;
4744	if ((FoldableOp = getFoldableLogicOp (N1))) {
4745	A = N0;
4746	} else if ((FoldableOp = getFoldableLogicOp (N0))) {
4747	A = N1;
4748	} else
4749	return false;
4750
4751	SDValue B = FoldableOp.getOperand(i: `0`);
4752	SDValue C = FoldableOp.getOperand(i: `1`);
4753	SDNode *ParentA = N;
4754	SDNode *ParentB = FoldableOp.getNode();
4755	SDNode *ParentC = FoldableOp.getNode();
4756
4757	// We can build the appropriate control immediate by performing the logic
4758	// operation we're matching using these constants for A, B, and C.
4759	uint8_t TernlogMagicA = `0xf0`;
4760	uint8_t TernlogMagicB = `0xcc`;
4761	uint8_t TernlogMagicC = `0xaa`;
4762
4763	// Some of the inputs may be inverted, peek through them and invert the
4764	// magic values accordingly.
4765	// TODO: There may be a bitcast before the xor that we should peek through.
4766	auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4767	if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4768	ISD::isBuildVectorAllOnes(N: Op.getOperand(i: `1`).getNode())) {
4769	Magic = ~Magic;
4770	Parent = Op.getNode();
4771	Op = Op.getOperand(i: `0`);
4772	}
4773	};
4774
4775	PeekThroughNot (A, ParentA, TernlogMagicA);
4776	PeekThroughNot (B, ParentB, TernlogMagicB);
4777	PeekThroughNot (C, ParentC, TernlogMagicC);
4778
4779	uint8_t Imm;
4780	switch (FoldableOp.getOpcode()) {
4781	default: llvm_unreachable("Unexpected opcode!");
4782	case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4783	case ISD::OR: Imm = TernlogMagicB \| TernlogMagicC; break;
4784	case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4785	case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4786	}
4787
4788	switch (N->getOpcode()) {
4789	default: llvm_unreachable("Unexpected opcode!");
4790	case X86ISD::ANDNP:
4791	if (A == N0)
4792	Imm &= ~TernlogMagicA;
4793	else
4794	Imm = ~(Imm) & TernlogMagicA;
4795	break;
4796	case ISD::AND: Imm &= TernlogMagicA; break;
4797	case ISD::OR: Imm \|= TernlogMagicA; break;
4798	case ISD::XOR: Imm ^= TernlogMagicA; break;
4799	}
4800
4801	return matchVPTERNLOG(Root: N, ParentA, ParentB, ParentC, A, B, C, Imm);
4802	}
4803
4804	/// If the high bits of an 'and' operand are known zero, try setting the
4805	/// high bits of an 'and' constant operand to produce a smaller encoding by
4806	/// creating a small, sign-extended negative immediate rather than a large
4807	/// positive one. This reverses a transform in SimplifyDemandedBits that
4808	/// shrinks mask constants by clearing bits. There is also a possibility that
4809	/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4810	/// case, just replace the 'and'. Return 'true' if the node is replaced.
4811	bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4812	// i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4813	// have immediate operands.
4814	MVT VT = And->getSimpleValueType(ResNo: `0`);
4815	if (VT != MVT::i32 && VT != MVT::i64)
4816	return false;
4817
4818	auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: `1`));
4819	if (!And1C)
4820	return false;
4821
4822	// Bail out if the mask constant is already negative. It's can't shrink more.
4823	// If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4824	// patterns to use a 32-bit and instead of a 64-bit and by relying on the
4825	// implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4826	// are negative too.
4827	APInt MaskVal = And1C->getAPIntValue();
4828	unsigned MaskLZ = MaskVal.countl_zero();
4829	if (!MaskLZ \|\| (VT == MVT::i64 && MaskLZ == `32`))
4830	return false;
4831
4832	// Don't extend into the upper 32 bits of a 64 bit mask.
4833	if (VT == MVT::i64 && MaskLZ >= `32`) {
4834	MaskLZ -= `32`;
4835	MaskVal = MaskVal.trunc(width: `32`);
4836	}
4837
4838	SDValue And0 = And->getOperand(Num: `0`);
4839	APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ);
4840	APInt NegMaskVal = MaskVal \| HighZeros;
4841
4842	// If a negative constant would not allow a smaller encoding, there's no need
4843	// to continue. Only change the constant when we know it's a win.
4844	unsigned MinWidth = NegMaskVal.getSignificantBits();
4845	if (MinWidth > `32` \|\| (MinWidth > `8` && MaskVal.getSignificantBits() <= `32`))
4846	return false;
4847
4848	// Extend masks if we truncated above.
4849	if (VT == MVT::i64 && MaskVal.getBitWidth() < `64`) {
4850	NegMaskVal = NegMaskVal.zext(width: `64`);
4851	HighZeros = HighZeros.zext(width: `64`);
4852	}
4853
4854	// The variable operand must be all zeros in the top bits to allow using the
4855	// new, negative constant as the mask.
4856	// TODO: Handle constant folding?
4857	KnownBits Known0 = CurDAG->computeKnownBits(Op: And0);
4858	if (Known0.isConstant() \|\| !HighZeros.isSubsetOf(RHS: Known0.Zero))
4859	return false;
4860
4861	// Check if the mask is -1. In that case, this is an unnecessary instruction
4862	// that escaped earlier analysis.
4863	if (NegMaskVal.isAllOnes()) {
4864	ReplaceNode(F: And, T: And0.getNode());
4865	return true;
4866	}
4867
4868	// A negative mask allows a smaller encoding. Create a new 'and' node.
4869	SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc (And), VT);
4870	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (And, `0`), N: NewMask);
4871	SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc (And), VT, N1: And0, N2: NewMask);
4872	ReplaceNode(F: And, T: NewAnd.getNode());
4873	SelectCode(N: NewAnd.getNode());
4874	return true;
4875	}
4876
4877	static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4878	bool FoldedBCast, bool Masked) {
4879	#define VPTESTM_CASE(VT, SUFFIX) \
4880	case MVT::VT: \
4881	if (Masked) \
4882	return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4883	return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4884
4885
4886	#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4887	default: llvm_unreachable("Unexpected VT!"); \
4888	VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4889	VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4890	VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4891	VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4892	VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4893	VPTESTM_CASE(v8i64, QZ##SUFFIX)
4894
4895	#define VPTESTM_FULL_CASES(SUFFIX) \
4896	VPTESTM_BROADCAST_CASES(SUFFIX) \
4897	VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4898	VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4899	VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4900	VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4901	VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4902	VPTESTM_CASE(v32i16, WZ##SUFFIX)
4903
4904	if (FoldedBCast) {
4905	switch (TestVT.SimpleTy) {
4906	VPTESTM_BROADCAST_CASES(rmb)
4907	}
4908	}
4909
4910	if (FoldedLoad) {
4911	switch (TestVT.SimpleTy) {
4912	VPTESTM_FULL_CASES(rm)
4913	}
4914	}
4915
4916	switch (TestVT.SimpleTy) {
4917	VPTESTM_FULL_CASES(rr)
4918	}
4919
4920	#undef VPTESTM_FULL_CASES
4921	#undef VPTESTM_BROADCAST_CASES
4922	#undef VPTESTM_CASE
4923	}
4924
4925	// Try to create VPTESTM instruction. If InMask is not null, it will be used
4926	// to form a masked operation.
4927	bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4928	SDValue InMask) {
4929	assert(Subtarget->hasAVX512() && "Expected AVX512!");
4930	assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4931	"Unexpected VT!");
4932
4933	// Look for equal and not equal compares.
4934	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: `2`))->get();
4935	if (CC != ISD::SETEQ && CC != ISD::SETNE)
4936	return false;
4937
4938	SDValue SetccOp0 = Setcc.getOperand(i: `0`);
4939	SDValue SetccOp1 = Setcc.getOperand(i: `1`);
4940
4941	// Canonicalize the all zero vector to the RHS.
4942	if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode()))
4943	std::swap(a&: SetccOp0, b&: SetccOp1);
4944
4945	// See if we're comparing against zero.
4946	if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode()))
4947	return false;
4948
4949	SDValue N0 = SetccOp0;
4950
4951	MVT CmpVT = N0.getSimpleValueType();
4952	MVT CmpSVT = CmpVT.getVectorElementType();
4953
4954	// Start with both operands the same. We'll try to refine this.
4955	SDValue Src0 = N0;
4956	SDValue Src1 = N0;
4957
4958	{
4959	// Look through single use bitcasts.
4960	SDValue N0Temp = N0;
4961	if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4962	N0Temp = N0.getOperand(i: `0`);
4963
4964	// Look for single use AND.
4965	if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4966	Src0 = N0Temp.getOperand(i: `0`);
4967	Src1 = N0Temp.getOperand(i: `1`);
4968	}
4969	}
4970
4971	// Without VLX we need to widen the operation.
4972	bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4973
4974	auto tryFoldLoadOrBCast = [&](SDNode Root, SDNode P, SDValue &L,
4975	SDValue &Base, SDValue &Scale, SDValue &Index,
4976	SDValue &Disp, SDValue &Segment) {
4977	// If we need to widen, we can't fold the load.
4978	if (!Widen)
4979	if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4980	return true;
4981
4982	// If we didn't fold a load, try to match broadcast. No widening limitation
4983	// for this. But only 32 and 64 bit types are supported.
4984	if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4985	return false;
4986
4987	// Look through single use bitcasts.
4988	if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4989	P = L.getNode();
4990	L = L.getOperand(i: `0`);
4991	}
4992
4993	if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4994	return false;
4995
4996	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4997	if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4998	return false;
4999
5000	return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
5001	};
5002
5003	// We can only fold loads if the sources are unique.
5004	bool CanFoldLoads = Src0 != Src1;
5005
5006	bool FoldedLoad = false;
5007	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5008	if (CanFoldLoads) {
5009	FoldedLoad = tryFoldLoadOrBCast (Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5010	Tmp3, Tmp4);
5011	if (!FoldedLoad) {
5012	// And is commutative.
5013	FoldedLoad = tryFoldLoadOrBCast (Root, N0.getNode(), Src0, Tmp0, Tmp1,
5014	Tmp2, Tmp3, Tmp4);
5015	if (FoldedLoad)
5016	std::swap(a&: Src0, b&: Src1);
5017	}
5018	}
5019
5020	bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5021
5022	bool IsMasked = InMask.getNode() != nullptr;
5023
5024	SDLoc dl(Root);
5025
5026	MVT ResVT = Setcc.getSimpleValueType();
5027	MVT MaskVT = ResVT;
5028	if (Widen) {
5029	// Widen the inputs using insert_subreg or copy_to_regclass.
5030	unsigned Scale = CmpVT.is128BitVector() ? `4` : `2`;
5031	unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5032	unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5033	CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts);
5034	MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts);
5035	SDValue ImplDef = SDValue (CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl,
5036	VT: CmpVT), `0`);
5037	Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0);
5038
5039	if (!FoldedBCast)
5040	Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1);
5041
5042	if (IsMasked) {
5043	// Widen the mask.
5044	unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID();
5045	SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5046	InMask = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5047	dl, VT: MaskVT, Op1: InMask, Op2: RC), `0`);
5048	}
5049	}
5050
5051	bool IsTestN = CC == ISD::SETEQ;
5052	unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5053	Masked: IsMasked);
5054
5055	MachineSDNode *CNode;
5056	if (FoldedLoad) {
5057	SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other);
5058
5059	if (IsMasked) {
5060	SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5061	Src1.getOperand(i: `0`) };
5062	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5063	} else {
5064	SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5065	Src1.getOperand(i: `0`) };
5066	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5067	}
5068
5069	// Update the chain.
5070	ReplaceUses(F: Src1.getValue(R: `1`), T: SDValue (CNode, `1`));
5071	// Record the mem-refs
5072	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()});
5073	} else {
5074	if (IsMasked)
5075	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1);
5076	else
5077	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1);
5078	}
5079
5080	// If we widened, we need to shrink the mask VT.
5081	if (Widen) {
5082	unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID();
5083	SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5084	CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5085	dl, VT: ResVT, Op1: SDValue (CNode, `0`), Op2: RC);
5086	}
5087
5088	ReplaceUses(F: SDValue (Root, `0`), T: SDValue (CNode, `0`));
5089	CurDAG->RemoveDeadNode(N: Root);
5090	return true;
5091	}
5092
5093	// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5094	// into vpternlog.
5095	bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5096	assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5097
5098	MVT NVT = N->getSimpleValueType(ResNo: `0`);
5099
5100	// Make sure we support VPTERNLOG.
5101	if (!NVT.isVector() \|\| !Subtarget->hasAVX512())
5102	return false;
5103
5104	// We need VLX for 128/256-bit.
5105	if (!(Subtarget->hasVLX() \|\| NVT.is512BitVector()))
5106	return false;
5107
5108	SDValue N0 = N->getOperand(Num: `0`);
5109	SDValue N1 = N->getOperand(Num: `1`);
5110
5111	// Canonicalize AND to LHS.
5112	if (N1.getOpcode() == ISD::AND)
5113	std::swap(a&: N0, b&: N1);
5114
5115	if (N0.getOpcode() != ISD::AND \|\|
5116	N1.getOpcode() != X86ISD::ANDNP \|\|
5117	!N0.hasOneUse() \|\| !N1.hasOneUse())
5118	return false;
5119
5120	// ANDN is not commutable, use it to pick down A and C.
5121	SDValue A = N1.getOperand(i: `0`);
5122	SDValue C = N1.getOperand(i: `1`);
5123
5124	// AND is commutable, if one operand matches A, the other operand is B.
5125	// Otherwise this isn't a match.
5126	SDValue B;
5127	if (N0.getOperand(i: `0`) == A)
5128	B = N0.getOperand(i: `1`);
5129	else if (N0.getOperand(i: `1`) == A)
5130	B = N0.getOperand(i: `0`);
5131	else
5132	return false;
5133
5134	SDLoc dl(N);
5135	SDValue Imm = CurDAG->getTargetConstant(Val: `0xCA`, DL: dl, VT: MVT::i8);
5136	SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm);
5137	ReplaceNode(F: N, T: Ternlog.getNode());
5138
5139	return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(),
5140	ParentC: Ternlog.getNode(), A, B, C, Imm: `0xCA`);
5141	}
5142
5143	void X86DAGToDAGISel::Select(SDNode *Node) {
5144	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
5145	unsigned Opcode = Node->getOpcode();
5146	SDLoc dl(Node);
5147
5148	if (Node->isMachineOpcode()) {
5149	LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << `'\n'`);
5150	Node->setNodeId(-`1`);
5151	return; // Already selected.
5152	}
5153
5154	switch (Opcode) {
5155	default: break;
5156	case ISD::INTRINSIC_W_CHAIN: {
5157	unsigned IntNo = Node->getConstantOperandVal(Num: `1`);
5158	switch (IntNo) {
5159	default: break;
5160	case Intrinsic::x86_encodekey128:
5161	case Intrinsic::x86_encodekey256: {
5162	if (!Subtarget->hasKL())
5163	break;
5164
5165	unsigned Opcode;
5166	switch (IntNo) {
5167	default: llvm_unreachable("Impossible intrinsic");
5168	case Intrinsic::x86_encodekey128:
5169	Opcode = X86::ENCODEKEY128;
5170	break;
5171	case Intrinsic::x86_encodekey256:
5172	Opcode = X86::ENCODEKEY256;
5173	break;
5174	}
5175
5176	SDValue Chain = Node->getOperand(Num: `0`);
5177	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: `3`),
5178	Glue: SDValue ());
5179	if (Opcode == X86::ENCODEKEY256)
5180	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: `4`),
5181	Glue: Chain.getValue(R: `1`));
5182
5183	MachineSDNode *Res = CurDAG->getMachineNode(
5184	Opcode, dl, VTs: Node->getVTList(),
5185	Ops: {Node->getOperand(Num: `2`), Chain, Chain.getValue(R: `1`)});
5186	ReplaceNode(F: Node, T: Res);
5187	return;
5188	}
5189	case Intrinsic::x86_tileloaddrs64_internal:
5190	case Intrinsic::x86_tileloaddrst164_internal:
5191	if (!Subtarget->hasAMXMOVRS())
5192	break;
5193	[[fallthrough]];
5194	case Intrinsic::x86_tileloadd64_internal:
5195	case Intrinsic::x86_tileloaddt164_internal: {
5196	if (!Subtarget->hasAMXTILE())
5197	break;
5198	auto *MFI =
5199	CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5200	MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5201	unsigned Opc;
5202	switch (IntNo) {
5203	default:
5204	llvm_unreachable("Unexpected intrinsic!");
5205	case Intrinsic::x86_tileloaddrs64_internal:
5206	Opc = X86::PTILELOADDRSV;
5207	break;
5208	case Intrinsic::x86_tileloaddrst164_internal:
5209	Opc = X86::PTILELOADDRST1V;
5210	break;
5211	case Intrinsic::x86_tileloadd64_internal:
5212	Opc = X86::PTILELOADDV;
5213	break;
5214	case Intrinsic::x86_tileloaddt164_internal:
5215	Opc = X86::PTILELOADDT1V;
5216	break;
5217	}
5218	// _tile_loadd_internal(row, col, buf, STRIDE)
5219	SDValue Base = Node->getOperand(Num: `4`);
5220	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5221	SDValue Index = Node->getOperand(Num: `5`);
5222	SDValue Disp = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
5223	SDValue Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
5224	SDValue Chain = Node->getOperand(Num: `0`);
5225	MachineSDNode *CNode;
5226	SDValue Ops[] = {Node->getOperand(Num: `2`),
5227	Node->getOperand(Num: `3`),
5228	Base,
5229	Scale,
5230	Index,
5231	Disp,
5232	Segment,
5233	Chain};
5234	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops);
5235	ReplaceNode(F: Node, T: CNode);
5236	return;
5237	}
5238	}
5239	break;
5240	}
5241	case ISD::INTRINSIC_VOID: {
5242	unsigned IntNo = Node->getConstantOperandVal(Num: `1`);
5243	switch (IntNo) {
5244	default: break;
5245	case Intrinsic::x86_sse3_monitor:
5246	case Intrinsic::x86_monitorx:
5247	case Intrinsic::x86_clzero: {
5248	bool Use64BitPtr = Node->getOperand(Num: `2`).getValueType() == MVT::i64;
5249
5250	unsigned Opc = `0`;
5251	switch (IntNo) {
5252	default: llvm_unreachable("Unexpected intrinsic!");
5253	case Intrinsic::x86_sse3_monitor:
5254	if (!Subtarget->hasSSE3())
5255	break;
5256	Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5257	break;
5258	case Intrinsic::x86_monitorx:
5259	if (!Subtarget->hasMWAITX())
5260	break;
5261	Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5262	break;
5263	case Intrinsic::x86_clzero:
5264	if (!Subtarget->hasCLZERO())
5265	break;
5266	Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5267	break;
5268	}
5269
5270	if (Opc) {
5271	unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5272	SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: `0`), dl, Reg: PtrReg,
5273	N: Node->getOperand(Num: `2`), Glue: SDValue ());
5274	SDValue InGlue = Chain.getValue(R: `1`);
5275
5276	if (IntNo == Intrinsic::x86_sse3_monitor \|\|
5277	IntNo == Intrinsic::x86_monitorx) {
5278	// Copy the other two operands to ECX and EDX.
5279	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: `3`),
5280	Glue: InGlue);
5281	InGlue = Chain.getValue(R: `1`);
5282	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: `4`),
5283	Glue: InGlue);
5284	InGlue = Chain.getValue(R: `1`);
5285	}
5286
5287	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other,
5288	Ops: { Chain, InGlue});
5289	ReplaceNode(F: Node, T: CNode);
5290	return;
5291	}
5292
5293	break;
5294	}
5295	case Intrinsic::x86_tilestored64_internal: {
5296	auto *MFI =
5297	CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5298	MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5299	unsigned Opc = X86::PTILESTOREDV;
5300	// _tile_stored_internal(row, col, buf, STRIDE, c)
5301	SDValue Base = Node->getOperand(Num: `4`);
5302	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5303	SDValue Index = Node->getOperand(Num: `5`);
5304	SDValue Disp = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
5305	SDValue Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
5306	SDValue Chain = Node->getOperand(Num: `0`);
5307	MachineSDNode *CNode;
5308	SDValue Ops[] = {Node->getOperand(Num: `2`),
5309	Node->getOperand(Num: `3`),
5310	Base,
5311	Scale,
5312	Index,
5313	Disp,
5314	Segment,
5315	Node->getOperand(Num: `6`),
5316	Chain};
5317	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5318	ReplaceNode(F: Node, T: CNode);
5319	return;
5320	}
5321	case Intrinsic::x86_tileloaddrs64:
5322	case Intrinsic::x86_tileloaddrst164:
5323	if (!Subtarget->hasAMXMOVRS())
5324	break;
5325	[[fallthrough]];
5326	case Intrinsic::x86_tileloadd64:
5327	case Intrinsic::x86_tileloaddt164:
5328	case Intrinsic::x86_tilestored64: {
5329	if (!Subtarget->hasAMXTILE())
5330	break;
5331	auto *MFI =
5332	CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5333	MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5334	unsigned Opc;
5335	switch (IntNo) {
5336	default: llvm_unreachable("Unexpected intrinsic!");
5337	case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5338	case Intrinsic::x86_tileloaddrs64:
5339	Opc = X86::PTILELOADDRS;
5340	break;
5341	case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5342	case Intrinsic::x86_tileloaddrst164:
5343	Opc = X86::PTILELOADDRST1;
5344	break;
5345	case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5346	}
5347	// FIXME: Match displacement and scale.
5348	unsigned TIndex = Node->getConstantOperandVal(Num: `2`);
5349	SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5350	SDValue Base = Node->getOperand(Num: `3`);
5351	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5352	SDValue Index = Node->getOperand(Num: `4`);
5353	SDValue Disp = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
5354	SDValue Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
5355	SDValue Chain = Node->getOperand(Num: `0`);
5356	MachineSDNode *CNode;
5357	if (Opc == X86::PTILESTORED) {
5358	SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5359	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5360	} else {
5361	SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5362	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5363	}
5364	ReplaceNode(F: Node, T: CNode);
5365	return;
5366	}
5367	case Intrinsic::x86_t2rpntlvwz0rs:
5368	case Intrinsic::x86_t2rpntlvwz0rst1:
5369	case Intrinsic::x86_t2rpntlvwz1rs:
5370	case Intrinsic::x86_t2rpntlvwz1rst1:
5371	if (!Subtarget->hasAMXMOVRS())
5372	break;
5373	[[fallthrough]];
5374	case Intrinsic::x86_t2rpntlvwz0:
5375	case Intrinsic::x86_t2rpntlvwz0t1:
5376	case Intrinsic::x86_t2rpntlvwz1:
5377	case Intrinsic::x86_t2rpntlvwz1t1: {
5378	if (!Subtarget->hasAMXTRANSPOSE())
5379	break;
5380	auto *MFI =
5381	CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5382	MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5383	unsigned Opc;
5384	switch (IntNo) {
5385	default:
5386	llvm_unreachable("Unexpected intrinsic!");
5387	case Intrinsic::x86_t2rpntlvwz0:
5388	Opc = X86::PT2RPNTLVWZ0;
5389	break;
5390	case Intrinsic::x86_t2rpntlvwz0t1:
5391	Opc = X86::PT2RPNTLVWZ0T1;
5392	break;
5393	case Intrinsic::x86_t2rpntlvwz1:
5394	Opc = X86::PT2RPNTLVWZ1;
5395	break;
5396	case Intrinsic::x86_t2rpntlvwz1t1:
5397	Opc = X86::PT2RPNTLVWZ1T1;
5398	break;
5399	case Intrinsic::x86_t2rpntlvwz0rs:
5400	Opc = X86::PT2RPNTLVWZ0RS;
5401	break;
5402	case Intrinsic::x86_t2rpntlvwz0rst1:
5403	Opc = X86::PT2RPNTLVWZ0RST1;
5404	break;
5405	case Intrinsic::x86_t2rpntlvwz1rs:
5406	Opc = X86::PT2RPNTLVWZ1RS;
5407	break;
5408	case Intrinsic::x86_t2rpntlvwz1rst1:
5409	Opc = X86::PT2RPNTLVWZ1RST1;
5410	break;
5411	}
5412	// FIXME: Match displacement and scale.
5413	unsigned TIndex = Node->getConstantOperandVal(Num: `2`);
5414	SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5415	SDValue Base = Node->getOperand(Num: `3`);
5416	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5417	SDValue Index = Node->getOperand(Num: `4`);
5418	SDValue Disp = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
5419	SDValue Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
5420	SDValue Chain = Node->getOperand(Num: `0`);
5421	SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
5422	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5423	ReplaceNode(F: Node, T: CNode);
5424	return;
5425	}
5426	}
5427	break;
5428	}
5429	case ISD::BRIND:
5430	case X86ISD::NT_BRIND: {
5431	if (Subtarget->isTargetNaCl())
5432	// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5433	// leave the instruction alone.
5434	break;
5435	if (Subtarget->isTarget64BitILP32()) {
5436	// Converts a 32-bit register to a 64-bit, zero-extended version of
5437	// it. This is needed because x86-64 can do many things, but jmp %r32
5438	// ain't one of them.
5439	SDValue Target = Node->getOperand(Num: `1`);
5440	assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5441	SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64);
5442	SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other,
5443	N1: Node->getOperand(Num: `0`), N2: ZextTarget);
5444	ReplaceNode(F: Node, T: Brind.getNode());
5445	SelectCode(N: ZextTarget.getNode());
5446	SelectCode(N: Brind.getNode());
5447	return;
5448	}
5449	break;
5450	}
5451	case X86ISD::GlobalBaseReg:
5452	ReplaceNode(F: Node, T: getGlobalBaseReg());
5453	return;
5454
5455	case ISD::BITCAST:
5456	// Just drop all 128/256/512-bit bitcasts.
5457	if (NVT.is512BitVector() \|\| NVT.is256BitVector() \|\| NVT.is128BitVector() \|\|
5458	NVT == MVT::f128) {
5459	ReplaceUses(F: SDValue (Node, `0`), T: Node->getOperand(Num: `0`));
5460	CurDAG->RemoveDeadNode(N: Node);
5461	return;
5462	}
5463	break;
5464
5465	case ISD::SRL:
5466	if (matchBitExtract(Node))
5467	return;
5468	[[fallthrough]];
5469	case ISD::SRA:
5470	case ISD::SHL:
5471	if (tryShiftAmountMod(N: Node))
5472	return;
5473	break;
5474
5475	case X86ISD::VPTERNLOG: {
5476	uint8_t Imm = Node->getConstantOperandVal(Num: `3`);
5477	if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: `0`),
5478	B: Node->getOperand(Num: `1`), C: Node->getOperand(Num: `2`), Imm))
5479	return;
5480	break;
5481	}
5482
5483	case X86ISD::ANDNP:
5484	if (tryVPTERNLOG(N: Node))
5485	return;
5486	break;
5487
5488	case ISD::AND:
5489	if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5490	// Try to form a masked VPTESTM. Operands can be in either order.
5491	SDValue N0 = Node->getOperand(Num: `0`);
5492	SDValue N1 = Node->getOperand(Num: `1`);
5493	if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5494	tryVPTESTM(Root: Node, Setcc: N0, InMask: N1))
5495	return;
5496	if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5497	tryVPTESTM(Root: Node, Setcc: N1, InMask: N0))
5498	return;
5499	}
5500
5501	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5502	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
5503	CurDAG->RemoveDeadNode(N: Node);
5504	return;
5505	}
5506	if (matchBitExtract(Node))
5507	return;
5508	if (AndImmShrink && shrinkAndImmediate(And: Node))
5509	return;
5510
5511	[[fallthrough]];
5512	case ISD::OR:
5513	case ISD::XOR:
5514	if (tryShrinkShlLogicImm(N: Node))
5515	return;
5516	if (Opcode == ISD::OR && tryMatchBitSelect(N: Node))
5517	return;
5518	if (tryVPTERNLOG(N: Node))
5519	return;
5520
5521	[[fallthrough]];
5522	case ISD::ADD:
5523	if (Opcode == ISD::ADD && matchBitExtract(Node))
5524	return;
5525	[[fallthrough]];
5526	case ISD::SUB: {
5527	// Try to avoid folding immediates with multiple uses for optsize.
5528	// This code tries to select to register form directly to avoid going
5529	// through the isel table which might fold the immediate. We can't change
5530	// the patterns on the add/sub/and/or/xor with immediate paterns in the
5531	// tablegen files to check immediate use count without making the patterns
5532	// unavailable to the fast-isel table.
5533	if (!CurDAG->shouldOptForSize())
5534	break;
5535
5536	// Only handle i8/i16/i32/i64.
5537	if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5538	break;
5539
5540	SDValue N0 = Node->getOperand(Num: `0`);
5541	SDValue N1 = Node->getOperand(Num: `1`);
5542
5543	auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
5544	if (!Cst)
5545	break;
5546
5547	int64_t Val = Cst->getSExtValue();
5548
5549	// Make sure its an immediate that is considered foldable.
5550	// FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5551	if (!isInt<`8`>(x: Val) && !isInt<`32`>(x: Val))
5552	break;
5553
5554	// If this can match to INC/DEC, let it go.
5555	if (Opcode == ISD::ADD && (Val == `1` \|\| Val == -`1`))
5556	break;
5557
5558	// Check if we should avoid folding this immediate.
5559	if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode()))
5560	break;
5561
5562	// We should not fold the immediate. So we need a register form instead.
5563	unsigned ROpc, MOpc;
5564	switch (NVT.SimpleTy) {
5565	default: llvm_unreachable("Unexpected VT!");
5566	case MVT::i8:
5567	switch (Opcode) {
5568	default: llvm_unreachable("Unexpected opcode!");
5569	case ISD::ADD:
5570	ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5571	MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5572	break;
5573	case ISD::SUB:
5574	ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5575	MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5576	break;
5577	case ISD::AND:
5578	ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5579	MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5580	break;
5581	case ISD::OR:
5582	ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5583	MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5584	break;
5585	case ISD::XOR:
5586	ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5587	MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5588	break;
5589	}
5590	break;
5591	case MVT::i16:
5592	switch (Opcode) {
5593	default: llvm_unreachable("Unexpected opcode!");
5594	case ISD::ADD:
5595	ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5596	MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5597	break;
5598	case ISD::SUB:
5599	ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5600	MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5601	break;
5602	case ISD::AND:
5603	ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5604	MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5605	break;
5606	case ISD::OR:
5607	ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5608	MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5609	break;
5610	case ISD::XOR:
5611	ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5612	MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5613	break;
5614	}
5615	break;
5616	case MVT::i32:
5617	switch (Opcode) {
5618	default: llvm_unreachable("Unexpected opcode!");
5619	case ISD::ADD:
5620	ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5621	MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5622	break;
5623	case ISD::SUB:
5624	ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5625	MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5626	break;
5627	case ISD::AND:
5628	ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5629	MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5630	break;
5631	case ISD::OR:
5632	ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5633	MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5634	break;
5635	case ISD::XOR:
5636	ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5637	MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5638	break;
5639	}
5640	break;
5641	case MVT::i64:
5642	switch (Opcode) {
5643	default: llvm_unreachable("Unexpected opcode!");
5644	case ISD::ADD:
5645	ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5646	MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5647	break;
5648	case ISD::SUB:
5649	ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5650	MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5651	break;
5652	case ISD::AND:
5653	ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5654	MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5655	break;
5656	case ISD::OR:
5657	ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5658	MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5659	break;
5660	case ISD::XOR:
5661	ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5662	MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5663	break;
5664	}
5665	break;
5666	}
5667
5668	// Ok this is a AND/OR/XOR/ADD/SUB with constant.
5669
5670	// If this is a not a subtract, we can still try to fold a load.
5671	if (Opcode != ISD::SUB) {
5672	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5673	if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5674	SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: `0`) };
5675	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5676	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5677	// Update the chain.
5678	ReplaceUses(F: N0.getValue(R: `1`), T: SDValue (CNode, `2`));
5679	// Record the mem-refs
5680	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5681	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
5682	CurDAG->RemoveDeadNode(N: Node);
5683	return;
5684	}
5685	}
5686
5687	CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1);
5688	return;
5689	}
5690
5691	case X86ISD::SMUL:
5692	// i16/i32/i64 are handled with isel patterns.
5693	if (NVT != MVT::i8)
5694	break;
5695	[[fallthrough]];
5696	case X86ISD::UMUL: {
5697	SDValue N0 = Node->getOperand(Num: `0`);
5698	SDValue N1 = Node->getOperand(Num: `1`);
5699
5700	unsigned LoReg, ROpc, MOpc;
5701	switch (NVT.SimpleTy) {
5702	default: llvm_unreachable("Unsupported VT!");
5703	case MVT::i8:
5704	LoReg = X86::AL;
5705	ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5706	MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5707	break;
5708	case MVT::i16:
5709	LoReg = X86::AX;
5710	ROpc = X86::MUL16r;
5711	MOpc = X86::MUL16m;
5712	break;
5713	case MVT::i32:
5714	LoReg = X86::EAX;
5715	ROpc = X86::MUL32r;
5716	MOpc = X86::MUL32m;
5717	break;
5718	case MVT::i64:
5719	LoReg = X86::RAX;
5720	ROpc = X86::MUL64r;
5721	MOpc = X86::MUL64m;
5722	break;
5723	}
5724
5725	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5726	bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5727	// Multiply is commutative.
5728	if (!FoldedLoad) {
5729	FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5730	if (FoldedLoad)
5731	std::swap(a&: N0, b&: N1);
5732	}
5733
5734	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5735	N: N0, Glue: SDValue ()).getValue(R: `1`);
5736
5737	MachineSDNode *CNode;
5738	if (FoldedLoad) {
5739	// i16/i32/i64 use an instruction that produces a low and high result even
5740	// though only the low result is used.
5741	SDVTList VTs;
5742	if (NVT == MVT::i8)
5743	VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5744	else
5745	VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other);
5746
5747	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
5748	InGlue };
5749	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5750
5751	// Update the chain.
5752	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, NVT == MVT::i8 ? `2` : `3`));
5753	// Record the mem-refs
5754	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5755	} else {
5756	// i16/i32/i64 use an instruction that produces a low and high result even
5757	// though only the low result is used.
5758	SDVTList VTs;
5759	if (NVT == MVT::i8)
5760	VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32);
5761	else
5762	VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32);
5763
5764	CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue});
5765	}
5766
5767	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
5768	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, NVT == MVT::i8 ? `1` : `2`));
5769	CurDAG->RemoveDeadNode(N: Node);
5770	return;
5771	}
5772
5773	case ISD::SMUL_LOHI:
5774	case ISD::UMUL_LOHI: {
5775	SDValue N0 = Node->getOperand(Num: `0`);
5776	SDValue N1 = Node->getOperand(Num: `1`);
5777
5778	unsigned Opc, MOpc;
5779	unsigned LoReg, HiReg;
5780	bool IsSigned = Opcode == ISD::SMUL_LOHI;
5781	bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5782	bool UseMULXHi = UseMULX && SDValue (Node, `0`).use_empty();
5783	switch (NVT.SimpleTy) {
5784	default: llvm_unreachable("Unsupported VT!");
5785	case MVT::i32:
5786	Opc = UseMULXHi ? X86::MULX32Hrr
5787	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5788	: IsSigned ? X86::IMUL32r
5789	: X86::MUL32r;
5790	MOpc = UseMULXHi ? X86::MULX32Hrm
5791	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5792	: IsSigned ? X86::IMUL32m
5793	: X86::MUL32m;
5794	LoReg = UseMULX ? X86::EDX : X86::EAX;
5795	HiReg = X86::EDX;
5796	break;
5797	case MVT::i64:
5798	Opc = UseMULXHi ? X86::MULX64Hrr
5799	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5800	: IsSigned ? X86::IMUL64r
5801	: X86::MUL64r;
5802	MOpc = UseMULXHi ? X86::MULX64Hrm
5803	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5804	: IsSigned ? X86::IMUL64m
5805	: X86::MUL64m;
5806	LoReg = UseMULX ? X86::RDX : X86::RAX;
5807	HiReg = X86::RDX;
5808	break;
5809	}
5810
5811	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5812	bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5813	// Multiply is commutative.
5814	if (!foldedLoad) {
5815	foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5816	if (foldedLoad)
5817	std::swap(a&: N0, b&: N1);
5818	}
5819
5820	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5821	N: N0, Glue: SDValue ()).getValue(R: `1`);
5822	SDValue ResHi, ResLo;
5823	if (foldedLoad) {
5824	SDValue Chain;
5825	MachineSDNode CNode = nullptr*;
5826	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
5827	InGlue };
5828	if (UseMULXHi) {
5829	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
5830	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5831	ResHi = SDValue (CNode, `0`);
5832	Chain = SDValue (CNode, `1`);
5833	} else if (UseMULX) {
5834	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other);
5835	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5836	ResHi = SDValue (CNode, `0`);
5837	ResLo = SDValue (CNode, `1`);
5838	Chain = SDValue (CNode, `2`);
5839	} else {
5840	SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
5841	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5842	Chain = SDValue (CNode, `0`);
5843	InGlue = SDValue (CNode, `1`);
5844	}
5845
5846	// Update the chain.
5847	ReplaceUses(F: N1.getValue(R: `1`), T: Chain);
5848	// Record the mem-refs
5849	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5850	} else {
5851	SDValue Ops[] = { N1, InGlue };
5852	if (UseMULXHi) {
5853	SDVTList VTs = CurDAG->getVTList(VT: NVT);
5854	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5855	ResHi = SDValue (CNode, `0`);
5856	} else if (UseMULX) {
5857	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT);
5858	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5859	ResHi = SDValue (CNode, `0`);
5860	ResLo = SDValue (CNode, `1`);
5861	} else {
5862	SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue);
5863	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5864	InGlue = SDValue (CNode, `0`);
5865	}
5866	}
5867
5868	// Copy the low half of the result, if it is needed.
5869	if (!SDValue (Node, `0`).use_empty()) {
5870	if (!ResLo) {
5871	assert(LoReg && "Register for low half is not defined!");
5872	ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5873	VT: NVT, Glue: InGlue);
5874	InGlue = ResLo.getValue(R: `2`);
5875	}
5876	ReplaceUses(F: SDValue (Node, `0`), T: ResLo);
5877	LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5878	dbgs() << `'\n'`);
5879	}
5880	// Copy the high half of the result, if it is needed.
5881	if (!SDValue (Node, `1`).use_empty()) {
5882	if (!ResHi) {
5883	assert(HiReg && "Register for high half is not defined!");
5884	ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg,
5885	VT: NVT, Glue: InGlue);
5886	InGlue = ResHi.getValue(R: `2`);
5887	}
5888	ReplaceUses(F: SDValue (Node, `1`), T: ResHi);
5889	LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5890	dbgs() << `'\n'`);
5891	}
5892
5893	CurDAG->RemoveDeadNode(N: Node);
5894	return;
5895	}
5896
5897	case ISD::SDIVREM:
5898	case ISD::UDIVREM: {
5899	SDValue N0 = Node->getOperand(Num: `0`);
5900	SDValue N1 = Node->getOperand(Num: `1`);
5901
5902	unsigned ROpc, MOpc;
5903	bool isSigned = Opcode == ISD::SDIVREM;
5904	if (!isSigned) {
5905	switch (NVT.SimpleTy) {
5906	default: llvm_unreachable("Unsupported VT!");
5907	case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5908	case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5909	case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5910	case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5911	}
5912	} else {
5913	switch (NVT.SimpleTy) {
5914	default: llvm_unreachable("Unsupported VT!");
5915	case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5916	case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5917	case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5918	case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5919	}
5920	}
5921
5922	unsigned LoReg, HiReg, ClrReg;
5923	unsigned SExtOpcode;
5924	switch (NVT.SimpleTy) {
5925	default: llvm_unreachable("Unsupported VT!");
5926	case MVT::i8:
5927	LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5928	SExtOpcode = `0`; // Not used.
5929	break;
5930	case MVT::i16:
5931	LoReg = X86::AX; HiReg = X86::DX;
5932	ClrReg = X86::DX;
5933	SExtOpcode = X86::CWD;
5934	break;
5935	case MVT::i32:
5936	LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5937	SExtOpcode = X86::CDQ;
5938	break;
5939	case MVT::i64:
5940	LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5941	SExtOpcode = X86::CQO;
5942	break;
5943	}
5944
5945	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5946	bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5947	bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0);
5948
5949	SDValue InGlue;
5950	if (NVT == MVT::i8) {
5951	// Special case for div8, just use a move with zero extension to AX to
5952	// clear the upper 8 bits (AH).
5953	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5954	MachineSDNode *Move;
5955	if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5956	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: `0`) };
5957	unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5958	: X86::MOVZX16rm8;
5959	Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops);
5960	Chain = SDValue (Move, `1`);
5961	ReplaceUses(F: N0.getValue(R: `1`), T: Chain);
5962	// Record the mem-refs
5963	CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5964	} else {
5965	unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5966	: X86::MOVZX16rr8;
5967	Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0);
5968	Chain = CurDAG->getEntryNode();
5969	}
5970	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue (Move, `0`),
5971	Glue: SDValue ());
5972	InGlue = Chain.getValue(R: `1`);
5973	} else {
5974	InGlue =
5975	CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl,
5976	Reg: LoReg, N: N0, Glue: SDValue ()).getValue(R: `1`);
5977	if (isSigned && !signBitIsZero) {
5978	// Sign extend the low part into the high part.
5979	InGlue =
5980	SDValue (CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),`0`);
5981	} else {
5982	// Zero out the high part, effectively zero extending the input.
5983	SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
5984	SDValue ClrNode =
5985	SDValue (CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), `0`);
5986	switch (NVT.SimpleTy) {
5987	case MVT::i16:
5988	ClrNode =
5989	SDValue (CurDAG->getMachineNode(
5990	Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode,
5991	Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl,
5992	VT: MVT::i32)),
5993	`0`);
5994	break;
5995	case MVT::i32:
5996	break;
5997	case MVT::i64:
5998	ClrNode =
5999	SDValue (CurDAG->getMachineNode(
6000	Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64,
6001	Op1: CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i64), Op2: ClrNode,
6002	Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl,
6003	VT: MVT::i32)),
6004	`0`);
6005	break;
6006	default:
6007	llvm_unreachable("Unexpected division source");
6008	}
6009
6010	InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg,
6011	N: ClrNode, Glue: InGlue).getValue(R: `1`);
6012	}
6013	}
6014
6015	if (foldedLoad) {
6016	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
6017	InGlue };
6018	MachineSDNode *CNode =
6019	CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops);
6020	InGlue = SDValue (CNode, `1`);
6021	// Update the chain.
6022	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, `0`));
6023	// Record the mem-refs
6024	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
6025	} else {
6026	InGlue =
6027	SDValue (CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), `0`);
6028	}
6029
6030	// Prevent use of AH in a REX instruction by explicitly copying it to
6031	// an ABCD_L register.
6032	//
6033	// The current assumption of the register allocator is that isel
6034	// won't generate explicit references to the GR8_ABCD_H registers. If
6035	// the allocator and/or the backend get enhanced to be more robust in
6036	// that regard, this can be, and should be, removed.
6037	if (HiReg == X86::AH && !SDValue (Node, `1`).use_empty()) {
6038	SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8);
6039	unsigned AHExtOpcode =
6040	isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6041
6042	SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32,
6043	VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue);
6044	SDValue Result(RNode, `0`);
6045	InGlue = SDValue (RNode, `1`);
6046
6047	Result =
6048	CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result);
6049
6050	ReplaceUses(F: SDValue (Node, `1`), T: Result);
6051	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6052	dbgs() << `'\n'`);
6053	}
6054	// Copy the division (low) result, if it is needed.
6055	if (!SDValue (Node, `0`).use_empty()) {
6056	SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6057	Reg: LoReg, VT: NVT, Glue: InGlue);
6058	InGlue = Result.getValue(R: `2`);
6059	ReplaceUses(F: SDValue (Node, `0`), T: Result);
6060	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6061	dbgs() << `'\n'`);
6062	}
6063	// Copy the remainder (high) result, if it is needed.
6064	if (!SDValue (Node, `1`).use_empty()) {
6065	SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6066	Reg: HiReg, VT: NVT, Glue: InGlue);
6067	InGlue = Result.getValue(R: `2`);
6068	ReplaceUses(F: SDValue (Node, `1`), T: Result);
6069	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6070	dbgs() << `'\n'`);
6071	}
6072	CurDAG->RemoveDeadNode(N: Node);
6073	return;
6074	}
6075
6076	case X86ISD::FCMP:
6077	case X86ISD::STRICT_FCMP:
6078	case X86ISD::STRICT_FCMPS: {
6079	bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP \|\|
6080	Node->getOpcode() == X86ISD::STRICT_FCMPS;
6081	SDValue N0 = Node->getOperand(Num: IsStrictCmp ? `1` : `0`);
6082	SDValue N1 = Node->getOperand(Num: IsStrictCmp ? `2` : `1`);
6083
6084	// Save the original VT of the compare.
6085	MVT CmpVT = N0.getSimpleValueType();
6086
6087	// Floating point needs special handling if we don't have FCOMI.
6088	if (Subtarget->canUseCMOV())
6089	break;
6090
6091	bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6092
6093	unsigned Opc;
6094	switch (CmpVT.SimpleTy) {
6095	default: llvm_unreachable("Unexpected type!");
6096	case MVT::f32:
6097	Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6098	break;
6099	case MVT::f64:
6100	Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6101	break;
6102	case MVT::f80:
6103	Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6104	break;
6105	}
6106
6107	SDValue Chain =
6108	IsStrictCmp ? Node->getOperand(Num: `0`) : CurDAG->getEntryNode();
6109	SDValue Glue;
6110	if (IsStrictCmp) {
6111	SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
6112	Chain = SDValue (CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), `0`);
6113	Glue = Chain.getValue(R: `1`);
6114	} else {
6115	Glue = SDValue (CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), `0`);
6116	}
6117
6118	// Move FPSW to AX.
6119	SDValue FNSTSW =
6120	SDValue (CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), `0`);
6121
6122	// Extract upper 8-bits of AX.
6123	SDValue Extract =
6124	CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW);
6125
6126	// Move AH into flags.
6127	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
6128	assert(Subtarget->canUseLAHFSAHF() &&
6129	"Target doesn't support SAHF or FCOMI?");
6130	SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue ());
6131	Chain = AH;
6132	SDValue SAHF = SDValue (
6133	CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: `1`)), `0`);
6134
6135	if (IsStrictCmp)
6136	ReplaceUses(F: SDValue (Node, `1`), T: Chain);
6137
6138	ReplaceUses(F: SDValue (Node, `0`), T: SAHF);
6139	CurDAG->RemoveDeadNode(N: Node);
6140	return;
6141	}
6142
6143	case X86ISD::CMP: {
6144	SDValue N0 = Node->getOperand(Num: `0`);
6145	SDValue N1 = Node->getOperand(Num: `1`);
6146
6147	// Optimizations for TEST compares.
6148	if (!isNullConstant(V: N1))
6149	break;
6150
6151	// Save the original VT of the compare.
6152	MVT CmpVT = N0.getSimpleValueType();
6153
6154	// If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6155	// by a test instruction. The test should be removed later by
6156	// analyzeCompare if we are using only the zero flag.
6157	// TODO: Should we check the users and use the BEXTR flags directly?
6158	if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6159	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) {
6160	unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6161	: X86::TEST32rr;
6162	SDValue BEXTR = SDValue (NewNode, `0`);
6163	NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR);
6164	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
6165	CurDAG->RemoveDeadNode(N: Node);
6166	return;
6167	}
6168	}
6169
6170	// We can peek through truncates, but we need to be careful below.
6171	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6172	N0 = N0.getOperand(i: `0`);
6173
6174	// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6175	// use a smaller encoding.
6176	// Look past the truncate if CMP is the only use of it.
6177	if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6178	N0.getValueType() != MVT::i8) {
6179	auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: `1`));
6180	if (!MaskC)
6181	break;
6182
6183	// We may have looked through a truncate so mask off any bits that
6184	// shouldn't be part of the compare.
6185	uint64_t Mask = MaskC->getZExtValue();
6186	Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits());
6187
6188	// Check if we can replace AND+IMM{32,64} with a shift. This is possible
6189	// for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6190	// zero flag.
6191	if (CmpVT == MVT::i64 && !isInt<`8`>(x: Mask) && isShiftedMask_64(Value: Mask) &&
6192	onlyUsesZeroFlag(Flags: SDValue (Node, `0`))) {
6193	unsigned ShiftOpcode = ISD::DELETED_NODE;
6194	unsigned ShiftAmt;
6195	unsigned SubRegIdx;
6196	MVT SubRegVT;
6197	unsigned TestOpcode;
6198	unsigned LeadingZeros = llvm::countl_zero(Val: Mask);
6199	unsigned TrailingZeros = llvm::countr_zero(Val: Mask);
6200
6201	// With leading/trailing zeros, the transform is profitable if we can
6202	// eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6203	// incurring any extra register moves.
6204	bool SavesBytes = !isInt<`32`>(x: Mask) \|\| N0.getOperand(i: `0`).hasOneUse();
6205	if (LeadingZeros == `0` && SavesBytes) {
6206	// If the mask covers the most significant bit, then we can replace
6207	// TEST+AND with a SHR and check eflags.
6208	// This emits a redundant TEST which is subsequently eliminated.
6209	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6210	ShiftAmt = TrailingZeros;
6211	SubRegIdx = `0`;
6212	TestOpcode = X86::TEST64rr;
6213	} else if (TrailingZeros == `0` && SavesBytes) {
6214	// If the mask covers the least significant bit, then we can replace
6215	// TEST+AND with a SHL and check eflags.
6216	// This emits a redundant TEST which is subsequently eliminated.
6217	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6218	ShiftAmt = LeadingZeros;
6219	SubRegIdx = `0`;
6220	TestOpcode = X86::TEST64rr;
6221	} else if (MaskC->hasOneUse() && !isInt<`32`>(x: Mask)) {
6222	// If the shifted mask extends into the high half and is 8/16/32 bits
6223	// wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6224	unsigned PopCount = `64` - LeadingZeros - TrailingZeros;
6225	if (PopCount == `8`) {
6226	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6227	ShiftAmt = TrailingZeros;
6228	SubRegIdx = X86::sub_8bit;
6229	SubRegVT = MVT::i8;
6230	TestOpcode = X86::TEST8rr;
6231	} else if (PopCount == `16`) {
6232	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6233	ShiftAmt = TrailingZeros;
6234	SubRegIdx = X86::sub_16bit;
6235	SubRegVT = MVT::i16;
6236	TestOpcode = X86::TEST16rr;
6237	} else if (PopCount == `32`) {
6238	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6239	ShiftAmt = TrailingZeros;
6240	SubRegIdx = X86::sub_32bit;
6241	SubRegVT = MVT::i32;
6242	TestOpcode = X86::TEST32rr;
6243	}
6244	}
6245	if (ShiftOpcode != ISD::DELETED_NODE) {
6246	SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64);
6247	SDValue Shift = SDValue (
6248	CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32,
6249	Op1: N0.getOperand(i: `0`), Op2: ShiftC),
6250	`0`);
6251	if (SubRegIdx != `0`) {
6252	Shift =
6253	CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift);
6254	}
6255	MachineSDNode *Test =
6256	CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift);
6257	ReplaceNode(F: Node, T: Test);
6258	return;
6259	}
6260	}
6261
6262	MVT VT;
6263	int SubRegOp;
6264	unsigned ROpc, MOpc;
6265
6266	// For each of these checks we need to be careful if the sign flag is
6267	// being used. It is only safe to use the sign flag in two conditions,
6268	// either the sign bit in the shrunken mask is zero or the final test
6269	// size is equal to the original compare size.
6270
6271	if (isUInt<`8`>(x: Mask) &&
6272	(!(Mask & `0x80`) \|\| CmpVT == MVT::i8 \|\|
6273	hasNoSignFlagUses(Flags: SDValue (Node, `0`)))) {
6274	// For example, convert "testl %eax, $8" to "testb %al, $8"
6275	VT = MVT::i8;
6276	SubRegOp = X86::sub_8bit;
6277	ROpc = X86::TEST8ri;
6278	MOpc = X86::TEST8mi;
6279	} else if (OptForMinSize && isUInt<`16`>(x: Mask) &&
6280	(!(Mask & `0x8000`) \|\| CmpVT == MVT::i16 \|\|
6281	hasNoSignFlagUses(Flags: SDValue (Node, `0`)))) {
6282	// For example, "testl %eax, $32776" to "testw %ax, $32776".
6283	// NOTE: We only want to form TESTW instructions if optimizing for
6284	// min size. Otherwise we only save one byte and possibly get a length
6285	// changing prefix penalty in the decoders.
6286	VT = MVT::i16;
6287	SubRegOp = X86::sub_16bit;
6288	ROpc = X86::TEST16ri;
6289	MOpc = X86::TEST16mi;
6290	} else if (isUInt<`32`>(x: Mask) && N0.getValueType() != MVT::i16 &&
6291	((!(Mask & `0x80000000`) &&
6292	// Without minsize 16-bit Cmps can get here so we need to
6293	// be sure we calculate the correct sign flag if needed.
6294	(CmpVT != MVT::i16 \|\| !(Mask & `0x8000`))) \|\|
6295	CmpVT == MVT::i32 \|\|
6296	hasNoSignFlagUses(Flags: SDValue (Node, `0`)))) {
6297	// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6298	// NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6299	// Otherwize, we find ourselves in a position where we have to do
6300	// promotion. If previous passes did not promote the and, we assume
6301	// they had a good reason not to and do not promote here.
6302	VT = MVT::i32;
6303	SubRegOp = X86::sub_32bit;
6304	ROpc = X86::TEST32ri;
6305	MOpc = X86::TEST32mi;
6306	} else {
6307	// No eligible transformation was found.
6308	break;
6309	}
6310
6311	SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT);
6312	SDValue Reg = N0.getOperand(i: `0`);
6313
6314	// Emit a testl or testw.
6315	MachineSDNode *NewNode;
6316	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6317	if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6318	if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: `0`).getNode())) {
6319	if (!LoadN->isSimple()) {
6320	unsigned NumVolBits = LoadN->getValueType(ResNo: `0`).getSizeInBits();
6321	if ((MOpc == X86::TEST8mi && NumVolBits != `8`) \|\|
6322	(MOpc == X86::TEST16mi && NumVolBits != `16`) \|\|
6323	(MOpc == X86::TEST32mi && NumVolBits != `32`))
6324	break;
6325	}
6326	}
6327	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6328	Reg.getOperand(i: `0`) };
6329	NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops);
6330	// Update the chain.
6331	ReplaceUses(F: Reg.getValue(R: `1`), T: SDValue (NewNode, `1`));
6332	// Record the mem-refs
6333	CurDAG->setNodeMemRefs(N: NewNode,
6334	NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()});
6335	} else {
6336	// Extract the subregister if necessary.
6337	if (N0.getValueType() != VT)
6338	Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg);
6339
6340	NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm);
6341	}
6342	// Replace CMP with TEST.
6343	ReplaceNode(F: Node, T: NewNode);
6344	return;
6345	}
6346	break;
6347	}
6348	case X86ISD::PCMPISTR: {
6349	if (!Subtarget->hasSSE42())
6350	break;
6351
6352	bool NeedIndex = !SDValue (Node, `0`).use_empty();
6353	bool NeedMask = !SDValue (Node, `1`).use_empty();
6354	// We can't fold a load if we are going to make two instructions.
6355	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;
6356
6357	MachineSDNode *CNode;
6358	if (NeedMask) {
6359	unsigned ROpc =
6360	Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6361	unsigned MOpc =
6362	Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6363	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node);
6364	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, `0`));
6365	}
6366	if (NeedIndex \|\| !NeedMask) {
6367	unsigned ROpc =
6368	Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6369	unsigned MOpc =
6370	Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6371	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node);
6372	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
6373	}
6374
6375	// Connect the flag usage to the last instruction created.
6376	ReplaceUses(F: SDValue (Node, `2`), T: SDValue (CNode, `1`));
6377	CurDAG->RemoveDeadNode(N: Node);
6378	return;
6379	}
6380	case X86ISD::PCMPESTR: {
6381	if (!Subtarget->hasSSE42())
6382	break;
6383
6384	// Copy the two implicit register inputs.
6385	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX,
6386	N: Node->getOperand(Num: `1`),
6387	Glue: SDValue ()).getValue(R: `1`);
6388	InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX,
6389	N: Node->getOperand(Num: `3`), Glue: InGlue).getValue(R: `1`);
6390
6391	bool NeedIndex = !SDValue (Node, `0`).use_empty();
6392	bool NeedMask = !SDValue (Node, `1`).use_empty();
6393	// We can't fold a load if we are going to make two instructions.
6394	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;
6395
6396	MachineSDNode *CNode;
6397	if (NeedMask) {
6398	unsigned ROpc =
6399	Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6400	unsigned MOpc =
6401	Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6402	CNode =
6403	emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue);
6404	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, `0`));
6405	}
6406	if (NeedIndex \|\| !NeedMask) {
6407	unsigned ROpc =
6408	Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6409	unsigned MOpc =
6410	Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6411	CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue);
6412	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
6413	}
6414	// Connect the flag usage to the last instruction created.
6415	ReplaceUses(F: SDValue (Node, `2`), T: SDValue (CNode, `1`));
6416	CurDAG->RemoveDeadNode(N: Node);
6417	return;
6418	}
6419
6420	case ISD::SETCC: {
6421	if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue (Node, `0`), InMask: SDValue ()))
6422	return;
6423
6424	break;
6425	}
6426
6427	case ISD::STORE:
6428	if (foldLoadStoreIntoMemOperand(Node))
6429	return;
6430	break;
6431
6432	case X86ISD::SETCC_CARRY: {
6433	MVT VT = Node->getSimpleValueType(ResNo: `0`);
6434	SDValue Result;
6435	if (Subtarget->hasSBBDepBreaking()) {
6436	// We have to do this manually because tblgen will put the eflags copy in
6437	// the wrong place if we use an extract_subreg in the pattern.
6438	// Copy flags to the EFLAGS register and glue it to next node.
6439	SDValue EFLAGS =
6440	CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
6441	N: Node->getOperand(Num: `1`), Glue: SDValue ());
6442
6443	// Create a 64-bit instruction if the result is 64-bits otherwise use the
6444	// 32-bit version.
6445	unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6446	MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6447	Result = SDValue (
6448	CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: `1`)),
6449	`0`);
6450	} else {
6451	// The target does not recognize sbb with the same reg operand as a
6452	// no-source idiom, so we explicitly zero the input values.
6453	Result = getSBBZero(N: Node);
6454	}
6455
6456	// For less than 32-bits we need to extract from the 32-bit node.
6457	if (VT == MVT::i8 \|\| VT == MVT::i16) {
6458	int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6459	Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6460	}
6461
6462	ReplaceUses(F: SDValue (Node, `0`), T: Result);
6463	CurDAG->RemoveDeadNode(N: Node);
6464	return;
6465	}
6466	case X86ISD::SBB: {
6467	if (isNullConstant(V: Node->getOperand(Num: `0`)) &&
6468	isNullConstant(V: Node->getOperand(Num: `1`))) {
6469	SDValue Result = getSBBZero(N: Node);
6470
6471	// Replace the flag use.
6472	ReplaceUses(F: SDValue (Node, `1`), T: Result.getValue(R: `1`));
6473
6474	// Replace the result use.
6475	if (!SDValue (Node, `0`).use_empty()) {
6476	// For less than 32-bits we need to extract from the 32-bit node.
6477	MVT VT = Node->getSimpleValueType(ResNo: `0`);
6478	if (VT == MVT::i8 \|\| VT == MVT::i16) {
6479	int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6480	Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6481	}
6482	ReplaceUses(F: SDValue (Node, `0`), T: Result);
6483	}
6484
6485	CurDAG->RemoveDeadNode(N: Node);
6486	return;
6487	}
6488	break;
6489	}
6490	case X86ISD::MGATHER: {
6491	auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node);
6492	SDValue IndexOp = Mgt->getIndex();
6493	SDValue Mask = Mgt->getMask();
6494	MVT IndexVT = IndexOp.getSimpleValueType();
6495	MVT ValueVT = Node->getSimpleValueType(ResNo: `0`);
6496	MVT MaskVT = Mask.getSimpleValueType();
6497
6498	// This is just to prevent crashes if the nodes are malformed somehow. We're
6499	// otherwise only doing loose type checking in here based on type what
6500	// a type constraint would say just like table based isel.
6501	if (!ValueVT.isVector() \|\| !MaskVT.isVector())
6502	break;
6503
6504	unsigned NumElts = ValueVT.getVectorNumElements();
6505	MVT ValueSVT = ValueVT.getVectorElementType();
6506
6507	bool IsFP = ValueSVT.isFloatingPoint();
6508	unsigned EltSize = ValueSVT.getSizeInBits();
6509
6510	unsigned Opc = `0`;
6511	bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6512	if (AVX512Gather) {
6513	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6514	Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6515	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6516	Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6517	else if (IndexVT == MVT::v16i32 && NumElts == `16` && EltSize == `32`)
6518	Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6519	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6520	Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6521	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6522	Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6523	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `64`)
6524	Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6525	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6526	Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6527	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6528	Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6529	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `32`)
6530	Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6531	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6532	Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6533	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6534	Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6535	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `64`)
6536	Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6537	} else {
6538	assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6539	"Unexpected mask VT!");
6540	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6541	Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6542	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6543	Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6544	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6545	Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6546	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6547	Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6548	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6549	Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6550	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6551	Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6552	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6553	Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6554	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6555	Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6556	}
6557
6558	if (!Opc)
6559	break;
6560
6561	SDValue Base, Scale, Index, Disp, Segment;
6562	if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(),
6563	Base, Scale, Index, Disp, Segment))
6564	break;
6565
6566	SDValue PassThru = Mgt->getPassThru();
6567	SDValue Chain = Mgt->getChain();
6568	// Gather instructions have a mask output not in the ISD node.
6569	SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other);
6570
6571	MachineSDNode *NewNode;
6572	if (AVX512Gather) {
6573	SDValue Ops[] = {PassThru, Mask, Base, Scale,
6574	Index, Disp, Segment, Chain};
6575	NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6576	} else {
6577	SDValue Ops[] = {PassThru, Base, Scale, Index,
6578	Disp, Segment, Mask, Chain};
6579	NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6580	}
6581	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()});
6582	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
6583	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (NewNode, `2`));
6584	CurDAG->RemoveDeadNode(N: Node);
6585	return;
6586	}
6587	case X86ISD::MSCATTER: {
6588	auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node);
6589	SDValue Value = Sc->getValue();
6590	SDValue IndexOp = Sc->getIndex();
6591	MVT IndexVT = IndexOp.getSimpleValueType();
6592	MVT ValueVT = Value.getSimpleValueType();
6593
6594	// This is just to prevent crashes if the nodes are malformed somehow. We're
6595	// otherwise only doing loose type checking in here based on type what
6596	// a type constraint would say just like table based isel.
6597	if (!ValueVT.isVector())
6598	break;
6599
6600	unsigned NumElts = ValueVT.getVectorNumElements();
6601	MVT ValueSVT = ValueVT.getVectorElementType();
6602
6603	bool IsFP = ValueSVT.isFloatingPoint();
6604	unsigned EltSize = ValueSVT.getSizeInBits();
6605
6606	unsigned Opc;
6607	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6608	Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6609	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6610	Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6611	else if (IndexVT == MVT::v16i32 && NumElts == `16` && EltSize == `32`)
6612	Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6613	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6614	Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6615	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6616	Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6617	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `64`)
6618	Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6619	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6620	Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6621	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6622	Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6623	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `32`)
6624	Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6625	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6626	Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6627	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6628	Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6629	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `64`)
6630	Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6631	else
6632	break;
6633
6634	SDValue Base, Scale, Index, Disp, Segment;
6635	if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(),
6636	Base, Scale, Index, Disp, Segment))
6637	break;
6638
6639	SDValue Mask = Sc->getMask();
6640	SDValue Chain = Sc->getChain();
6641	// Scatter instructions have a mask output not in the ISD node.
6642	SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other);
6643	SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6644
6645	MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6646	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()});
6647	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `1`));
6648	CurDAG->RemoveDeadNode(N: Node);
6649	return;
6650	}
6651	case ISD::PREALLOCATED_SETUP: {
6652	auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6653	auto CallId = MFI->getPreallocatedIdForCallSite(
6654	CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: `1`))->getValue());
6655	SDValue Chain = Node->getOperand(Num: `0`);
6656	SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6657	MachineSDNode *New = CurDAG->getMachineNode(
6658	Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain);
6659	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (New, `0`)); // Chain
6660	CurDAG->RemoveDeadNode(N: Node);
6661	return;
6662	}
6663	case ISD::PREALLOCATED_ARG: {
6664	auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6665	auto CallId = MFI->getPreallocatedIdForCallSite(
6666	CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: `1`))->getValue());
6667	SDValue Chain = Node->getOperand(Num: `0`);
6668	SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6669	SDValue ArgIndex = Node->getOperand(Num: `2`);
6670	SDValue Ops[`3`];
6671	Ops[`0`] = CallIdValue;
6672	Ops[`1`] = ArgIndex;
6673	Ops[`2`] = Chain;
6674	MachineSDNode *New = CurDAG->getMachineNode(
6675	Opcode: TargetOpcode::PREALLOCATED_ARG, dl,
6676	VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()),
6677	VT2: MVT::Other),
6678	Ops);
6679	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (New, `0`)); // Arg pointer
6680	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (New, `1`)); // Chain
6681	CurDAG->RemoveDeadNode(N: Node);
6682	return;
6683	}
6684	case X86ISD::AESENCWIDE128KL:
6685	case X86ISD::AESDECWIDE128KL:
6686	case X86ISD::AESENCWIDE256KL:
6687	case X86ISD::AESDECWIDE256KL: {
6688	if (!Subtarget->hasWIDEKL())
6689	break;
6690
6691	unsigned Opcode;
6692	switch (Node->getOpcode()) {
6693	default:
6694	llvm_unreachable("Unexpected opcode!");
6695	case X86ISD::AESENCWIDE128KL:
6696	Opcode = X86::AESENCWIDE128KL;
6697	break;
6698	case X86ISD::AESDECWIDE128KL:
6699	Opcode = X86::AESDECWIDE128KL;
6700	break;
6701	case X86ISD::AESENCWIDE256KL:
6702	Opcode = X86::AESENCWIDE256KL;
6703	break;
6704	case X86ISD::AESDECWIDE256KL:
6705	Opcode = X86::AESDECWIDE256KL;
6706	break;
6707	}
6708
6709	SDValue Chain = Node->getOperand(Num: `0`);
6710	SDValue Addr = Node->getOperand(Num: `1`);
6711
6712	SDValue Base, Scale, Index, Disp, Segment;
6713	if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment))
6714	break;
6715
6716	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: `2`),
6717	Glue: SDValue ());
6718	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: `3`),
6719	Glue: Chain.getValue(R: `1`));
6720	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: `4`),
6721	Glue: Chain.getValue(R: `1`));
6722	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: `5`),
6723	Glue: Chain.getValue(R: `1`));
6724	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: `6`),
6725	Glue: Chain.getValue(R: `1`));
6726	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: `7`),
6727	Glue: Chain.getValue(R: `1`));
6728	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: `8`),
6729	Glue: Chain.getValue(R: `1`));
6730	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: `9`),
6731	Glue: Chain.getValue(R: `1`));
6732
6733	MachineSDNode *Res = CurDAG->getMachineNode(
6734	Opcode, dl, VTs: Node->getVTList(),
6735	Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: `1`)});
6736	CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand());
6737	ReplaceNode(F: Node, T: Res);
6738	return;
6739	}
6740	case X86ISD::POP_FROM_X87_REG: {
6741	SDValue Chain = Node->getOperand(Num: `0`);
6742	Register Reg = cast<RegisterSDNode>(Val: Node->getOperand(Num: `1`))->getReg();
6743	SDValue Glue;
6744	if (Node->getNumValues() == `3`)
6745	Glue = Node->getOperand(Num: `2`);
6746	SDValue Copy =
6747	CurDAG->getCopyFromReg(Chain, dl, Reg, VT: Node->getValueType(ResNo: `0`), Glue);
6748	ReplaceNode(F: Node, T: Copy.getNode());
6749	return;
6750	}
6751	}
6752
6753	SelectCode(N: Node);
6754	}
6755
6756	bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6757	const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6758	std::vector<SDValue> &OutOps) {
6759	SDValue Op0, Op1, Op2, Op3, Op4;
6760	switch (ConstraintID) {
6761	default:
6762	llvm_unreachable("Unexpected asm memory constraint");
6763	case InlineAsm::ConstraintCode::o: // offsetable ??
6764	case InlineAsm::ConstraintCode::v: // not offsetable ??
6765	case InlineAsm::ConstraintCode::m: // memory
6766	case InlineAsm::ConstraintCode::X:
6767	case InlineAsm::ConstraintCode::p: // address
6768	if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4))
6769	return true;
6770	break;
6771	}
6772
6773	OutOps.push_back(x: Op0);
6774	OutOps.push_back(x: Op1);
6775	OutOps.push_back(x: Op2);
6776	OutOps.push_back(x: Op3);
6777	OutOps.push_back(x: Op4);
6778	return false;
6779	}
6780
6781	X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6782	: SelectionDAGISelPass (
6783	std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
6784
6785	/// This pass converts a legalized DAG into a X86-specific DAG,
6786	/// ready for instruction scheduling.
6787	FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6788	CodeGenOptLevel OptLevel) {
6789	return new X86DAGToDAGISelLegacy (TM, OptLevel);
6790	}
6791

Browse the source code of llvm_projects/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp