X86ISelDAGToDAG.cpp source code [llvm_projects/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp]

1	//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines a DAG pattern matching instruction selector for X86,
10	// converting from a legalized dag to a X86 dag.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "X86ISelDAGToDAG.h"
15	#include "X86.h"
16	#include "X86MachineFunctionInfo.h"
17	#include "X86Subtarget.h"
18	#include "X86TargetMachine.h"
19	#include "llvm/ADT/Statistic.h"
20	#include "llvm/CodeGen/MachineModuleInfo.h"
21	#include "llvm/CodeGen/SelectionDAGISel.h"
22	#include "llvm/Config/llvm-config.h"
23	#include "llvm/IR/ConstantRange.h"
24	#include "llvm/IR/Function.h"
25	#include "llvm/IR/Instructions.h"
26	#include "llvm/IR/Intrinsics.h"
27	#include "llvm/IR/IntrinsicsX86.h"
28	#include "llvm/IR/Module.h"
29	#include "llvm/IR/Type.h"
30	#include "llvm/Support/Debug.h"
31	#include "llvm/Support/ErrorHandling.h"
32	#include "llvm/Support/KnownBits.h"
33	#include "llvm/Support/MathExtras.h"
34	#include <cstdint>
35
36	using namespace llvm;
37
38	#define DEBUG_TYPE "x86-isel"
39	#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41	STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43	static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(Val: true),
44	cl::desc ("Enable setting constant bits to reduce size of mask immediates"),
45	cl::Hidden);
46
47	static cl::opt<bool> EnablePromoteAnyextLoad(
48	"x86-promote-anyext-load", cl::init(Val: true),
49	cl::desc ("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
51	extern cl::opt<bool> IndirectBranchTracking;
52
53	//===----------------------------------------------------------------------===//
54	// Pattern Matcher Implementation
55	//===----------------------------------------------------------------------===//
56
57	namespace {
58	/// This corresponds to X86AddressMode, but uses SDValue's instead of register
59	/// numbers for the leaves of the matched tree.
60	struct X86ISelAddressMode {
61	enum {
62	RegBase,
63	FrameIndexBase
64	} BaseType = RegBase;
65
66	// This is really a union, discriminated by BaseType!
67	SDValue Base_Reg;
68	int Base_FrameIndex = `0`;
69
70	unsigned Scale = `1`;
71	SDValue IndexReg;
72	int32_t Disp = `0`;
73	SDValue Segment;
74	const GlobalValue GV = nullptr*;
75	const Constant CP = nullptr*;
76	const BlockAddress BlockAddr = nullptr*;
77	const char ES = nullptr*;
78	MCSymbol MCSym = nullptr*;
79	int JT = -`1`;
80	Align Alignment; // CP alignment.
81	unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82	bool NegateIndex = false;
83
84	X86ISelAddressMode() = default;
85
86	bool hasSymbolicDisplacement() const {
87	return GV != nullptr \|\| CP != nullptr \|\| ES != nullptr \|\|
88	MCSym != nullptr \|\| JT != -`1` \|\| BlockAddr != nullptr;
89	}
90
91	bool hasBaseOrIndexReg() const {
92	return BaseType == FrameIndexBase \|\|
93	IndexReg.getNode() != nullptr \|\| Base_Reg.getNode() != nullptr;
94	}
95
96	/// Return true if this addressing mode is already RIP-relative.
97	bool isRIPRelative() const {
98	if (BaseType != RegBase) return false;
99	if (RegisterSDNode *RegNode =
100	dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode()))
101	return RegNode->getReg() == X86::RIP;
102	return false;
103	}
104
105	void setBaseReg(SDValue Reg) {
106	BaseType = RegBase;
107	Base_Reg = Reg;
108	}
109
110	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
111	void dump(SelectionDAG DAG = nullptr*) {
112	dbgs() << "X86ISelAddressMode " << this << `'\n'`;
113	dbgs() << "Base_Reg ";
114	if (Base_Reg.getNode())
115	Base_Reg.getNode()->dump(DAG);
116	else
117	dbgs() << "nul\n";
118	if (BaseType == FrameIndexBase)
119	dbgs() << " Base.FrameIndex " << Base_FrameIndex << `'\n'`;
120	dbgs() << " Scale " << Scale << `'\n'`
121	<< "IndexReg ";
122	if (NegateIndex)
123	dbgs() << "negate ";
124	if (IndexReg.getNode())
125	IndexReg.getNode()->dump(DAG);
126	else
127	dbgs() << "nul\n";
128	dbgs() << " Disp " << Disp << `'\n'`
129	<< "GV ";
130	if (GV)
131	GV->dump();
132	else
133	dbgs() << "nul";
134	dbgs() << " CP ";
135	if (CP)
136	CP->dump();
137	else
138	dbgs() << "nul";
139	dbgs() << `'\n'`
140	<< "ES ";
141	if (ES)
142	dbgs() << ES;
143	else
144	dbgs() << "nul";
145	dbgs() << " MCSym ";
146	if (MCSym)
147	dbgs() << MCSym;
148	else
149	dbgs() << "nul";
150	dbgs() << " JT" << JT << " Align" << Alignment.value() << `'\n'`;
151	}
152	#endif
153	};
154	}
155
156	namespace {
157	//===--------------------------------------------------------------------===//
158	/// ISel - X86-specific code to select X86 machine instructions for
159	/// SelectionDAG operations.
160	///
161	class X86DAGToDAGISel final : public SelectionDAGISel {
162	/// Keep a pointer to the X86Subtarget around so that we can
163	/// make the right decision when generating code for different targets.
164	const X86Subtarget *Subtarget;
165
166	/// If true, selector should try to optimize for minimum code size.
167	bool OptForMinSize;
168
169	/// Disable direct TLS access through segment registers.
170	bool IndirectTlsSegRefs;
171
172	public:
173	X86DAGToDAGISel() = delete;
174
175	explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176	: SelectionDAGISel (tm, OptLevel), Subtarget(nullptr),
177	OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179	bool runOnMachineFunction(MachineFunction &MF) override {
180	// Reset the subtarget each time through.
181	Subtarget = &MF.getSubtarget<X86Subtarget>();
182	IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183	Kind: "indirect-tls-seg-refs");
184
185	// OptFor[Min]Size are used in pattern predicates that isel is matching.
186	OptForMinSize = MF.getFunction().hasMinSize();
187	return SelectionDAGISel::runOnMachineFunction(mf&: MF);
188	}
189
190	void emitFunctionEntryCode() override;
191
192	bool IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const override;
193
194	void PreprocessISelDAG() override;
195	void PostprocessISelDAG() override;
196
197	// Include the pieces autogenerated from the target description.
198	#include "X86GenDAGISel.inc"
199
200	private:
201	void Select(SDNode *N) override;
202
203	bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204	bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205	bool AllowSegmentRegForX32 = false);
206	bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207	bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208	bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209	bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210	SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211	unsigned Depth);
212	bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213	unsigned Depth);
214	bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215	unsigned Depth);
216	bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217	bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale,
218	SDValue &Index, SDValue &Disp, SDValue &Segment,
219	bool HasNDDM = true);
220	bool selectNDDAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale,
221	SDValue &Index, SDValue &Disp, SDValue &Segment);
222	bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
223	SDValue ScaleOp, SDValue &Base, SDValue &Scale,
224	SDValue &Index, SDValue &Disp, SDValue &Segment);
225	bool selectMOV64Imm32(SDValue N, SDValue &Imm);
226	bool selectLEAAddr(SDValue N, SDValue &Base,
227	SDValue &Scale, SDValue &Index, SDValue &Disp,
228	SDValue &Segment);
229	bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
230	SDValue &Index, SDValue &Disp, SDValue &Segment);
231	bool selectTLSADDRAddr(SDValue N, SDValue &Base,
232	SDValue &Scale, SDValue &Index, SDValue &Disp,
233	SDValue &Segment);
234	bool selectRelocImm(SDValue N, SDValue &Op);
235
236	bool tryFoldLoad(SDNode Root, SDNode P, SDValue N,
237	SDValue &Base, SDValue &Scale,
238	SDValue &Index, SDValue &Disp,
239	SDValue &Segment);
240
241	// Convenience method where P is also root.
242	bool tryFoldLoad(SDNode *P, SDValue N,
243	SDValue &Base, SDValue &Scale,
244	SDValue &Index, SDValue &Disp,
245	SDValue &Segment) {
246	return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment);
247	}
248
249	bool tryFoldBroadcast(SDNode Root, SDNode P, SDValue N,
250	SDValue &Base, SDValue &Scale,
251	SDValue &Index, SDValue &Disp,
252	SDValue &Segment);
253
254	bool isProfitableToFormMaskedOp(SDNode N) const*;
255
256	/// Implement addressing mode selection for inline asm expressions.
257	bool SelectInlineAsmMemoryOperand(const SDValue &Op,
258	InlineAsm::ConstraintCode ConstraintID,
259	std::vector<SDValue> &OutOps) override;
260
261	void emitSpecialCodeForMain();
262
263	inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
264	MVT VT, SDValue &Base, SDValue &Scale,
265	SDValue &Index, SDValue &Disp,
266	SDValue &Segment) {
267	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
268	Base = CurDAG->getTargetFrameIndex(
269	FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout()));
270	else if (AM.Base_Reg.getNode())
271	Base = AM.Base_Reg;
272	else
273	Base = CurDAG->getRegister(Reg: `0`, VT);
274
275	Scale = getI8Imm(Imm: AM.Scale, DL);
276
277	#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
278	#define GET_NDM_IF_ENABLED(OPC) \
279	(Subtarget->hasNDD() && Subtarget->hasNDDM() ? OPC##_ND : OPC)
280	// Negate the index if needed.
281	if (AM.NegateIndex) {
282	unsigned NegOpc;
283	switch (VT.SimpleTy) {
284	default:
285	llvm_unreachable("Unsupported VT!");
286	case MVT::i64:
287	NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
288	break;
289	case MVT::i32:
290	NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
291	break;
292	case MVT::i16:
293	NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
294	break;
295	case MVT::i8:
296	NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
297	break;
298	}
299	SDValue Neg = SDValue (CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32,
300	Ops: AM.IndexReg), `0`);
301	AM.IndexReg = Neg;
302	}
303
304	if (AM.IndexReg.getNode())
305	Index = AM.IndexReg;
306	else
307	Index = CurDAG->getRegister(Reg: `0`, VT);
308
309	// These are 32-bit even in 64-bit mode since RIP-relative offset
310	// is 32-bit.
311	if (AM.GV)
312	Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc (),
313	VT: MVT::i32, offset: AM.Disp,
314	TargetFlags: AM.SymbolFlags);
315	else if (AM.CP)
316	Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment,
317	Offset: AM.Disp, TargetFlags: AM.SymbolFlags);
318	else if (AM.ES) {
319	assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
320	Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
321	} else if (AM.MCSym) {
322	assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
323	assert(AM.SymbolFlags == `0` && "oo");
324	Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32);
325	} else if (AM.JT != -`1`) {
326	assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
327	Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
328	} else if (AM.BlockAddr)
329	Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp,
330	TargetFlags: AM.SymbolFlags);
331	else
332	Disp = CurDAG->getSignedTargetConstant(Val: AM.Disp, DL, VT: MVT::i32);
333
334	if (AM.Segment.getNode())
335	Segment = AM.Segment;
336	else
337	Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
338	}
339
340	// Utility function to determine whether it is AMX SDNode right after
341	// lowering but before ISEL.
342	bool isAMXSDNode(SDNode N) const* {
343	// Check if N is AMX SDNode:
344	// 1. check result type;
345	// 2. check operand type;
346	for (unsigned Idx = `0`, E = N->getNumValues(); Idx != E; ++Idx) {
347	if (N->getValueType(ResNo: Idx) == MVT::x86amx)
348	return true;
349	}
350	for (unsigned Idx = `0`, E = N->getNumOperands(); Idx != E; ++Idx) {
351	SDValue Op = N->getOperand(Num: Idx);
352	if (Op.getValueType() == MVT::x86amx)
353	return true;
354	}
355	return false;
356	}
357
358	// Utility function to determine whether we should avoid selecting
359	// immediate forms of instructions for better code size or not.
360	// At a high level, we'd like to avoid such instructions when
361	// we have similar constants used within the same basic block
362	// that can be kept in a register.
363	//
364	bool shouldAvoidImmediateInstFormsForSize(SDNode N) const* {
365	uint32_t UseCount = `0`;
366
367	// Do not want to hoist if we're not optimizing for size.
368	// TODO: We'd like to remove this restriction.
369	// See the comment in X86InstrInfo.td for more info.
370	if (!CurDAG->shouldOptForSize())
371	return false;
372
373	// Walk all the users of the immediate.
374	for (const SDNode *User : N->users()) {
375	if (UseCount >= `2`)
376	break;
377
378	// This user is already selected. Count it as a legitimate use and
379	// move on.
380	if (User->isMachineOpcode()) {
381	UseCount++;
382	continue;
383	}
384
385	// We want to count stores of immediates as real uses.
386	if (User->getOpcode() == ISD::STORE &&
387	User->getOperand(Num: `1`).getNode() == N) {
388	UseCount++;
389	continue;
390	}
391
392	// We don't currently match users that have > 2 operands (except
393	// for stores, which are handled above)
394	// Those instruction won't match in ISEL, for now, and would
395	// be counted incorrectly.
396	// This may change in the future as we add additional instruction
397	// types.
398	if (User->getNumOperands() != `2`)
399	continue;
400
401	// If this is a sign-extended 8-bit integer immediate used in an ALU
402	// instruction, there is probably an opcode encoding to save space.
403	auto *C = dyn_cast<ConstantSDNode>(Val: N);
404	if (C && isInt<`8`>(x: C->getSExtValue()))
405	continue;
406
407	// Immediates that are used for offsets as part of stack
408	// manipulation should be left alone. These are typically
409	// used to indicate SP offsets for argument passing and
410	// will get pulled into stores/pushes (implicitly).
411	if (User->getOpcode() == X86ISD::ADD \|\|
412	User->getOpcode() == ISD::ADD \|\|
413	User->getOpcode() == X86ISD::SUB \|\|
414	User->getOpcode() == ISD::SUB) {
415
416	// Find the other operand of the add/sub.
417	SDValue OtherOp = User->getOperand(Num: `0`);
418	if (OtherOp.getNode() == N)
419	OtherOp = User->getOperand(Num: `1`);
420
421	// Don't count if the other operand is SP.
422	RegisterSDNode *RegNode;
423	if (OtherOp ->getOpcode() == ISD::CopyFromReg &&
424	(RegNode = dyn_cast_or_null<RegisterSDNode>(
425	Val: OtherOp ->getOperand(Num: `1`).getNode())))
426	if ((RegNode->getReg() == X86::ESP) \|\|
427	(RegNode->getReg() == X86::RSP))
428	continue;
429	}
430
431	// ... otherwise, count this and move on.
432	UseCount++;
433	}
434
435	// If we have more than 1 use, then recommend for hoisting.
436	return (UseCount > `1`);
437	}
438
439	/// Return a target constant with the specified value of type i8.
440	inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
441	return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
442	}
443
444	/// Return a target constant with the specified value, of type i32.
445	inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
446	return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32);
447	}
448
449	/// Return a target constant with the specified value, of type i64.
450	inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
451	return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64);
452	}
453
454	SDValue getExtractVEXTRACTImmediate(SDNode N, unsigned* VecWidth,
455	const SDLoc &DL) {
456	assert((VecWidth == `128` \|\| VecWidth == `256`) && "Unexpected vector width");
457	uint64_t Index = N->getConstantOperandVal(Num: `1`);
458	MVT VecVT = N->getOperand(Num: `0`).getSimpleValueType();
459	return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
460	}
461
462	SDValue getInsertVINSERTImmediate(SDNode N, unsigned* VecWidth,
463	const SDLoc &DL) {
464	assert((VecWidth == `128` \|\| VecWidth == `256`) && "Unexpected vector width");
465	uint64_t Index = N->getConstantOperandVal(Num: `2`);
466	MVT VecVT = N->getSimpleValueType(ResNo: `0`);
467	return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
468	}
469
470	SDValue getPermuteVINSERTCommutedImmediate(SDNode N, unsigned* VecWidth,
471	const SDLoc &DL) {
472	assert(VecWidth == `128` && "Unexpected vector width");
473	uint64_t Index = N->getConstantOperandVal(Num: `2`);
474	MVT VecVT = N->getSimpleValueType(ResNo: `0`);
475	uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
476	assert((InsertIdx == `0` \|\| InsertIdx == `1`) && "Bad insertf128 index");
477	// vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
478	// vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
479	return getI8Imm(Imm: InsertIdx ? `0x02` : `0x30`, DL);
480	}
481
482	SDValue getSBBZero(SDNode *N) {
483	SDLoc dl(N);
484	MVT VT = N->getSimpleValueType(ResNo: `0`);
485
486	// Create zero.
487	SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
488	SDValue Zero =
489	SDValue (CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), `0`);
490	if (VT == MVT::i64) {
491	Zero = SDValue (
492	CurDAG->getMachineNode(
493	Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, Op1: Zero,
494	Op2: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
495	`0`);
496	}
497
498	// Copy flags to the EFLAGS register and glue it to next node.
499	unsigned Opcode = N->getOpcode();
500	assert((Opcode == X86ISD::SBB \|\| Opcode == X86ISD::SETCC_CARRY) &&
501	"Unexpected opcode for SBB materialization");
502	unsigned FlagOpIndex = Opcode == X86ISD::SBB ? `2` : `1`;
503	SDValue EFLAGS =
504	CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
505	N: N->getOperand(Num: FlagOpIndex), Glue: SDValue ());
506
507	// Create a 64-bit instruction if the result is 64-bits otherwise use the
508	// 32-bit version.
509	unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
510	MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
511	VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32);
512	return SDValue (
513	CurDAG->getMachineNode(Opcode: Opc, dl, VTs,
514	Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: `1`)}),
515	`0`);
516	}
517
518	// Helper to detect unneeded and instructions on shift amounts. Called
519	// from PatFrags in tablegen.
520	bool isUnneededShiftMask(SDNode N, unsigned* Width) const {
521	assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
522	const APInt &Val = N->getConstantOperandAPInt(Num: `1`);
523
524	if (Val.countr_one() >= Width)
525	return true;
526
527	APInt Mask = Val \| CurDAG->computeKnownBits(Op: N->getOperand(Num: `0`)).Zero;
528	return Mask.countr_one() >= Width;
529	}
530
531	/// Return an SDNode that returns the value of the global base register.
532	/// Output instructions required to initialize the global base register,
533	/// if necessary.
534	SDNode *getGlobalBaseReg();
535
536	/// Return a reference to the TargetMachine, casted to the target-specific
537	/// type.
538	const X86TargetMachine &getTargetMachine() const {
539	return static_cast<const X86TargetMachine &>(TM);
540	}
541
542	/// Return a reference to the TargetInstrInfo, casted to the target-specific
543	/// type.
544	const X86InstrInfo getInstrInfo() const* {
545	return Subtarget->getInstrInfo();
546	}
547
548	/// Return a condition code of the given SDNode
549	X86::CondCode getCondFromNode(SDNode N) const*;
550
551	/// Address-mode matching performs shift-of-and to and-of-shift
552	/// reassociation in order to expose more scaled addressing
553	/// opportunities.
554	bool ComplexPatternFuncMutatesDAG() const override {
555	return true;
556	}
557
558	bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode N) const*;
559
560	// Indicates we should prefer to use a non-temporal load for this load.
561	bool useNonTemporalLoad(LoadSDNode N) const* {
562	if (!N->isNonTemporal())
563	return false;
564
565	unsigned StoreSize = N->getMemoryVT().getStoreSize();
566
567	if (N->getAlign().value() < StoreSize)
568	return false;
569
570	switch (StoreSize) {
571	default: llvm_unreachable("Unsupported store size");
572	case `4`:
573	case `8`:
574	return false;
575	case `16`:
576	return Subtarget->hasSSE41();
577	case `32`:
578	return Subtarget->hasAVX2();
579	case `64`:
580	return Subtarget->hasAVX512();
581	}
582	}
583
584	bool foldLoadStoreIntoMemOperand(SDNode *Node);
585	MachineSDNode matchBEXTRFromAndImm(SDNode Node);
586	bool matchBitExtract(SDNode *Node);
587	bool shrinkAndImmediate(SDNode *N);
588	bool isMaskZeroExtended(SDNode N) const*;
589	bool tryShiftAmountMod(SDNode *N);
590	bool tryShrinkShlLogicImm(SDNode *N);
591	bool tryVPTERNLOG(SDNode *N);
592	bool matchVPTERNLOG(SDNode Root, SDNode ParentA, SDNode *ParentB,
593	SDNode *ParentC, SDValue A, SDValue B, SDValue C,
594	uint8_t Imm);
595	bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
596	bool tryMatchBitSelect(SDNode *N);
597
598	MachineSDNode emitPCMPISTR(unsigned* ROpc, unsigned MOpc, bool MayFoldLoad,
599	const SDLoc &dl, MVT VT, SDNode *Node);
600	MachineSDNode emitPCMPESTR(unsigned* ROpc, unsigned MOpc, bool MayFoldLoad,
601	const SDLoc &dl, MVT VT, SDNode *Node,
602	SDValue &InGlue);
603
604	bool tryOptimizeRem8Extend(SDNode *N);
605
606	bool onlyUsesZeroFlag(SDValue Flags) const;
607	bool hasNoSignFlagUses(SDValue Flags) const;
608	bool hasNoCarryFlagUses(SDValue Flags) const;
609	bool checkTCRetEnoughRegs(SDNode N) const*;
610	};
611
612	class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
613	public:
614	static char ID;
615	explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
616	CodeGenOptLevel OptLevel)
617	: SelectionDAGISelLegacy (
618	ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {}
619	};
620	}
621
622	char X86DAGToDAGISelLegacy::ID = `0`;
623
624	INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
625
626	// Returns true if this masked compare can be implemented legally with this
627	// type.
628	static bool isLegalMaskCompare(SDNode N, const* X86Subtarget *Subtarget) {
629	unsigned Opcode = N->getOpcode();
630	if (Opcode == X86ISD::CMPM \|\| Opcode == X86ISD::CMPMM \|\|
631	Opcode == X86ISD::STRICT_CMPM \|\| Opcode == ISD::SETCC \|\|
632	Opcode == X86ISD::CMPMM_SAE \|\| Opcode == X86ISD::VFPCLASS) {
633	// We can get 256-bit 8 element types here without VLX being enabled. When
634	// this happens we will use 512-bit operations and the mask will not be
635	// zero extended.
636	EVT OpVT = N->getOperand(Num: `0`).getValueType();
637	// The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
638	// second operand.
639	if (Opcode == X86ISD::STRICT_CMPM)
640	OpVT = N->getOperand(Num: `1`).getValueType();
641	if (OpVT.is256BitVector() \|\| OpVT.is128BitVector())
642	return Subtarget->hasVLX();
643
644	return true;
645	}
646	// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
647	if (Opcode == X86ISD::VFPCLASSS \|\| Opcode == X86ISD::FSETCCM \|\|
648	Opcode == X86ISD::FSETCCM_SAE)
649	return true;
650
651	return false;
652	}
653
654	// Returns true if we can assume the writer of the mask has zero extended it
655	// for us.
656	bool X86DAGToDAGISel::isMaskZeroExtended(SDNode N) const* {
657	// If this is an AND, check if we have a compare on either side. As long as
658	// one side guarantees the mask is zero extended, the AND will preserve those
659	// zeros.
660	if (N->getOpcode() == ISD::AND)
661	return isLegalMaskCompare(N: N->getOperand(Num: `0`).getNode(), Subtarget) \|\|
662	isLegalMaskCompare(N: N->getOperand(Num: `1`).getNode(), Subtarget);
663
664	return isLegalMaskCompare(N, Subtarget);
665	}
666
667	bool
668	X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const {
669	if (OptLevel == CodeGenOptLevel::None)
670	return false;
671
672	if (!N.hasOneUse())
673	return false;
674
675	if (N.getOpcode() != ISD::LOAD)
676	return true;
677
678	// Don't fold non-temporal loads if we have an instruction for them.
679	if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N)))
680	return false;
681
682	// If N is a load, do additional profitability checks.
683	if (U == Root) {
684	switch (U->getOpcode()) {
685	default: break;
686	case X86ISD::ADD:
687	case X86ISD::ADC:
688	case X86ISD::SUB:
689	case X86ISD::SBB:
690	case X86ISD::AND:
691	case X86ISD::XOR:
692	case X86ISD::OR:
693	case ISD::ADD:
694	case ISD::UADDO_CARRY:
695	case ISD::AND:
696	case ISD::OR:
697	case ISD::XOR: {
698	SDValue Op1 = U->getOperand(Num: `1`);
699
700	// If the other operand is a 8-bit immediate we should fold the immediate
701	// instead. This reduces code size.
702	// e.g.
703	// movl 4(%esp), %eax
704	// addl $4, %eax
705	// vs.
706	// movl $4, %eax
707	// addl 4(%esp), %eax
708	// The former is 2 bytes shorter. In case where the increment is 1, then
709	// the saving can be 4 bytes (by using incl %eax).
710	if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) {
711	if (Imm->getAPIntValue().isSignedIntN(N: `8`))
712	return false;
713
714	// If this is a 64-bit AND with an immediate that fits in 32-bits,
715	// prefer using the smaller and over folding the load. This is needed to
716	// make sure immediates created by shrinkAndImmediate are always folded.
717	// Ideally we would narrow the load during DAG combine and get the
718	// best of both worlds.
719	if (U->getOpcode() == ISD::AND &&
720	Imm->getAPIntValue().getBitWidth() == `64` &&
721	Imm->getAPIntValue().isIntN(N: `32`))
722	return false;
723
724	// If this really a zext_inreg that can be represented with a movzx
725	// instruction, prefer that.
726	// TODO: We could shrink the load and fold if it is non-volatile.
727	if (U->getOpcode() == ISD::AND &&
728	(Imm->getAPIntValue() == UINT8_MAX \|\|
729	Imm->getAPIntValue() == UINT16_MAX \|\|
730	Imm->getAPIntValue() == UINT32_MAX))
731	return false;
732
733	// ADD/SUB with can negate the immediate and use the opposite operation
734	// to fit 128 into a sign extended 8 bit immediate.
735	if ((U->getOpcode() == ISD::ADD \|\| U->getOpcode() == ISD::SUB) &&
736	(-Imm->getAPIntValue()).isSignedIntN(N: `8`))
737	return false;
738
739	if ((U->getOpcode() == X86ISD::ADD \|\| U->getOpcode() == X86ISD::SUB) &&
740	(-Imm->getAPIntValue()).isSignedIntN(N: `8`) &&
741	hasNoCarryFlagUses(Flags: SDValue (U, `1`)))
742	return false;
743	}
744
745	// If the other operand is a TLS address, we should fold it instead.
746	// This produces
747	// movl %gs:0, %eax
748	// leal i@NTPOFF(%eax), %eax
749	// instead of
750	// movl $i@NTPOFF, %eax
751	// addl %gs:0, %eax
752	// if the block also has an access to a second TLS address this will save
753	// a load.
754	// FIXME: This is probably also true for non-TLS addresses.
755	if (Op1.getOpcode() == X86ISD::Wrapper) {
756	SDValue Val = Op1.getOperand(i: `0`);
757	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
758	return false;
759	}
760
761	// Don't fold load if this matches the BTS/BTR/BTC patterns.
762	// BTS: (or X, (shl 1, n))
763	// BTR: (and X, (rotl -2, n))
764	// BTC: (xor X, (shl 1, n))
765	if (U->getOpcode() == ISD::OR \|\| U->getOpcode() == ISD::XOR) {
766	if (U->getOperand(Num: `0`).getOpcode() == ISD::SHL &&
767	isOneConstant(V: U->getOperand(Num: `0`).getOperand(i: `0`)))
768	return false;
769
770	if (U->getOperand(Num: `1`).getOpcode() == ISD::SHL &&
771	isOneConstant(V: U->getOperand(Num: `1`).getOperand(i: `0`)))
772	return false;
773	}
774	if (U->getOpcode() == ISD::AND) {
775	SDValue U0 = U->getOperand(Num: `0`);
776	SDValue U1 = U->getOperand(Num: `1`);
777	if (U0.getOpcode() == ISD::ROTL) {
778	auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: `0`));
779	if (C && C->getSExtValue() == -`2`)
780	return false;
781	}
782
783	if (U1.getOpcode() == ISD::ROTL) {
784	auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: `0`));
785	if (C && C->getSExtValue() == -`2`)
786	return false;
787	}
788	}
789
790	break;
791	}
792	case ISD::SHL:
793	case ISD::SRA:
794	case ISD::SRL:
795	// Don't fold a load into a shift by immediate. The BMI2 instructions
796	// support folding a load, but not an immediate. The legacy instructions
797	// support folding an immediate, but can't fold a load. Folding an
798	// immediate is preferable to folding a load.
799	if (isa<ConstantSDNode>(Val: U->getOperand(Num: `1`)))
800	return false;
801
802	break;
803	}
804	}
805
806	// Prevent folding a load if this can implemented with an insert_subreg or
807	// a move that implicitly zeroes.
808	if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
809	isNullConstant(V: Root->getOperand(Num: `2`)) &&
810	(Root->getOperand(Num: `0`).isUndef() \|\|
811	ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: `0`).getNode())))
812	return false;
813
814	return true;
815	}
816
817	// Indicates it is profitable to form an AVX512 masked operation. Returning
818	// false will favor a masked register-register masked move or vblendm and the
819	// operation will be selected separately.
820	bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode N) const* {
821	assert(
822	(N->getOpcode() == ISD::VSELECT \|\| N->getOpcode() == X86ISD::SELECTS) &&
823	"Unexpected opcode!");
824
825	// If the operation has additional users, the operation will be duplicated.
826	// Check the use count to prevent that.
827	// FIXME: Are there cheap opcodes we might want to duplicate?
828	return N->getOperand(Num: `1`).hasOneUse();
829	}
830
831	/// Replace the original chain operand of the call with
832	/// load's chain operand and move load below the call's chain operand.
833	static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
834	SDValue Call, SDValue OrigChain) {
835	SmallVector<SDValue, `8`> Ops;
836	SDValue Chain = OrigChain.getOperand(i: `0`);
837	if (Chain.getNode() == Load.getNode())
838	Ops.push_back(Elt: Load.getOperand(i: `0`));
839	else {
840	assert(Chain.getOpcode() == ISD::TokenFactor &&
841	"Unexpected chain operand");
842	for (unsigned i = `0`, e = Chain.getNumOperands(); i != e; ++i)
843	if (Chain.getOperand(i).getNode() == Load.getNode())
844	Ops.push_back(Elt: Load.getOperand(i: `0`));
845	else
846	Ops.push_back(Elt: Chain.getOperand(i));
847	SDValue NewChain =
848	CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc (Load), VT: MVT::Other, Ops);
849	Ops.clear();
850	Ops.push_back(Elt: NewChain);
851	}
852	Ops.append(in_start: OrigChain ->op_begin() + `1`, in_end: OrigChain ->op_end());
853	CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops);
854	CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: `0`),
855	Op2: Load.getOperand(i: `1`), Op3: Load.getOperand(i: `2`));
856
857	Ops.clear();
858	Ops.push_back(Elt: SDValue (Load.getNode(), `1`));
859	Ops.append(in_start: Call ->op_begin() + `1`, in_end: Call ->op_end());
860	CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops);
861	}
862
863	/// Return true if call address is a load and it can be
864	/// moved below CALLSEQ_START and the chains leading up to the call.
865	/// Return the CALLSEQ_START by reference as a second output.
866	/// In the case of a tail call, there isn't a callseq node between the call
867	/// chain and the load.
868	static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
869	// The transformation is somewhat dangerous if the call's chain was glued to
870	// the call. After MoveBelowOrigChain the load is moved between the call and
871	// the chain, this can create a cycle if the load is not folded. So it is
872	// really* important that we are sure the load will be folded.*
873	if (Callee.getNode() == Chain.getNode() \|\| !Callee.hasOneUse())
874	return false;
875	auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode());
876	if (!LD \|\|
877	!LD->isSimple() \|\|
878	LD->getAddressingMode() != ISD::UNINDEXED \|\|
879	LD->getExtensionType() != ISD::NON_EXTLOAD)
880	return false;
881
882	// If the load's outgoing chain has more than one use, we can't (currently)
883	// move the load since we'd most likely create a loop. TODO: Maybe it could
884	// work if moveBelowOrigChain() updated all* the chain users.*
885	if (!Callee.getValue(R: `1`).hasOneUse())
886	return false;
887
888	// Now let's find the callseq_start.
889	while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
890	if (!Chain.hasOneUse())
891	return false;
892	Chain = Chain.getOperand(i: `0`);
893	}
894
895	while (true) {
896	if (!Chain.getNumOperands())
897	return false;
898
899	// It's not safe to move the callee (a load) across e.g. a store.
900	// Conservatively abort if the chain contains a node other than the ones
901	// below.
902	switch (Chain.getNode()->getOpcode()) {
903	case ISD::CALLSEQ_START:
904	case ISD::CopyToReg:
905	case ISD::LOAD:
906	break;
907	default:
908	return false;
909	}
910
911	if (Chain.getOperand(i: `0`).getNode() == Callee.getNode())
912	return true;
913	if (Chain.getOperand(i: `0`).getOpcode() == ISD::TokenFactor &&
914	Chain.getOperand(i: `0`).getValue(R: `0`).hasOneUse() &&
915	Callee.getValue(R: `1`).isOperandOf(N: Chain.getOperand(i: `0`).getNode()) &&
916	Callee.getValue(R: `1`).hasOneUse())
917	return true;
918
919	// Look past CopyToRegs. We only walk one path, so the chain mustn't branch.
920	if (Chain.getOperand(i: `0`).getOpcode() == ISD::CopyToReg &&
921	Chain.getOperand(i: `0`).getValue(R: `0`).hasOneUse()) {
922	Chain = Chain.getOperand(i: `0`);
923	continue;
924	}
925
926	return false;
927	}
928	}
929
930	static bool isEndbrImm64(uint64_t Imm) {
931	// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
932	// i.g: 0xF3660F1EFA, 0xF3670F1EFA
933	if ((Imm & `0x00FFFFFF`) != `0x0F1EFA`)
934	return false;
935
936	uint8_t OptionalPrefixBytes [] = {`0x26`, `0x2e`, `0x36`, `0x3e`, `0x64`,
937	`0x65`, `0x66`, `0x67`, `0xf0`, `0xf2`};
938	int i = `24`; // 24bit 0x0F1EFA has matched
939	while (i < `64`) {
940	uint8_t Byte = (Imm >> i) & `0xFF`;
941	if (Byte == `0xF3`)
942	return true;
943	if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte))
944	return false;
945	i += `8`;
946	}
947
948	return false;
949	}
950
951	static bool needBWI(MVT VT) {
952	return (VT == MVT::v32i16 \|\| VT == MVT::v32f16 \|\| VT == MVT::v64i8);
953	}
954
955	void X86DAGToDAGISel::PreprocessISelDAG() {
956	bool MadeChange = false;
957	for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
958	E = CurDAG->allnodes_end(); I != E; ) {
959	SDNode N = &I ++; // Preincrement iterator to avoid invalidation issues.
960
961	// This is for CET enhancement.
962	//
963	// ENDBR32 and ENDBR64 have specific opcodes:
964	// ENDBR32: F3 0F 1E FB
965	// ENDBR64: F3 0F 1E FA
966	// And we want that attackers won’t find unintended ENDBR32/64
967	// opcode matches in the binary
968	// Here’s an example:
969	// If the compiler had to generate asm for the following code:
970	// a = 0xF30F1EFA
971	// it could, for example, generate:
972	// mov 0xF30F1EFA, dword ptr[a]
973	// In such a case, the binary would include a gadget that starts
974	// with a fake ENDBR64 opcode. Therefore, we split such generation
975	// into multiple operations, let it not shows in the binary
976	if (N->getOpcode() == ISD::Constant) {
977	MVT VT = N->getSimpleValueType(ResNo: `0`);
978	int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
979	int32_t EndbrImm = Subtarget->is64Bit() ? `0xF30F1EFA` : `0xF30F1EFB`;
980	if (Imm == EndbrImm \|\| isEndbrImm64(Imm)) {
981	// Check that the cf-protection-branch is enabled.
982	Metadata *CFProtectionBranch =
983	MF->getFunction().getParent()->getModuleFlag(
984	Key: "cf-protection-branch");
985	if (CFProtectionBranch \|\| IndirectBranchTracking) {
986	SDLoc dl(N);
987	SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true);
988	Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT);
989	--I;
990	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Complement);
991	++I;
992	MadeChange = true;
993	continue;
994	}
995	}
996	}
997
998	// If this is a target specific AND node with no flag usages, turn it back
999	// into ISD::AND to enable test instruction matching.
1000	if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: `1`)) {
1001	SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1002	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`));
1003	--I;
1004	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1005	++I;
1006	MadeChange = true;
1007	continue;
1008	}
1009
1010	// Convert vector increment or decrement to sub/add with an all-ones
1011	// constant:
1012	// add X, <1, 1...> --> sub X, <-1, -1...>
1013	// sub X, <1, 1...> --> add X, <-1, -1...>
1014	// The all-ones vector constant can be materialized using a pcmpeq
1015	// instruction that is commonly recognized as an idiom (has no register
1016	// dependency), so that's better/smaller than loading a splat 1 constant.
1017	//
1018	// But don't do this if it would inhibit a potentially profitable load
1019	// folding opportunity for the other operand. That only occurs with the
1020	// intersection of:
1021	// (1) The other operand (op0) is load foldable.
1022	// (2) The op is an add (otherwise, we are creating* an add and can still*
1023	// load fold the other op).
1024	// (3) The target has AVX (otherwise, we have a destructive add and can't
1025	// load fold the other op without killing the constant op).
1026	// (4) The constant 1 vector has multiple uses (so it is profitable to load
1027	// into a register anyway).
1028	auto mayPreventLoadFold = [&]() {
1029	return X86::mayFoldLoad(Op: N->getOperand(Num: `0`), Subtarget: *Subtarget) &&
1030	N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1031	!N->getOperand(Num: `1`).hasOneUse();
1032	};
1033	if ((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
1034	N->getSimpleValueType(ResNo: `0`).isVector() && !mayPreventLoadFold ()) {
1035	APInt SplatVal;
1036	if (!ISD::isBuildVectorOfConstantSDNodes(
1037	N: peekThroughBitcasts(V: N->getOperand(Num: `0`)).getNode()) &&
1038	X86::isConstantSplat(Op: N->getOperand(Num: `1`), SplatVal) &&
1039	SplatVal.isOne()) {
1040	SDLoc DL(N);
1041
1042	MVT VT = N->getSimpleValueType(ResNo: `0`);
1043	unsigned NumElts = VT.getSizeInBits() / `32`;
1044	SDValue AllOnes =
1045	CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts));
1046	AllOnes = CurDAG->getBitcast(VT, V: AllOnes);
1047
1048	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1049	SDValue Res =
1050	CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: `0`), N2: AllOnes);
1051	--I;
1052	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1053	++I;
1054	MadeChange = true;
1055	continue;
1056	}
1057	}
1058
1059	switch (N->getOpcode()) {
1060	case X86ISD::VBROADCAST: {
1061	MVT VT = N->getSimpleValueType(ResNo: `0`);
1062	// Emulate v32i16/v64i8 broadcast without BWI.
1063	if (!Subtarget->hasBWI() && needBWI(VT)) {
1064	MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1065	SDLoc dl(N);
1066	SDValue NarrowBCast =
1067	CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: `0`));
1068	SDValue Res =
1069	CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1070	N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1071	unsigned Index = NarrowVT.getVectorMinNumElements();
1072	Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1073	N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1074
1075	--I;
1076	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1077	++I;
1078	MadeChange = true;
1079	continue;
1080	}
1081
1082	break;
1083	}
1084	case X86ISD::VBROADCAST_LOAD: {
1085	MVT VT = N->getSimpleValueType(ResNo: `0`);
1086	// Emulate v32i16/v64i8 broadcast without BWI.
1087	if (!Subtarget->hasBWI() && needBWI(VT)) {
1088	MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1089	auto *MemNode = cast<MemSDNode>(Val: N);
1090	SDLoc dl(N);
1091	SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other);
1092	SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1093	SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1094	Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(),
1095	MMO: MemNode->getMemOperand());
1096	SDValue Res =
1097	CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1098	N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1099	unsigned Index = NarrowVT.getVectorMinNumElements();
1100	Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1101	N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1102
1103	--I;
1104	SDValue To[] = {Res, NarrowBCast.getValue(R: `1`)};
1105	CurDAG->ReplaceAllUsesWith(From: N, To);
1106	++I;
1107	MadeChange = true;
1108	continue;
1109	}
1110
1111	break;
1112	}
1113	case ISD::LOAD: {
1114	// If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1115	// load, then just extract the lower subvector and avoid the second load.
1116	auto *Ld = cast<LoadSDNode>(Val: N);
1117	MVT VT = N->getSimpleValueType(ResNo: `0`);
1118	if (!ISD::isNormalLoad(N: Ld) \|\| !Ld->isSimple() \|\|
1119	!(VT.is128BitVector() \|\| VT.is256BitVector()))
1120	break;
1121
1122	MVT MaxVT = VT;
1123	SDNode MaxLd = nullptr*;
1124	SDValue Ptr = Ld->getBasePtr();
1125	SDValue Chain = Ld->getChain();
1126	for (SDNode *User : Ptr ->users()) {
1127	auto *UserLd = dyn_cast<LoadSDNode>(Val: User);
1128	MVT UserVT = User->getSimpleValueType(ResNo: `0`);
1129	if (User != N && UserLd && ISD::isNormalLoad(N: User) &&
1130	UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1131	!User->hasAnyUseOfValue(Value: `1`) &&
1132	(UserVT.is256BitVector() \|\| UserVT.is512BitVector()) &&
1133	UserVT.getSizeInBits() > VT.getSizeInBits() &&
1134	(!MaxLd \|\| UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1135	MaxLd = User;
1136	MaxVT = UserVT;
1137	}
1138	}
1139	if (MaxLd) {
1140	SDLoc dl(N);
1141	unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1142	MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts);
1143	SDValue Extract = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT,
1144	N1: SDValue (MaxLd, `0`),
1145	N2: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1146	SDValue Res = CurDAG->getBitcast(VT, V: Extract);
1147
1148	--I;
1149	SDValue To[] = {Res, SDValue (MaxLd, `1`)};
1150	CurDAG->ReplaceAllUsesWith(From: N, To);
1151	++I;
1152	MadeChange = true;
1153	continue;
1154	}
1155	break;
1156	}
1157	case ISD::VSELECT: {
1158	// Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1159	EVT EleVT = N->getOperand(Num: `0`).getValueType().getVectorElementType();
1160	if (EleVT == MVT::i1)
1161	break;
1162
1163	assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1164	assert(N->getValueType(`0`).getVectorElementType() != MVT::i16 &&
1165	"We can't replace VSELECT with BLENDV in vXi16!");
1166	SDValue R;
1167	if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: `0`)) ==
1168	EleVT.getSizeInBits()) {
1169	R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1170	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`), N3: N->getOperand(Num: `2`),
1171	N4: CurDAG->getTargetConstant(Val: `0xCA`, DL: SDLoc (N), VT: MVT::i8));
1172	} else {
1173	R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1174	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`),
1175	N3: N->getOperand(Num: `2`));
1176	}
1177	--I;
1178	CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode());
1179	++I;
1180	MadeChange = true;
1181	continue;
1182	}
1183	case ISD::FP_ROUND:
1184	case ISD::STRICT_FP_ROUND:
1185	case ISD::FP_TO_SINT:
1186	case ISD::FP_TO_UINT:
1187	case ISD::STRICT_FP_TO_SINT:
1188	case ISD::STRICT_FP_TO_UINT: {
1189	// Replace vector fp_to_s/uint with their X86 specific equivalent so we
1190	// don't need 2 sets of patterns.
1191	if (!N->getSimpleValueType(ResNo: `0`).isVector())
1192	break;
1193
1194	unsigned NewOpc;
1195	switch (N->getOpcode()) {
1196	default: llvm_unreachable("Unexpected opcode!");
1197	case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1198	case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1199	case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1200	case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1201	case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1202	case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1203	}
1204	SDValue Res;
1205	if (N->isStrictFPOpcode())
1206	Res =
1207	CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), ResultTys: {N->getValueType(ResNo: `0`), MVT::Other},
1208	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`)});
1209	else
1210	Res =
1211	CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1212	Operand: N->getOperand(Num: `0`));
1213	--I;
1214	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1215	++I;
1216	MadeChange = true;
1217	continue;
1218	}
1219	case ISD::SHL:
1220	case ISD::SRA:
1221	case ISD::SRL: {
1222	// Replace vector shifts with their X86 specific equivalent so we don't
1223	// need 2 sets of patterns.
1224	if (!N->getValueType(ResNo: `0`).isVector())
1225	break;
1226
1227	unsigned NewOpc;
1228	switch (N->getOpcode()) {
1229	default: llvm_unreachable("Unexpected opcode!");
1230	case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1231	case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1232	case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1233	}
1234	SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1235	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`));
1236	--I;
1237	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1238	++I;
1239	MadeChange = true;
1240	continue;
1241	}
1242	case ISD::ANY_EXTEND:
1243	case ISD::ANY_EXTEND_VECTOR_INREG: {
1244	// Replace vector any extend with the zero extend equivalents so we don't
1245	// need 2 sets of patterns. Ignore vXi1 extensions.
1246	if (!N->getValueType(ResNo: `0`).isVector())
1247	break;
1248
1249	unsigned NewOpc;
1250	if (N->getOperand(Num: `0`).getScalarValueSizeInBits() == `1`) {
1251	assert(N->getOpcode() == ISD::ANY_EXTEND &&
1252	"Unexpected opcode for mask vector!");
1253	NewOpc = ISD::SIGN_EXTEND;
1254	} else {
1255	NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1256	? ISD::ZERO_EXTEND
1257	: ISD::ZERO_EXTEND_VECTOR_INREG;
1258	}
1259
1260	SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1261	Operand: N->getOperand(Num: `0`));
1262	--I;
1263	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1264	++I;
1265	MadeChange = true;
1266	continue;
1267	}
1268	case ISD::FCEIL:
1269	case ISD::STRICT_FCEIL:
1270	case ISD::FFLOOR:
1271	case ISD::STRICT_FFLOOR:
1272	case ISD::FTRUNC:
1273	case ISD::STRICT_FTRUNC:
1274	case ISD::FROUNDEVEN:
1275	case ISD::STRICT_FROUNDEVEN:
1276	case ISD::FNEARBYINT:
1277	case ISD::STRICT_FNEARBYINT:
1278	case ISD::FRINT:
1279	case ISD::STRICT_FRINT: {
1280	// Replace fp rounding with their X86 specific equivalent so we don't
1281	// need 2 sets of patterns.
1282	unsigned Imm;
1283	switch (N->getOpcode()) {
1284	default: llvm_unreachable("Unexpected opcode!");
1285	case ISD::STRICT_FCEIL:
1286	case ISD::FCEIL: Imm = `0xA`; break;
1287	case ISD::STRICT_FFLOOR:
1288	case ISD::FFLOOR: Imm = `0x9`; break;
1289	case ISD::STRICT_FTRUNC:
1290	case ISD::FTRUNC: Imm = `0xB`; break;
1291	case ISD::STRICT_FROUNDEVEN:
1292	case ISD::FROUNDEVEN: Imm = `0x8`; break;
1293	case ISD::STRICT_FNEARBYINT:
1294	case ISD::FNEARBYINT: Imm = `0xC`; break;
1295	case ISD::STRICT_FRINT:
1296	case ISD::FRINT: Imm = `0x4`; break;
1297	}
1298	SDLoc dl(N);
1299	bool IsStrict = N->isStrictFPOpcode();
1300	SDValue Res;
1301	if (IsStrict)
1302	Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl,
1303	ResultTys: {N->getValueType(ResNo: `0`), MVT::Other},
1304	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`),
1305	CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)});
1306	else
1307	Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: `0`),
1308	N1: N->getOperand(Num: `0`),
1309	N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32));
1310	--I;
1311	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1312	++I;
1313	MadeChange = true;
1314	continue;
1315	}
1316	case X86ISD::FANDN:
1317	case X86ISD::FAND:
1318	case X86ISD::FOR:
1319	case X86ISD::FXOR: {
1320	// Widen scalar fp logic ops to vector to reduce isel patterns.
1321	// FIXME: Can we do this during lowering/combine.
1322	MVT VT = N->getSimpleValueType(ResNo: `0`);
1323	if (VT.isVector() \|\| VT == MVT::f128)
1324	break;
1325
1326	MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1327	: VT == MVT::f32 ? MVT::v4f32
1328	: MVT::v8f16;
1329
1330	SDLoc dl(N);
1331	SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1332	Operand: N->getOperand(Num: `0`));
1333	SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1334	Operand: N->getOperand(Num: `1`));
1335
1336	SDValue Res;
1337	if (Subtarget->hasSSE2()) {
1338	EVT IntVT = EVT (VecVT).changeVectorElementTypeToInteger();
1339	Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0);
1340	Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1);
1341	unsigned Opc;
1342	switch (N->getOpcode()) {
1343	default: llvm_unreachable("Unexpected opcode!");
1344	case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1345	case X86ISD::FAND: Opc = ISD::AND; break;
1346	case X86ISD::FOR: Opc = ISD::OR; break;
1347	case X86ISD::FXOR: Opc = ISD::XOR; break;
1348	}
1349	Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1);
1350	Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res);
1351	} else {
1352	Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1);
1353	}
1354	Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res,
1355	N2: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1356	--I;
1357	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1358	++I;
1359	MadeChange = true;
1360	continue;
1361	}
1362	}
1363
1364	if (OptLevel != CodeGenOptLevel::None &&
1365	// Only do this when the target can fold the load into the call or
1366	// jmp.
1367	!Subtarget->useIndirectThunkCalls() &&
1368	((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) \|\|
1369	(N->getOpcode() == X86ISD::TC_RETURN &&
1370	(Subtarget->is64Bit() \|\|
1371	!getTargetMachine().isPositionIndependent())))) {
1372	/// Also try moving call address load from outside callseq_start to just
1373	/// before the call to allow it to be folded.
1374	///
1375	/// [Load chain]
1376	/// ^
1377	/// \|
1378	/// [Load]
1379	/// ^ ^
1380	/// \| \|
1381	/// / \--
1382	/// / \|
1383	///[CALLSEQ_START] \|
1384	/// ^ \|
1385	/// \| \|
1386	/// [LOAD/C2Reg] \|
1387	/// \| \|
1388	/// \ /
1389	/// \ /
1390	/// [CALL]
1391	bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1392	SDValue Chain = N->getOperand(Num: `0`);
1393	SDValue Load = N->getOperand(Num: `1`);
1394	if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq))
1395	continue;
1396	if (N->getOpcode() == X86ISD::TC_RETURN && !checkTCRetEnoughRegs(N))
1397	continue;
1398	moveBelowOrigChain(CurDAG, Load, Call: SDValue (N, `0`), OrigChain: Chain);
1399	++NumLoadMoved;
1400	MadeChange = true;
1401	continue;
1402	}
1403
1404	// Lower fpround and fpextend nodes that target the FP stack to be store and
1405	// load to the stack. This is a gross hack. We would like to simply mark
1406	// these as being illegal, but when we do that, legalize produces these when
1407	// it expands calls, then expands these in the same legalize pass. We would
1408	// like dag combine to be able to hack on these between the call expansion
1409	// and the node legalization. As such this pass basically does "really
1410	// late" legalization of these inline with the X86 isel pass.
1411	// FIXME: This should only happen when not compiled with -O0.
1412	switch (N->getOpcode()) {
1413	default: continue;
1414	case ISD::FP_ROUND:
1415	case ISD::FP_EXTEND:
1416	{
1417	MVT SrcVT = N->getOperand(Num: `0`).getSimpleValueType();
1418	MVT DstVT = N->getSimpleValueType(ResNo: `0`);
1419
1420	// If any of the sources are vectors, no fp stack involved.
1421	if (SrcVT.isVector() \|\| DstVT.isVector())
1422	continue;
1423
1424	// If the source and destination are SSE registers, then this is a legal
1425	// conversion that should not be lowered.
1426	const X86TargetLowering *X86Lowering =
1427	static_cast<const X86TargetLowering *>(TLI);
1428	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1429	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1430	if (SrcIsSSE && DstIsSSE)
1431	continue;
1432
1433	if (!SrcIsSSE && !DstIsSSE) {
1434	// If this is an FPStack extension, it is a noop.
1435	if (N->getOpcode() == ISD::FP_EXTEND)
1436	continue;
1437	// If this is a value-preserving FPStack truncation, it is a noop.
1438	if (N->getConstantOperandVal(Num: `1`))
1439	continue;
1440	}
1441
1442	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1443	// FPStack has extload and truncstore. SSE can fold direct loads into other
1444	// operations. Based on this, decide what we want to do.
1445	MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1446	SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1447	int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1448	MachinePointerInfo MPI =
1449	MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1450	SDLoc dl(N);
1451
1452	// FIXME: optimize the case where the src/dest is a load or store?
1453
1454	SDValue Store = CurDAG->getTruncStore(
1455	Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: `0`), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT);
1456	SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store,
1457	Ptr: MemTmp, PtrInfo: MPI, MemVT);
1458
1459	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1460	// extload we created. This will cause general havok on the dag because
1461	// anything below the conversion could be folded into other existing nodes.
1462	// To avoid invalidating 'I', back it up to the convert node.
1463	--I;
1464	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Result);
1465	break;
1466	}
1467
1468	//The sequence of events for lowering STRICT_FP versions of these nodes requires
1469	//dealing with the chain differently, as there is already a preexisting chain.
1470	case ISD::STRICT_FP_ROUND:
1471	case ISD::STRICT_FP_EXTEND:
1472	{
1473	MVT SrcVT = N->getOperand(Num: `1`).getSimpleValueType();
1474	MVT DstVT = N->getSimpleValueType(ResNo: `0`);
1475
1476	// If any of the sources are vectors, no fp stack involved.
1477	if (SrcVT.isVector() \|\| DstVT.isVector())
1478	continue;
1479
1480	// If the source and destination are SSE registers, then this is a legal
1481	// conversion that should not be lowered.
1482	const X86TargetLowering *X86Lowering =
1483	static_cast<const X86TargetLowering *>(TLI);
1484	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1485	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1486	if (SrcIsSSE && DstIsSSE)
1487	continue;
1488
1489	if (!SrcIsSSE && !DstIsSSE) {
1490	// If this is an FPStack extension, it is a noop.
1491	if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1492	continue;
1493	// If this is a value-preserving FPStack truncation, it is a noop.
1494	if (N->getConstantOperandVal(Num: `2`))
1495	continue;
1496	}
1497
1498	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1499	// FPStack has extload and truncstore. SSE can fold direct loads into other
1500	// operations. Based on this, decide what we want to do.
1501	MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1502	SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1503	int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1504	MachinePointerInfo MPI =
1505	MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1506	SDLoc dl(N);
1507
1508	// FIXME: optimize the case where the src/dest is a load or store?
1509
1510	//Since the operation is StrictFP, use the preexisting chain.
1511	SDValue Store, Result;
1512	if (!SrcIsSSE) {
1513	SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
1514	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `1`), MemTmp};
1515	Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT,
1516	PtrInfo: MPI, /Align/ Alignment: std::nullopt,
1517	Flags: MachineMemOperand::MOStore);
1518	if (N->getFlags().hasNoFPExcept()) {
1519	SDNodeFlags Flags = Store ->getFlags();
1520	Flags.setNoFPExcept(true);
1521	Store ->setFlags(Flags);
1522	}
1523	} else {
1524	assert(SrcVT == MemVT && "Unexpected VT!");
1525	Store = CurDAG->getStore(Chain: N->getOperand(Num: `0`), dl, Val: N->getOperand(Num: `1`), Ptr: MemTmp,
1526	PtrInfo: MPI);
1527	}
1528
1529	if (!DstIsSSE) {
1530	SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other);
1531	SDValue Ops[] = {Store, MemTmp};
1532	Result = CurDAG->getMemIntrinsicNode(
1533	Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI,
1534	/Align/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad);
1535	if (N->getFlags().hasNoFPExcept()) {
1536	SDNodeFlags Flags = Result ->getFlags();
1537	Flags.setNoFPExcept(true);
1538	Result ->setFlags(Flags);
1539	}
1540	} else {
1541	assert(DstVT == MemVT && "Unexpected VT!");
1542	Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI);
1543	}
1544
1545	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1546	// extload we created. This will cause general havok on the dag because
1547	// anything below the conversion could be folded into other existing nodes.
1548	// To avoid invalidating 'I', back it up to the convert node.
1549	--I;
1550	CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode());
1551	break;
1552	}
1553	}
1554
1555
1556	// Now that we did that, the node is dead. Increment the iterator to the
1557	// next node to process, then delete N.
1558	++I;
1559	MadeChange = true;
1560	}
1561
1562	// Remove any dead nodes that may have been left behind.
1563	if (MadeChange)
1564	CurDAG->RemoveDeadNodes();
1565	}
1566
1567	// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1568	bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1569	unsigned Opc = N->getMachineOpcode();
1570	if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1571	Opc != X86::MOVSX64rr8)
1572	return false;
1573
1574	SDValue N0 = N->getOperand(Num: `0`);
1575
1576	// We need to be extracting the lower bit of an extend.
1577	if (!N0.isMachineOpcode() \|\|
1578	N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG \|\|
1579	N0.getConstantOperandVal(i: `1`) != X86::sub_8bit)
1580	return false;
1581
1582	// We're looking for either a movsx or movzx to match the original opcode.
1583	unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1584	: X86::MOVSX32rr8_NOREX;
1585	SDValue N00 = N0.getOperand(i: `0`);
1586	if (!N00.isMachineOpcode() \|\| N00.getMachineOpcode() != ExpectedOpc)
1587	return false;
1588
1589	if (Opc == X86::MOVSX64rr8) {
1590	// If we had a sign extend from 8 to 64 bits. We still need to go from 32
1591	// to 64.
1592	MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc (N),
1593	VT: MVT::i64, Op1: N00);
1594	ReplaceUses(F: N, T: Extend);
1595	} else {
1596	// Ok we can drop this extend and just use the original extend.
1597	ReplaceUses(F: N, T: N00.getNode());
1598	}
1599
1600	return true;
1601	}
1602
1603	void X86DAGToDAGISel::PostprocessISelDAG() {
1604	// Skip peepholes at -O0.
1605	if (TM.getOptLevel() == CodeGenOptLevel::None)
1606	return;
1607
1608	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1609
1610	bool MadeChange = false;
1611	while (Position != CurDAG->allnodes_begin()) {
1612	SDNode N = &--Position;
1613	// Skip dead nodes and any non-machine opcodes.
1614	if (N->use_empty() \|\| !N->isMachineOpcode())
1615	continue;
1616
1617	if (tryOptimizeRem8Extend(N)) {
1618	MadeChange = true;
1619	continue;
1620	}
1621
1622	unsigned Opc = N->getMachineOpcode();
1623	switch (Opc) {
1624	default:
1625	continue;
1626	// ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1627	case X86::TEST8rr:
1628	case X86::TEST16rr:
1629	case X86::TEST32rr:
1630	case X86::TEST64rr:
1631	// ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1632	case X86::CTEST8rr:
1633	case X86::CTEST16rr:
1634	case X86::CTEST32rr:
1635	case X86::CTEST64rr: {
1636	auto &Op0 = N->getOperand(Num: `0`);
1637	if (Op0 != N->getOperand(Num: `1`) \|\| !Op0 ->hasNUsesOfValue(NUses: `2`, Value: Op0.getResNo()) \|\|
1638	!Op0.isMachineOpcode())
1639	continue;
1640	SDValue And = N->getOperand(Num: `0`);
1641	#define CASE_ND(OP) \
1642	case X86::OP: \
1643	case X86::OP##_ND:
1644	switch (And.getMachineOpcode()) {
1645	default:
1646	continue;
1647	CASE_ND(AND8rr)
1648	CASE_ND(AND16rr)
1649	CASE_ND(AND32rr)
1650	CASE_ND(AND64rr) {
1651	if (And ->hasAnyUseOfValue(Value: `1`))
1652	continue;
1653	SmallVector<SDValue> Ops(N->op_values());
1654	Ops [`0`] = And.getOperand(i: `0`);
1655	Ops [`1`] = And.getOperand(i: `1`);
1656	MachineSDNode *Test =
1657	CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (N), VT: MVT::i32, Ops);
1658	ReplaceUses(F: N, T: Test);
1659	MadeChange = true;
1660	continue;
1661	}
1662	CASE_ND(AND8rm)
1663	CASE_ND(AND16rm)
1664	CASE_ND(AND32rm)
1665	CASE_ND(AND64rm) {
1666	if (And ->hasAnyUseOfValue(Value: `1`))
1667	continue;
1668	unsigned NewOpc;
1669	bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc);
1670	#define FROM_TO(A, B) \
1671	CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1672	break;
1673	switch (And.getMachineOpcode()) {
1674	FROM_TO(AND8rm, TEST8mr);
1675	FROM_TO(AND16rm, TEST16mr);
1676	FROM_TO(AND32rm, TEST32mr);
1677	FROM_TO(AND64rm, TEST64mr);
1678	}
1679	#undef FROM_TO
1680	#undef CASE_ND
1681	// Need to swap the memory and register operand.
1682	SmallVector<SDValue> Ops = {And.getOperand(i: `1`), And.getOperand(i: `2`),
1683	And.getOperand(i: `3`), And.getOperand(i: `4`),
1684	And.getOperand(i: `5`), And.getOperand(i: `0`)};
1685	// CC, Cflags.
1686	if (IsCTESTCC) {
1687	Ops.push_back(Elt: N->getOperand(Num: `2`));
1688	Ops.push_back(Elt: N->getOperand(Num: `3`));
1689	}
1690	// Chain of memory load
1691	Ops.push_back(Elt: And.getOperand(i: `6`));
1692	// Glue
1693	if (IsCTESTCC)
1694	Ops.push_back(Elt: N->getOperand(Num: `4`));
1695
1696	MachineSDNode *Test = CurDAG->getMachineNode(
1697	Opcode: NewOpc, dl: SDLoc (N), VT1: MVT::i32, VT2: MVT::Other, Ops);
1698	CurDAG->setNodeMemRefs(
1699	N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands());
1700	ReplaceUses(F: And.getValue(R: `2`), T: SDValue (Test, `1`));
1701	ReplaceUses(F: SDValue (N, `0`), T: SDValue (Test, `0`));
1702	MadeChange = true;
1703	continue;
1704	}
1705	}
1706	}
1707	// Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1708	// used. We're doing this late so we can prefer to fold the AND into masked
1709	// comparisons. Doing that can be better for the live range of the mask
1710	// register.
1711	case X86::KORTESTBkk:
1712	case X86::KORTESTWkk:
1713	case X86::KORTESTDkk:
1714	case X86::KORTESTQkk: {
1715	SDValue Op0 = N->getOperand(Num: `0`);
1716	if (Op0 != N->getOperand(Num: `1`) \|\| !N->isOnlyUserOf(N: Op0.getNode()) \|\|
1717	!Op0.isMachineOpcode() \|\| !onlyUsesZeroFlag(Flags: SDValue (N, `0`)))
1718	continue;
1719	#define CASE(A) \
1720	case X86::A: \
1721	break;
1722	switch (Op0.getMachineOpcode()) {
1723	default:
1724	continue;
1725	CASE(KANDBkk)
1726	CASE(KANDWkk)
1727	CASE(KANDDkk)
1728	CASE(KANDQkk)
1729	}
1730	unsigned NewOpc;
1731	#define FROM_TO(A, B) \
1732	case X86::A: \
1733	NewOpc = X86::B; \
1734	break;
1735	switch (Opc) {
1736	FROM_TO(KORTESTBkk, KTESTBkk)
1737	FROM_TO(KORTESTWkk, KTESTWkk)
1738	FROM_TO(KORTESTDkk, KTESTDkk)
1739	FROM_TO(KORTESTQkk, KTESTQkk)
1740	}
1741	// KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1742	// KAND instructions and KTEST use the same ISA feature.
1743	if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1744	continue;
1745	#undef FROM_TO
1746	MachineSDNode *KTest = CurDAG->getMachineNode(
1747	Opcode: NewOpc, dl: SDLoc (N), VT: MVT::i32, Op1: Op0.getOperand(i: `0`), Op2: Op0.getOperand(i: `1`));
1748	ReplaceUses(F: N, T: KTest);
1749	MadeChange = true;
1750	continue;
1751	}
1752	// Attempt to remove vectors moves that were inserted to zero upper bits.
1753	case TargetOpcode::SUBREG_TO_REG: {
1754	unsigned SubRegIdx = N->getConstantOperandVal(Num: `1`);
1755	if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1756	continue;
1757
1758	SDValue Move = N->getOperand(Num: `0`);
1759	if (!Move.isMachineOpcode())
1760	continue;
1761
1762	// Make sure its one of the move opcodes we recognize.
1763	switch (Move.getMachineOpcode()) {
1764	default:
1765	continue;
1766	CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1767	CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1768	CASE(VMOVDQArr) CASE(VMOVDQUrr)
1769	CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1770	CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1771	CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1772	CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1773	CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1774	CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1775	CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1776	CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1777	CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1778	CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1779	CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1780	}
1781	#undef CASE
1782
1783	SDValue In = Move.getOperand(i: `0`);
1784	if (!In.isMachineOpcode() \|\|
1785	In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1786	continue;
1787
1788	// Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1789	// the SHA instructions which use a legacy encoding.
1790	uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags;
1791	if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1792	(TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1793	(TSFlags & X86II::EncodingMask) != X86II::XOP)
1794	continue;
1795
1796	// Producing instruction is another vector instruction. We can drop the
1797	// move.
1798	CurDAG->UpdateNodeOperands(N, Op1: In, Op2: N->getOperand(Num: `1`));
1799	MadeChange = true;
1800	}
1801	}
1802	}
1803
1804	if (MadeChange)
1805	CurDAG->RemoveDeadNodes();
1806	}
1807
1808
1809	/// Emit any code that needs to be executed only in the main function.
1810	void X86DAGToDAGISel::emitSpecialCodeForMain() {
1811	if (Subtarget->isTargetCygMing()) {
1812	TargetLowering::ArgListTy Args;
1813	auto &DL = CurDAG->getDataLayout();
1814
1815	TargetLowering::CallLoweringInfo CLI(*CurDAG);
1816	CLI.setChain(CurDAG->getRoot())
1817	.setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()),
1818	Target: CurDAG->getExternalSymbol(Sym: "__main", VT: TLI->getPointerTy(DL)),
1819	ArgsList: std::move(Args));
1820	const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1821	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1822	CurDAG->setRoot(Result.second);
1823	}
1824	}
1825
1826	void X86DAGToDAGISel::emitFunctionEntryCode() {
1827	// If this is main, emit special code for main.
1828	const Function &F = MF->getFunction();
1829	if (F.hasExternalLinkage() && F.getName() == "main")
1830	emitSpecialCodeForMain();
1831	}
1832
1833	static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1834	// We can run into an issue where a frame index or a register base
1835	// includes a displacement that, when added to the explicit displacement,
1836	// will overflow the displacement field. Assuming that the
1837	// displacement fits into a 31-bit integer (which is only slightly more
1838	// aggressive than the current fundamental assumption that it fits into
1839	// a 32-bit integer), a 31-bit disp should always be safe.
1840	return isInt<`31`>(x: Val);
1841	}
1842
1843	bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1844	X86ISelAddressMode &AM) {
1845	// We may have already matched a displacement and the caller just added the
1846	// symbolic displacement. So we still need to do the checks even if Offset
1847	// is zero.
1848
1849	int64_t Val = AM.Disp + Offset;
1850
1851	// Cannot combine ExternalSymbol displacements with integer offsets.
1852	if (Val != `0` && (AM.ES \|\| AM.MCSym))
1853	return true;
1854
1855	CodeModel::Model M = TM.getCodeModel();
1856	if (Subtarget->is64Bit()) {
1857	if (Val != `0` &&
1858	!X86::isOffsetSuitableForCodeModel(Offset: Val, M,
1859	hasSymbolicDisplacement: AM.hasSymbolicDisplacement()))
1860	return true;
1861	// In addition to the checks required for a register base, check that
1862	// we do not try to use an unsafe Disp with a frame index.
1863	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1864	!isDispSafeForFrameIndexOrRegBase(Val))
1865	return true;
1866	// In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1867	// 64 bits. Instructions with 32-bit register addresses perform this zero
1868	// extension for us and we can safely ignore the high bits of Offset.
1869	// Instructions with only a 32-bit immediate address do not, though: they
1870	// sign extend instead. This means only address the low 2GB of address space
1871	// is directly addressable, we need indirect addressing for the high 2GB of
1872	// address space.
1873	// TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1874	// implicit zero extension of instructions would cover up any problem.
1875	// However, we have asserts elsewhere that get triggered if we do, so keep
1876	// the checks for now.
1877	// TODO: We would actually be able to accept these, as well as the same
1878	// addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1879	// to get an address size override to be emitted. However, this
1880	// pseudo-register is not part of any register class and therefore causes
1881	// MIR verification to fail.
1882	if (Subtarget->isTarget64BitILP32() &&
1883	!isDispSafeForFrameIndexOrRegBase(Val: (uint32_t)Val) &&
1884	!AM.hasBaseOrIndexReg())
1885	return true;
1886	} else if (Subtarget->is16Bit()) {
1887	// In 16-bit mode, displacements are limited to [-65535,65535] for FK_Data_2
1888	// fixups of unknown signedness. See X86AsmBackend::applyFixup.
1889	if (Val < -(int64_t)UINT16_MAX \|\| Val > (int64_t)UINT16_MAX)
1890	return true;
1891	} else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1892	// For 32-bit X86, make sure the displacement still isn't close to the
1893	// expressible limit.
1894	return true;
1895	AM.Disp = Val;
1896	return false;
1897	}
1898
1899	bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1900	bool AllowSegmentRegForX32) {
1901	SDValue Address = N->getOperand(Num: `1`);
1902
1903	// load gs:0 -> GS segment register.
1904	// load fs:0 -> FS segment register.
1905	//
1906	// This optimization is generally valid because the GNU TLS model defines that
1907	// gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1908	// with 32-bit registers, as we get in ILP32 mode, those registers are first
1909	// zero-extended to 64 bits and then added it to the base address, which gives
1910	// unwanted results when the register holds a negative value.
1911	// For more information see http://people.redhat.com/drepper/tls.pdf
1912	if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr &&
1913	!IndirectTlsSegRefs &&
1914	(Subtarget->isTargetGlibc() \|\| Subtarget->isTargetMusl() \|\|
1915	Subtarget->isTargetAndroid() \|\| Subtarget->isTargetFuchsia())) {
1916	if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1917	return true;
1918	switch (N->getPointerInfo().getAddrSpace()) {
1919	case X86AS::GS:
1920	AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
1921	return false;
1922	case X86AS::FS:
1923	AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
1924	return false;
1925	// Address space X86AS::SS is not handled here, because it is not used to
1926	// address TLS areas.
1927	}
1928	}
1929
1930	return true;
1931	}
1932
1933	/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1934	/// mode. These wrap things that will resolve down into a symbol reference.
1935	/// If no match is possible, this returns true, otherwise it returns false.
1936	bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1937	// If the addressing mode already has a symbol as the displacement, we can
1938	// never match another symbol.
1939	if (AM.hasSymbolicDisplacement())
1940	return true;
1941
1942	bool IsRIPRelTLS = false;
1943	bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1944	if (IsRIPRel) {
1945	SDValue Val = N.getOperand(i: `0`);
1946	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1947	IsRIPRelTLS = true;
1948	}
1949
1950	// We can't use an addressing mode in the 64-bit large code model.
1951	// Global TLS addressing is an exception. In the medium code model,
1952	// we use can use a mode when RIP wrappers are present.
1953	// That signifies access to globals that are known to be "near",
1954	// such as the GOT itself.
1955	CodeModel::Model M = TM.getCodeModel();
1956	if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1957	return true;
1958
1959	// Base and index reg must be 0 in order to use %rip as base.
1960	if (IsRIPRel && AM.hasBaseOrIndexReg())
1961	return true;
1962
1963	// Make a local copy in case we can't do this fold.
1964	X86ISelAddressMode Backup = AM;
1965
1966	int64_t Offset = `0`;
1967	SDValue N0 = N.getOperand(i: `0`);
1968	if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) {
1969	AM.GV = G->getGlobal();
1970	AM.SymbolFlags = G->getTargetFlags();
1971	Offset = G->getOffset();
1972	} else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) {
1973	AM.CP = CP->getConstVal();
1974	AM.Alignment = CP->getAlign();
1975	AM.SymbolFlags = CP->getTargetFlags();
1976	Offset = CP->getOffset();
1977	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) {
1978	AM.ES = S->getSymbol();
1979	AM.SymbolFlags = S->getTargetFlags();
1980	} else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) {
1981	AM.MCSym = S->getMCSymbol();
1982	} else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) {
1983	AM.JT = J->getIndex();
1984	AM.SymbolFlags = J->getTargetFlags();
1985	} else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) {
1986	AM.BlockAddr = BA->getBlockAddress();
1987	AM.SymbolFlags = BA->getTargetFlags();
1988	Offset = BA->getOffset();
1989	} else
1990	llvm_unreachable("Unhandled symbol reference node.");
1991
1992	// Can't use an addressing mode with large globals.
1993	if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1994	TM.isLargeGlobalValue(GV: AM.GV)) {
1995	AM = Backup;
1996	return true;
1997	}
1998
1999	if (foldOffsetIntoAddress(Offset, AM)) {
2000	AM = Backup;
2001	return true;
2002	}
2003
2004	if (IsRIPRel)
2005	AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64));
2006
2007	// Commit the changes now that we know this fold is safe.
2008	return false;
2009	}
2010
2011	/// Add the specified node to the specified addressing mode, returning true if
2012	/// it cannot be done. This just pattern matches for the addressing mode.
2013	bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
2014	if (matchAddressRecursively(N, AM, Depth: `0`))
2015	return true;
2016
2017	// Post-processing: Make a second attempt to fold a load, if we now know
2018	// that there will not be any other register. This is only performed for
2019	// 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
2020	// any foldable load the first time.
2021	if (Subtarget->isTarget64BitILP32() &&
2022	AM.BaseType == X86ISelAddressMode::RegBase &&
2023	AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
2024	SDValue Save_Base_Reg = AM.Base_Reg;
2025	if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) {
2026	AM.Base_Reg = SDValue ();
2027	if (matchLoadInAddress(N: LoadN, AM, /AllowSegmentRegForX32=/true))
2028	AM.Base_Reg = Save_Base_Reg;
2029	}
2030	}
2031
2032	// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
2033	// a smaller encoding and avoids a scaled-index.
2034	if (AM.Scale == `2` &&
2035	AM.BaseType == X86ISelAddressMode::RegBase &&
2036	AM.Base_Reg.getNode() == nullptr) {
2037	AM.Base_Reg = AM.IndexReg;
2038	AM.Scale = `1`;
2039	}
2040
2041	// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2042	// because it has a smaller encoding.
2043	if (TM.getCodeModel() != CodeModel::Large &&
2044	(!AM.GV \|\| !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() &&
2045	AM.Scale == `1` && AM.BaseType == X86ISelAddressMode::RegBase &&
2046	AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2047	AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2048	// However, when GV is a local function symbol and in the same section as
2049	// the current instruction, and AM.Disp is negative and near INT32_MIN,
2050	// referencing GV+Disp generates a relocation referencing the section symbol
2051	// with an even smaller offset, which might underflow. We should bail out if
2052	// the negative offset is too close to INT32_MIN. Actually, we are more
2053	// conservative here, using a smaller magic number also used by
2054	// isOffsetSuitableForCodeModel.
2055	if (isa_and_nonnull<Function>(Val: AM.GV) && AM.Disp < -`16` * `1024` * `1024`)
2056	return true;
2057
2058	AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64);
2059	}
2060
2061	return false;
2062	}
2063
2064	bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2065	unsigned Depth) {
2066	// Add an artificial use to this node so that we can keep track of
2067	// it if it gets CSE'd with a different node.
2068	HandleSDNode Handle(N);
2069
2070	X86ISelAddressMode Backup = AM;
2071	if (!matchAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth+`1`) &&
2072	!matchAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM, Depth: Depth+`1`))
2073	return false;
2074	AM = Backup;
2075
2076	// Try again after commutating the operands.
2077	if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
2078	Depth: Depth + `1`) &&
2079	!matchAddressRecursively(N: Handle.getValue().getOperand(i: `0`), AM, Depth: Depth + `1`))
2080	return false;
2081	AM = Backup;
2082
2083	// If we couldn't fold both operands into the address at the same time,
2084	// see if we can just put each operand into a register and fold at least
2085	// the add.
2086	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2087	!AM.Base_Reg.getNode() &&
2088	!AM.IndexReg.getNode()) {
2089	N = Handle.getValue();
2090	AM.Base_Reg = N.getOperand(i: `0`);
2091	AM.IndexReg = N.getOperand(i: `1`);
2092	AM.Scale = `1`;
2093	return false;
2094	}
2095	N = Handle.getValue();
2096	return true;
2097	}
2098
2099	// Insert a node into the DAG at least before the Pos node's position. This
2100	// will reposition the node as needed, and will assign it a node ID that is <=
2101	// the Pos node's ID. Note that this does not* preserve the uniqueness of node*
2102	// IDs! The selection DAG must no longer depend on their uniqueness when this
2103	// is used.
2104	static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2105	if (N ->getNodeId() == -`1` \|\|
2106	(SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) >
2107	SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) {
2108	DAG.RepositionNode(Position: Pos ->getIterator(), N: N.getNode());
2109	// Mark Node as invalid for pruning as after this it may be a successor to a
2110	// selected node but otherwise be in the same position of Pos.
2111	// Conservatively mark it with the same -abs(Id) to assure node id
2112	// invariant is preserved.
2113	N ->setNodeId(Pos ->getNodeId());
2114	SelectionDAGISel::InvalidateNodeId(N: N.getNode());
2115	}
2116	}
2117
2118	// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2119	// safe. This allows us to convert the shift and and into an h-register
2120	// extract and a scaled index. Returns false if the simplification is
2121	// performed.
2122	static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2123	uint64_t Mask,
2124	SDValue Shift, SDValue X,
2125	X86ISelAddressMode &AM) {
2126	if (Shift.getOpcode() != ISD::SRL \|\|
2127	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)) \|\|
2128	!Shift.hasOneUse())
2129	return true;
2130
2131	int ScaleLog = `8` - Shift.getConstantOperandVal(i: `1`);
2132	if (ScaleLog <= `0` \|\| ScaleLog >= `4` \|\|
2133	Mask != (`0xffu` << ScaleLog))
2134	return true;
2135
2136	MVT XVT = X.getSimpleValueType();
2137	MVT VT = N.getSimpleValueType();
2138	SDLoc DL(N);
2139	SDValue Eight = DAG.getConstant(Val: `8`, DL, VT: MVT::i8);
2140	SDValue NewMask = DAG.getConstant(Val: `0xff`, DL, VT: XVT);
2141	SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight);
2142	SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask);
2143	SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT);
2144	SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8);
2145	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount);
2146
2147	// Insert the new nodes into the topological ordering. We must do this in
2148	// a valid topological ordering as nothing is going to go back and re-sort
2149	// these nodes. We continually insert before 'N' in sequence as this is
2150	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2151	// hierarchy left to express.
2152	insertDAGNode(DAG, Pos: N, N: Eight);
2153	insertDAGNode(DAG, Pos: N, N: NewMask);
2154	insertDAGNode(DAG, Pos: N, N: Srl);
2155	insertDAGNode(DAG, Pos: N, N: And);
2156	insertDAGNode(DAG, Pos: N, N: Ext);
2157	insertDAGNode(DAG, Pos: N, N: ShlCount);
2158	insertDAGNode(DAG, Pos: N, N: Shl);
2159	DAG.ReplaceAllUsesWith(From: N, To: Shl);
2160	DAG.RemoveDeadNode(N: N.getNode());
2161	AM.IndexReg = Ext;
2162	AM.Scale = (`1` << ScaleLog);
2163	return false;
2164	}
2165
2166	// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2167	// allows us to fold the shift into this addressing mode. Returns false if the
2168	// transform succeeded.
2169	static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2170	X86ISelAddressMode &AM) {
2171	SDValue Shift = N.getOperand(i: `0`);
2172
2173	// Use a signed mask so that shifting right will insert sign bits. These
2174	// bits will be removed when we shift the result left so it doesn't matter
2175	// what we use. This might allow a smaller immediate encoding.
2176	int64_t Mask = cast<ConstantSDNode>(Val: N ->getOperand(Num: `1`))->getSExtValue();
2177
2178	// If we have an any_extend feeding the AND, look through it to see if there
2179	// is a shift behind it. But only if the AND doesn't use the extended bits.
2180	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2181	bool FoundAnyExtend = false;
2182	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2183	Shift.getOperand(i: `0`).getSimpleValueType() == MVT::i32 &&
2184	isUInt<`32`>(x: Mask)) {
2185	FoundAnyExtend = true;
2186	Shift = Shift.getOperand(i: `0`);
2187	}
2188
2189	if (Shift.getOpcode() != ISD::SHL \|\|
2190	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
2191	return true;
2192
2193	SDValue X = Shift.getOperand(i: `0`);
2194
2195	// Not likely to be profitable if either the AND or SHIFT node has more
2196	// than one use (unless all uses are for address computation). Besides,
2197	// isel mechanism requires their node ids to be reused.
2198	if (!N.hasOneUse() \|\| !Shift.hasOneUse())
2199	return true;
2200
2201	// Verify that the shift amount is something we can fold.
2202	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2203	if (ShiftAmt != `1` && ShiftAmt != `2` && ShiftAmt != `3`)
2204	return true;
2205
2206	MVT VT = N.getSimpleValueType();
2207	SDLoc DL(N);
2208	if (FoundAnyExtend) {
2209	SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X);
2210	insertDAGNode(DAG, Pos: N, N: NewX);
2211	X = NewX;
2212	}
2213
2214	SDValue NewMask = DAG.getSignedConstant(Val: Mask >> ShiftAmt, DL, VT);
2215	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask);
2216	SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: `1`));
2217
2218	// Insert the new nodes into the topological ordering. We must do this in
2219	// a valid topological ordering as nothing is going to go back and re-sort
2220	// these nodes. We continually insert before 'N' in sequence as this is
2221	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2222	// hierarchy left to express.
2223	insertDAGNode(DAG, Pos: N, N: NewMask);
2224	insertDAGNode(DAG, Pos: N, N: NewAnd);
2225	insertDAGNode(DAG, Pos: N, N: NewShift);
2226	DAG.ReplaceAllUsesWith(From: N, To: NewShift);
2227	DAG.RemoveDeadNode(N: N.getNode());
2228
2229	AM.Scale = `1` << ShiftAmt;
2230	AM.IndexReg = NewAnd;
2231	return false;
2232	}
2233
2234	// Implement some heroics to detect shifts of masked values where the mask can
2235	// be replaced by extending the shift and undoing that in the addressing mode
2236	// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2237	// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2238	// the addressing mode. This results in code such as:
2239	//
2240	// int f(short y, int lookup_table) {
2241	// ...
2242	// return y + lookup_table[y >> 11];
2243	// }
2244	//
2245	// Turning into:
2246	// movzwl (%rdi), %eax
2247	// movl %eax, %ecx
2248	// shrl $11, %ecx
2249	// addl (%rsi,%rcx,4), %eax
2250	//
2251	// Instead of:
2252	// movzwl (%rdi), %eax
2253	// movl %eax, %ecx
2254	// shrl $9, %ecx
2255	// andl $124, %rcx
2256	// addl (%rsi,%rcx), %eax
2257	//
2258	// Note that this function assumes the mask is provided as a mask after* the*
2259	// value is shifted. The input chain may or may not match that, but computing
2260	// such a mask is trivial.
2261	static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2262	uint64_t Mask,
2263	SDValue Shift, SDValue X,
2264	X86ISelAddressMode &AM) {
2265	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse() \|\|
2266	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
2267	return true;
2268
2269	// We need to ensure that mask is a continuous run of bits.
2270	unsigned MaskIdx, MaskLen;
2271	if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2272	return true;
2273	unsigned MaskLZ = `64` - (MaskIdx + MaskLen);
2274
2275	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2276
2277	// The amount of shift we're trying to fit into the addressing mode is taken
2278	// from the shifted mask index (number of trailing zeros of the mask).
2279	unsigned AMShiftAmt = MaskIdx;
2280
2281	// There is nothing we can do here unless the mask is removing some bits.
2282	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2283	if (AMShiftAmt == `0` \|\| AMShiftAmt > `3`) return true;
2284
2285	// Scale the leading zero count down based on the actual size of the value.
2286	// Also scale it down based on the size of the shift.
2287	unsigned ScaleDown = (`64` - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2288	if (MaskLZ < ScaleDown)
2289	return true;
2290	MaskLZ -= ScaleDown;
2291
2292	// The final check is to ensure that any masked out high bits of X are
2293	// already known to be zero. Otherwise, the mask has a semantic impact
2294	// other than masking out a couple of low bits. Unfortunately, because of
2295	// the mask, zero extensions will be removed from operands in some cases.
2296	// This code works extra hard to look through extensions because we can
2297	// replace them with zero extensions cheaply if necessary.
2298	bool ReplacingAnyExtend = false;
2299	if (X.getOpcode() == ISD::ANY_EXTEND) {
2300	unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2301	X.getOperand(i: `0`).getSimpleValueType().getSizeInBits();
2302	// Assume that we'll replace the any-extend with a zero-extend, and
2303	// narrow the search to the extended value.
2304	X = X.getOperand(i: `0`);
2305	MaskLZ = ExtendBits > MaskLZ ? `0` : MaskLZ - ExtendBits;
2306	ReplacingAnyExtend = true;
2307	}
2308	APInt MaskedHighBits =
2309	APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ);
2310	if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits))
2311	return true;
2312
2313	// We've identified a pattern that can be transformed into a single shift
2314	// and an addressing mode. Make it so.
2315	MVT VT = N.getSimpleValueType();
2316	if (ReplacingAnyExtend) {
2317	assert(X.getValueType() != VT);
2318	// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2319	SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc (X), VT, Operand: X);
2320	insertDAGNode(DAG, Pos: N, N: NewX);
2321	X = NewX;
2322	}
2323
2324	MVT XVT = X.getSimpleValueType();
2325	SDLoc DL(N);
2326	SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2327	SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2328	SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT);
2329	SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2330	SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2331
2332	// Insert the new nodes into the topological ordering. We must do this in
2333	// a valid topological ordering as nothing is going to go back and re-sort
2334	// these nodes. We continually insert before 'N' in sequence as this is
2335	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2336	// hierarchy left to express.
2337	insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2338	insertDAGNode(DAG, Pos: N, N: NewSRL);
2339	insertDAGNode(DAG, Pos: N, N: NewExt);
2340	insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2341	insertDAGNode(DAG, Pos: N, N: NewSHL);
2342	DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2343	DAG.RemoveDeadNode(N: N.getNode());
2344
2345	AM.Scale = `1` << AMShiftAmt;
2346	AM.IndexReg = NewExt;
2347	return false;
2348	}
2349
2350	// Transform "(X >> SHIFT) & (MASK << C1)" to
2351	// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2352	// matched to a BEXTR later. Returns false if the simplification is performed.
2353	static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2354	uint64_t Mask,
2355	SDValue Shift, SDValue X,
2356	X86ISelAddressMode &AM,
2357	const X86Subtarget &Subtarget) {
2358	if (Shift.getOpcode() != ISD::SRL \|\|
2359	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)) \|\|
2360	!Shift.hasOneUse() \|\| !N.hasOneUse())
2361	return true;
2362
2363	// Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2364	if (!Subtarget.hasTBM() &&
2365	!(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2366	return true;
2367
2368	// We need to ensure that mask is a continuous run of bits.
2369	unsigned MaskIdx, MaskLen;
2370	if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2371	return true;
2372
2373	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2374
2375	// The amount of shift we're trying to fit into the addressing mode is taken
2376	// from the shifted mask index (number of trailing zeros of the mask).
2377	unsigned AMShiftAmt = MaskIdx;
2378
2379	// There is nothing we can do here unless the mask is removing some bits.
2380	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2381	if (AMShiftAmt == `0` \|\| AMShiftAmt > `3`) return true;
2382
2383	MVT XVT = X.getSimpleValueType();
2384	MVT VT = N.getSimpleValueType();
2385	SDLoc DL(N);
2386	SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2387	SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2388	SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT);
2389	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask);
2390	SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT);
2391	SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2392	SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2393
2394	// Insert the new nodes into the topological ordering. We must do this in
2395	// a valid topological ordering as nothing is going to go back and re-sort
2396	// these nodes. We continually insert before 'N' in sequence as this is
2397	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2398	// hierarchy left to express.
2399	insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2400	insertDAGNode(DAG, Pos: N, N: NewSRL);
2401	insertDAGNode(DAG, Pos: N, N: NewMask);
2402	insertDAGNode(DAG, Pos: N, N: NewAnd);
2403	insertDAGNode(DAG, Pos: N, N: NewExt);
2404	insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2405	insertDAGNode(DAG, Pos: N, N: NewSHL);
2406	DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2407	DAG.RemoveDeadNode(N: N.getNode());
2408
2409	AM.Scale = `1` << AMShiftAmt;
2410	AM.IndexReg = NewExt;
2411	return false;
2412	}
2413
2414	// Attempt to peek further into a scaled index register, collecting additional
2415	// extensions / offsets / etc. Returns /p N if we can't peek any further.
2416	SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2417	X86ISelAddressMode &AM,
2418	unsigned Depth) {
2419	assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2420	assert((AM.Scale == `1` \|\| AM.Scale == `2` \|\| AM.Scale == `4` \|\| AM.Scale == `8`) &&
2421	"Illegal index scale");
2422
2423	// Limit recursion.
2424	if (Depth >= SelectionDAG::MaxRecursionDepth)
2425	return N;
2426
2427	EVT VT = N.getValueType();
2428	unsigned Opc = N.getOpcode();
2429
2430	// index: add(x,c) -> index: x, disp + c
2431	if (CurDAG->isBaseWithConstantOffset(Op: N)) {
2432	auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: `1`));
2433	uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2434	if (!foldOffsetIntoAddress(Offset, AM))
2435	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2436	}
2437
2438	// index: add(x,x) -> index: x, scale 2*
2439	if (Opc == ISD::ADD && N.getOperand(i: `0`) == N.getOperand(i: `1`)) {
2440	if (AM.Scale <= `4`) {
2441	AM.Scale *= `2`;
2442	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2443	}
2444	}
2445
2446	// index: shl(x,i) -> index: x, scale (1 << i)*
2447	if (Opc == X86ISD::VSHLI) {
2448	uint64_t ShiftAmt = N.getConstantOperandVal(i: `1`);
2449	uint64_t ScaleAmt = `1ULL` << ShiftAmt;
2450	if ((AM.Scale * ScaleAmt) <= `8`) {
2451	AM.Scale *= ScaleAmt;
2452	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2453	}
2454	}
2455
2456	// index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2457	// TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2458	if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2459	SDValue Src = N.getOperand(i: `0`);
2460	if (Src.getOpcode() == ISD::ADD && Src ->getFlags().hasNoSignedWrap() &&
2461	Src.hasOneUse()) {
2462	if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2463	SDValue AddSrc = Src.getOperand(i: `0`);
2464	auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: `1`));
2465	int64_t Offset = AddVal->getSExtValue();
2466	if (!foldOffsetIntoAddress(Offset: (uint64_t)Offset * AM.Scale, AM)) {
2467	SDLoc DL(N);
2468	SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2469	SDValue ExtVal = CurDAG->getSignedConstant(Val: Offset, DL, VT);
2470	SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal);
2471	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2472	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2473	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2474	CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2475	CurDAG->RemoveDeadNode(N: N.getNode());
2476	return ExtSrc;
2477	}
2478	}
2479	}
2480	}
2481
2482	// index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2483	// index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2484	// TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2485	if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2486	SDValue Src = N.getOperand(i: `0`);
2487	unsigned SrcOpc = Src.getOpcode();
2488	if (((SrcOpc == ISD::ADD && Src ->getFlags().hasNoUnsignedWrap()) \|\|
2489	CurDAG->isADDLike(Op: Src, /NoWrap=/true)) &&
2490	Src.hasOneUse()) {
2491	if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2492	SDValue AddSrc = Src.getOperand(i: `0`);
2493	uint64_t Offset = Src.getConstantOperandVal(i: `1`);
2494	if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2495	SDLoc DL(N);
2496	SDValue Res;
2497	// If we're also scaling, see if we can use that as well.
2498	if (AddSrc.getOpcode() == ISD::SHL &&
2499	isa<ConstantSDNode>(Val: AddSrc.getOperand(i: `1`))) {
2500	SDValue ShVal = AddSrc.getOperand(i: `0`);
2501	uint64_t ShAmt = AddSrc.getConstantOperandVal(i: `1`);
2502	APInt HiBits =
2503	APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt);
2504	uint64_t ScaleAmt = `1ULL` << ShAmt;
2505	if ((AM.Scale * ScaleAmt) <= `8` &&
2506	(AddSrc ->getFlags().hasNoUnsignedWrap() \|\|
2507	CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) {
2508	AM.Scale *= ScaleAmt;
2509	SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal);
2510	SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal,
2511	N2: AddSrc.getOperand(i: `1`));
2512	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal);
2513	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift);
2514	AddSrc = ExtShift;
2515	Res = ExtShVal;
2516	}
2517	}
2518	SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2519	SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2520	SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal);
2521	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2522	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2523	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2524	CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2525	CurDAG->RemoveDeadNode(N: N.getNode());
2526	return Res ? Res : ExtSrc;
2527	}
2528	}
2529	}
2530	}
2531
2532	// TODO: Handle extensions, shifted masks etc.
2533	return N;
2534	}
2535
2536	bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2537	unsigned Depth) {
2538	LLVM_DEBUG({
2539	dbgs() << "MatchAddress: ";
2540	AM.dump(CurDAG);
2541	});
2542	// Limit recursion.
2543	if (Depth >= SelectionDAG::MaxRecursionDepth)
2544	return matchAddressBase(N, AM);
2545
2546	// If this is already a %rip relative address, we can only merge immediates
2547	// into it. Instead of handling this in every case, we handle it here.
2548	// RIP relative addressing: %rip + 32-bit displacement!
2549	if (AM.isRIPRelative()) {
2550	// FIXME: JumpTable and ExternalSymbol address currently don't like
2551	// displacements. It isn't very important, but this should be fixed for
2552	// consistency.
2553	if (!(AM.ES \|\| AM.MCSym) && AM.JT != -`1`)
2554	return true;
2555
2556	if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N))
2557	if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM))
2558	return false;
2559	return true;
2560	}
2561
2562	switch (N.getOpcode()) {
2563	default: break;
2564	case ISD::LOCAL_RECOVER: {
2565	if (!AM.hasSymbolicDisplacement() && AM.Disp == `0`)
2566	if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: `0`))) {
2567	// Use the symbol and don't prefix it.
2568	AM.MCSym = ESNode->getMCSymbol();
2569	return false;
2570	}
2571	break;
2572	}
2573	case ISD::Constant: {
2574	uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2575	if (!foldOffsetIntoAddress(Offset: Val, AM))
2576	return false;
2577	break;
2578	}
2579
2580	case X86ISD::Wrapper:
2581	case X86ISD::WrapperRIP:
2582	if (!matchWrapper(N, AM))
2583	return false;
2584	break;
2585
2586	case ISD::LOAD:
2587	if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM))
2588	return false;
2589	break;
2590
2591	case ISD::FrameIndex:
2592	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2593	AM.Base_Reg.getNode() == nullptr &&
2594	(!Subtarget->is64Bit() \|\| isDispSafeForFrameIndexOrRegBase(Val: AM.Disp))) {
2595	AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2596	AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex();
2597	return false;
2598	}
2599	break;
2600
2601	case ISD::SHL:
2602	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`)
2603	break;
2604
2605	if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`))) {
2606	unsigned Val = CN->getZExtValue();
2607	// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2608	// that the base operand remains free for further matching. If
2609	// the base doesn't end up getting used, a post-processing step
2610	// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2611	if (Val == `1` \|\| Val == `2` \|\| Val == `3`) {
2612	SDValue ShVal = N.getOperand(i: `0`);
2613	AM.Scale = `1` << Val;
2614	AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + `1`);
2615	return false;
2616	}
2617	}
2618	break;
2619
2620	case ISD::SRL: {
2621	// Scale must not be used already.
2622	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`) break;
2623
2624	// We only handle up to 64-bit values here as those are what matter for
2625	// addressing mode optimizations.
2626	assert(N.getSimpleValueType().getSizeInBits() <= `64` &&
2627	"Unexpected value size!");
2628
2629	SDValue And = N.getOperand(i: `0`);
2630	if (And.getOpcode() != ISD::AND) break;
2631	SDValue X = And.getOperand(i: `0`);
2632
2633	// The mask used for the transform is expected to be post-shift, but we
2634	// found the shift first so just apply the shift to the mask before passing
2635	// it down.
2636	if (!isa<ConstantSDNode>(Val: N.getOperand(i: `1`)) \|\|
2637	!isa<ConstantSDNode>(Val: And.getOperand(i: `1`)))
2638	break;
2639	uint64_t Mask = And.getConstantOperandVal(i: `1`) >> N.getConstantOperandVal(i: `1`);
2640
2641	// Try to fold the mask and shift into the scale, and return false if we
2642	// succeed.
2643	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM))
2644	return false;
2645	break;
2646	}
2647
2648	case ISD::SMUL_LOHI:
2649	case ISD::UMUL_LOHI:
2650	// A mul_lohi where we need the low part can be folded as a plain multiply.
2651	if (N.getResNo() != `0`) break;
2652	[[fallthrough]];
2653	case ISD::MUL:
2654	case X86ISD::MUL_IMM:
2655	// X[3,5,9] -> X+X[2,4,8]
2656	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2657	AM.Base_Reg.getNode() == nullptr &&
2658	AM.IndexReg.getNode() == nullptr) {
2659	if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`)))
2660	if (CN->getZExtValue() == `3` \|\| CN->getZExtValue() == `5` \|\|
2661	CN->getZExtValue() == `9`) {
2662	AM.Scale = unsigned(CN->getZExtValue())-`1`;
2663
2664	SDValue MulVal = N.getOperand(i: `0`);
2665	SDValue Reg;
2666
2667	// Okay, we know that we have a scale by now. However, if the scaled
2668	// value is an add of something and a constant, we can fold the
2669	// constant into the disp field here.
2670	if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2671	isa<ConstantSDNode>(Val: MulVal.getOperand(i: `1`))) {
2672	Reg = MulVal.getOperand(i: `0`);
2673	auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: `1`));
2674	uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2675	if (foldOffsetIntoAddress(Offset: Disp, AM))
2676	Reg = N.getOperand(i: `0`);
2677	} else {
2678	Reg = N.getOperand(i: `0`);
2679	}
2680
2681	AM.IndexReg = AM.Base_Reg = Reg;
2682	return false;
2683	}
2684	}
2685	break;
2686
2687	case ISD::SUB: {
2688	// Given A-B, if A can be completely folded into the address and
2689	// the index field with the index field unused, use -B as the index.
2690	// This is a win if a has multiple parts that can be folded into
2691	// the address. Also, this saves a mov if the base register has
2692	// other uses, since it avoids a two-address sub instruction, however
2693	// it costs an additional mov if the index register has other uses.
2694
2695	// Add an artificial use to this node so that we can keep track of
2696	// it if it gets CSE'd with a different node.
2697	HandleSDNode Handle(N);
2698
2699	// Test if the LHS of the sub can be folded.
2700	X86ISelAddressMode Backup = AM;
2701	if (matchAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth+`1`)) {
2702	N = Handle.getValue();
2703	AM = Backup;
2704	break;
2705	}
2706	N = Handle.getValue();
2707	// Test if the index field is free for use.
2708	if (AM.IndexReg.getNode() \|\| AM.isRIPRelative()) {
2709	AM = Backup;
2710	break;
2711	}
2712
2713	int Cost = `0`;
2714	SDValue RHS = N.getOperand(i: `1`);
2715	// If the RHS involves a register with multiple uses, this
2716	// transformation incurs an extra mov, due to the neg instruction
2717	// clobbering its operand.
2718	if (!RHS.getNode()->hasOneUse() \|\|
2719	RHS.getNode()->getOpcode() == ISD::CopyFromReg \|\|
2720	RHS.getNode()->getOpcode() == ISD::TRUNCATE \|\|
2721	RHS.getNode()->getOpcode() == ISD::ANY_EXTEND \|\|
2722	(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2723	RHS.getOperand(i: `0`).getValueType() == MVT::i32))
2724	++Cost;
2725	// If the base is a register with multiple uses, this
2726	// transformation may save a mov.
2727	if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2728	!AM.Base_Reg.getNode()->hasOneUse()) \|\|
2729	AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2730	--Cost;
2731	// If the folded LHS was interesting, this transformation saves
2732	// address arithmetic.
2733	if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2734	((AM.Disp != `0`) && (Backup.Disp == `0`)) +
2735	(AM.Segment.getNode() && !Backup.Segment.getNode()) >= `2`)
2736	--Cost;
2737	// If it doesn't look like it may be an overall win, don't do it.
2738	if (Cost >= `0`) {
2739	AM = Backup;
2740	break;
2741	}
2742
2743	// Ok, the transformation is legal and appears profitable. Go for it.
2744	// Negation will be emitted later to avoid creating dangling nodes if this
2745	// was an unprofitable LEA.
2746	AM.IndexReg = RHS;
2747	AM.NegateIndex = true;
2748	AM.Scale = `1`;
2749	return false;
2750	}
2751
2752	case ISD::OR:
2753	case ISD::XOR:
2754	// See if we can treat the OR/XOR node as an ADD node.
2755	if (!CurDAG->isADDLike(Op: N))
2756	break;
2757	[[fallthrough]];
2758	case ISD::ADD:
2759	if (!matchAdd(N, AM, Depth))
2760	return false;
2761	break;
2762
2763	case ISD::AND: {
2764	// Perform some heroic transforms on an and of a constant-count shift
2765	// with a constant to enable use of the scaled offset field.
2766
2767	// Scale must not be used already.
2768	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`) break;
2769
2770	// We only handle up to 64-bit values here as those are what matter for
2771	// addressing mode optimizations.
2772	assert(N.getSimpleValueType().getSizeInBits() <= `64` &&
2773	"Unexpected value size!");
2774
2775	if (!isa<ConstantSDNode>(Val: N.getOperand(i: `1`)))
2776	break;
2777
2778	if (N.getOperand(i: `0`).getOpcode() == ISD::SRL) {
2779	SDValue Shift = N.getOperand(i: `0`);
2780	SDValue X = Shift.getOperand(i: `0`);
2781
2782	uint64_t Mask = N.getConstantOperandVal(i: `1`);
2783
2784	// Try to fold the mask and shift into an extract and scale.
2785	if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2786	return false;
2787
2788	// Try to fold the mask and shift directly into the scale.
2789	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2790	return false;
2791
2792	// Try to fold the mask and shift into BEXTR and scale.
2793	if (!foldMaskedShiftToBEXTR(DAG&: CurDAG, N, Mask, Shift, X, AM, Subtarget: Subtarget))
2794	return false;
2795	}
2796
2797	// Try to swap the mask and shift to place shifts which can be done as
2798	// a scale on the outside of the mask.
2799	if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM))
2800	return false;
2801
2802	break;
2803	}
2804	case ISD::ZERO_EXTEND: {
2805	// Try to widen a zexted shift left to the same size as its use, so we can
2806	// match the shift as a scale factor.
2807	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`)
2808	break;
2809
2810	SDValue Src = N.getOperand(i: `0`);
2811
2812	// See if we can match a zext(addlike(x,c)).
2813	// TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2814	if (Src.getOpcode() == ISD::ADD \|\| Src.getOpcode() == ISD::OR)
2815	if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + `1`))
2816	if (Index != N) {
2817	AM.IndexReg = Index;
2818	return false;
2819	}
2820
2821	// Peek through mask: zext(and(shl(x,c1),c2))
2822	APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits());
2823	if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2824	if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: `1`))) {
2825	Mask = MaskC->getAPIntValue();
2826	Src = Src.getOperand(i: `0`);
2827	}
2828
2829	if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N ->hasOneUse()) {
2830	// Give up if the shift is not a valid scale factor [1,2,3].
2831	SDValue ShlSrc = Src.getOperand(i: `0`);
2832	SDValue ShlAmt = Src.getOperand(i: `1`);
2833	auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt);
2834	if (!ShAmtC)
2835	break;
2836	unsigned ShAmtV = ShAmtC->getZExtValue();
2837	if (ShAmtV > `3`)
2838	break;
2839
2840	// The narrow shift must only shift out zero bits (it must be 'nuw').
2841	// That makes it safe to widen to the destination type.
2842	APInt HighZeros =
2843	APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV);
2844	if (!Src ->getFlags().hasNoUnsignedWrap() &&
2845	!CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask))
2846	break;
2847
2848	// zext (shl nuw i8 %x, C1) to i32
2849	// --> shl (zext i8 %x to i32), (zext C1)
2850	// zext (and (shl nuw i8 %x, C1), C2) to i32
2851	// --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2852	MVT SrcVT = ShlSrc.getSimpleValueType();
2853	MVT VT = N.getSimpleValueType();
2854	SDLoc DL(N);
2855
2856	SDValue Res = ShlSrc;
2857	if (!Mask.isAllOnes()) {
2858	Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT);
2859	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2860	Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res);
2861	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2862	}
2863	SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res);
2864	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext);
2865	SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt);
2866	insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl);
2867	CurDAG->ReplaceAllUsesWith(From: N, To: NewShl);
2868	CurDAG->RemoveDeadNode(N: N.getNode());
2869
2870	// Convert the shift to scale factor.
2871	AM.Scale = `1` << ShAmtV;
2872	// If matchIndexRecursively is not called here,
2873	// Zext may be replaced by other nodes but later used to call a builder
2874	// method
2875	AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + `1`);
2876	return false;
2877	}
2878
2879	if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2880	// Try to fold the mask and shift into an extract and scale.
2881	if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2882	X: Src.getOperand(i: `0`), AM))
2883	return false;
2884
2885	// Try to fold the mask and shift directly into the scale.
2886	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2887	X: Src.getOperand(i: `0`), AM))
2888	return false;
2889
2890	// Try to fold the mask and shift into BEXTR and scale.
2891	if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2892	X: Src.getOperand(i: `0`), AM, Subtarget: *Subtarget))
2893	return false;
2894	}
2895
2896	break;
2897	}
2898	}
2899
2900	return matchAddressBase(N, AM);
2901	}
2902
2903	/// Helper for MatchAddress. Add the specified node to the
2904	/// specified addressing mode without any further recursion.
2905	bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2906	// Is the base register already occupied?
2907	if (AM.BaseType != X86ISelAddressMode::RegBase \|\| AM.Base_Reg.getNode()) {
2908	// If so, check to see if the scale index register is set.
2909	if (!AM.IndexReg.getNode()) {
2910	AM.IndexReg = N;
2911	AM.Scale = `1`;
2912	return false;
2913	}
2914
2915	// Otherwise, we cannot select it.
2916	return true;
2917	}
2918
2919	// Default, generate it as a register.
2920	AM.BaseType = X86ISelAddressMode::RegBase;
2921	AM.Base_Reg = N;
2922	return false;
2923	}
2924
2925	bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2926	X86ISelAddressMode &AM,
2927	unsigned Depth) {
2928	LLVM_DEBUG({
2929	dbgs() << "MatchVectorAddress: ";
2930	AM.dump(CurDAG);
2931	});
2932	// Limit recursion.
2933	if (Depth >= SelectionDAG::MaxRecursionDepth)
2934	return matchAddressBase(N, AM);
2935
2936	// TODO: Support other operations.
2937	switch (N.getOpcode()) {
2938	case ISD::Constant: {
2939	uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2940	if (!foldOffsetIntoAddress(Offset: Val, AM))
2941	return false;
2942	break;
2943	}
2944	case X86ISD::Wrapper:
2945	if (!matchWrapper(N, AM))
2946	return false;
2947	break;
2948	case ISD::ADD: {
2949	// Add an artificial use to this node so that we can keep track of
2950	// it if it gets CSE'd with a different node.
2951	HandleSDNode Handle(N);
2952
2953	X86ISelAddressMode Backup = AM;
2954	if (!matchVectorAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`) &&
2955	!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
2956	Depth: Depth + `1`))
2957	return false;
2958	AM = Backup;
2959
2960	// Try again after commuting the operands.
2961	if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
2962	Depth: Depth + `1`) &&
2963	!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `0`), AM,
2964	Depth: Depth + `1`))
2965	return false;
2966	AM = Backup;
2967
2968	N = Handle.getValue();
2969	break;
2970	}
2971	}
2972
2973	return matchAddressBase(N, AM);
2974	}
2975
2976	/// Helper for selectVectorAddr. Handles things that can be folded into a
2977	/// gather/scatter address. The index register and scale should have already
2978	/// been handled.
2979	bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2980	return matchVectorAddressRecursively(N, AM, Depth: `0`);
2981	}
2982
2983	bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2984	SDValue IndexOp, SDValue ScaleOp,
2985	SDValue &Base, SDValue &Scale,
2986	SDValue &Index, SDValue &Disp,
2987	SDValue &Segment) {
2988	X86ISelAddressMode AM;
2989	AM.Scale = ScaleOp ->getAsZExtVal();
2990
2991	// Attempt to match index patterns, as long as we're not relying on implicit
2992	// sign-extension, which is performed BEFORE scale.
2993	if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2994	AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: `0`);
2995	else
2996	AM.IndexReg = IndexOp;
2997
2998	unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2999	if (AddrSpace == X86AS::GS)
3000	AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
3001	if (AddrSpace == X86AS::FS)
3002	AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
3003	if (AddrSpace == X86AS::SS)
3004	AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
3005
3006	SDLoc DL(BasePtr);
3007	MVT VT = BasePtr.getSimpleValueType();
3008
3009	// Try to match into the base and displacement fields.
3010	if (matchVectorAddress(N: BasePtr, AM))
3011	return false;
3012
3013	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3014	return true;
3015	}
3016
3017	/// Returns true if it is able to pattern match an addressing mode.
3018	/// It returns the operands which make up the maximal addressing mode it can
3019	/// match by reference.
3020	///
3021	/// Parent is the parent node of the addr operand that is being matched. It
3022	/// is always a load, store, atomic node, or null. It is only null when
3023	/// checking memory operands for inline asm nodes.
3024	bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
3025	SDValue &Scale, SDValue &Index, SDValue &Disp,
3026	SDValue &Segment, bool HasNDDM) {
3027	X86ISelAddressMode AM;
3028
3029	if (Parent &&
3030	// This list of opcodes are all the nodes that have an "addr:$ptr" operand
3031	// that are not a MemSDNode, and thus don't have proper addrspace info.
3032	Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
3033	Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
3034	Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
3035	Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3036	Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3037	Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3038	Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3039	unsigned AddrSpace =
3040	cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace();
3041	if (AddrSpace == X86AS::GS)
3042	AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
3043	if (AddrSpace == X86AS::FS)
3044	AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
3045	if (AddrSpace == X86AS::SS)
3046	AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
3047	}
3048
3049	// Save the DL and VT before calling matchAddress, it can invalidate N.
3050	SDLoc DL(N);
3051	MVT VT = N.getSimpleValueType();
3052
3053	if (matchAddress(N, AM))
3054	return false;
3055
3056	if (!HasNDDM && !AM.isRIPRelative())
3057	return false;
3058
3059	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3060	return true;
3061	}
3062
3063	bool X86DAGToDAGISel::selectNDDAddr(SDNode *Parent, SDValue N, SDValue &Base,
3064	SDValue &Scale, SDValue &Index,
3065	SDValue &Disp, SDValue &Segment) {
3066	return selectAddr(Parent, N, Base, Scale, Index, Disp, Segment,
3067	HasNDDM: Subtarget->hasNDDM());
3068	}
3069
3070	bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3071	// Cannot use 32 bit constants to reference objects in kernel/large code
3072	// model.
3073	if (TM.getCodeModel() == CodeModel::Kernel \|\|
3074	TM.getCodeModel() == CodeModel::Large)
3075	return false;
3076
3077	// In static codegen with small code model, we can get the address of a label
3078	// into a register with 'movl'
3079	if (N ->getOpcode() != X86ISD::Wrapper)
3080	return false;
3081
3082	N = N.getOperand(i: `0`);
3083
3084	// At least GNU as does not accept 'movl' for TPOFF relocations.
3085	// FIXME: We could use 'movl' when we know we are targeting MC.
3086	if (N ->getOpcode() == ISD::TargetGlobalTLSAddress)
3087	return false;
3088
3089	Imm = N;
3090	// Small/medium code model can reference non-TargetGlobalAddress objects with
3091	// 32 bit constants.
3092	if (N ->getOpcode() != ISD::TargetGlobalAddress) {
3093	return TM.getCodeModel() == CodeModel::Small \|\|
3094	TM.getCodeModel() == CodeModel::Medium;
3095	}
3096
3097	const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal();
3098	if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3099	return CR ->getUnsignedMax().ult(RHS: `1ull` << `32`);
3100
3101	return !TM.isLargeGlobalValue(GV);
3102	}
3103
3104	bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3105	SDValue &Index, SDValue &Disp,
3106	SDValue &Segment) {
3107	// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3108	SDLoc DL(N);
3109
3110	if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3111	return false;
3112
3113	EVT BaseType = Base.getValueType();
3114	unsigned SubReg;
3115	if (BaseType == MVT::i8)
3116	SubReg = X86::sub_8bit;
3117	else if (BaseType == MVT::i16)
3118	SubReg = X86::sub_16bit;
3119	else
3120	SubReg = X86::sub_32bit;
3121
3122	auto *RN = dyn_cast<RegisterSDNode>(Val&: Base);
3123	if (RN && RN->getReg() == `0`)
3124	Base = CurDAG->getRegister(Reg: `0`, VT: MVT::i64);
3125	else if ((BaseType == MVT::i8 \|\| BaseType == MVT::i16 \|\|
3126	BaseType == MVT::i32) &&
3127	!isa<FrameIndexSDNode>(Val: Base)) {
3128	// Base could already be %rip, particularly in the x32 ABI.
3129	SDValue ImplDef = SDValue (CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3130	VT: MVT::i64), `0`);
3131	Base = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Base);
3132	}
3133
3134	[[maybe_unused]] EVT IndexType = Index.getValueType();
3135	RN = dyn_cast<RegisterSDNode>(Val&: Index);
3136	if (RN && RN->getReg() == `0`)
3137	Index = CurDAG->getRegister(Reg: `0`, VT: MVT::i64);
3138	else {
3139	assert((IndexType == BaseType) &&
3140	"Expect to be extending 8/16/32-bit registers for use in LEA");
3141	SDValue ImplDef = SDValue (CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3142	VT: MVT::i64), `0`);
3143	Index = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Index);
3144	}
3145
3146	return true;
3147	}
3148
3149	/// Calls SelectAddr and determines if the maximal addressing
3150	/// mode it matches can be cost effectively emitted as an LEA instruction.
3151	bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3152	SDValue &Base, SDValue &Scale,
3153	SDValue &Index, SDValue &Disp,
3154	SDValue &Segment) {
3155	X86ISelAddressMode AM;
3156
3157	// Save the DL and VT before calling matchAddress, it can invalidate N.
3158	SDLoc DL(N);
3159	MVT VT = N.getSimpleValueType();
3160
3161	// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3162	// segments.
3163	SDValue Copy = AM.Segment;
3164	SDValue T = CurDAG->getRegister(Reg: `0`, VT: MVT::i32);
3165	AM.Segment = T;
3166	if (matchAddress(N, AM))
3167	return false;
3168	assert (T == AM.Segment);
3169	AM.Segment = Copy;
3170
3171	unsigned Complexity = `0`;
3172	if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3173	Complexity = `1`;
3174	else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3175	Complexity = `4`;
3176
3177	if (AM.IndexReg.getNode())
3178	Complexity++;
3179
3180	// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3181	// a simple shift.
3182	if (AM.Scale > `1`)
3183	Complexity++;
3184
3185	// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3186	// to a LEA. This is determined with some experimentation but is by no means
3187	// optimal (especially for code size consideration). LEA is nice because of
3188	// its three-address nature. Tweak the cost function again when we can run
3189	// convertToThreeAddress() at register allocation time.
3190	if (AM.hasSymbolicDisplacement()) {
3191	// For X86-64, always use LEA to materialize RIP-relative addresses.
3192	if (Subtarget->is64Bit())
3193	Complexity = `4`;
3194	else
3195	Complexity += `2`;
3196	}
3197
3198	// Heuristic: try harder to form an LEA from ADD if the operands set flags.
3199	// Unlike ADD, LEA does not affect flags, so we will be less likely to require
3200	// duplicating flag-producing instructions later in the pipeline.
3201	if (N.getOpcode() == ISD::ADD) {
3202	auto isMathWithFlags = [](SDValue V) {
3203	switch (V.getOpcode()) {
3204	case X86ISD::ADD:
3205	case X86ISD::SUB:
3206	case X86ISD::ADC:
3207	case X86ISD::SBB:
3208	case X86ISD::SMUL:
3209	case X86ISD::UMUL:
3210	/ TODO: These opcodes can be added safely, but we may want to justify*
3211	their inclusion for different reasons (better for reg-alloc).
3212	case X86ISD::OR:
3213	case X86ISD::XOR:
3214	case X86ISD::AND:
3215	*/
3216	// Value 1 is the flag output of the node - verify it's not dead.
3217	return !SDValue (V.getNode(), `1`).use_empty();
3218	default:
3219	return false;
3220	}
3221	};
3222	// TODO: We might want to factor in whether there's a load folding
3223	// opportunity for the math op that disappears with LEA.
3224	if (isMathWithFlags (N.getOperand(i: `0`)) \|\| isMathWithFlags (N.getOperand(i: `1`)))
3225	Complexity++;
3226	}
3227
3228	if (AM.Disp)
3229	Complexity++;
3230
3231	// If it isn't worth using an LEA, reject it.
3232	if (Complexity <= `2`)
3233	return false;
3234
3235	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3236	return true;
3237	}
3238
3239	/// This is only run on TargetGlobalTLSAddress nodes.
3240	bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3241	SDValue &Scale, SDValue &Index,
3242	SDValue &Disp, SDValue &Segment) {
3243	assert(N.getOpcode() == ISD::TargetGlobalTLSAddress \|\|
3244	N.getOpcode() == ISD::TargetExternalSymbol);
3245
3246	X86ISelAddressMode AM;
3247	if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) {
3248	AM.GV = GA->getGlobal();
3249	AM.Disp += GA->getOffset();
3250	AM.SymbolFlags = GA->getTargetFlags();
3251	} else {
3252	auto *SA = cast<ExternalSymbolSDNode>(Val&: N);
3253	AM.ES = SA->getSymbol();
3254	AM.SymbolFlags = SA->getTargetFlags();
3255	}
3256
3257	if (Subtarget->is32Bit()) {
3258	AM.Scale = `1`;
3259	AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32);
3260	}
3261
3262	MVT VT = N.getSimpleValueType();
3263	getAddressOperands(AM, DL: SDLoc (N), VT, Base, Scale, Index, Disp, Segment);
3264	return true;
3265	}
3266
3267	bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3268	// Keep track of the original value type and whether this value was
3269	// truncated. If we see a truncation from pointer type to VT that truncates
3270	// bits that are known to be zero, we can use a narrow reference.
3271	EVT VT = N.getValueType();
3272	bool WasTruncated = false;
3273	if (N.getOpcode() == ISD::TRUNCATE) {
3274	WasTruncated = true;
3275	N = N.getOperand(i: `0`);
3276	}
3277
3278	if (N.getOpcode() != X86ISD::Wrapper)
3279	return false;
3280
3281	// We can only use non-GlobalValues as immediates if they were not truncated,
3282	// as we do not have any range information. If we have a GlobalValue and the
3283	// address was not truncated, we can select it as an operand directly.
3284	unsigned Opc = N.getOperand(i: `0`)->getOpcode();
3285	if (Opc != ISD::TargetGlobalAddress \|\| !WasTruncated) {
3286	Op = N.getOperand(i: `0`);
3287	// We can only select the operand directly if we didn't have to look past a
3288	// truncate.
3289	return !WasTruncated;
3290	}
3291
3292	// Check that the global's range fits into VT.
3293	auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: `0`));
3294	std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3295	if (!CR \|\| CR ->getUnsignedMax().uge(RHS: `1ull` << VT.getSizeInBits()))
3296	return false;
3297
3298	// Okay, we can use a narrow reference.
3299	Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc (N), VT,
3300	offset: GA->getOffset(), TargetFlags: GA->getTargetFlags());
3301	return true;
3302	}
3303
3304	bool X86DAGToDAGISel::tryFoldLoad(SDNode Root, SDNode P, SDValue N,
3305	SDValue &Base, SDValue &Scale,
3306	SDValue &Index, SDValue &Disp,
3307	SDValue &Segment) {
3308	assert(Root && P && "Unknown root/parent nodes");
3309	if (!ISD::isNON_EXTLoad(N: N.getNode()) \|\|
3310	!IsProfitableToFold(N, U: P, Root) \|\|
3311	!IsLegalToFold(N, U: P, Root, OptLevel))
3312	return false;
3313
3314	return selectAddr(Parent: N.getNode(),
3315	N: N.getOperand(i: `1`), Base, Scale, Index, Disp, Segment);
3316	}
3317
3318	bool X86DAGToDAGISel::tryFoldBroadcast(SDNode Root, SDNode P, SDValue N,
3319	SDValue &Base, SDValue &Scale,
3320	SDValue &Index, SDValue &Disp,
3321	SDValue &Segment) {
3322	assert(Root && P && "Unknown root/parent nodes");
3323	if (N ->getOpcode() != X86ISD::VBROADCAST_LOAD \|\|
3324	!IsProfitableToFold(N, U: P, Root) \|\|
3325	!IsLegalToFold(N, U: P, Root, OptLevel))
3326	return false;
3327
3328	return selectAddr(Parent: N.getNode(),
3329	N: N.getOperand(i: `1`), Base, Scale, Index, Disp, Segment);
3330	}
3331
3332	/// Return an SDNode that returns the value of the global base register.
3333	/// Output instructions required to initialize the global base register,
3334	/// if necessary.
3335	SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3336	Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3337	auto &DL = MF->getDataLayout();
3338	return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode();
3339	}
3340
3341	bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode N) const* {
3342	if (N->getOpcode() == ISD::TRUNCATE)
3343	N = N->getOperand(Num: `0`).getNode();
3344	if (N->getOpcode() != X86ISD::Wrapper)
3345	return false;
3346
3347	auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: `0`));
3348	if (!GA)
3349	return false;
3350
3351	auto *GV = GA->getGlobal();
3352	std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3353	if (CR)
3354	return CR ->getSignedMin().sge(RHS: -`1ull` << Width) &&
3355	CR ->getSignedMax().slt(RHS: `1ull` << Width);
3356	// In the kernel code model, globals are in the negative 2GB of the address
3357	// space, so globals can be a sign extended 32-bit immediate.
3358	// In other code models, small globals are in the low 2GB of the address
3359	// space, so sign extending them is equivalent to zero extending them.
3360	return TM.getCodeModel() != CodeModel::Large && Width == `32` &&
3361	!TM.isLargeGlobalValue(GV);
3362	}
3363
3364	X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode N) const* {
3365	assert(N->isMachineOpcode() && "Unexpected node");
3366	unsigned Opc = N->getMachineOpcode();
3367	const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc);
3368	int CondNo = X86::getCondSrcNoFromDesc(MCID);
3369	if (CondNo < `0`)
3370	return X86::COND_INVALID;
3371
3372	return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo));
3373	}
3374
3375	/// Test whether the given X86ISD::CMP node has any users that use a flag
3376	/// other than ZF.
3377	bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3378	// Examine each user of the node.
3379	for (SDUse &Use : Flags ->uses()) {
3380	// Only check things that use the flags.
3381	if (Use.getResNo() != Flags.getResNo())
3382	continue;
3383	SDNode *User = Use.getUser();
3384	// Only examine CopyToReg uses that copy to EFLAGS.
3385	if (User->getOpcode() != ISD::CopyToReg \|\|
3386	cast<RegisterSDNode>(Val: User->getOperand(Num: `1`))->getReg() != X86::EFLAGS)
3387	return false;
3388	// Examine each user of the CopyToReg use.
3389	for (SDUse &FlagUse : User->uses()) {
3390	// Only examine the Flag result.
3391	if (FlagUse.getResNo() != `1`)
3392	continue;
3393	// Anything unusual: assume conservatively.
3394	if (!FlagUse.getUser()->isMachineOpcode())
3395	return false;
3396	// Examine the condition code of the user.
3397	X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3398
3399	switch (CC) {
3400	// Comparisons which only use the zero flag.
3401	case X86::COND_E: case X86::COND_NE:
3402	continue;
3403	// Anything else: assume conservatively.
3404	default:
3405	return false;
3406	}
3407	}
3408	}
3409	return true;
3410	}
3411
3412	/// Test whether the given X86ISD::CMP node has any uses which require the SF
3413	/// flag to be accurate.
3414	bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3415	// Examine each user of the node.
3416	for (SDUse &Use : Flags ->uses()) {
3417	// Only check things that use the flags.
3418	if (Use.getResNo() != Flags.getResNo())
3419	continue;
3420	SDNode *User = Use.getUser();
3421	// Only examine CopyToReg uses that copy to EFLAGS.
3422	if (User->getOpcode() != ISD::CopyToReg \|\|
3423	cast<RegisterSDNode>(Val: User->getOperand(Num: `1`))->getReg() != X86::EFLAGS)
3424	return false;
3425	// Examine each user of the CopyToReg use.
3426	for (SDUse &FlagUse : User->uses()) {
3427	// Only examine the Flag result.
3428	if (FlagUse.getResNo() != `1`)
3429	continue;
3430	// Anything unusual: assume conservatively.
3431	if (!FlagUse.getUser()->isMachineOpcode())
3432	return false;
3433	// Examine the condition code of the user.
3434	X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3435
3436	switch (CC) {
3437	// Comparisons which don't examine the SF flag.
3438	case X86::COND_A: case X86::COND_AE:
3439	case X86::COND_B: case X86::COND_BE:
3440	case X86::COND_E: case X86::COND_NE:
3441	case X86::COND_O: case X86::COND_NO:
3442	case X86::COND_P: case X86::COND_NP:
3443	continue;
3444	// Anything else: assume conservatively.
3445	default:
3446	return false;
3447	}
3448	}
3449	}
3450	return true;
3451	}
3452
3453	static bool mayUseCarryFlag(X86::CondCode CC) {
3454	switch (CC) {
3455	// Comparisons which don't examine the CF flag.
3456	case X86::COND_O: case X86::COND_NO:
3457	case X86::COND_E: case X86::COND_NE:
3458	case X86::COND_S: case X86::COND_NS:
3459	case X86::COND_P: case X86::COND_NP:
3460	case X86::COND_L: case X86::COND_GE:
3461	case X86::COND_G: case X86::COND_LE:
3462	return false;
3463	// Anything else: assume conservatively.
3464	default:
3465	return true;
3466	}
3467	}
3468
3469	/// Test whether the given node which sets flags has any uses which require the
3470	/// CF flag to be accurate.
3471	bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3472	// Examine each user of the node.
3473	for (SDUse &Use : Flags ->uses()) {
3474	// Only check things that use the flags.
3475	if (Use.getResNo() != Flags.getResNo())
3476	continue;
3477
3478	SDNode *User = Use.getUser();
3479	unsigned UserOpc = User->getOpcode();
3480
3481	if (UserOpc == ISD::CopyToReg) {
3482	// Only examine CopyToReg uses that copy to EFLAGS.
3483	if (cast<RegisterSDNode>(Val: User->getOperand(Num: `1`))->getReg() != X86::EFLAGS)
3484	return false;
3485	// Examine each user of the CopyToReg use.
3486	for (SDUse &FlagUse : User->uses()) {
3487	// Only examine the Flag result.
3488	if (FlagUse.getResNo() != `1`)
3489	continue;
3490	// Anything unusual: assume conservatively.
3491	if (!FlagUse.getUser()->isMachineOpcode())
3492	return false;
3493	// Examine the condition code of the user.
3494	X86::CondCode CC = getCondFromNode(N: FlagUse.getUser());
3495
3496	if (mayUseCarryFlag(CC))
3497	return false;
3498	}
3499
3500	// This CopyToReg is ok. Move on to the next user.
3501	continue;
3502	}
3503
3504	// This might be an unselected node. So look for the pre-isel opcodes that
3505	// use flags.
3506	unsigned CCOpNo;
3507	switch (UserOpc) {
3508	default:
3509	// Something unusual. Be conservative.
3510	return false;
3511	case X86ISD::SETCC: CCOpNo = `0`; break;
3512	case X86ISD::SETCC_CARRY: CCOpNo = `0`; break;
3513	case X86ISD::CMOV: CCOpNo = `2`; break;
3514	case X86ISD::BRCOND: CCOpNo = `2`; break;
3515	}
3516
3517	X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(Num: CCOpNo);
3518	if (mayUseCarryFlag(CC))
3519	return false;
3520	}
3521	return true;
3522	}
3523
3524	bool X86DAGToDAGISel::checkTCRetEnoughRegs(SDNode N) const* {
3525	// Check that there is enough volatile registers to load the callee address.
3526
3527	const X86RegisterInfo *RI = Subtarget->getRegisterInfo();
3528	unsigned AvailGPRs;
3529	// The register classes below must stay in sync with what's used for
3530	// TCRETURNri, TCRETURN_HIPE32ri, TCRETURN_WIN64ri, etc).
3531	if (Subtarget->is64Bit()) {
3532	const TargetRegisterClass *TCGPRs =
3533	Subtarget->isCallingConvWin64(CC: MF->getFunction().getCallingConv())
3534	? &X86::GR64_TCW64RegClass
3535	: &X86::GR64_TCRegClass;
3536	// Can't use RSP or RIP for the load in general.
3537	assert(TCGPRs->contains(X86::RSP));
3538	assert(TCGPRs->contains(X86::RIP));
3539	AvailGPRs = TCGPRs->getNumRegs() - `2`;
3540	} else {
3541	const TargetRegisterClass *TCGPRs =
3542	MF->getFunction().getCallingConv() == CallingConv::HiPE
3543	? &X86::GR32RegClass
3544	: &X86::GR32_TCRegClass;
3545	// Can't use ESP for the address in general.
3546	assert(TCGPRs->contains(X86::ESP));
3547	AvailGPRs = TCGPRs->getNumRegs() - `1`;
3548	}
3549
3550	// The load's base and index need up to two registers.
3551	unsigned LoadGPRs = `2`;
3552
3553	assert(N->getOpcode() == X86ISD::TC_RETURN);
3554	// X86tcret args: (chain, ptr, imm, regs..., glue)*
3555
3556	if (Subtarget->is32Bit()) {
3557	// FIXME: This was carried from X86tcret_1reg which was used for 32-bit,
3558	// but it could apply to 64-bit too.
3559	const SDValue &BasePtr = cast<LoadSDNode>(Val: N->getOperand(Num: `1`))->getBasePtr();
3560	if (isa<FrameIndexSDNode>(Val: BasePtr)) {
3561	LoadGPRs -= `2`; // Base is fixed index off ESP; no regs needed.
3562	} else if (BasePtr.getOpcode() == X86ISD::Wrapper &&
3563	isa<GlobalAddressSDNode>(Val: BasePtr ->getOperand(Num: `0`))) {
3564	assert(!getTargetMachine().isPositionIndependent());
3565	LoadGPRs -= `1`; // Base is a global (immediate since this is non-PIC), no
3566	// reg needed.
3567	}
3568	}
3569
3570	unsigned ArgGPRs = `0`;
3571	for (unsigned I = `3`, E = N->getNumOperands(); I != E; ++I) {
3572	if (const auto *RN = dyn_cast<RegisterSDNode>(Val: N->getOperand(Num: I))) {
3573	if (!RI->isGeneralPurposeRegister(*MF, RN->getReg()))
3574	continue;
3575	if (++ArgGPRs + LoadGPRs > AvailGPRs)
3576	return false;
3577	}
3578	}
3579
3580	return true;
3581	}
3582
3583	/// Check whether or not the chain ending in StoreNode is suitable for doing
3584	/// the {load; op; store} to modify transformation.
3585	static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3586	SDValue StoredVal, SelectionDAG *CurDAG,
3587	unsigned LoadOpNo,
3588	LoadSDNode *&LoadNode,
3589	SDValue &InputChain) {
3590	// Is the stored value result 0 of the operation?
3591	if (StoredVal.getResNo() != `0`) return false;
3592
3593	// Are there other uses of the operation other than the store?
3594	if (!StoredVal.getNode()->hasNUsesOfValue(NUses: `1`, Value: `0`)) return false;
3595
3596	// Is the store non-extending and non-indexed?
3597	if (!ISD::isNormalStore(N: StoreNode) \|\| StoreNode->isNonTemporal())
3598	return false;
3599
3600	SDValue Load = StoredVal ->getOperand(Num: LoadOpNo);
3601	// Is the stored value a non-extending and non-indexed load?
3602	if (!ISD::isNormalLoad(N: Load.getNode())) return false;
3603
3604	// Return LoadNode by reference.
3605	LoadNode = cast<LoadSDNode>(Val&: Load);
3606
3607	// Is store the only read of the loaded value?
3608	if (!Load.hasOneUse())
3609	return false;
3610
3611	// Is the address of the store the same as the load?
3612	if (LoadNode->getBasePtr() != StoreNode->getBasePtr() \|\|
3613	LoadNode->getOffset() != StoreNode->getOffset())
3614	return false;
3615
3616	bool FoundLoad = false;
3617	SmallVector<SDValue, `4`> ChainOps;
3618	SmallVector<const SDNode *, `4`> LoopWorklist;
3619	SmallPtrSet<const SDNode *, `16`> Visited;
3620	const unsigned int Max = `1024`;
3621
3622	// Visualization of Load-Op-Store fusion:
3623	// -------------------------
3624	// Legend:
3625	// -lines = Chain operand dependencies.*
3626	// \|-lines = Normal operand dependencies.
3627	// Dependencies flow down and right. n-suffix references multiple nodes.
3628	//
3629	// C Xn C
3630	// * * *
3631	// * * *
3632	// Xn A-LD Yn TF Yn
3633	// * \ \| * \|*
3634	// * \ \| * \|*
3635	// * \ \| => A--LD_OP_ST*
3636	// * \\| \*
3637	// TF OP \
3638	// \| \ Zn*
3639	// \| \*
3640	// A-ST Zn
3641	//
3642
3643	// This merge induced dependences from: #1: Xn -> LD, OP, Zn
3644	// #2: Yn -> LD
3645	// #3: ST -> Zn
3646
3647	// Ensure the transform is safe by checking for the dual
3648	// dependencies to make sure we do not induce a loop.
3649
3650	// As LD is a predecessor to both OP and ST we can do this by checking:
3651	// a). if LD is a predecessor to a member of Xn or Yn.
3652	// b). if a Zn is a predecessor to ST.
3653
3654	// However, (b) can only occur through being a chain predecessor to
3655	// ST, which is the same as Zn being a member or predecessor of Xn,
3656	// which is a subset of LD being a predecessor of Xn. So it's
3657	// subsumed by check (a).
3658
3659	SDValue Chain = StoreNode->getChain();
3660
3661	// Gather X elements in ChainOps.
3662	if (Chain == Load.getValue(R: `1`)) {
3663	FoundLoad = true;
3664	ChainOps.push_back(Elt: Load.getOperand(i: `0`));
3665	} else if (Chain.getOpcode() == ISD::TokenFactor) {
3666	for (unsigned i = `0`, e = Chain.getNumOperands(); i != e; ++i) {
3667	SDValue Op = Chain.getOperand(i);
3668	if (Op == Load.getValue(R: `1`)) {
3669	FoundLoad = true;
3670	// Drop Load, but keep its chain. No cycle check necessary.
3671	ChainOps.push_back(Elt: Load.getOperand(i: `0`));
3672	continue;
3673	}
3674	LoopWorklist.push_back(Elt: Op.getNode());
3675	ChainOps.push_back(Elt: Op);
3676	}
3677	}
3678
3679	if (!FoundLoad)
3680	return false;
3681
3682	// Worklist is currently Xn. Add Yn to worklist.
3683	for (SDValue Op : StoredVal ->ops())
3684	if (Op.getNode() != LoadNode)
3685	LoopWorklist.push_back(Elt: Op.getNode());
3686
3687	// Check (a) if Load is a predecessor to Xn + Yn
3688	if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max,
3689	TopologicalPrune: true))
3690	return false;
3691
3692	InputChain =
3693	CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc (Chain), VT: MVT::Other, Ops: ChainOps);
3694	return true;
3695	}
3696
3697	// Change a chain of {load; op; store} of the same value into a simple op
3698	// through memory of that value, if the uses of the modified value and its
3699	// address are suitable.
3700	//
3701	// The tablegen pattern memory operand pattern is currently not able to match
3702	// the case where the EFLAGS on the original operation are used.
3703	//
3704	// To move this to tablegen, we'll need to improve tablegen to allow flags to
3705	// be transferred from a node in the pattern to the result node, probably with
3706	// a new keyword. For example, we have this
3707	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3708	// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3709	// but maybe need something like this
3710	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3711	// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3712	// (transferrable EFLAGS)]>;
3713	//
3714	// Until then, we manually fold these and instruction select the operation
3715	// here.
3716	bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3717	auto *StoreNode = cast<StoreSDNode>(Val: Node);
3718	SDValue StoredVal = StoreNode->getOperand(Num: `1`);
3719	unsigned Opc = StoredVal ->getOpcode();
3720
3721	// Before we try to select anything, make sure this is memory operand size
3722	// and opcode we can handle. Note that this must match the code below that
3723	// actually lowers the opcodes.
3724	EVT MemVT = StoreNode->getMemoryVT();
3725	if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3726	MemVT != MVT::i8)
3727	return false;
3728
3729	bool IsCommutable = false;
3730	bool IsNegate = false;
3731	switch (Opc) {
3732	default:
3733	return false;
3734	case X86ISD::SUB:
3735	IsNegate = isNullConstant(V: StoredVal.getOperand(i: `0`));
3736	break;
3737	case X86ISD::SBB:
3738	break;
3739	case X86ISD::ADD:
3740	case X86ISD::ADC:
3741	case X86ISD::AND:
3742	case X86ISD::OR:
3743	case X86ISD::XOR:
3744	IsCommutable = true;
3745	break;
3746	}
3747
3748	unsigned LoadOpNo = IsNegate ? `1` : `0`;
3749	LoadSDNode LoadNode = nullptr*;
3750	SDValue InputChain;
3751	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3752	LoadNode, InputChain)) {
3753	if (!IsCommutable)
3754	return false;
3755
3756	// This operation is commutable, try the other operand.
3757	LoadOpNo = `1`;
3758	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3759	LoadNode, InputChain))
3760	return false;
3761	}
3762
3763	SDValue Base, Scale, Index, Disp, Segment;
3764	if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3765	Segment))
3766	return false;
3767
3768	auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3769	unsigned Opc8) {
3770	switch (MemVT.getSimpleVT().SimpleTy) {
3771	case MVT::i64:
3772	return Opc64;
3773	case MVT::i32:
3774	return Opc32;
3775	case MVT::i16:
3776	return Opc16;
3777	case MVT::i8:
3778	return Opc8;
3779	default:
3780	llvm_unreachable("Invalid size!");
3781	}
3782	};
3783
3784	MachineSDNode *Result;
3785	switch (Opc) {
3786	case X86ISD::SUB:
3787	// Handle negate.
3788	if (IsNegate) {
3789	unsigned NewOpc = SelectOpcode (X86::NEG64m, X86::NEG32m, X86::NEG16m,
3790	X86::NEG8m);
3791	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3792	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32,
3793	VT2: MVT::Other, Ops);
3794	break;
3795	}
3796	[[fallthrough]];
3797	case X86ISD::ADD:
3798	// Try to match inc/dec.
3799	if (!Subtarget->slowIncDec() \|\| CurDAG->shouldOptForSize()) {
3800	bool IsOne = isOneConstant(V: StoredVal.getOperand(i: `1`));
3801	bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: `1`));
3802	// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3803	if ((IsOne \|\| IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: `1`))) {
3804	unsigned NewOpc =
3805	((Opc == X86ISD::ADD) == IsOne)
3806	? SelectOpcode (X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3807	: SelectOpcode (X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3808	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3809	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32,
3810	VT2: MVT::Other, Ops);
3811	break;
3812	}
3813	}
3814	[[fallthrough]];
3815	case X86ISD::ADC:
3816	case X86ISD::SBB:
3817	case X86ISD::AND:
3818	case X86ISD::OR:
3819	case X86ISD::XOR: {
3820	auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3821	switch (Opc) {
3822	case X86ISD::ADD:
3823	return SelectOpcode (X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3824	X86::ADD8mr);
3825	case X86ISD::ADC:
3826	return SelectOpcode (X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3827	X86::ADC8mr);
3828	case X86ISD::SUB:
3829	return SelectOpcode (X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3830	X86::SUB8mr);
3831	case X86ISD::SBB:
3832	return SelectOpcode (X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3833	X86::SBB8mr);
3834	case X86ISD::AND:
3835	return SelectOpcode (X86::AND64mr, X86::AND32mr, X86::AND16mr,
3836	X86::AND8mr);
3837	case X86ISD::OR:
3838	return SelectOpcode (X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3839	case X86ISD::XOR:
3840	return SelectOpcode (X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3841	X86::XOR8mr);
3842	default:
3843	llvm_unreachable("Invalid opcode!");
3844	}
3845	};
3846	auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3847	switch (Opc) {
3848	case X86ISD::ADD:
3849	return SelectOpcode (X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3850	X86::ADD8mi);
3851	case X86ISD::ADC:
3852	return SelectOpcode (X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3853	X86::ADC8mi);
3854	case X86ISD::SUB:
3855	return SelectOpcode (X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3856	X86::SUB8mi);
3857	case X86ISD::SBB:
3858	return SelectOpcode (X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3859	X86::SBB8mi);
3860	case X86ISD::AND:
3861	return SelectOpcode (X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3862	X86::AND8mi);
3863	case X86ISD::OR:
3864	return SelectOpcode (X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3865	X86::OR8mi);
3866	case X86ISD::XOR:
3867	return SelectOpcode (X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3868	X86::XOR8mi);
3869	default:
3870	llvm_unreachable("Invalid opcode!");
3871	}
3872	};
3873
3874	unsigned NewOpc = SelectRegOpcode (Opc);
3875	SDValue Operand = StoredVal ->getOperand(Num: `1`-LoadOpNo);
3876
3877	// See if the operand is a constant that we can fold into an immediate
3878	// operand.
3879	if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) {
3880	int64_t OperandV = OperandC->getSExtValue();
3881
3882	// Check if we can shrink the operand enough to fit in an immediate (or
3883	// fit into a smaller immediate) by negating it and switching the
3884	// operation.
3885	if ((Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB) &&
3886	((MemVT != MVT::i8 && !isInt<`8`>(x: OperandV) && isInt<`8`>(x: -OperandV)) \|\|
3887	(MemVT == MVT::i64 && !isInt<`32`>(x: OperandV) &&
3888	isInt<`32`>(x: -OperandV))) &&
3889	hasNoCarryFlagUses(Flags: StoredVal.getValue(R: `1`))) {
3890	OperandV = -OperandV;
3891	Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3892	}
3893
3894	if (MemVT != MVT::i64 \|\| isInt<`32`>(x: OperandV)) {
3895	Operand = CurDAG->getSignedTargetConstant(Val: OperandV, DL: SDLoc (Node), VT: MemVT);
3896	NewOpc = SelectImmOpcode (Opc);
3897	}
3898	}
3899
3900	if (Opc == X86ISD::ADC \|\| Opc == X86ISD::SBB) {
3901	SDValue CopyTo =
3902	CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc (Node), Reg: X86::EFLAGS,
3903	N: StoredVal.getOperand(i: `2`), Glue: SDValue ());
3904
3905	const SDValue Ops[] = {Base, Scale, Index, Disp,
3906	Segment, Operand, CopyTo, CopyTo.getValue(R: `1`)};
3907	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32, VT2: MVT::Other,
3908	Ops);
3909	} else {
3910	const SDValue Ops[] = {Base, Scale, Index, Disp,
3911	Segment, Operand, InputChain};
3912	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32, VT2: MVT::Other,
3913	Ops);
3914	}
3915	break;
3916	}
3917	default:
3918	llvm_unreachable("Invalid opcode!");
3919	}
3920
3921	MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3922	LoadNode->getMemOperand()};
3923	CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps);
3924
3925	// Update Load Chain uses as well.
3926	ReplaceUses(F: SDValue (LoadNode, `1`), T: SDValue (Result, `1`));
3927	ReplaceUses(F: SDValue (StoreNode, `0`), T: SDValue (Result, `1`));
3928	ReplaceUses(F: SDValue (StoredVal.getNode(), `1`), T: SDValue (Result, `0`));
3929	CurDAG->RemoveDeadNode(N: Node);
3930	return true;
3931	}
3932
3933	// See if this is an X & Mask that we can match to BEXTR/BZHI.
3934	// Where Mask is one of the following patterns:
3935	// a) x & (1 << nbits) - 1
3936	// b) x & ~(-1 << nbits)
3937	// c) x & (-1 >> (32 - y))
3938	// d) x << (32 - y) >> (32 - y)
3939	// e) (1 << nbits) - 1
3940	bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3941	assert(
3942	(Node->getOpcode() == ISD::ADD \|\| Node->getOpcode() == ISD::AND \|\|
3943	Node->getOpcode() == ISD::SRL) &&
3944	"Should be either an and-mask, or right-shift after clearing high bits.");
3945
3946	// BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3947	if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3948	return false;
3949
3950	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
3951
3952	// Only supported for 32 and 64 bits.
3953	if (NVT != MVT::i32 && NVT != MVT::i64)
3954	return false;
3955
3956	SDValue NBits;
3957	bool NegateNBits;
3958
3959	// If we have BMI2's BZHI, we are ok with muti-use patterns.
3960	// Else, if we only have BMI1's BEXTR, we require one-use.
3961	const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3962	auto checkUses = [AllowExtraUsesByDefault](
3963	SDValue Op, unsigned NUses,
3964	std::optional<bool> AllowExtraUses) {
3965	return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) \|\|
3966	Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo());
3967	};
3968	auto checkOneUse = [checkUses](SDValue Op,
3969	std::optional<bool> AllowExtraUses =
3970	std::nullopt) {
3971	return checkUses (Op, `1`, AllowExtraUses);
3972	};
3973	auto checkTwoUse = [checkUses](SDValue Op,
3974	std::optional<bool> AllowExtraUses =
3975	std::nullopt) {
3976	return checkUses (Op, `2`, AllowExtraUses);
3977	};
3978
3979	auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3980	if (V ->getOpcode() == ISD::TRUNCATE && checkOneUse (V)) {
3981	assert(V.getSimpleValueType() == MVT::i32 &&
3982	V.getOperand(`0`).getSimpleValueType() == MVT::i64 &&
3983	"Expected i64 -> i32 truncation");
3984	V = V.getOperand(i: `0`);
3985	}
3986	return V;
3987	};
3988
3989	// a) x & ((1 << nbits) + (-1))
3990	auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3991	&NegateNBits](SDValue Mask) -> bool {
3992	// Match `add`. Must only have one use!
3993	if (Mask ->getOpcode() != ISD::ADD \|\| !checkOneUse (Mask))
3994	return false;
3995	// We should be adding all-ones constant (i.e. subtracting one.)
3996	if (!isAllOnesConstant(V: Mask ->getOperand(Num: `1`)))
3997	return false;
3998	// Match `1 << nbits`. Might be truncated. Must only have one use!
3999	SDValue M0 = peekThroughOneUseTruncation (Mask ->getOperand(Num: `0`));
4000	if (M0 ->getOpcode() != ISD::SHL \|\| !checkOneUse (M0))
4001	return false;
4002	if (!isOneConstant(V: M0 ->getOperand(Num: `0`)))
4003	return false;
4004	NBits = M0 ->getOperand(Num: `1`);
4005	NegateNBits = false;
4006	return true;
4007	};
4008
4009	auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
4010	V = peekThroughOneUseTruncation (V);
4011	return CurDAG->MaskedValueIsAllOnes(
4012	Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(),
4013	loBitsSet: NVT.getSizeInBits()));
4014	};
4015
4016	// b) x & ~(-1 << nbits)
4017	auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
4018	&NBits, &NegateNBits](SDValue Mask) -> bool {
4019	// Match `~()`. Must only have one use!
4020	if (Mask.getOpcode() != ISD::XOR \|\| !checkOneUse (Mask))
4021	return false;
4022	// The -1 only has to be all-ones for the final Node's NVT.
4023	if (!isAllOnes (Mask ->getOperand(Num: `1`)))
4024	return false;
4025	// Match `-1 << nbits`. Might be truncated. Must only have one use!
4026	SDValue M0 = peekThroughOneUseTruncation (Mask ->getOperand(Num: `0`));
4027	if (M0 ->getOpcode() != ISD::SHL \|\| !checkOneUse (M0))
4028	return false;
4029	// The -1 only has to be all-ones for the final Node's NVT.
4030	if (!isAllOnes (M0 ->getOperand(Num: `0`)))
4031	return false;
4032	NBits = M0 ->getOperand(Num: `1`);
4033	NegateNBits = false;
4034	return true;
4035	};
4036
4037	// Try to match potentially-truncated shift amount as `(bitwidth - y)`,
4038	// or leave the shift amount as-is, but then we'll have to negate it.
4039	auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
4040	unsigned Bitwidth) {
4041	NBits = ShiftAmt;
4042	NegateNBits = true;
4043	// Skip over a truncate of the shift amount, if any.
4044	if (NBits.getOpcode() == ISD::TRUNCATE)
4045	NBits = NBits.getOperand(i: `0`);
4046	// Try to match the shift amount as (bitwidth - y). It should go away, too.
4047	// If it doesn't match, that's fine, we'll just negate it ourselves.
4048	if (NBits.getOpcode() != ISD::SUB)
4049	return;
4050	auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: `0`));
4051	if (!V0 \|\| V0->getZExtValue() != Bitwidth)
4052	return;
4053	NBits = NBits.getOperand(i: `1`);
4054	NegateNBits = false;
4055	};
4056
4057	// c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
4058	// or
4059	// c) x & (-1 >> (32 - y))
4060	auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
4061	canonicalizeShiftAmt](SDValue Mask) -> bool {
4062	// The mask itself may be truncated.
4063	Mask = peekThroughOneUseTruncation (Mask);
4064	unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
4065	// Match `l>>`. Must only have one use!
4066	if (Mask.getOpcode() != ISD::SRL \|\| !checkOneUse (Mask))
4067	return false;
4068	// We should be shifting truly all-ones constant.
4069	if (!isAllOnesConstant(V: Mask.getOperand(i: `0`)))
4070	return false;
4071	SDValue M1 = Mask.getOperand(i: `1`);
4072	// The shift amount should not be used externally.
4073	if (!checkOneUse (M1))
4074	return false;
4075	canonicalizeShiftAmt (M1, Bitwidth);
4076	// Pattern c. is non-canonical, and is expanded into pattern d. iff there
4077	// is no extra use of the mask. Clearly, there was one since we are here.
4078	// But at the same time, if we need to negate the shift amount,
4079	// then we don't want the mask to stick around, else it's unprofitable.
4080	return !NegateNBits;
4081	};
4082
4083	SDValue X;
4084
4085	// d) x << z >> z but then we'll have to subtract z from bitwidth
4086	// or
4087	// d) x << (32 - y) >> (32 - y)
4088	auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
4089	AllowExtraUsesByDefault, &NegateNBits,
4090	&X](SDNode Node) -> bool* {
4091	if (Node->getOpcode() != ISD::SRL)
4092	return false;
4093	SDValue N0 = Node->getOperand(Num: `0`);
4094	if (N0 ->getOpcode() != ISD::SHL)
4095	return false;
4096	unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
4097	SDValue N1 = Node->getOperand(Num: `1`);
4098	SDValue N01 = N0 ->getOperand(Num: `1`);
4099	// Both of the shifts must be by the exact same value.
4100	if (N1 != N01)
4101	return false;
4102	canonicalizeShiftAmt (N1, Bitwidth);
4103	// There should not be any external uses of the inner shift / shift amount.
4104	// Note that while we are generally okay with external uses given BMI2,
4105	// iff we need to negate the shift amount, we are not okay with extra uses.
4106	const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4107	if (!checkOneUse (N0, AllowExtraUses) \|\| !checkTwoUse (N1, AllowExtraUses))
4108	return false;
4109	X = N0 ->getOperand(Num: `0`);
4110	return true;
4111	};
4112
4113	auto matchLowBitMask = [matchPatternA, matchPatternB,
4114	matchPatternC](SDValue Mask) -> bool {
4115	return matchPatternA (Mask) \|\| matchPatternB (Mask) \|\| matchPatternC (Mask);
4116	};
4117
4118	if (Node->getOpcode() == ISD::AND) {
4119	X = Node->getOperand(Num: `0`);
4120	SDValue Mask = Node->getOperand(Num: `1`);
4121
4122	if (matchLowBitMask (Mask)) {
4123	// Great.
4124	} else {
4125	std::swap(a&: X, b&: Mask);
4126	if (!matchLowBitMask (Mask))
4127	return false;
4128	}
4129	} else if (matchLowBitMask (SDValue (Node, `0`))) {
4130	X = CurDAG->getAllOnesConstant(DL: SDLoc (Node), VT: NVT);
4131	} else if (!matchPatternD (Node))
4132	return false;
4133
4134	// If we need to negate the shift amount, require BMI2 BZHI support.
4135	// It's just too unprofitable for BMI1 BEXTR.
4136	if (NegateNBits && !Subtarget->hasBMI2())
4137	return false;
4138
4139	SDLoc DL(Node);
4140
4141	if (NBits.getSimpleValueType() != MVT::i8) {
4142	// Truncate the shift amount.
4143	NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits);
4144	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4145	}
4146
4147	// Turn (i32)(x & imm8) into (i32)x & imm32.
4148	ConstantSDNode Imm = nullptr*;
4149	if (NBits ->getOpcode() == ISD::AND)
4150	if ((Imm = dyn_cast<ConstantSDNode>(Val: NBits ->getOperand(Num: `1`))))
4151	NBits = NBits ->getOperand(Num: `0`);
4152
4153	// Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4154	// All the other bits are undefined, we do not care about them.
4155	SDValue ImplDef = SDValue (
4156	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), `0`);
4157	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: ImplDef);
4158
4159	SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32);
4160	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: SRIdxVal);
4161	NBits = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL,
4162	VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal),
4163	`0`);
4164	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4165
4166	if (Imm) {
4167	NBits =
4168	CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: NBits,
4169	N2: CurDAG->getConstant(Val: Imm->getZExtValue(), DL, VT: MVT::i32));
4170	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4171	}
4172
4173	// We might have matched the amount of high bits to be cleared,
4174	// but we want the amount of low bits to be kept, so negate it then.
4175	if (NegateNBits) {
4176	SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32);
4177	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: BitWidthC);
4178
4179	NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits);
4180	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4181	}
4182
4183	if (Subtarget->hasBMI2()) {
4184	// Great, just emit the BZHI..
4185	if (NVT != MVT::i32) {
4186	// But have to place the bit count into the wide-enough register first.
4187	NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits);
4188	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4189	}
4190
4191	SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits);
4192	ReplaceNode(F: Node, T: Extract.getNode());
4193	SelectCode(N: Extract.getNode());
4194	return true;
4195	}
4196
4197	// Else, if we do NOT* have BMI2, let's find out if the if the 'X' is*
4198	// logically* shifted (potentially with one-use trunc inbetween),*
4199	// and the truncation was the only use of the shift,
4200	// and if so look past one-use truncation.
4201	{
4202	SDValue RealX = peekThroughOneUseTruncation (X);
4203	// FIXME: only if the shift is one-use?
4204	if (RealX != X && RealX.getOpcode() == ISD::SRL)
4205	X = RealX;
4206	}
4207
4208	MVT XVT = X.getSimpleValueType();
4209
4210	// Else, emitting BEXTR requires one more step.
4211	// The 'control' of BEXTR has the pattern of:
4212	// [15...8 bit][ 7...0 bit] location
4213	// [ bit count][ shift] name
4214	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4215
4216	// Shift NBits left by 8 bits, thus producing 'control'.
4217	// This makes the low 8 bits to be zero.
4218	SDValue C8 = CurDAG->getConstant(Val: `8`, DL, VT: MVT::i8);
4219	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: C8);
4220	SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8);
4221	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4222
4223	// If the 'X' is logically* shifted, we can fold that shift into 'control'.*
4224	// FIXME: only if the shift is one-use?
4225	if (X.getOpcode() == ISD::SRL) {
4226	SDValue ShiftAmt = X.getOperand(i: `1`);
4227	X = X.getOperand(i: `0`);
4228
4229	assert(ShiftAmt.getValueType() == MVT::i8 &&
4230	"Expected shift amount to be i8");
4231
4232	// Now, zero-extend the shift amount. The bits 8...15 must* be zero!*
4233	// We could zext to i16 in some form, but we intentionally don't do that.
4234	SDValue OrigShiftAmt = ShiftAmt;
4235	ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt);
4236	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt);
4237
4238	// And now 'or' these low 8 bits of shift amount into the 'control'.
4239	Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt);
4240	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4241	}
4242
4243	// But have to place the 'control' into the wide-enough register first.
4244	if (XVT != MVT::i32) {
4245	Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control);
4246	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4247	}
4248
4249	// And finally, form the BEXTR itself.
4250	SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control);
4251
4252	// The 'X' was originally truncated. Do that now.
4253	if (XVT != NVT) {
4254	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Extract);
4255	Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract);
4256	}
4257
4258	ReplaceNode(F: Node, T: Extract.getNode());
4259	SelectCode(N: Extract.getNode());
4260
4261	return true;
4262	}
4263
4264	// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4265	MachineSDNode X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode Node) {
4266	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
4267	SDLoc dl(Node);
4268
4269	SDValue N0 = Node->getOperand(Num: `0`);
4270	SDValue N1 = Node->getOperand(Num: `1`);
4271
4272	// If we have TBM we can use an immediate for the control. If we have BMI
4273	// we should only do this if the BEXTR instruction is implemented well.
4274	// Otherwise moving the control into a register makes this more costly.
4275	// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4276	// hoisting the move immediate would make it worthwhile with a less optimal
4277	// BEXTR?
4278	bool PreferBEXTR =
4279	Subtarget->hasTBM() \|\| (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4280	if (!PreferBEXTR && !Subtarget->hasBMI2())
4281	return nullptr;
4282
4283	// Must have a shift right.
4284	if (N0 ->getOpcode() != ISD::SRL && N0 ->getOpcode() != ISD::SRA)
4285	return nullptr;
4286
4287	// Shift can't have additional users.
4288	if (!N0 ->hasOneUse())
4289	return nullptr;
4290
4291	// Only supported for 32 and 64 bits.
4292	if (NVT != MVT::i32 && NVT != MVT::i64)
4293	return nullptr;
4294
4295	// Shift amount and RHS of and must be constant.
4296	auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1);
4297	auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0 ->getOperand(Num: `1`));
4298	if (!MaskCst \|\| !ShiftCst)
4299	return nullptr;
4300
4301	// And RHS must be a mask.
4302	uint64_t Mask = MaskCst->getZExtValue();
4303	if (!isMask_64(Value: Mask))
4304	return nullptr;
4305
4306	uint64_t Shift = ShiftCst->getZExtValue();
4307	uint64_t MaskSize = llvm::popcount(Value: Mask);
4308
4309	// Don't interfere with something that can be handled by extracting AH.
4310	// TODO: If we are able to fold a load, BEXTR might still be better than AH.
4311	if (Shift == `8` && MaskSize == `8`)
4312	return nullptr;
4313
4314	// Make sure we are only using bits that were in the original value, not
4315	// shifted in.
4316	if (Shift + MaskSize > NVT.getSizeInBits())
4317	return nullptr;
4318
4319	// BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4320	// that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4321	// does not fit into 32 bits. Load folding is not a sufficient reason.
4322	if (!PreferBEXTR && MaskSize <= `32`)
4323	return nullptr;
4324
4325	SDValue Control;
4326	unsigned ROpc, MOpc;
4327
4328	#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4329	if (!PreferBEXTR) {
4330	assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4331	// If we can't make use of BEXTR then we can't fuse shift+mask stages.
4332	// Let's perform the mask first, and apply shift later. Note that we need to
4333	// widen the mask to account for the fact that we'll apply shift afterwards!
4334	Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT);
4335	ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4336	: GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4337	MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4338	: GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4339	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4340	Control = SDValue (CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), `0`);
4341	} else {
4342	// The 'control' of BEXTR has the pattern of:
4343	// [15...8 bit][ 7...0 bit] location
4344	// [ bit count][ shift] name
4345	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4346	Control = CurDAG->getTargetConstant(Val: Shift \| (MaskSize << `8`), DL: dl, VT: NVT);
4347	if (Subtarget->hasTBM()) {
4348	ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4349	MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4350	} else {
4351	assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4352	// BMI requires the immediate to placed in a register.
4353	ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4354	: GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4355	MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4356	: GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4357	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4358	Control = SDValue (CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), `0`);
4359	}
4360	}
4361
4362	MachineSDNode *NewNode;
4363	SDValue Input = N0 ->getOperand(Num: `0`);
4364	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4365	if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4366	SDValue Ops[] = {
4367	Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: `0`)};
4368	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
4369	NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4370	// Update the chain.
4371	ReplaceUses(F: Input.getValue(R: `1`), T: SDValue (NewNode, `2`));
4372	// Record the mem-refs
4373	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()});
4374	} else {
4375	NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control);
4376	}
4377
4378	if (!PreferBEXTR) {
4379	// We still need to apply the shift.
4380	SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT);
4381	unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4382	: GET_ND_IF_ENABLED(X86::SHR32ri);
4383	NewNode =
4384	CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue (NewNode, `0`), Op2: ShAmt);
4385	}
4386
4387	return NewNode;
4388	}
4389
4390	// Emit a PCMISTR(I/M) instruction.
4391	MachineSDNode X86DAGToDAGISel::emitPCMPISTR(unsigned* ROpc, unsigned MOpc,
4392	bool MayFoldLoad, const SDLoc &dl,
4393	MVT VT, SDNode *Node) {
4394	SDValue N0 = Node->getOperand(Num: `0`);
4395	SDValue N1 = Node->getOperand(Num: `1`);
4396	SDValue Imm = Node->getOperand(Num: `2`);
4397	auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4398	Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc (Node), VT: Imm.getValueType());
4399
4400	// Try to fold a load. No need to check alignment.
4401	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4402	if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4403	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4404	N1.getOperand(i: `0`) };
4405	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other);
4406	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4407	// Update the chain.
4408	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, `2`));
4409	// Record the mem-refs
4410	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
4411	return CNode;
4412	}
4413
4414	SDValue Ops[] = { N0, N1, Imm };
4415	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32);
4416	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4417	return CNode;
4418	}
4419
4420	// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4421	// to emit a second instruction after this one. This is needed since we have two
4422	// copyToReg nodes glued before this and we need to continue that glue through.
4423	MachineSDNode X86DAGToDAGISel::emitPCMPESTR(unsigned* ROpc, unsigned MOpc,
4424	bool MayFoldLoad, const SDLoc &dl,
4425	MVT VT, SDNode *Node,
4426	SDValue &InGlue) {
4427	SDValue N0 = Node->getOperand(Num: `0`);
4428	SDValue N2 = Node->getOperand(Num: `2`);
4429	SDValue Imm = Node->getOperand(Num: `4`);
4430	auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4431	Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc (Node), VT: Imm.getValueType());
4432
4433	// Try to fold a load. No need to check alignment.
4434	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4435	if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4436	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4437	N2.getOperand(i: `0`), InGlue };
4438	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
4439	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4440	InGlue = SDValue (CNode, `3`);
4441	// Update the chain.
4442	ReplaceUses(F: N2.getValue(R: `1`), T: SDValue (CNode, `2`));
4443	// Record the mem-refs
4444	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()});
4445	return CNode;
4446	}
4447
4448	SDValue Ops[] = { N0, N2, Imm, InGlue };
4449	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue);
4450	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4451	InGlue = SDValue (CNode, `2`);
4452	return CNode;
4453	}
4454
4455	bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4456	EVT VT = N->getValueType(ResNo: `0`);
4457
4458	// Only handle scalar shifts.
4459	if (VT.isVector())
4460	return false;
4461
4462	// Narrower shifts only mask to 5 bits in hardware.
4463	unsigned Size = VT == MVT::i64 ? `64` : `32`;
4464
4465	SDValue OrigShiftAmt = N->getOperand(Num: `1`);
4466	SDValue ShiftAmt = OrigShiftAmt;
4467	SDLoc DL(N);
4468
4469	// Skip over a truncate of the shift amount.
4470	if (ShiftAmt ->getOpcode() == ISD::TRUNCATE)
4471	ShiftAmt = ShiftAmt ->getOperand(Num: `0`);
4472
4473	// This function is called after X86DAGToDAGISel::matchBitExtract(),
4474	// so we are not afraid that we might mess up BZHI/BEXTR pattern.
4475
4476	SDValue NewShiftAmt;
4477	if (ShiftAmt ->getOpcode() == ISD::ADD \|\| ShiftAmt ->getOpcode() == ISD::SUB \|\|
4478	ShiftAmt ->getOpcode() == ISD::XOR) {
4479	SDValue Add0 = ShiftAmt ->getOperand(Num: `0`);
4480	SDValue Add1 = ShiftAmt ->getOperand(Num: `1`);
4481	auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0);
4482	auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1);
4483	// If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4484	// to avoid the ADD/SUB/XOR.
4485	if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == `0`) {
4486	NewShiftAmt = Add0;
4487
4488	} else if (ShiftAmt ->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4489	((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - `1`) \|\|
4490	(Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - `1`))) {
4491	// If we are doing a NOT on just the lower bits with (SizeN-1) -/^ X*
4492	// we can replace it with a NOT. In the XOR case it may save some code
4493	// size, in the SUB case it also may save a move.
4494	assert(Add0C == nullptr \|\| Add1C == nullptr);
4495
4496	// We can only do N-X, not X-N
4497	if (ShiftAmt ->getOpcode() == ISD::SUB && Add0C == nullptr)
4498	return false;
4499
4500	EVT OpVT = ShiftAmt.getValueType();
4501
4502	SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT);
4503	NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT,
4504	N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes);
4505	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes);
4506	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4507	// If we are shifting by N-X where N == 0 mod Size, then just shift by
4508	// -X to generate a NEG instead of a SUB of a constant.
4509	} else if (ShiftAmt ->getOpcode() == ISD::SUB && Add0C &&
4510	Add0C->getZExtValue() != `0`) {
4511	EVT SubVT = ShiftAmt.getValueType();
4512	SDValue X;
4513	if (Add0C->getZExtValue() % Size == `0`)
4514	X = Add1;
4515	else if (ShiftAmt.hasOneUse() && Size == `64` &&
4516	Add0C->getZExtValue() % `32` == `0`) {
4517	// We have a 64-bit shift by (n32-x), turn it into -(x+n32).
4518	// This is mainly beneficial if we already compute (x+n32).*
4519	if (Add1.getOpcode() == ISD::TRUNCATE) {
4520	Add1 = Add1.getOperand(i: `0`);
4521	SubVT = Add1.getValueType();
4522	}
4523	if (Add0.getValueType() != SubVT) {
4524	Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT);
4525	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0);
4526	}
4527
4528	X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0);
4529	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X);
4530	} else
4531	return false;
4532	// Insert a negate op.
4533	// TODO: This isn't guaranteed to replace the sub if there is a logic cone
4534	// that uses it that's not a shift.
4535	SDValue Zero = CurDAG->getConstant(Val: `0`, DL, VT: SubVT);
4536	SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X);
4537	NewShiftAmt = Neg;
4538
4539	// Insert these operands into a valid topological order so they can
4540	// get selected independently.
4541	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero);
4542	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg);
4543	} else
4544	return false;
4545	} else
4546	return false;
4547
4548	if (NewShiftAmt.getValueType() != MVT::i8) {
4549	// Need to truncate the shift amount.
4550	NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt);
4551	// Add to a correct topological ordering.
4552	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4553	}
4554
4555	// Insert a new mask to keep the shift amount legal. This should be removed
4556	// by isel patterns.
4557	NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt,
4558	N2: CurDAG->getConstant(Val: Size - `1`, DL, VT: MVT::i8));
4559	// Place in a correct topological ordering.
4560	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4561
4562	SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: `0`),
4563	Op2: NewShiftAmt);
4564	if (UpdatedNode != N) {
4565	// If we found an existing node, we should replace ourselves with that node
4566	// and wait for it to be selected after its other users.
4567	ReplaceNode(F: N, T: UpdatedNode);
4568	return true;
4569	}
4570
4571	// If the original shift amount is now dead, delete it so that we don't run
4572	// it through isel.
4573	if (OrigShiftAmt.getNode()->use_empty())
4574	CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode());
4575
4576	// Now that we've optimized the shift amount, defer to normal isel to get
4577	// load folding and legacy vs BMI2 selection without repeating it here.
4578	SelectCode(N);
4579	return true;
4580	}
4581
4582	bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4583	MVT NVT = N->getSimpleValueType(ResNo: `0`);
4584	unsigned Opcode = N->getOpcode();
4585	SDLoc dl(N);
4586
4587	// For operations of the form (x << C1) op C2, check if we can use a smaller
4588	// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4589	SDValue Shift = N->getOperand(Num: `0`);
4590	SDValue N1 = N->getOperand(Num: `1`);
4591
4592	auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
4593	if (!Cst)
4594	return false;
4595
4596	int64_t Val = Cst->getSExtValue();
4597
4598	// If we have an any_extend feeding the AND, look through it to see if there
4599	// is a shift behind it. But only if the AND doesn't use the extended bits.
4600	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4601	bool FoundAnyExtend = false;
4602	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4603	Shift.getOperand(i: `0`).getSimpleValueType() == MVT::i32 &&
4604	isUInt<`32`>(x: Val)) {
4605	FoundAnyExtend = true;
4606	Shift = Shift.getOperand(i: `0`);
4607	}
4608
4609	if (Shift.getOpcode() != ISD::SHL \|\| !Shift.hasOneUse())
4610	return false;
4611
4612	// i8 is unshrinkable, i16 should be promoted to i32.
4613	if (NVT != MVT::i32 && NVT != MVT::i64)
4614	return false;
4615
4616	auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`));
4617	if (!ShlCst)
4618	return false;
4619
4620	uint64_t ShAmt = ShlCst->getZExtValue();
4621
4622	// Make sure that we don't change the operation by removing bits.
4623	// This only matters for OR and XOR, AND is unaffected.
4624	uint64_t RemovedBitsMask = (`1ULL` << ShAmt) - `1`;
4625	if (Opcode != ISD::AND && (Val & RemovedBitsMask) != `0`)
4626	return false;
4627
4628	// Check the minimum bitwidth for the new constant.
4629	// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4630	auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4631	if (Opcode == ISD::AND) {
4632	// AND32ri is the same as AND64ri32 with zext imm.
4633	// Try this before sign extended immediates below.
4634	ShiftedVal = (uint64_t)Val >> ShAmt;
4635	if (NVT == MVT::i64 && !isUInt<`32`>(x: Val) && isUInt<`32`>(x: ShiftedVal))
4636	return true;
4637	// Also swap order when the AND can become MOVZX.
4638	if (ShiftedVal == UINT8_MAX \|\| ShiftedVal == UINT16_MAX)
4639	return true;
4640	}
4641	ShiftedVal = Val >> ShAmt;
4642	if ((!isInt<`8`>(x: Val) && isInt<`8`>(x: ShiftedVal)) \|\|
4643	(!isInt<`32`>(x: Val) && isInt<`32`>(x: ShiftedVal)))
4644	return true;
4645	if (Opcode != ISD::AND) {
4646	// MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4647	ShiftedVal = (uint64_t)Val >> ShAmt;
4648	if (NVT == MVT::i64 && !isUInt<`32`>(x: Val) && isUInt<`32`>(x: ShiftedVal))
4649	return true;
4650	}
4651	return false;
4652	};
4653
4654	int64_t ShiftedVal;
4655	if (!CanShrinkImmediate (ShiftedVal))
4656	return false;
4657
4658	// Ok, we can reorder to get a smaller immediate.
4659
4660	// But, its possible the original immediate allowed an AND to become MOVZX.
4661	// Doing this late due to avoid the MakedValueIsZero call as late as
4662	// possible.
4663	if (Opcode == ISD::AND) {
4664	// Find the smallest zext this could possibly be.
4665	unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4666	ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: `8U`));
4667
4668	// Figure out which bits need to be zero to achieve that mask.
4669	APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(),
4670	loBitsSet: ZExtWidth);
4671	NeededMask &= ~Cst->getAPIntValue();
4672
4673	if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: `0`), Mask: NeededMask))
4674	return false;
4675	}
4676
4677	SDValue X = Shift.getOperand(i: `0`);
4678	if (FoundAnyExtend) {
4679	SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X);
4680	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewX);
4681	X = NewX;
4682	}
4683
4684	SDValue NewCst = CurDAG->getSignedConstant(Val: ShiftedVal, DL: dl, VT: NVT);
4685	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewCst);
4686	SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst);
4687	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewBinOp);
4688	SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp,
4689	N2: Shift.getOperand(i: `1`));
4690	ReplaceNode(F: N, T: NewSHL.getNode());
4691	SelectCode(N: NewSHL.getNode());
4692	return true;
4693	}
4694
4695	bool X86DAGToDAGISel::matchVPTERNLOG(SDNode Root, SDNode ParentA,
4696	SDNode ParentB, SDNode ParentC,
4697	SDValue A, SDValue B, SDValue C,
4698	uint8_t Imm) {
4699	assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4700	C.isOperandOf(ParentC) && "Incorrect parent node");
4701
4702	auto tryFoldLoadOrBCast =
4703	[this](SDNode Root, SDNode P, SDValue &L, SDValue &Base, SDValue &Scale,
4704	SDValue &Index, SDValue &Disp, SDValue &Segment) {
4705	if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4706	return true;
4707
4708	// Not a load, check for broadcast which may be behind a bitcast.
4709	if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4710	P = L.getNode();
4711	L = L.getOperand(i: `0`);
4712	}
4713
4714	if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4715	return false;
4716
4717	// Only 32 and 64 bit broadcasts are supported.
4718	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4719	unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4720	if (Size != `32` && Size != `64`)
4721	return false;
4722
4723	return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4724	};
4725
4726	bool FoldedLoad = false;
4727	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4728	if (tryFoldLoadOrBCast (Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4729	FoldedLoad = true;
4730	} else if (tryFoldLoadOrBCast (Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4731	Tmp4)) {
4732	FoldedLoad = true;
4733	std::swap(a&: A, b&: C);
4734	// Swap bits 1/4 and 3/6.
4735	uint8_t OldImm = Imm;
4736	Imm = OldImm & `0xa5`;
4737	if (OldImm & `0x02`) Imm \|= `0x10`;
4738	if (OldImm & `0x10`) Imm \|= `0x02`;
4739	if (OldImm & `0x08`) Imm \|= `0x40`;
4740	if (OldImm & `0x40`) Imm \|= `0x08`;
4741	} else if (tryFoldLoadOrBCast (Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4742	Tmp4)) {
4743	FoldedLoad = true;
4744	std::swap(a&: B, b&: C);
4745	// Swap bits 1/2 and 5/6.
4746	uint8_t OldImm = Imm;
4747	Imm = OldImm & `0x99`;
4748	if (OldImm & `0x02`) Imm \|= `0x04`;
4749	if (OldImm & `0x04`) Imm \|= `0x02`;
4750	if (OldImm & `0x20`) Imm \|= `0x40`;
4751	if (OldImm & `0x40`) Imm \|= `0x20`;
4752	}
4753
4754	SDLoc DL(Root);
4755
4756	SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
4757
4758	MVT NVT = Root->getSimpleValueType(ResNo: `0`);
4759
4760	MachineSDNode *MNode;
4761	if (FoldedLoad) {
4762	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
4763
4764	unsigned Opc;
4765	if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4766	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C);
4767	unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4768	assert((EltSize == `32` \|\| EltSize == `64`) && "Unexpected broadcast size!");
4769
4770	bool UseD = EltSize == `32`;
4771	if (NVT.is128BitVector())
4772	Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4773	else if (NVT.is256BitVector())
4774	Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4775	else if (NVT.is512BitVector())
4776	Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4777	else
4778	llvm_unreachable("Unexpected vector size!");
4779	} else {
4780	bool UseD = NVT.getVectorElementType() == MVT::i32;
4781	if (NVT.is128BitVector())
4782	Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4783	else if (NVT.is256BitVector())
4784	Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4785	else if (NVT.is512BitVector())
4786	Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4787	else
4788	llvm_unreachable("Unexpected vector size!");
4789	}
4790
4791	SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: `0`)};
4792	MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops);
4793
4794	// Update the chain.
4795	ReplaceUses(F: C.getValue(R: `1`), T: SDValue (MNode, `1`));
4796	// Record the mem-refs
4797	CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()});
4798	} else {
4799	bool UseD = NVT.getVectorElementType() == MVT::i32;
4800	unsigned Opc;
4801	if (NVT.is128BitVector())
4802	Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4803	else if (NVT.is256BitVector())
4804	Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4805	else if (NVT.is512BitVector())
4806	Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4807	else
4808	llvm_unreachable("Unexpected vector size!");
4809
4810	MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm});
4811	}
4812
4813	ReplaceUses(F: SDValue (Root, `0`), T: SDValue (MNode, `0`));
4814	CurDAG->RemoveDeadNode(N: Root);
4815	return true;
4816	}
4817
4818	// Try to match two logic ops to a VPTERNLOG.
4819	// FIXME: Handle more complex patterns that use an operand more than once?
4820	bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4821	MVT NVT = N->getSimpleValueType(ResNo: `0`);
4822
4823	// Make sure we support VPTERNLOG.
4824	if (!NVT.isVector() \|\| !Subtarget->hasAVX512() \|\|
4825	NVT.getVectorElementType() == MVT::i1)
4826	return false;
4827
4828	// We need VLX for 128/256-bit.
4829	if (!(Subtarget->hasVLX() \|\| NVT.is512BitVector()))
4830	return false;
4831
4832	auto getFoldableLogicOp = [](SDValue Op) {
4833	// Peek through single use bitcast.
4834	if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4835	Op = Op.getOperand(i: `0`);
4836
4837	if (!Op.hasOneUse())
4838	return SDValue ();
4839
4840	unsigned Opc = Op.getOpcode();
4841	if (Opc == ISD::AND \|\| Opc == ISD::OR \|\| Opc == ISD::XOR \|\|
4842	Opc == X86ISD::ANDNP)
4843	return Op;
4844
4845	return SDValue ();
4846	};
4847
4848	SDValue N0, N1, A, FoldableOp;
4849
4850	// Identify and (optionally) peel an outer NOT that wraps a pure logic tree
4851	auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
4852	if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
4853	ISD::isBuildVectorAllOnes(N: Op->getOperand(Num: `1`).getNode())) {
4854	SDValue InnerOp = getFoldableLogicOp (Op->getOperand(Num: `0`));
4855
4856	if (!InnerOp)
4857	return SDValue ();
4858
4859	N0 = InnerOp.getOperand(i: `0`);
4860	N1 = InnerOp.getOperand(i: `1`);
4861	if ((FoldableOp = getFoldableLogicOp (N1))) {
4862	A = N0;
4863	return InnerOp;
4864	}
4865	if ((FoldableOp = getFoldableLogicOp (N0))) {
4866	A = N1;
4867	return InnerOp;
4868	}
4869	}
4870	return SDValue ();
4871	};
4872
4873	bool PeeledOuterNot = false;
4874	SDNode *OriN = N;
4875	if (SDValue InnerOp = tryPeelOuterNotWrappingLogic (N)) {
4876	PeeledOuterNot = true;
4877	N = InnerOp.getNode();
4878	} else {
4879	N0 = N->getOperand(Num: `0`);
4880	N1 = N->getOperand(Num: `1`);
4881
4882	if ((FoldableOp = getFoldableLogicOp (N1)))
4883	A = N0;
4884	else if ((FoldableOp = getFoldableLogicOp (N0)))
4885	A = N1;
4886	else
4887	return false;
4888	}
4889
4890	SDValue B = FoldableOp.getOperand(i: `0`);
4891	SDValue C = FoldableOp.getOperand(i: `1`);
4892	SDNode *ParentA = N;
4893	SDNode *ParentB = FoldableOp.getNode();
4894	SDNode *ParentC = FoldableOp.getNode();
4895
4896	// We can build the appropriate control immediate by performing the logic
4897	// operation we're matching using these constants for A, B, and C.
4898	uint8_t TernlogMagicA = `0xf0`;
4899	uint8_t TernlogMagicB = `0xcc`;
4900	uint8_t TernlogMagicC = `0xaa`;
4901
4902	// Some of the inputs may be inverted, peek through them and invert the
4903	// magic values accordingly.
4904	// TODO: There may be a bitcast before the xor that we should peek through.
4905	auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4906	if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4907	ISD::isBuildVectorAllOnes(N: Op.getOperand(i: `1`).getNode())) {
4908	Magic = ~Magic;
4909	Parent = Op.getNode();
4910	Op = Op.getOperand(i: `0`);
4911	}
4912	};
4913
4914	PeekThroughNot (A, ParentA, TernlogMagicA);
4915	PeekThroughNot (B, ParentB, TernlogMagicB);
4916	PeekThroughNot (C, ParentC, TernlogMagicC);
4917
4918	uint8_t Imm;
4919	switch (FoldableOp.getOpcode()) {
4920	default: llvm_unreachable("Unexpected opcode!");
4921	case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4922	case ISD::OR: Imm = TernlogMagicB \| TernlogMagicC; break;
4923	case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4924	case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4925	}
4926
4927	switch (N->getOpcode()) {
4928	default: llvm_unreachable("Unexpected opcode!");
4929	case X86ISD::ANDNP:
4930	if (A == N0)
4931	Imm &= ~TernlogMagicA;
4932	else
4933	Imm = ~(Imm) & TernlogMagicA;
4934	break;
4935	case ISD::AND: Imm &= TernlogMagicA; break;
4936	case ISD::OR: Imm \|= TernlogMagicA; break;
4937	case ISD::XOR: Imm ^= TernlogMagicA; break;
4938	}
4939
4940	if (PeeledOuterNot)
4941	Imm = ~Imm;
4942
4943	return matchVPTERNLOG(Root: OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
4944	}
4945
4946	/// If the high bits of an 'and' operand are known zero, try setting the
4947	/// high bits of an 'and' constant operand to produce a smaller encoding by
4948	/// creating a small, sign-extended negative immediate rather than a large
4949	/// positive one. This reverses a transform in SimplifyDemandedBits that
4950	/// shrinks mask constants by clearing bits. There is also a possibility that
4951	/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4952	/// case, just replace the 'and'. Return 'true' if the node is replaced.
4953	bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4954	// i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4955	// have immediate operands.
4956	MVT VT = And->getSimpleValueType(ResNo: `0`);
4957	if (VT != MVT::i32 && VT != MVT::i64)
4958	return false;
4959
4960	auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: `1`));
4961	if (!And1C)
4962	return false;
4963
4964	// Bail out if the mask constant is already negative. It's can't shrink more.
4965	// If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4966	// patterns to use a 32-bit and instead of a 64-bit and by relying on the
4967	// implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4968	// are negative too.
4969	APInt MaskVal = And1C->getAPIntValue();
4970	unsigned MaskLZ = MaskVal.countl_zero();
4971	if (!MaskLZ \|\| (VT == MVT::i64 && MaskLZ == `32`))
4972	return false;
4973
4974	// Don't extend into the upper 32 bits of a 64 bit mask.
4975	if (VT == MVT::i64 && MaskLZ >= `32`) {
4976	MaskLZ -= `32`;
4977	MaskVal = MaskVal.trunc(width: `32`);
4978	}
4979
4980	SDValue And0 = And->getOperand(Num: `0`);
4981	APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ);
4982	APInt NegMaskVal = MaskVal \| HighZeros;
4983
4984	// If a negative constant would not allow a smaller encoding, there's no need
4985	// to continue. Only change the constant when we know it's a win.
4986	unsigned MinWidth = NegMaskVal.getSignificantBits();
4987	if (MinWidth > `32` \|\| (MinWidth > `8` && MaskVal.getSignificantBits() <= `32`))
4988	return false;
4989
4990	// Extend masks if we truncated above.
4991	if (VT == MVT::i64 && MaskVal.getBitWidth() < `64`) {
4992	NegMaskVal = NegMaskVal.zext(width: `64`);
4993	HighZeros = HighZeros.zext(width: `64`);
4994	}
4995
4996	// The variable operand must be all zeros in the top bits to allow using the
4997	// new, negative constant as the mask.
4998	// TODO: Handle constant folding?
4999	KnownBits Known0 = CurDAG->computeKnownBits(Op: And0);
5000	if (Known0.isConstant() \|\| !HighZeros.isSubsetOf(RHS: Known0.Zero))
5001	return false;
5002
5003	// Check if the mask is -1. In that case, this is an unnecessary instruction
5004	// that escaped earlier analysis.
5005	if (NegMaskVal.isAllOnes()) {
5006	ReplaceNode(F: And, T: And0.getNode());
5007	return true;
5008	}
5009
5010	// A negative mask allows a smaller encoding. Create a new 'and' node.
5011	SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc (And), VT);
5012	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (And, `0`), N: NewMask);
5013	SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc (And), VT, N1: And0, N2: NewMask);
5014	ReplaceNode(F: And, T: NewAnd.getNode());
5015	SelectCode(N: NewAnd.getNode());
5016	return true;
5017	}
5018
5019	static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
5020	bool FoldedBCast, bool Masked) {
5021	#define VPTESTM_CASE(VT, SUFFIX) \
5022	case MVT::VT: \
5023	if (Masked) \
5024	return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
5025	return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
5026
5027
5028	#define VPTESTM_BROADCAST_CASES(SUFFIX) \
5029	default: llvm_unreachable("Unexpected VT!"); \
5030	VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
5031	VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
5032	VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
5033	VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
5034	VPTESTM_CASE(v16i32, DZ##SUFFIX) \
5035	VPTESTM_CASE(v8i64, QZ##SUFFIX)
5036
5037	#define VPTESTM_FULL_CASES(SUFFIX) \
5038	VPTESTM_BROADCAST_CASES(SUFFIX) \
5039	VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
5040	VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
5041	VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
5042	VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
5043	VPTESTM_CASE(v64i8, BZ##SUFFIX) \
5044	VPTESTM_CASE(v32i16, WZ##SUFFIX)
5045
5046	if (FoldedBCast) {
5047	switch (TestVT.SimpleTy) {
5048	VPTESTM_BROADCAST_CASES(rmb)
5049	}
5050	}
5051
5052	if (FoldedLoad) {
5053	switch (TestVT.SimpleTy) {
5054	VPTESTM_FULL_CASES(rm)
5055	}
5056	}
5057
5058	switch (TestVT.SimpleTy) {
5059	VPTESTM_FULL_CASES(rr)
5060	}
5061
5062	#undef VPTESTM_FULL_CASES
5063	#undef VPTESTM_BROADCAST_CASES
5064	#undef VPTESTM_CASE
5065	}
5066
5067	static void orderRegForMul(SDValue &N0, SDValue &N1, const unsigned LoReg,
5068	const MachineRegisterInfo &MRI) {
5069	auto GetPhysReg = [&](SDValue V) -> Register {
5070	if (V.getOpcode() != ISD::CopyFromReg)
5071	return Register ();
5072	Register Reg = cast<RegisterSDNode>(Val: V.getOperand(i: `1`))->getReg();
5073	if (Reg.isVirtual())
5074	return MRI.getLiveInPhysReg(VReg: Reg);
5075	return Reg;
5076	};
5077
5078	if (GetPhysReg (N1) == LoReg && GetPhysReg (N0) != LoReg)
5079	std::swap(a&: N0, b&: N1);
5080	}
5081
5082	// Try to create VPTESTM instruction. If InMask is not null, it will be used
5083	// to form a masked operation.
5084	bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
5085	SDValue InMask) {
5086	assert(Subtarget->hasAVX512() && "Expected AVX512!");
5087	assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
5088	"Unexpected VT!");
5089
5090	// Look for equal and not equal compares.
5091	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: `2`))->get();
5092	if (CC != ISD::SETEQ && CC != ISD::SETNE)
5093	return false;
5094
5095	SDValue SetccOp0 = Setcc.getOperand(i: `0`);
5096	SDValue SetccOp1 = Setcc.getOperand(i: `1`);
5097
5098	// Canonicalize the all zero vector to the RHS.
5099	if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode()))
5100	std::swap(a&: SetccOp0, b&: SetccOp1);
5101
5102	// See if we're comparing against zero.
5103	if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode()))
5104	return false;
5105
5106	SDValue N0 = SetccOp0;
5107
5108	MVT CmpVT = N0.getSimpleValueType();
5109	MVT CmpSVT = CmpVT.getVectorElementType();
5110
5111	// Start with both operands the same. We'll try to refine this.
5112	SDValue Src0 = N0;
5113	SDValue Src1 = N0;
5114
5115	{
5116	// Look through single use bitcasts.
5117	SDValue N0Temp = N0;
5118	if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
5119	N0Temp = N0.getOperand(i: `0`);
5120
5121	// Look for single use AND.
5122	if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
5123	Src0 = N0Temp.getOperand(i: `0`);
5124	Src1 = N0Temp.getOperand(i: `1`);
5125	}
5126	}
5127
5128	// Without VLX we need to widen the operation.
5129	bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
5130
5131	auto tryFoldLoadOrBCast = [&](SDNode Root, SDNode P, SDValue &L,
5132	SDValue &Base, SDValue &Scale, SDValue &Index,
5133	SDValue &Disp, SDValue &Segment) {
5134	// If we need to widen, we can't fold the load.
5135	if (!Widen)
5136	if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
5137	return true;
5138
5139	// If we didn't fold a load, try to match broadcast. No widening limitation
5140	// for this. But only 32 and 64 bit types are supported.
5141	if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
5142	return false;
5143
5144	// Look through single use bitcasts.
5145	if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
5146	P = L.getNode();
5147	L = L.getOperand(i: `0`);
5148	}
5149
5150	if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
5151	return false;
5152
5153	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
5154	if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
5155	return false;
5156
5157	return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
5158	};
5159
5160	// We can only fold loads if the sources are unique.
5161	bool CanFoldLoads = Src0 != Src1;
5162
5163	bool FoldedLoad = false;
5164	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5165	if (CanFoldLoads) {
5166	FoldedLoad = tryFoldLoadOrBCast (Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5167	Tmp3, Tmp4);
5168	if (!FoldedLoad) {
5169	// And is commutative.
5170	FoldedLoad = tryFoldLoadOrBCast (Root, N0.getNode(), Src0, Tmp0, Tmp1,
5171	Tmp2, Tmp3, Tmp4);
5172	if (FoldedLoad)
5173	std::swap(a&: Src0, b&: Src1);
5174	}
5175	}
5176
5177	bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5178
5179	bool IsMasked = InMask.getNode() != nullptr;
5180
5181	SDLoc dl(Root);
5182
5183	MVT ResVT = Setcc.getSimpleValueType();
5184	MVT MaskVT = ResVT;
5185	if (Widen) {
5186	// Widen the inputs using insert_subreg or copy_to_regclass.
5187	unsigned Scale = CmpVT.is128BitVector() ? `4` : `2`;
5188	unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5189	unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5190	CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts);
5191	MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts);
5192	SDValue ImplDef = SDValue (CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl,
5193	VT: CmpVT), `0`);
5194	Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0);
5195
5196	if (!FoldedBCast)
5197	Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1);
5198
5199	if (IsMasked) {
5200	// Widen the mask.
5201	unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID();
5202	SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5203	InMask = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5204	dl, VT: MaskVT, Op1: InMask, Op2: RC), `0`);
5205	}
5206	}
5207
5208	bool IsTestN = CC == ISD::SETEQ;
5209	unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5210	Masked: IsMasked);
5211
5212	MachineSDNode *CNode;
5213	if (FoldedLoad) {
5214	SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other);
5215
5216	if (IsMasked) {
5217	SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5218	Src1.getOperand(i: `0`) };
5219	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5220	} else {
5221	SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5222	Src1.getOperand(i: `0`) };
5223	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5224	}
5225
5226	// Update the chain.
5227	ReplaceUses(F: Src1.getValue(R: `1`), T: SDValue (CNode, `1`));
5228	// Record the mem-refs
5229	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()});
5230	} else {
5231	if (IsMasked)
5232	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1);
5233	else
5234	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1);
5235	}
5236
5237	// If we widened, we need to shrink the mask VT.
5238	if (Widen) {
5239	unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID();
5240	SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5241	CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5242	dl, VT: ResVT, Op1: SDValue (CNode, `0`), Op2: RC);
5243	}
5244
5245	ReplaceUses(F: SDValue (Root, `0`), T: SDValue (CNode, `0`));
5246	CurDAG->RemoveDeadNode(N: Root);
5247	return true;
5248	}
5249
5250	// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5251	// into vpternlog.
5252	bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5253	assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5254
5255	MVT NVT = N->getSimpleValueType(ResNo: `0`);
5256
5257	// Make sure we support VPTERNLOG.
5258	if (!NVT.isVector() \|\| !Subtarget->hasAVX512())
5259	return false;
5260
5261	// We need VLX for 128/256-bit.
5262	if (!(Subtarget->hasVLX() \|\| NVT.is512BitVector()))
5263	return false;
5264
5265	SDValue N0 = N->getOperand(Num: `0`);
5266	SDValue N1 = N->getOperand(Num: `1`);
5267
5268	// Canonicalize AND to LHS.
5269	if (N1.getOpcode() == ISD::AND)
5270	std::swap(a&: N0, b&: N1);
5271
5272	if (N0.getOpcode() != ISD::AND \|\|
5273	N1.getOpcode() != X86ISD::ANDNP \|\|
5274	!N0.hasOneUse() \|\| !N1.hasOneUse())
5275	return false;
5276
5277	// ANDN is not commutable, use it to pick down A and C.
5278	SDValue A = N1.getOperand(i: `0`);
5279	SDValue C = N1.getOperand(i: `1`);
5280
5281	// AND is commutable, if one operand matches A, the other operand is B.
5282	// Otherwise this isn't a match.
5283	SDValue B;
5284	if (N0.getOperand(i: `0`) == A)
5285	B = N0.getOperand(i: `1`);
5286	else if (N0.getOperand(i: `1`) == A)
5287	B = N0.getOperand(i: `0`);
5288	else
5289	return false;
5290
5291	SDLoc dl(N);
5292	SDValue Imm = CurDAG->getTargetConstant(Val: `0xCA`, DL: dl, VT: MVT::i8);
5293	SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm);
5294	ReplaceNode(F: N, T: Ternlog.getNode());
5295
5296	return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(),
5297	ParentC: Ternlog.getNode(), A, B, C, Imm: `0xCA`);
5298	}
5299
5300	void X86DAGToDAGISel::Select(SDNode *Node) {
5301	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
5302	unsigned Opcode = Node->getOpcode();
5303	SDLoc dl(Node);
5304
5305	if (Node->isMachineOpcode()) {
5306	LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << `'\n'`);
5307	Node->setNodeId(-`1`);
5308	return; // Already selected.
5309	}
5310
5311	switch (Opcode) {
5312	default: break;
5313	case ISD::INTRINSIC_W_CHAIN: {
5314	unsigned IntNo = Node->getConstantOperandVal(Num: `1`);
5315	switch (IntNo) {
5316	default: break;
5317	case Intrinsic::x86_encodekey128:
5318	case Intrinsic::x86_encodekey256: {
5319	if (!Subtarget->hasKL())
5320	break;
5321
5322	unsigned Opcode;
5323	switch (IntNo) {
5324	default: llvm_unreachable("Impossible intrinsic");
5325	case Intrinsic::x86_encodekey128:
5326	Opcode = X86::ENCODEKEY128;
5327	break;
5328	case Intrinsic::x86_encodekey256:
5329	Opcode = X86::ENCODEKEY256;
5330	break;
5331	}
5332
5333	SDValue Chain = Node->getOperand(Num: `0`);
5334	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: `3`),
5335	Glue: SDValue ());
5336	if (Opcode == X86::ENCODEKEY256)
5337	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: `4`),
5338	Glue: Chain.getValue(R: `1`));
5339
5340	MachineSDNode *Res = CurDAG->getMachineNode(
5341	Opcode, dl, VTs: Node->getVTList(),
5342	Ops: {Node->getOperand(Num: `2`), Chain, Chain.getValue(R: `1`)});
5343	ReplaceNode(F: Node, T: Res);
5344	return;
5345	}
5346	case Intrinsic::x86_tileloaddrs64_internal:
5347	case Intrinsic::x86_tileloaddrst164_internal:
5348	if (!Subtarget->hasAMXMOVRS())
5349	break;
5350	[[fallthrough]];
5351	case Intrinsic::x86_tileloadd64_internal:
5352	case Intrinsic::x86_tileloaddt164_internal: {
5353	if (!Subtarget->hasAMXTILE())
5354	break;
5355	auto *MFI =
5356	CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5357	MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5358	unsigned Opc;
5359	switch (IntNo) {
5360	default:
5361	llvm_unreachable("Unexpected intrinsic!");
5362	case Intrinsic::x86_tileloaddrs64_internal:
5363	Opc = X86::PTILELOADDRSV;
5364	break;
5365	case Intrinsic::x86_tileloaddrst164_internal:
5366	Opc = X86::PTILELOADDRST1V;
5367	break;
5368	case Intrinsic::x86_tileloadd64_internal:
5369	Opc = X86::PTILELOADDV;
5370	break;
5371	case Intrinsic::x86_tileloaddt164_internal:
5372	Opc = X86::PTILELOADDT1V;
5373	break;
5374	}
5375	// _tile_loadd_internal(row, col, buf, STRIDE)
5376	SDValue Base = Node->getOperand(Num: `4`);
5377	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5378	SDValue Index = Node->getOperand(Num: `5`);
5379	SDValue Disp = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
5380	SDValue Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
5381	SDValue Chain = Node->getOperand(Num: `0`);
5382	MachineSDNode *CNode;
5383	SDValue Ops[] = {Node->getOperand(Num: `2`),
5384	Node->getOperand(Num: `3`),
5385	Base,
5386	Scale,
5387	Index,
5388	Disp,
5389	Segment,
5390	Chain};
5391	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops);
5392	ReplaceNode(F: Node, T: CNode);
5393	return;
5394	}
5395	}
5396	break;
5397	}
5398	case ISD::INTRINSIC_VOID: {
5399	unsigned IntNo = Node->getConstantOperandVal(Num: `1`);
5400	switch (IntNo) {
5401	default: break;
5402	case Intrinsic::x86_sse3_monitor:
5403	case Intrinsic::x86_monitorx:
5404	case Intrinsic::x86_clzero: {
5405	bool Use64BitPtr = Node->getOperand(Num: `2`).getValueType() == MVT::i64;
5406
5407	unsigned Opc = `0`;
5408	switch (IntNo) {
5409	default: llvm_unreachable("Unexpected intrinsic!");
5410	case Intrinsic::x86_sse3_monitor:
5411	if (!Subtarget->hasSSE3())
5412	break;
5413	Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5414	break;
5415	case Intrinsic::x86_monitorx:
5416	if (!Subtarget->hasMWAITX())
5417	break;
5418	Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5419	break;
5420	case Intrinsic::x86_clzero:
5421	if (!Subtarget->hasCLZERO())
5422	break;
5423	Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5424	break;
5425	}
5426
5427	if (Opc) {
5428	unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5429	SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: `0`), dl, Reg: PtrReg,
5430	N: Node->getOperand(Num: `2`), Glue: SDValue ());
5431	SDValue InGlue = Chain.getValue(R: `1`);
5432
5433	if (IntNo == Intrinsic::x86_sse3_monitor \|\|
5434	IntNo == Intrinsic::x86_monitorx) {
5435	// Copy the other two operands to ECX and EDX.
5436	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: `3`),
5437	Glue: InGlue);
5438	InGlue = Chain.getValue(R: `1`);
5439	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: `4`),
5440	Glue: InGlue);
5441	InGlue = Chain.getValue(R: `1`);
5442	}
5443
5444	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other,
5445	Ops: { Chain, InGlue});
5446	ReplaceNode(F: Node, T: CNode);
5447	return;
5448	}
5449
5450	break;
5451	}
5452	case Intrinsic::x86_tilestored64_internal: {
5453	auto *MFI =
5454	CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5455	MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5456	unsigned Opc = X86::PTILESTOREDV;
5457	// _tile_stored_internal(row, col, buf, STRIDE, c)
5458	SDValue Base = Node->getOperand(Num: `4`);
5459	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5460	SDValue Index = Node->getOperand(Num: `5`);
5461	SDValue Disp = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
5462	SDValue Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
5463	SDValue Chain = Node->getOperand(Num: `0`);
5464	MachineSDNode *CNode;
5465	SDValue Ops[] = {Node->getOperand(Num: `2`),
5466	Node->getOperand(Num: `3`),
5467	Base,
5468	Scale,
5469	Index,
5470	Disp,
5471	Segment,
5472	Node->getOperand(Num: `6`),
5473	Chain};
5474	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5475	ReplaceNode(F: Node, T: CNode);
5476	return;
5477	}
5478	case Intrinsic::x86_tileloaddrs64:
5479	case Intrinsic::x86_tileloaddrst164:
5480	if (!Subtarget->hasAMXMOVRS())
5481	break;
5482	[[fallthrough]];
5483	case Intrinsic::x86_tileloadd64:
5484	case Intrinsic::x86_tileloaddt164:
5485	case Intrinsic::x86_tilestored64: {
5486	if (!Subtarget->hasAMXTILE())
5487	break;
5488	auto *MFI =
5489	CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5490	MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5491	unsigned Opc;
5492	switch (IntNo) {
5493	default: llvm_unreachable("Unexpected intrinsic!");
5494	case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5495	case Intrinsic::x86_tileloaddrs64:
5496	Opc = X86::PTILELOADDRS;
5497	break;
5498	case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5499	case Intrinsic::x86_tileloaddrst164:
5500	Opc = X86::PTILELOADDRST1;
5501	break;
5502	case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5503	}
5504	// FIXME: Match displacement and scale.
5505	unsigned TIndex = Node->getConstantOperandVal(Num: `2`);
5506	SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5507	SDValue Base = Node->getOperand(Num: `3`);
5508	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5509	SDValue Index = Node->getOperand(Num: `4`);
5510	SDValue Disp = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
5511	SDValue Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
5512	SDValue Chain = Node->getOperand(Num: `0`);
5513	MachineSDNode *CNode;
5514	if (Opc == X86::PTILESTORED) {
5515	SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5516	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5517	} else {
5518	SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5519	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5520	}
5521	ReplaceNode(F: Node, T: CNode);
5522	return;
5523	}
5524	}
5525	break;
5526	}
5527	case ISD::BRIND:
5528	case X86ISD::NT_BRIND: {
5529	if (Subtarget->isTarget64BitILP32()) {
5530	// Converts a 32-bit register to a 64-bit, zero-extended version of
5531	// it. This is needed because x86-64 can do many things, but jmp %r32
5532	// ain't one of them.
5533	SDValue Target = Node->getOperand(Num: `1`);
5534	assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5535	SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64);
5536	SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other,
5537	N1: Node->getOperand(Num: `0`), N2: ZextTarget);
5538	ReplaceNode(F: Node, T: Brind.getNode());
5539	SelectCode(N: ZextTarget.getNode());
5540	SelectCode(N: Brind.getNode());
5541	return;
5542	}
5543	break;
5544	}
5545	case X86ISD::GlobalBaseReg:
5546	ReplaceNode(F: Node, T: getGlobalBaseReg());
5547	return;
5548
5549	case ISD::BITCAST:
5550	// Just drop all 128/256/512-bit bitcasts.
5551	if (NVT.is512BitVector() \|\| NVT.is256BitVector() \|\| NVT.is128BitVector() \|\|
5552	NVT == MVT::f128) {
5553	ReplaceUses(F: SDValue (Node, `0`), T: Node->getOperand(Num: `0`));
5554	CurDAG->RemoveDeadNode(N: Node);
5555	return;
5556	}
5557	break;
5558
5559	case ISD::SRL:
5560	if (matchBitExtract(Node))
5561	return;
5562	[[fallthrough]];
5563	case ISD::SRA:
5564	case ISD::SHL:
5565	if (tryShiftAmountMod(N: Node))
5566	return;
5567	break;
5568
5569	case X86ISD::VPTERNLOG: {
5570	uint8_t Imm = Node->getConstantOperandVal(Num: `3`);
5571	if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: `0`),
5572	B: Node->getOperand(Num: `1`), C: Node->getOperand(Num: `2`), Imm))
5573	return;
5574	break;
5575	}
5576
5577	case X86ISD::ANDNP:
5578	if (tryVPTERNLOG(N: Node))
5579	return;
5580	break;
5581
5582	case ISD::AND:
5583	if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5584	// Try to form a masked VPTESTM. Operands can be in either order.
5585	SDValue N0 = Node->getOperand(Num: `0`);
5586	SDValue N1 = Node->getOperand(Num: `1`);
5587	if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5588	tryVPTESTM(Root: Node, Setcc: N0, InMask: N1))
5589	return;
5590	if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5591	tryVPTESTM(Root: Node, Setcc: N1, InMask: N0))
5592	return;
5593	}
5594
5595	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5596	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
5597	CurDAG->RemoveDeadNode(N: Node);
5598	return;
5599	}
5600	if (matchBitExtract(Node))
5601	return;
5602	if (AndImmShrink && shrinkAndImmediate(And: Node))
5603	return;
5604
5605	[[fallthrough]];
5606	case ISD::OR:
5607	case ISD::XOR:
5608	if (tryShrinkShlLogicImm(N: Node))
5609	return;
5610	if (Opcode == ISD::OR && tryMatchBitSelect(N: Node))
5611	return;
5612	if (tryVPTERNLOG(N: Node))
5613	return;
5614
5615	[[fallthrough]];
5616	case ISD::ADD:
5617	if (Opcode == ISD::ADD && matchBitExtract(Node))
5618	return;
5619	[[fallthrough]];
5620	case ISD::SUB: {
5621	// Try to avoid folding immediates with multiple uses for optsize.
5622	// This code tries to select to register form directly to avoid going
5623	// through the isel table which might fold the immediate. We can't change
5624	// the patterns on the add/sub/and/or/xor with immediate paterns in the
5625	// tablegen files to check immediate use count without making the patterns
5626	// unavailable to the fast-isel table.
5627	if (!CurDAG->shouldOptForSize())
5628	break;
5629
5630	// Only handle i8/i16/i32/i64.
5631	if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5632	break;
5633
5634	SDValue N0 = Node->getOperand(Num: `0`);
5635	SDValue N1 = Node->getOperand(Num: `1`);
5636
5637	auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
5638	if (!Cst)
5639	break;
5640
5641	int64_t Val = Cst->getSExtValue();
5642
5643	// Make sure its an immediate that is considered foldable.
5644	// FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5645	if (!isInt<`8`>(x: Val) && !isInt<`32`>(x: Val))
5646	break;
5647
5648	// If this can match to INC/DEC, let it go.
5649	if (Opcode == ISD::ADD && (Val == `1` \|\| Val == -`1`))
5650	break;
5651
5652	// Check if we should avoid folding this immediate.
5653	if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode()))
5654	break;
5655
5656	// We should not fold the immediate. So we need a register form instead.
5657	unsigned ROpc, MOpc;
5658	switch (NVT.SimpleTy) {
5659	default: llvm_unreachable("Unexpected VT!");
5660	case MVT::i8:
5661	switch (Opcode) {
5662	default: llvm_unreachable("Unexpected opcode!");
5663	case ISD::ADD:
5664	ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5665	MOpc = GET_NDM_IF_ENABLED(X86::ADD8rm);
5666	break;
5667	case ISD::SUB:
5668	ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5669	MOpc = GET_NDM_IF_ENABLED(X86::SUB8rm);
5670	break;
5671	case ISD::AND:
5672	ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5673	MOpc = GET_NDM_IF_ENABLED(X86::AND8rm);
5674	break;
5675	case ISD::OR:
5676	ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5677	MOpc = GET_NDM_IF_ENABLED(X86::OR8rm);
5678	break;
5679	case ISD::XOR:
5680	ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5681	MOpc = GET_NDM_IF_ENABLED(X86::XOR8rm);
5682	break;
5683	}
5684	break;
5685	case MVT::i16:
5686	switch (Opcode) {
5687	default: llvm_unreachable("Unexpected opcode!");
5688	case ISD::ADD:
5689	ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5690	MOpc = GET_NDM_IF_ENABLED(X86::ADD16rm);
5691	break;
5692	case ISD::SUB:
5693	ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5694	MOpc = GET_NDM_IF_ENABLED(X86::SUB16rm);
5695	break;
5696	case ISD::AND:
5697	ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5698	MOpc = GET_NDM_IF_ENABLED(X86::AND16rm);
5699	break;
5700	case ISD::OR:
5701	ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5702	MOpc = GET_NDM_IF_ENABLED(X86::OR16rm);
5703	break;
5704	case ISD::XOR:
5705	ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5706	MOpc = GET_NDM_IF_ENABLED(X86::XOR16rm);
5707	break;
5708	}
5709	break;
5710	case MVT::i32:
5711	switch (Opcode) {
5712	default: llvm_unreachable("Unexpected opcode!");
5713	case ISD::ADD:
5714	ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5715	MOpc = GET_NDM_IF_ENABLED(X86::ADD32rm);
5716	break;
5717	case ISD::SUB:
5718	ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5719	MOpc = GET_NDM_IF_ENABLED(X86::SUB32rm);
5720	break;
5721	case ISD::AND:
5722	ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5723	MOpc = GET_NDM_IF_ENABLED(X86::AND32rm);
5724	break;
5725	case ISD::OR:
5726	ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5727	MOpc = GET_NDM_IF_ENABLED(X86::OR32rm);
5728	break;
5729	case ISD::XOR:
5730	ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5731	MOpc = GET_NDM_IF_ENABLED(X86::XOR32rm);
5732	break;
5733	}
5734	break;
5735	case MVT::i64:
5736	switch (Opcode) {
5737	default: llvm_unreachable("Unexpected opcode!");
5738	case ISD::ADD:
5739	ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5740	MOpc = GET_NDM_IF_ENABLED(X86::ADD64rm);
5741	break;
5742	case ISD::SUB:
5743	ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5744	MOpc = GET_NDM_IF_ENABLED(X86::SUB64rm);
5745	break;
5746	case ISD::AND:
5747	ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5748	MOpc = GET_NDM_IF_ENABLED(X86::AND64rm);
5749	break;
5750	case ISD::OR:
5751	ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5752	MOpc = GET_NDM_IF_ENABLED(X86::OR64rm);
5753	break;
5754	case ISD::XOR:
5755	ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5756	MOpc = GET_NDM_IF_ENABLED(X86::XOR64rm);
5757	break;
5758	}
5759	break;
5760	}
5761
5762	// Ok this is a AND/OR/XOR/ADD/SUB with constant.
5763
5764	// If this is a not a subtract, we can still try to fold a load.
5765	if (Opcode != ISD::SUB) {
5766	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5767	if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5768	SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: `0`) };
5769	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5770	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5771	// Update the chain.
5772	ReplaceUses(F: N0.getValue(R: `1`), T: SDValue (CNode, `2`));
5773	// Record the mem-refs
5774	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5775	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
5776	CurDAG->RemoveDeadNode(N: Node);
5777	return;
5778	}
5779	}
5780
5781	CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1);
5782	return;
5783	}
5784
5785	case X86ISD::SMUL:
5786	// i16/i32/i64 are handled with isel patterns.
5787	if (NVT != MVT::i8)
5788	break;
5789	[[fallthrough]];
5790	case X86ISD::UMUL: {
5791	SDValue N0 = Node->getOperand(Num: `0`);
5792	SDValue N1 = Node->getOperand(Num: `1`);
5793
5794	unsigned LoReg, ROpc, MOpc;
5795	switch (NVT.SimpleTy) {
5796	default: llvm_unreachable("Unsupported VT!");
5797	case MVT::i8:
5798	LoReg = X86::AL;
5799	ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5800	MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5801	break;
5802	case MVT::i16:
5803	LoReg = X86::AX;
5804	ROpc = X86::MUL16r;
5805	MOpc = X86::MUL16m;
5806	break;
5807	case MVT::i32:
5808	LoReg = X86::EAX;
5809	ROpc = X86::MUL32r;
5810	MOpc = X86::MUL32m;
5811	break;
5812	case MVT::i64:
5813	LoReg = X86::RAX;
5814	ROpc = X86::MUL64r;
5815	MOpc = X86::MUL64m;
5816	break;
5817	}
5818
5819	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5820	bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5821	// Multiply is commutative.
5822	if (!FoldedLoad) {
5823	FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5824	if (FoldedLoad)
5825	std::swap(a&: N0, b&: N1);
5826	}
5827
5828	// UMUL/SMUL have an implicit source in LoReg (AL/AX/EAX/RAX). Prefer the
5829	// operand that's already there to avoid an extra register-to-register move.
5830	if (!FoldedLoad)
5831	orderRegForMul(N0, N1, LoReg, MRI: CurDAG->getMachineFunction().getRegInfo());
5832
5833	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5834	N: N0, Glue: SDValue ()).getValue(R: `1`);
5835
5836	MachineSDNode *CNode;
5837	if (FoldedLoad) {
5838	// i16/i32/i64 use an instruction that produces a low and high result even
5839	// though only the low result is used.
5840	SDVTList VTs;
5841	if (NVT == MVT::i8)
5842	VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5843	else
5844	VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other);
5845
5846	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
5847	InGlue };
5848	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5849
5850	// Update the chain.
5851	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, NVT == MVT::i8 ? `2` : `3`));
5852	// Record the mem-refs
5853	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5854	} else {
5855	// i16/i32/i64 use an instruction that produces a low and high result even
5856	// though only the low result is used.
5857	SDVTList VTs;
5858	if (NVT == MVT::i8)
5859	VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32);
5860	else
5861	VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32);
5862
5863	CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue});
5864	}
5865
5866	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
5867	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, NVT == MVT::i8 ? `1` : `2`));
5868	CurDAG->RemoveDeadNode(N: Node);
5869	return;
5870	}
5871
5872	case ISD::SMUL_LOHI:
5873	case ISD::UMUL_LOHI: {
5874	SDValue N0 = Node->getOperand(Num: `0`);
5875	SDValue N1 = Node->getOperand(Num: `1`);
5876
5877	unsigned Opc, MOpc;
5878	unsigned LoReg, HiReg;
5879	bool IsSigned = Opcode == ISD::SMUL_LOHI;
5880	bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5881	bool UseMULXHi = UseMULX && SDValue (Node, `0`).use_empty();
5882	switch (NVT.SimpleTy) {
5883	default: llvm_unreachable("Unsupported VT!");
5884	case MVT::i32:
5885	Opc = UseMULXHi ? X86::MULX32Hrr
5886	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5887	: IsSigned ? X86::IMUL32r
5888	: X86::MUL32r;
5889	MOpc = UseMULXHi ? X86::MULX32Hrm
5890	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5891	: IsSigned ? X86::IMUL32m
5892	: X86::MUL32m;
5893	LoReg = UseMULX ? X86::EDX : X86::EAX;
5894	HiReg = X86::EDX;
5895	break;
5896	case MVT::i64:
5897	Opc = UseMULXHi ? X86::MULX64Hrr
5898	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5899	: IsSigned ? X86::IMUL64r
5900	: X86::MUL64r;
5901	MOpc = UseMULXHi ? X86::MULX64Hrm
5902	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5903	: IsSigned ? X86::IMUL64m
5904	: X86::MUL64m;
5905	LoReg = UseMULX ? X86::RDX : X86::RAX;
5906	HiReg = X86::RDX;
5907	break;
5908	}
5909
5910	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5911	bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5912	// Multiply is commutative.
5913	if (!foldedLoad) {
5914	foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5915	if (foldedLoad)
5916	std::swap(a&: N0, b&: N1);
5917	}
5918
5919	// UMUL/SMUL_LOHI has an implicit source in LoReg (RDX for MULX, RAX for
5920	// MUL/IMUL). Prefer the operand that's already there.
5921	if (!foldedLoad)
5922	orderRegForMul(N0, N1, LoReg, MRI: CurDAG->getMachineFunction().getRegInfo());
5923
5924	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5925	N: N0, Glue: SDValue ()).getValue(R: `1`);
5926	SDValue ResHi, ResLo;
5927	if (foldedLoad) {
5928	SDValue Chain;
5929	MachineSDNode CNode = nullptr*;
5930	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
5931	InGlue };
5932	if (UseMULXHi) {
5933	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
5934	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5935	ResHi = SDValue (CNode, `0`);
5936	Chain = SDValue (CNode, `1`);
5937	} else if (UseMULX) {
5938	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other);
5939	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5940	ResHi = SDValue (CNode, `0`);
5941	ResLo = SDValue (CNode, `1`);
5942	Chain = SDValue (CNode, `2`);
5943	} else {
5944	SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
5945	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5946	Chain = SDValue (CNode, `0`);
5947	InGlue = SDValue (CNode, `1`);
5948	}
5949
5950	// Update the chain.
5951	ReplaceUses(F: N1.getValue(R: `1`), T: Chain);
5952	// Record the mem-refs
5953	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5954	} else {
5955	SDValue Ops[] = { N1, InGlue };
5956	if (UseMULXHi) {
5957	SDVTList VTs = CurDAG->getVTList(VT: NVT);
5958	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5959	ResHi = SDValue (CNode, `0`);
5960	} else if (UseMULX) {
5961	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT);
5962	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5963	ResHi = SDValue (CNode, `0`);
5964	ResLo = SDValue (CNode, `1`);
5965	} else {
5966	SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue);
5967	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5968	InGlue = SDValue (CNode, `0`);
5969	}
5970	}
5971
5972	// Copy the low half of the result, if it is needed.
5973	if (!SDValue (Node, `0`).use_empty()) {
5974	if (!ResLo) {
5975	assert(LoReg && "Register for low half is not defined!");
5976	ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5977	VT: NVT, Glue: InGlue);
5978	InGlue = ResLo.getValue(R: `2`);
5979	}
5980	ReplaceUses(F: SDValue (Node, `0`), T: ResLo);
5981	LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5982	dbgs() << `'\n'`);
5983	}
5984	// Copy the high half of the result, if it is needed.
5985	if (!SDValue (Node, `1`).use_empty()) {
5986	if (!ResHi) {
5987	assert(HiReg && "Register for high half is not defined!");
5988	ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg,
5989	VT: NVT, Glue: InGlue);
5990	InGlue = ResHi.getValue(R: `2`);
5991	}
5992	ReplaceUses(F: SDValue (Node, `1`), T: ResHi);
5993	LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5994	dbgs() << `'\n'`);
5995	}
5996
5997	CurDAG->RemoveDeadNode(N: Node);
5998	return;
5999	}
6000
6001	case ISD::SDIVREM:
6002	case ISD::UDIVREM: {
6003	SDValue N0 = Node->getOperand(Num: `0`);
6004	SDValue N1 = Node->getOperand(Num: `1`);
6005
6006	unsigned ROpc, MOpc;
6007	bool isSigned = Opcode == ISD::SDIVREM;
6008	if (!isSigned) {
6009	switch (NVT.SimpleTy) {
6010	default: llvm_unreachable("Unsupported VT!");
6011	case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
6012	case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
6013	case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
6014	case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
6015	}
6016	} else {
6017	switch (NVT.SimpleTy) {
6018	default: llvm_unreachable("Unsupported VT!");
6019	case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
6020	case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
6021	case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
6022	case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
6023	}
6024	}
6025
6026	unsigned LoReg, HiReg, ClrReg;
6027	unsigned SExtOpcode;
6028	switch (NVT.SimpleTy) {
6029	default: llvm_unreachable("Unsupported VT!");
6030	case MVT::i8:
6031	LoReg = X86::AL; ClrReg = HiReg = X86::AH;
6032	SExtOpcode = `0`; // Not used.
6033	break;
6034	case MVT::i16:
6035	LoReg = X86::AX; HiReg = X86::DX;
6036	ClrReg = X86::DX;
6037	SExtOpcode = X86::CWD;
6038	break;
6039	case MVT::i32:
6040	LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
6041	SExtOpcode = X86::CDQ;
6042	break;
6043	case MVT::i64:
6044	LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
6045	SExtOpcode = X86::CQO;
6046	break;
6047	}
6048
6049	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6050	bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
6051	bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0);
6052
6053	SDValue InGlue;
6054	if (NVT == MVT::i8) {
6055	// Special case for div8, just use a move with zero extension to AX to
6056	// clear the upper 8 bits (AH).
6057	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
6058	MachineSDNode *Move;
6059	if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6060	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: `0`) };
6061	unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
6062	: X86::MOVZX16rm8;
6063	Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops);
6064	Chain = SDValue (Move, `1`);
6065	ReplaceUses(F: N0.getValue(R: `1`), T: Chain);
6066	// Record the mem-refs
6067	CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
6068	} else {
6069	unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
6070	: X86::MOVZX16rr8;
6071	Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0);
6072	Chain = CurDAG->getEntryNode();
6073	}
6074	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue (Move, `0`),
6075	Glue: SDValue ());
6076	InGlue = Chain.getValue(R: `1`);
6077	} else {
6078	InGlue =
6079	CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl,
6080	Reg: LoReg, N: N0, Glue: SDValue ()).getValue(R: `1`);
6081	if (isSigned && !signBitIsZero) {
6082	// Sign extend the low part into the high part.
6083	InGlue =
6084	SDValue (CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),`0`);
6085	} else {
6086	// Zero out the high part, effectively zero extending the input.
6087	SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
6088	SDValue ClrNode =
6089	SDValue (CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), `0`);
6090	switch (NVT.SimpleTy) {
6091	case MVT::i16:
6092	ClrNode =
6093	SDValue (CurDAG->getMachineNode(
6094	Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode,
6095	Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl,
6096	VT: MVT::i32)),
6097	`0`);
6098	break;
6099	case MVT::i32:
6100	break;
6101	case MVT::i64:
6102	ClrNode = SDValue (
6103	CurDAG->getMachineNode(
6104	Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, Op1: ClrNode,
6105	Op2: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
6106	`0`);
6107	break;
6108	default:
6109	llvm_unreachable("Unexpected division source");
6110	}
6111
6112	InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg,
6113	N: ClrNode, Glue: InGlue).getValue(R: `1`);
6114	}
6115	}
6116
6117	if (foldedLoad) {
6118	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
6119	InGlue };
6120	MachineSDNode *CNode =
6121	CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops);
6122	InGlue = SDValue (CNode, `1`);
6123	// Update the chain.
6124	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, `0`));
6125	// Record the mem-refs
6126	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
6127	} else {
6128	InGlue =
6129	SDValue (CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), `0`);
6130	}
6131
6132	// Prevent use of AH in a REX instruction by explicitly copying it to
6133	// an ABCD_L register.
6134	//
6135	// The current assumption of the register allocator is that isel
6136	// won't generate explicit references to the GR8_ABCD_H registers. If
6137	// the allocator and/or the backend get enhanced to be more robust in
6138	// that regard, this can be, and should be, removed.
6139	if (HiReg == X86::AH && !SDValue (Node, `1`).use_empty()) {
6140	SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8);
6141	unsigned AHExtOpcode =
6142	isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6143
6144	SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32,
6145	VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue);
6146	SDValue Result(RNode, `0`);
6147	InGlue = SDValue (RNode, `1`);
6148
6149	Result =
6150	CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result);
6151
6152	ReplaceUses(F: SDValue (Node, `1`), T: Result);
6153	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6154	dbgs() << `'\n'`);
6155	}
6156	// Copy the division (low) result, if it is needed.
6157	if (!SDValue (Node, `0`).use_empty()) {
6158	SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6159	Reg: LoReg, VT: NVT, Glue: InGlue);
6160	InGlue = Result.getValue(R: `2`);
6161	ReplaceUses(F: SDValue (Node, `0`), T: Result);
6162	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6163	dbgs() << `'\n'`);
6164	}
6165	// Copy the remainder (high) result, if it is needed.
6166	if (!SDValue (Node, `1`).use_empty()) {
6167	SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
6168	Reg: HiReg, VT: NVT, Glue: InGlue);
6169	InGlue = Result.getValue(R: `2`);
6170	ReplaceUses(F: SDValue (Node, `1`), T: Result);
6171	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6172	dbgs() << `'\n'`);
6173	}
6174	CurDAG->RemoveDeadNode(N: Node);
6175	return;
6176	}
6177
6178	case X86ISD::FCMP:
6179	case X86ISD::STRICT_FCMP:
6180	case X86ISD::STRICT_FCMPS: {
6181	bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP \|\|
6182	Node->getOpcode() == X86ISD::STRICT_FCMPS;
6183	SDValue N0 = Node->getOperand(Num: IsStrictCmp ? `1` : `0`);
6184	SDValue N1 = Node->getOperand(Num: IsStrictCmp ? `2` : `1`);
6185
6186	// Save the original VT of the compare.
6187	MVT CmpVT = N0.getSimpleValueType();
6188
6189	// Floating point needs special handling if we don't have FCOMI.
6190	if (Subtarget->canUseCMOV())
6191	break;
6192
6193	bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6194
6195	unsigned Opc;
6196	switch (CmpVT.SimpleTy) {
6197	default: llvm_unreachable("Unexpected type!");
6198	case MVT::f32:
6199	Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6200	break;
6201	case MVT::f64:
6202	Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6203	break;
6204	case MVT::f80:
6205	Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6206	break;
6207	}
6208
6209	SDValue Chain =
6210	IsStrictCmp ? Node->getOperand(Num: `0`) : CurDAG->getEntryNode();
6211	SDValue Glue;
6212	if (IsStrictCmp) {
6213	SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
6214	Chain = SDValue (CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), `0`);
6215	Glue = Chain.getValue(R: `1`);
6216	} else {
6217	Glue = SDValue (CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), `0`);
6218	}
6219
6220	// Move FPSW to AX.
6221	SDValue FNSTSW =
6222	SDValue (CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), `0`);
6223
6224	// Extract upper 8-bits of AX.
6225	SDValue Extract =
6226	CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW);
6227
6228	// Move AH into flags.
6229	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
6230	assert(Subtarget->canUseLAHFSAHF() &&
6231	"Target doesn't support SAHF or FCOMI?");
6232	SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue ());
6233	Chain = AH;
6234	SDValue SAHF = SDValue (
6235	CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: `1`)), `0`);
6236
6237	if (IsStrictCmp)
6238	ReplaceUses(F: SDValue (Node, `1`), T: Chain);
6239
6240	ReplaceUses(F: SDValue (Node, `0`), T: SAHF);
6241	CurDAG->RemoveDeadNode(N: Node);
6242	return;
6243	}
6244
6245	case X86ISD::CMP: {
6246	SDValue N0 = Node->getOperand(Num: `0`);
6247	SDValue N1 = Node->getOperand(Num: `1`);
6248
6249	// Optimizations for TEST compares.
6250	if (!isNullConstant(V: N1))
6251	break;
6252
6253	// Save the original VT of the compare.
6254	MVT CmpVT = N0.getSimpleValueType();
6255
6256	// If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6257	// by a test instruction. The test should be removed later by
6258	// analyzeCompare if we are using only the zero flag.
6259	// TODO: Should we check the users and use the BEXTR flags directly?
6260	if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6261	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) {
6262	unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6263	: X86::TEST32rr;
6264	SDValue BEXTR = SDValue (NewNode, `0`);
6265	NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR);
6266	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
6267	CurDAG->RemoveDeadNode(N: Node);
6268	return;
6269	}
6270	}
6271
6272	// We can peek through truncates, but we need to be careful below.
6273	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6274	N0 = N0.getOperand(i: `0`);
6275
6276	// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6277	// use a smaller encoding.
6278	// Look past the truncate if CMP is the only use of it.
6279	if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6280	N0.getValueType() != MVT::i8) {
6281	auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: `1`));
6282	if (!MaskC)
6283	break;
6284
6285	// We may have looked through a truncate so mask off any bits that
6286	// shouldn't be part of the compare.
6287	uint64_t Mask = MaskC->getZExtValue();
6288	Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits());
6289
6290	// Check if we can replace AND+IMM{32,64} with a shift. This is possible
6291	// for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6292	// zero flag.
6293	if (CmpVT == MVT::i64 && !isInt<`8`>(x: Mask) && isShiftedMask_64(Value: Mask) &&
6294	onlyUsesZeroFlag(Flags: SDValue (Node, `0`))) {
6295	unsigned ShiftOpcode = ISD::DELETED_NODE;
6296	unsigned ShiftAmt;
6297	unsigned SubRegIdx;
6298	MVT SubRegVT;
6299	unsigned TestOpcode;
6300	unsigned LeadingZeros = llvm::countl_zero(Val: Mask);
6301	unsigned TrailingZeros = llvm::countr_zero(Val: Mask);
6302
6303	// With leading/trailing zeros, the transform is profitable if we can
6304	// eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6305	// incurring any extra register moves.
6306	bool SavesBytes = !isInt<`32`>(x: Mask) \|\| N0.getOperand(i: `0`).hasOneUse();
6307	if (LeadingZeros == `0` && SavesBytes) {
6308	// If the mask covers the most significant bit, then we can replace
6309	// TEST+AND with a SHR and check eflags.
6310	// This emits a redundant TEST which is subsequently eliminated.
6311	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6312	ShiftAmt = TrailingZeros;
6313	SubRegIdx = `0`;
6314	TestOpcode = X86::TEST64rr;
6315	} else if (TrailingZeros == `0` && SavesBytes) {
6316	// If the mask covers the least significant bit, then we can replace
6317	// TEST+AND with a SHL and check eflags.
6318	// This emits a redundant TEST which is subsequently eliminated.
6319	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6320	ShiftAmt = LeadingZeros;
6321	SubRegIdx = `0`;
6322	TestOpcode = X86::TEST64rr;
6323	} else if (MaskC->hasOneUse() && !isInt<`32`>(x: Mask)) {
6324	// If the shifted mask extends into the high half and is 8/16/32 bits
6325	// wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6326	unsigned PopCount = `64` - LeadingZeros - TrailingZeros;
6327	if (PopCount == `8`) {
6328	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6329	ShiftAmt = TrailingZeros;
6330	SubRegIdx = X86::sub_8bit;
6331	SubRegVT = MVT::i8;
6332	TestOpcode = X86::TEST8rr;
6333	} else if (PopCount == `16`) {
6334	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6335	ShiftAmt = TrailingZeros;
6336	SubRegIdx = X86::sub_16bit;
6337	SubRegVT = MVT::i16;
6338	TestOpcode = X86::TEST16rr;
6339	} else if (PopCount == `32`) {
6340	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6341	ShiftAmt = TrailingZeros;
6342	SubRegIdx = X86::sub_32bit;
6343	SubRegVT = MVT::i32;
6344	TestOpcode = X86::TEST32rr;
6345	}
6346	}
6347	if (ShiftOpcode != ISD::DELETED_NODE) {
6348	SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64);
6349	SDValue Shift = SDValue (
6350	CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32,
6351	Op1: N0.getOperand(i: `0`), Op2: ShiftC),
6352	`0`);
6353	if (SubRegIdx != `0`) {
6354	Shift =
6355	CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift);
6356	}
6357	MachineSDNode *Test =
6358	CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift);
6359	ReplaceNode(F: Node, T: Test);
6360	return;
6361	}
6362	}
6363
6364	MVT VT;
6365	int SubRegOp;
6366	unsigned ROpc, MOpc;
6367
6368	// For each of these checks we need to be careful if the sign flag is
6369	// being used. It is only safe to use the sign flag in two conditions,
6370	// either the sign bit in the shrunken mask is zero or the final test
6371	// size is equal to the original compare size.
6372
6373	if (isUInt<`8`>(x: Mask) &&
6374	(!(Mask & `0x80`) \|\| CmpVT == MVT::i8 \|\|
6375	hasNoSignFlagUses(Flags: SDValue (Node, `0`)))) {
6376	// For example, convert "testl %eax, $8" to "testb %al, $8"
6377	VT = MVT::i8;
6378	SubRegOp = X86::sub_8bit;
6379	ROpc = X86::TEST8ri;
6380	MOpc = X86::TEST8mi;
6381	} else if (OptForMinSize && isUInt<`16`>(x: Mask) &&
6382	(!(Mask & `0x8000`) \|\| CmpVT == MVT::i16 \|\|
6383	hasNoSignFlagUses(Flags: SDValue (Node, `0`)))) {
6384	// For example, "testl %eax, $32776" to "testw %ax, $32776".
6385	// NOTE: We only want to form TESTW instructions if optimizing for
6386	// min size. Otherwise we only save one byte and possibly get a length
6387	// changing prefix penalty in the decoders.
6388	VT = MVT::i16;
6389	SubRegOp = X86::sub_16bit;
6390	ROpc = X86::TEST16ri;
6391	MOpc = X86::TEST16mi;
6392	} else if (isUInt<`32`>(x: Mask) && N0.getValueType() != MVT::i16 &&
6393	((!(Mask & `0x80000000`) &&
6394	// Without minsize 16-bit Cmps can get here so we need to
6395	// be sure we calculate the correct sign flag if needed.
6396	(CmpVT != MVT::i16 \|\| !(Mask & `0x8000`))) \|\|
6397	CmpVT == MVT::i32 \|\|
6398	hasNoSignFlagUses(Flags: SDValue (Node, `0`)))) {
6399	// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6400	// NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6401	// Otherwize, we find ourselves in a position where we have to do
6402	// promotion. If previous passes did not promote the and, we assume
6403	// they had a good reason not to and do not promote here.
6404	VT = MVT::i32;
6405	SubRegOp = X86::sub_32bit;
6406	ROpc = X86::TEST32ri;
6407	MOpc = X86::TEST32mi;
6408	} else {
6409	// No eligible transformation was found.
6410	break;
6411	}
6412
6413	SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT);
6414	SDValue Reg = N0.getOperand(i: `0`);
6415
6416	// Emit a testl or testw.
6417	MachineSDNode *NewNode;
6418	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6419	if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6420	if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: `0`).getNode())) {
6421	if (!LoadN->isSimple()) {
6422	unsigned NumVolBits = LoadN->getValueType(ResNo: `0`).getSizeInBits();
6423	if ((MOpc == X86::TEST8mi && NumVolBits != `8`) \|\|
6424	(MOpc == X86::TEST16mi && NumVolBits != `16`) \|\|
6425	(MOpc == X86::TEST32mi && NumVolBits != `32`))
6426	break;
6427	}
6428	}
6429	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6430	Reg.getOperand(i: `0`) };
6431	NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops);
6432	// Update the chain.
6433	ReplaceUses(F: Reg.getValue(R: `1`), T: SDValue (NewNode, `1`));
6434	// Record the mem-refs
6435	CurDAG->setNodeMemRefs(N: NewNode,
6436	NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()});
6437	} else {
6438	// Extract the subregister if necessary.
6439	if (N0.getValueType() != VT)
6440	Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg);
6441
6442	NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm);
6443	}
6444	// Replace CMP with TEST.
6445	ReplaceNode(F: Node, T: NewNode);
6446	return;
6447	}
6448	break;
6449	}
6450	case X86ISD::PCMPISTR: {
6451	if (!Subtarget->hasSSE42())
6452	break;
6453
6454	bool NeedIndex = !SDValue (Node, `0`).use_empty();
6455	bool NeedMask = !SDValue (Node, `1`).use_empty();
6456	// We can't fold a load if we are going to make two instructions.
6457	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;
6458
6459	MachineSDNode *CNode;
6460	if (NeedMask) {
6461	unsigned ROpc =
6462	Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6463	unsigned MOpc =
6464	Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6465	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node);
6466	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, `0`));
6467	}
6468	if (NeedIndex \|\| !NeedMask) {
6469	unsigned ROpc =
6470	Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6471	unsigned MOpc =
6472	Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6473	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node);
6474	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
6475	}
6476
6477	// Connect the flag usage to the last instruction created.
6478	ReplaceUses(F: SDValue (Node, `2`), T: SDValue (CNode, `1`));
6479	CurDAG->RemoveDeadNode(N: Node);
6480	return;
6481	}
6482	case X86ISD::PCMPESTR: {
6483	if (!Subtarget->hasSSE42())
6484	break;
6485
6486	// Copy the two implicit register inputs.
6487	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX,
6488	N: Node->getOperand(Num: `1`),
6489	Glue: SDValue ()).getValue(R: `1`);
6490	InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX,
6491	N: Node->getOperand(Num: `3`), Glue: InGlue).getValue(R: `1`);
6492
6493	bool NeedIndex = !SDValue (Node, `0`).use_empty();
6494	bool NeedMask = !SDValue (Node, `1`).use_empty();
6495	// We can't fold a load if we are going to make two instructions.
6496	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;
6497
6498	MachineSDNode *CNode;
6499	if (NeedMask) {
6500	unsigned ROpc =
6501	Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6502	unsigned MOpc =
6503	Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6504	CNode =
6505	emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue);
6506	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, `0`));
6507	}
6508	if (NeedIndex \|\| !NeedMask) {
6509	unsigned ROpc =
6510	Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6511	unsigned MOpc =
6512	Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6513	CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue);
6514	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
6515	}
6516	// Connect the flag usage to the last instruction created.
6517	ReplaceUses(F: SDValue (Node, `2`), T: SDValue (CNode, `1`));
6518	CurDAG->RemoveDeadNode(N: Node);
6519	return;
6520	}
6521
6522	case ISD::SETCC: {
6523	if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue (Node, `0`), InMask: SDValue ()))
6524	return;
6525
6526	break;
6527	}
6528
6529	case ISD::STORE:
6530	if (foldLoadStoreIntoMemOperand(Node))
6531	return;
6532	break;
6533
6534	case X86ISD::SETCC_CARRY: {
6535	MVT VT = Node->getSimpleValueType(ResNo: `0`);
6536	SDValue Result;
6537	if (Subtarget->hasSBBDepBreaking()) {
6538	// We have to do this manually because tblgen will put the eflags copy in
6539	// the wrong place if we use an extract_subreg in the pattern.
6540	// Copy flags to the EFLAGS register and glue it to next node.
6541	SDValue EFLAGS =
6542	CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
6543	N: Node->getOperand(Num: `1`), Glue: SDValue ());
6544
6545	// Create a 64-bit instruction if the result is 64-bits otherwise use the
6546	// 32-bit version.
6547	unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6548	MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6549	Result = SDValue (
6550	CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: `1`)),
6551	`0`);
6552	} else {
6553	// The target does not recognize sbb with the same reg operand as a
6554	// no-source idiom, so we explicitly zero the input values.
6555	Result = getSBBZero(N: Node);
6556	}
6557
6558	// For less than 32-bits we need to extract from the 32-bit node.
6559	if (VT == MVT::i8 \|\| VT == MVT::i16) {
6560	int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6561	Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6562	}
6563
6564	ReplaceUses(F: SDValue (Node, `0`), T: Result);
6565	CurDAG->RemoveDeadNode(N: Node);
6566	return;
6567	}
6568	case X86ISD::SBB: {
6569	if (isNullConstant(V: Node->getOperand(Num: `0`)) &&
6570	isNullConstant(V: Node->getOperand(Num: `1`))) {
6571	SDValue Result = getSBBZero(N: Node);
6572
6573	// Replace the flag use.
6574	ReplaceUses(F: SDValue (Node, `1`), T: Result.getValue(R: `1`));
6575
6576	// Replace the result use.
6577	if (!SDValue (Node, `0`).use_empty()) {
6578	// For less than 32-bits we need to extract from the 32-bit node.
6579	MVT VT = Node->getSimpleValueType(ResNo: `0`);
6580	if (VT == MVT::i8 \|\| VT == MVT::i16) {
6581	int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6582	Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6583	}
6584	ReplaceUses(F: SDValue (Node, `0`), T: Result);
6585	}
6586
6587	CurDAG->RemoveDeadNode(N: Node);
6588	return;
6589	}
6590	break;
6591	}
6592	case X86ISD::MGATHER: {
6593	auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node);
6594	SDValue IndexOp = Mgt->getIndex();
6595	SDValue Mask = Mgt->getMask();
6596	MVT IndexVT = IndexOp.getSimpleValueType();
6597	MVT ValueVT = Node->getSimpleValueType(ResNo: `0`);
6598	MVT MaskVT = Mask.getSimpleValueType();
6599
6600	// This is just to prevent crashes if the nodes are malformed somehow. We're
6601	// otherwise only doing loose type checking in here based on type what
6602	// a type constraint would say just like table based isel.
6603	if (!ValueVT.isVector() \|\| !MaskVT.isVector())
6604	break;
6605
6606	unsigned NumElts = ValueVT.getVectorNumElements();
6607	MVT ValueSVT = ValueVT.getVectorElementType();
6608
6609	bool IsFP = ValueSVT.isFloatingPoint();
6610	unsigned EltSize = ValueSVT.getSizeInBits();
6611
6612	unsigned Opc = `0`;
6613	bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6614	if (AVX512Gather) {
6615	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6616	Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6617	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6618	Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6619	else if (IndexVT == MVT::v16i32 && NumElts == `16` && EltSize == `32`)
6620	Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6621	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6622	Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6623	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6624	Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6625	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `64`)
6626	Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6627	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6628	Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6629	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6630	Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6631	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `32`)
6632	Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6633	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6634	Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6635	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6636	Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6637	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `64`)
6638	Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6639	} else {
6640	assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6641	"Unexpected mask VT!");
6642	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6643	Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6644	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6645	Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6646	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6647	Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6648	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6649	Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6650	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6651	Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6652	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6653	Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6654	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6655	Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6656	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6657	Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6658	}
6659
6660	if (!Opc)
6661	break;
6662
6663	SDValue Base, Scale, Index, Disp, Segment;
6664	if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(),
6665	Base, Scale, Index, Disp, Segment))
6666	break;
6667
6668	SDValue PassThru = Mgt->getPassThru();
6669	SDValue Chain = Mgt->getChain();
6670	// Gather instructions have a mask output not in the ISD node.
6671	SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other);
6672
6673	MachineSDNode *NewNode;
6674	if (AVX512Gather) {
6675	SDValue Ops[] = {PassThru, Mask, Base, Scale,
6676	Index, Disp, Segment, Chain};
6677	NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6678	} else {
6679	SDValue Ops[] = {PassThru, Base, Scale, Index,
6680	Disp, Segment, Mask, Chain};
6681	NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6682	}
6683	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()});
6684	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
6685	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (NewNode, `2`));
6686	CurDAG->RemoveDeadNode(N: Node);
6687	return;
6688	}
6689	case X86ISD::MSCATTER: {
6690	auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node);
6691	SDValue Value = Sc->getValue();
6692	SDValue IndexOp = Sc->getIndex();
6693	MVT IndexVT = IndexOp.getSimpleValueType();
6694	MVT ValueVT = Value.getSimpleValueType();
6695
6696	// This is just to prevent crashes if the nodes are malformed somehow. We're
6697	// otherwise only doing loose type checking in here based on type what
6698	// a type constraint would say just like table based isel.
6699	if (!ValueVT.isVector())
6700	break;
6701
6702	unsigned NumElts = ValueVT.getVectorNumElements();
6703	MVT ValueSVT = ValueVT.getVectorElementType();
6704
6705	bool IsFP = ValueSVT.isFloatingPoint();
6706	unsigned EltSize = ValueSVT.getSizeInBits();
6707
6708	unsigned Opc;
6709	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6710	Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6711	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6712	Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6713	else if (IndexVT == MVT::v16i32 && NumElts == `16` && EltSize == `32`)
6714	Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6715	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6716	Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6717	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6718	Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6719	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `64`)
6720	Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6721	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6722	Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6723	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6724	Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6725	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `32`)
6726	Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6727	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6728	Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6729	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6730	Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6731	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `64`)
6732	Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6733	else
6734	break;
6735
6736	SDValue Base, Scale, Index, Disp, Segment;
6737	if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(),
6738	Base, Scale, Index, Disp, Segment))
6739	break;
6740
6741	SDValue Mask = Sc->getMask();
6742	SDValue Chain = Sc->getChain();
6743	// Scatter instructions have a mask output not in the ISD node.
6744	SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other);
6745	SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6746
6747	MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6748	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()});
6749	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `1`));
6750	CurDAG->RemoveDeadNode(N: Node);
6751	return;
6752	}
6753	case ISD::PREALLOCATED_SETUP: {
6754	auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6755	auto CallId = MFI->getPreallocatedIdForCallSite(
6756	CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: `1`))->getValue());
6757	SDValue Chain = Node->getOperand(Num: `0`);
6758	SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6759	MachineSDNode *New = CurDAG->getMachineNode(
6760	Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain);
6761	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (New, `0`)); // Chain
6762	CurDAG->RemoveDeadNode(N: Node);
6763	return;
6764	}
6765	case ISD::PREALLOCATED_ARG: {
6766	auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6767	auto CallId = MFI->getPreallocatedIdForCallSite(
6768	CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: `1`))->getValue());
6769	SDValue Chain = Node->getOperand(Num: `0`);
6770	SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6771	SDValue ArgIndex = Node->getOperand(Num: `2`);
6772	SDValue Ops[`3`];
6773	Ops[`0`] = CallIdValue;
6774	Ops[`1`] = ArgIndex;
6775	Ops[`2`] = Chain;
6776	MachineSDNode *New = CurDAG->getMachineNode(
6777	Opcode: TargetOpcode::PREALLOCATED_ARG, dl,
6778	VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()),
6779	VT2: MVT::Other),
6780	Ops);
6781	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (New, `0`)); // Arg pointer
6782	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (New, `1`)); // Chain
6783	CurDAG->RemoveDeadNode(N: Node);
6784	return;
6785	}
6786	case X86ISD::AESENCWIDE128KL:
6787	case X86ISD::AESDECWIDE128KL:
6788	case X86ISD::AESENCWIDE256KL:
6789	case X86ISD::AESDECWIDE256KL: {
6790	if (!Subtarget->hasWIDEKL())
6791	break;
6792
6793	unsigned Opcode;
6794	switch (Node->getOpcode()) {
6795	default:
6796	llvm_unreachable("Unexpected opcode!");
6797	case X86ISD::AESENCWIDE128KL:
6798	Opcode = X86::AESENCWIDE128KL;
6799	break;
6800	case X86ISD::AESDECWIDE128KL:
6801	Opcode = X86::AESDECWIDE128KL;
6802	break;
6803	case X86ISD::AESENCWIDE256KL:
6804	Opcode = X86::AESENCWIDE256KL;
6805	break;
6806	case X86ISD::AESDECWIDE256KL:
6807	Opcode = X86::AESDECWIDE256KL;
6808	break;
6809	}
6810
6811	SDValue Chain = Node->getOperand(Num: `0`);
6812	SDValue Addr = Node->getOperand(Num: `1`);
6813
6814	SDValue Base, Scale, Index, Disp, Segment;
6815	if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment))
6816	break;
6817
6818	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: `2`),
6819	Glue: SDValue ());
6820	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: `3`),
6821	Glue: Chain.getValue(R: `1`));
6822	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: `4`),
6823	Glue: Chain.getValue(R: `1`));
6824	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: `5`),
6825	Glue: Chain.getValue(R: `1`));
6826	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: `6`),
6827	Glue: Chain.getValue(R: `1`));
6828	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: `7`),
6829	Glue: Chain.getValue(R: `1`));
6830	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: `8`),
6831	Glue: Chain.getValue(R: `1`));
6832	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: `9`),
6833	Glue: Chain.getValue(R: `1`));
6834
6835	MachineSDNode *Res = CurDAG->getMachineNode(
6836	Opcode, dl, VTs: Node->getVTList(),
6837	Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: `1`)});
6838	CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand());
6839	ReplaceNode(F: Node, T: Res);
6840	return;
6841	}
6842	case X86ISD::POP_FROM_X87_REG: {
6843	SDValue Chain = Node->getOperand(Num: `0`);
6844	Register Reg = cast<RegisterSDNode>(Val: Node->getOperand(Num: `1`))->getReg();
6845	SDValue Glue;
6846	if (Node->getNumValues() == `3`)
6847	Glue = Node->getOperand(Num: `2`);
6848	SDValue Copy =
6849	CurDAG->getCopyFromReg(Chain, dl, Reg, VT: Node->getValueType(ResNo: `0`), Glue);
6850	ReplaceNode(F: Node, T: Copy.getNode());
6851	return;
6852	}
6853	}
6854
6855	SelectCode(N: Node);
6856	}
6857
6858	bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6859	const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6860	std::vector<SDValue> &OutOps) {
6861	SDValue Op0, Op1, Op2, Op3, Op4;
6862	switch (ConstraintID) {
6863	default:
6864	llvm_unreachable("Unexpected asm memory constraint");
6865	case InlineAsm::ConstraintCode::o: // offsetable ??
6866	case InlineAsm::ConstraintCode::v: // not offsetable ??
6867	case InlineAsm::ConstraintCode::m: // memory
6868	case InlineAsm::ConstraintCode::X:
6869	case InlineAsm::ConstraintCode::p: // address
6870	if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4))
6871	return true;
6872	break;
6873	}
6874
6875	OutOps.push_back(x: Op0);
6876	OutOps.push_back(x: Op1);
6877	OutOps.push_back(x: Op2);
6878	OutOps.push_back(x: Op3);
6879	OutOps.push_back(x: Op4);
6880	return false;
6881	}
6882
6883	X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6884	: SelectionDAGISelPass (
6885	std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
6886
6887	/// This pass converts a legalized DAG into a X86-specific DAG,
6888	/// ready for instruction scheduling.
6889	FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6890	CodeGenOptLevel OptLevel) {
6891	return new X86DAGToDAGISelLegacy (TM, OptLevel);
6892	}
6893

Browse the source code of llvm_projects/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp