X86ISelDAGToDAG.cpp source code [llvm_projects/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp]

1	//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines a DAG pattern matching instruction selector for X86,
10	// converting from a legalized dag to a X86 dag.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "X86ISelDAGToDAG.h"
15	#include "X86.h"
16	#include "X86MachineFunctionInfo.h"
17	#include "X86RegisterInfo.h"
18	#include "X86Subtarget.h"
19	#include "X86TargetMachine.h"
20	#include "llvm/ADT/Statistic.h"
21	#include "llvm/CodeGen/MachineModuleInfo.h"
22	#include "llvm/CodeGen/SelectionDAGISel.h"
23	#include "llvm/Config/llvm-config.h"
24	#include "llvm/IR/ConstantRange.h"
25	#include "llvm/IR/Function.h"
26	#include "llvm/IR/Instructions.h"
27	#include "llvm/IR/Intrinsics.h"
28	#include "llvm/IR/IntrinsicsX86.h"
29	#include "llvm/IR/Module.h"
30	#include "llvm/IR/Type.h"
31	#include "llvm/Support/Debug.h"
32	#include "llvm/Support/ErrorHandling.h"
33	#include "llvm/Support/KnownBits.h"
34	#include "llvm/Support/MathExtras.h"
35	#include <cstdint>
36
37	using namespace llvm;
38
39	#define DEBUG_TYPE "x86-isel"
40	#define PASS_NAME "X86 DAG->DAG Instruction Selection"
41
42	STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
43
44	static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(Val: true),
45	cl::desc ("Enable setting constant bits to reduce size of mask immediates"),
46	cl::Hidden);
47
48	static cl::opt<bool> EnablePromoteAnyextLoad(
49	"x86-promote-anyext-load", cl::init(Val: true),
50	cl::desc ("Enable promoting aligned anyext load to wider load"), cl::Hidden);
51
52	extern cl::opt<bool> IndirectBranchTracking;
53
54	//===----------------------------------------------------------------------===//
55	// Pattern Matcher Implementation
56	//===----------------------------------------------------------------------===//
57
58	namespace {
59	/// This corresponds to X86AddressMode, but uses SDValue's instead of register
60	/// numbers for the leaves of the matched tree.
61	struct X86ISelAddressMode {
62	enum {
63	RegBase,
64	FrameIndexBase
65	} BaseType = RegBase;
66
67	// This is really a union, discriminated by BaseType!
68	SDValue Base_Reg;
69	int Base_FrameIndex = `0`;
70
71	unsigned Scale = `1`;
72	SDValue IndexReg;
73	int32_t Disp = `0`;
74	SDValue Segment;
75	const GlobalValue GV = nullptr*;
76	const Constant CP = nullptr*;
77	const BlockAddress BlockAddr = nullptr*;
78	const char ES = nullptr*;
79	MCSymbol MCSym = nullptr*;
80	int JT = -`1`;
81	Align Alignment; // CP alignment.
82	unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
83	bool NegateIndex = false;
84
85	X86ISelAddressMode() = default;
86
87	bool hasSymbolicDisplacement() const {
88	return GV != nullptr \|\| CP != nullptr \|\| ES != nullptr \|\|
89	MCSym != nullptr \|\| JT != -`1` \|\| BlockAddr != nullptr;
90	}
91
92	bool hasBaseOrIndexReg() const {
93	return BaseType == FrameIndexBase \|\|
94	IndexReg.getNode() != nullptr \|\| Base_Reg.getNode() != nullptr;
95	}
96
97	/// Return true if this addressing mode is already RIP-relative.
98	bool isRIPRelative() const {
99	if (BaseType != RegBase) return false;
100	if (RegisterSDNode *RegNode =
101	dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode()))
102	return RegNode->getReg() == X86::RIP;
103	return false;
104	}
105
106	void setBaseReg(SDValue Reg) {
107	BaseType = RegBase;
108	Base_Reg = Reg;
109	}
110
111	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
112	void dump(SelectionDAG DAG = nullptr*) {
113	dbgs() << "X86ISelAddressMode " << this << `'\n'`;
114	dbgs() << "Base_Reg ";
115	if (Base_Reg.getNode())
116	Base_Reg.getNode()->dump(DAG);
117	else
118	dbgs() << "nul\n";
119	if (BaseType == FrameIndexBase)
120	dbgs() << " Base.FrameIndex " << Base_FrameIndex << `'\n'`;
121	dbgs() << " Scale " << Scale << `'\n'`
122	<< "IndexReg ";
123	if (NegateIndex)
124	dbgs() << "negate ";
125	if (IndexReg.getNode())
126	IndexReg.getNode()->dump(DAG);
127	else
128	dbgs() << "nul\n";
129	dbgs() << " Disp " << Disp << `'\n'`
130	<< "GV ";
131	if (GV)
132	GV->dump();
133	else
134	dbgs() << "nul";
135	dbgs() << " CP ";
136	if (CP)
137	CP->dump();
138	else
139	dbgs() << "nul";
140	dbgs() << `'\n'`
141	<< "ES ";
142	if (ES)
143	dbgs() << ES;
144	else
145	dbgs() << "nul";
146	dbgs() << " MCSym ";
147	if (MCSym)
148	dbgs() << MCSym;
149	else
150	dbgs() << "nul";
151	dbgs() << " JT" << JT << " Align" << Alignment.value() << `'\n'`;
152	}
153	#endif
154	};
155	}
156
157	namespace {
158	//===--------------------------------------------------------------------===//
159	/// ISel - X86-specific code to select X86 machine instructions for
160	/// SelectionDAG operations.
161	///
162	class X86DAGToDAGISel final : public SelectionDAGISel {
163	/// Keep a pointer to the X86Subtarget around so that we can
164	/// make the right decision when generating code for different targets.
165	const X86Subtarget *Subtarget;
166
167	/// If true, selector should try to optimize for minimum code size.
168	bool OptForMinSize;
169
170	/// Disable direct TLS access through segment registers.
171	bool IndirectTlsSegRefs;
172
173	public:
174	X86DAGToDAGISel() = delete;
175
176	explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177	: SelectionDAGISel (tm, OptLevel), Subtarget(nullptr),
178	OptForMinSize(false), IndirectTlsSegRefs(false) {}
179
180	bool runOnMachineFunction(MachineFunction &MF) override {
181	// Reset the subtarget each time through.
182	Subtarget = &MF.getSubtarget<X86Subtarget>();
183	IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184	Kind: "indirect-tls-seg-refs");
185
186	// OptFor[Min]Size are used in pattern predicates that isel is matching.
187	OptForMinSize = MF.getFunction().hasMinSize();
188	assert((!OptForMinSize \|\| MF.getFunction().hasOptSize()) &&
189	"OptForMinSize implies OptForSize");
190	return SelectionDAGISel::runOnMachineFunction(mf&: MF);
191	}
192
193	void emitFunctionEntryCode() override;
194
195	bool IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const override;
196
197	void PreprocessISelDAG() override;
198	void PostprocessISelDAG() override;
199
200	// Include the pieces autogenerated from the target description.
201	#include "X86GenDAGISel.inc"
202
203	private:
204	void Select(SDNode *N) override;
205
206	bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
207	bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
208	bool AllowSegmentRegForX32 = false);
209	bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
210	bool matchAddress(SDValue N, X86ISelAddressMode &AM);
211	bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
212	bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
213	SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
214	unsigned Depth);
215	bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
216	unsigned Depth);
217	bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218	unsigned Depth);
219	bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220	bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221	SDValue &Scale, SDValue &Index, SDValue &Disp,
222	SDValue &Segment);
223	bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
224	SDValue ScaleOp, SDValue &Base, SDValue &Scale,
225	SDValue &Index, SDValue &Disp, SDValue &Segment);
226	bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227	bool selectLEAAddr(SDValue N, SDValue &Base,
228	SDValue &Scale, SDValue &Index, SDValue &Disp,
229	SDValue &Segment);
230	bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231	SDValue &Scale, SDValue &Index, SDValue &Disp,
232	SDValue &Segment);
233	bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234	SDValue &Scale, SDValue &Index, SDValue &Disp,
235	SDValue &Segment);
236	bool selectRelocImm(SDValue N, SDValue &Op);
237
238	bool tryFoldLoad(SDNode Root, SDNode P, SDValue N,
239	SDValue &Base, SDValue &Scale,
240	SDValue &Index, SDValue &Disp,
241	SDValue &Segment);
242
243	// Convenience method where P is also root.
244	bool tryFoldLoad(SDNode *P, SDValue N,
245	SDValue &Base, SDValue &Scale,
246	SDValue &Index, SDValue &Disp,
247	SDValue &Segment) {
248	return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment);
249	}
250
251	bool tryFoldBroadcast(SDNode Root, SDNode P, SDValue N,
252	SDValue &Base, SDValue &Scale,
253	SDValue &Index, SDValue &Disp,
254	SDValue &Segment);
255
256	bool isProfitableToFormMaskedOp(SDNode N) const*;
257
258	/// Implement addressing mode selection for inline asm expressions.
259	bool SelectInlineAsmMemoryOperand(const SDValue &Op,
260	InlineAsm::ConstraintCode ConstraintID,
261	std::vector<SDValue> &OutOps) override;
262
263	void emitSpecialCodeForMain();
264
265	inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
266	MVT VT, SDValue &Base, SDValue &Scale,
267	SDValue &Index, SDValue &Disp,
268	SDValue &Segment) {
269	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
270	Base = CurDAG->getTargetFrameIndex(
271	FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout()));
272	else if (AM.Base_Reg.getNode())
273	Base = AM.Base_Reg;
274	else
275	Base = CurDAG->getRegister(Reg: `0`, VT);
276
277	Scale = getI8Imm(Imm: AM.Scale, DL);
278
279	#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
280	// Negate the index if needed.
281	if (AM.NegateIndex) {
282	unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
283	: GET_ND_IF_ENABLED(X86::NEG32r);
284	SDValue Neg = SDValue (CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32,
285	Ops: AM.IndexReg), `0`);
286	AM.IndexReg = Neg;
287	}
288
289	if (AM.IndexReg.getNode())
290	Index = AM.IndexReg;
291	else
292	Index = CurDAG->getRegister(Reg: `0`, VT);
293
294	// These are 32-bit even in 64-bit mode since RIP-relative offset
295	// is 32-bit.
296	if (AM.GV)
297	Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc (),
298	VT: MVT::i32, offset: AM.Disp,
299	TargetFlags: AM.SymbolFlags);
300	else if (AM.CP)
301	Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment,
302	Offset: AM.Disp, TargetFlags: AM.SymbolFlags);
303	else if (AM.ES) {
304	assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
305	Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
306	} else if (AM.MCSym) {
307	assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
308	assert(AM.SymbolFlags == `0` && "oo");
309	Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32);
310	} else if (AM.JT != -`1`) {
311	assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
312	Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags);
313	} else if (AM.BlockAddr)
314	Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp,
315	TargetFlags: AM.SymbolFlags);
316	else
317	Disp = CurDAG->getTargetConstant(Val: AM.Disp, DL, VT: MVT::i32);
318
319	if (AM.Segment.getNode())
320	Segment = AM.Segment;
321	else
322	Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
323	}
324
325	// Utility function to determine whether we should avoid selecting
326	// immediate forms of instructions for better code size or not.
327	// At a high level, we'd like to avoid such instructions when
328	// we have similar constants used within the same basic block
329	// that can be kept in a register.
330	//
331	bool shouldAvoidImmediateInstFormsForSize(SDNode N) const* {
332	uint32_t UseCount = `0`;
333
334	// Do not want to hoist if we're not optimizing for size.
335	// TODO: We'd like to remove this restriction.
336	// See the comment in X86InstrInfo.td for more info.
337	if (!CurDAG->shouldOptForSize())
338	return false;
339
340	// Walk all the users of the immediate.
341	for (const SDNode *User : N->uses()) {
342	if (UseCount >= `2`)
343	break;
344
345	// This user is already selected. Count it as a legitimate use and
346	// move on.
347	if (User->isMachineOpcode()) {
348	UseCount++;
349	continue;
350	}
351
352	// We want to count stores of immediates as real uses.
353	if (User->getOpcode() == ISD::STORE &&
354	User->getOperand(Num: `1`).getNode() == N) {
355	UseCount++;
356	continue;
357	}
358
359	// We don't currently match users that have > 2 operands (except
360	// for stores, which are handled above)
361	// Those instruction won't match in ISEL, for now, and would
362	// be counted incorrectly.
363	// This may change in the future as we add additional instruction
364	// types.
365	if (User->getNumOperands() != `2`)
366	continue;
367
368	// If this is a sign-extended 8-bit integer immediate used in an ALU
369	// instruction, there is probably an opcode encoding to save space.
370	auto *C = dyn_cast<ConstantSDNode>(Val: N);
371	if (C && isInt<`8`>(x: C->getSExtValue()))
372	continue;
373
374	// Immediates that are used for offsets as part of stack
375	// manipulation should be left alone. These are typically
376	// used to indicate SP offsets for argument passing and
377	// will get pulled into stores/pushes (implicitly).
378	if (User->getOpcode() == X86ISD::ADD \|\|
379	User->getOpcode() == ISD::ADD \|\|
380	User->getOpcode() == X86ISD::SUB \|\|
381	User->getOpcode() == ISD::SUB) {
382
383	// Find the other operand of the add/sub.
384	SDValue OtherOp = User->getOperand(Num: `0`);
385	if (OtherOp.getNode() == N)
386	OtherOp = User->getOperand(Num: `1`);
387
388	// Don't count if the other operand is SP.
389	RegisterSDNode *RegNode;
390	if (OtherOp ->getOpcode() == ISD::CopyFromReg &&
391	(RegNode = dyn_cast_or_null<RegisterSDNode>(
392	Val: OtherOp ->getOperand(Num: `1`).getNode())))
393	if ((RegNode->getReg() == X86::ESP) \|\|
394	(RegNode->getReg() == X86::RSP))
395	continue;
396	}
397
398	// ... otherwise, count this and move on.
399	UseCount++;
400	}
401
402	// If we have more than 1 use, then recommend for hoisting.
403	return (UseCount > `1`);
404	}
405
406	/// Return a target constant with the specified value of type i8.
407	inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
408	return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
409	}
410
411	/// Return a target constant with the specified value, of type i32.
412	inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
413	return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32);
414	}
415
416	/// Return a target constant with the specified value, of type i64.
417	inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
418	return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64);
419	}
420
421	SDValue getExtractVEXTRACTImmediate(SDNode N, unsigned* VecWidth,
422	const SDLoc &DL) {
423	assert((VecWidth == `128` \|\| VecWidth == `256`) && "Unexpected vector width");
424	uint64_t Index = N->getConstantOperandVal(Num: `1`);
425	MVT VecVT = N->getOperand(Num: `0`).getSimpleValueType();
426	return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
427	}
428
429	SDValue getInsertVINSERTImmediate(SDNode N, unsigned* VecWidth,
430	const SDLoc &DL) {
431	assert((VecWidth == `128` \|\| VecWidth == `256`) && "Unexpected vector width");
432	uint64_t Index = N->getConstantOperandVal(Num: `2`);
433	MVT VecVT = N->getSimpleValueType(ResNo: `0`);
434	return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
435	}
436
437	SDValue getPermuteVINSERTCommutedImmediate(SDNode N, unsigned* VecWidth,
438	const SDLoc &DL) {
439	assert(VecWidth == `128` && "Unexpected vector width");
440	uint64_t Index = N->getConstantOperandVal(Num: `2`);
441	MVT VecVT = N->getSimpleValueType(ResNo: `0`);
442	uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
443	assert((InsertIdx == `0` \|\| InsertIdx == `1`) && "Bad insertf128 index");
444	// vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
445	// vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
446	return getI8Imm(Imm: InsertIdx ? `0x02` : `0x30`, DL);
447	}
448
449	SDValue getSBBZero(SDNode *N) {
450	SDLoc dl(N);
451	MVT VT = N->getSimpleValueType(ResNo: `0`);
452
453	// Create zero.
454	SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
455	SDValue Zero = SDValue (
456	CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: std::nullopt), `0`);
457	if (VT == MVT::i64) {
458	Zero = SDValue (
459	CurDAG->getMachineNode(
460	Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64,
461	Op1: CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i64), Op2: Zero,
462	Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)),
463	`0`);
464	}
465
466	// Copy flags to the EFLAGS register and glue it to next node.
467	unsigned Opcode = N->getOpcode();
468	assert((Opcode == X86ISD::SBB \|\| Opcode == X86ISD::SETCC_CARRY) &&
469	"Unexpected opcode for SBB materialization");
470	unsigned FlagOpIndex = Opcode == X86ISD::SBB ? `2` : `1`;
471	SDValue EFLAGS =
472	CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
473	N: N->getOperand(Num: FlagOpIndex), Glue: SDValue ());
474
475	// Create a 64-bit instruction if the result is 64-bits otherwise use the
476	// 32-bit version.
477	unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
478	MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
479	VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32);
480	return SDValue (
481	CurDAG->getMachineNode(Opcode: Opc, dl, VTs,
482	Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: `1`)}),
483	`0`);
484	}
485
486	// Helper to detect unneeded and instructions on shift amounts. Called
487	// from PatFrags in tablegen.
488	bool isUnneededShiftMask(SDNode N, unsigned* Width) const {
489	assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
490	const APInt &Val = N->getConstantOperandAPInt(Num: `1`);
491
492	if (Val.countr_one() >= Width)
493	return true;
494
495	APInt Mask = Val \| CurDAG->computeKnownBits(Op: N->getOperand(Num: `0`)).Zero;
496	return Mask.countr_one() >= Width;
497	}
498
499	/// Return an SDNode that returns the value of the global base register.
500	/// Output instructions required to initialize the global base register,
501	/// if necessary.
502	SDNode *getGlobalBaseReg();
503
504	/// Return a reference to the TargetMachine, casted to the target-specific
505	/// type.
506	const X86TargetMachine &getTargetMachine() const {
507	return static_cast<const X86TargetMachine &>(TM);
508	}
509
510	/// Return a reference to the TargetInstrInfo, casted to the target-specific
511	/// type.
512	const X86InstrInfo getInstrInfo() const* {
513	return Subtarget->getInstrInfo();
514	}
515
516	/// Return a condition code of the given SDNode
517	X86::CondCode getCondFromNode(SDNode N) const*;
518
519	/// Address-mode matching performs shift-of-and to and-of-shift
520	/// reassociation in order to expose more scaled addressing
521	/// opportunities.
522	bool ComplexPatternFuncMutatesDAG() const override {
523	return true;
524	}
525
526	bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode N) const*;
527
528	// Indicates we should prefer to use a non-temporal load for this load.
529	bool useNonTemporalLoad(LoadSDNode N) const* {
530	if (!N->isNonTemporal())
531	return false;
532
533	unsigned StoreSize = N->getMemoryVT().getStoreSize();
534
535	if (N->getAlign().value() < StoreSize)
536	return false;
537
538	switch (StoreSize) {
539	default: llvm_unreachable("Unsupported store size");
540	case `4`:
541	case `8`:
542	return false;
543	case `16`:
544	return Subtarget->hasSSE41();
545	case `32`:
546	return Subtarget->hasAVX2();
547	case `64`:
548	return Subtarget->hasAVX512();
549	}
550	}
551
552	bool foldLoadStoreIntoMemOperand(SDNode *Node);
553	MachineSDNode matchBEXTRFromAndImm(SDNode Node);
554	bool matchBitExtract(SDNode *Node);
555	bool shrinkAndImmediate(SDNode *N);
556	bool isMaskZeroExtended(SDNode N) const*;
557	bool tryShiftAmountMod(SDNode *N);
558	bool tryShrinkShlLogicImm(SDNode *N);
559	bool tryVPTERNLOG(SDNode *N);
560	bool matchVPTERNLOG(SDNode Root, SDNode ParentA, SDNode *ParentB,
561	SDNode *ParentC, SDValue A, SDValue B, SDValue C,
562	uint8_t Imm);
563	bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
564	bool tryMatchBitSelect(SDNode *N);
565
566	MachineSDNode emitPCMPISTR(unsigned* ROpc, unsigned MOpc, bool MayFoldLoad,
567	const SDLoc &dl, MVT VT, SDNode *Node);
568	MachineSDNode emitPCMPESTR(unsigned* ROpc, unsigned MOpc, bool MayFoldLoad,
569	const SDLoc &dl, MVT VT, SDNode *Node,
570	SDValue &InGlue);
571
572	bool tryOptimizeRem8Extend(SDNode *N);
573
574	bool onlyUsesZeroFlag(SDValue Flags) const;
575	bool hasNoSignFlagUses(SDValue Flags) const;
576	bool hasNoCarryFlagUses(SDValue Flags) const;
577	};
578
579	class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
580	public:
581	static char ID;
582	explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
583	CodeGenOptLevel OptLevel)
584	: SelectionDAGISelLegacy (
585	ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {}
586	};
587	}
588
589	char X86DAGToDAGISelLegacy::ID = `0`;
590
591	INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
592
593	// Returns true if this masked compare can be implemented legally with this
594	// type.
595	static bool isLegalMaskCompare(SDNode N, const* X86Subtarget *Subtarget) {
596	unsigned Opcode = N->getOpcode();
597	if (Opcode == X86ISD::CMPM \|\| Opcode == X86ISD::CMPMM \|\|
598	Opcode == X86ISD::STRICT_CMPM \|\| Opcode == ISD::SETCC \|\|
599	Opcode == X86ISD::CMPMM_SAE \|\| Opcode == X86ISD::VFPCLASS) {
600	// We can get 256-bit 8 element types here without VLX being enabled. When
601	// this happens we will use 512-bit operations and the mask will not be
602	// zero extended.
603	EVT OpVT = N->getOperand(Num: `0`).getValueType();
604	// The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
605	// second operand.
606	if (Opcode == X86ISD::STRICT_CMPM)
607	OpVT = N->getOperand(Num: `1`).getValueType();
608	if (OpVT.is256BitVector() \|\| OpVT.is128BitVector())
609	return Subtarget->hasVLX();
610
611	return true;
612	}
613	// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
614	if (Opcode == X86ISD::VFPCLASSS \|\| Opcode == X86ISD::FSETCCM \|\|
615	Opcode == X86ISD::FSETCCM_SAE)
616	return true;
617
618	return false;
619	}
620
621	// Returns true if we can assume the writer of the mask has zero extended it
622	// for us.
623	bool X86DAGToDAGISel::isMaskZeroExtended(SDNode N) const* {
624	// If this is an AND, check if we have a compare on either side. As long as
625	// one side guarantees the mask is zero extended, the AND will preserve those
626	// zeros.
627	if (N->getOpcode() == ISD::AND)
628	return isLegalMaskCompare(N: N->getOperand(Num: `0`).getNode(), Subtarget) \|\|
629	isLegalMaskCompare(N: N->getOperand(Num: `1`).getNode(), Subtarget);
630
631	return isLegalMaskCompare(N, Subtarget);
632	}
633
634	bool
635	X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const {
636	if (OptLevel == CodeGenOptLevel::None)
637	return false;
638
639	if (!N.hasOneUse())
640	return false;
641
642	if (N.getOpcode() != ISD::LOAD)
643	return true;
644
645	// Don't fold non-temporal loads if we have an instruction for them.
646	if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N)))
647	return false;
648
649	// If N is a load, do additional profitability checks.
650	if (U == Root) {
651	switch (U->getOpcode()) {
652	default: break;
653	case X86ISD::ADD:
654	case X86ISD::ADC:
655	case X86ISD::SUB:
656	case X86ISD::SBB:
657	case X86ISD::AND:
658	case X86ISD::XOR:
659	case X86ISD::OR:
660	case ISD::ADD:
661	case ISD::UADDO_CARRY:
662	case ISD::AND:
663	case ISD::OR:
664	case ISD::XOR: {
665	SDValue Op1 = U->getOperand(Num: `1`);
666
667	// If the other operand is a 8-bit immediate we should fold the immediate
668	// instead. This reduces code size.
669	// e.g.
670	// movl 4(%esp), %eax
671	// addl $4, %eax
672	// vs.
673	// movl $4, %eax
674	// addl 4(%esp), %eax
675	// The former is 2 bytes shorter. In case where the increment is 1, then
676	// the saving can be 4 bytes (by using incl %eax).
677	if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) {
678	if (Imm->getAPIntValue().isSignedIntN(N: `8`))
679	return false;
680
681	// If this is a 64-bit AND with an immediate that fits in 32-bits,
682	// prefer using the smaller and over folding the load. This is needed to
683	// make sure immediates created by shrinkAndImmediate are always folded.
684	// Ideally we would narrow the load during DAG combine and get the
685	// best of both worlds.
686	if (U->getOpcode() == ISD::AND &&
687	Imm->getAPIntValue().getBitWidth() == `64` &&
688	Imm->getAPIntValue().isIntN(N: `32`))
689	return false;
690
691	// If this really a zext_inreg that can be represented with a movzx
692	// instruction, prefer that.
693	// TODO: We could shrink the load and fold if it is non-volatile.
694	if (U->getOpcode() == ISD::AND &&
695	(Imm->getAPIntValue() == UINT8_MAX \|\|
696	Imm->getAPIntValue() == UINT16_MAX \|\|
697	Imm->getAPIntValue() == UINT32_MAX))
698	return false;
699
700	// ADD/SUB with can negate the immediate and use the opposite operation
701	// to fit 128 into a sign extended 8 bit immediate.
702	if ((U->getOpcode() == ISD::ADD \|\| U->getOpcode() == ISD::SUB) &&
703	(-Imm->getAPIntValue()).isSignedIntN(N: `8`))
704	return false;
705
706	if ((U->getOpcode() == X86ISD::ADD \|\| U->getOpcode() == X86ISD::SUB) &&
707	(-Imm->getAPIntValue()).isSignedIntN(N: `8`) &&
708	hasNoCarryFlagUses(Flags: SDValue (U, `1`)))
709	return false;
710	}
711
712	// If the other operand is a TLS address, we should fold it instead.
713	// This produces
714	// movl %gs:0, %eax
715	// leal i@NTPOFF(%eax), %eax
716	// instead of
717	// movl $i@NTPOFF, %eax
718	// addl %gs:0, %eax
719	// if the block also has an access to a second TLS address this will save
720	// a load.
721	// FIXME: This is probably also true for non-TLS addresses.
722	if (Op1.getOpcode() == X86ISD::Wrapper) {
723	SDValue Val = Op1.getOperand(i: `0`);
724	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
725	return false;
726	}
727
728	// Don't fold load if this matches the BTS/BTR/BTC patterns.
729	// BTS: (or X, (shl 1, n))
730	// BTR: (and X, (rotl -2, n))
731	// BTC: (xor X, (shl 1, n))
732	if (U->getOpcode() == ISD::OR \|\| U->getOpcode() == ISD::XOR) {
733	if (U->getOperand(Num: `0`).getOpcode() == ISD::SHL &&
734	isOneConstant(V: U->getOperand(Num: `0`).getOperand(i: `0`)))
735	return false;
736
737	if (U->getOperand(Num: `1`).getOpcode() == ISD::SHL &&
738	isOneConstant(V: U->getOperand(Num: `1`).getOperand(i: `0`)))
739	return false;
740	}
741	if (U->getOpcode() == ISD::AND) {
742	SDValue U0 = U->getOperand(Num: `0`);
743	SDValue U1 = U->getOperand(Num: `1`);
744	if (U0.getOpcode() == ISD::ROTL) {
745	auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: `0`));
746	if (C && C->getSExtValue() == -`2`)
747	return false;
748	}
749
750	if (U1.getOpcode() == ISD::ROTL) {
751	auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: `0`));
752	if (C && C->getSExtValue() == -`2`)
753	return false;
754	}
755	}
756
757	break;
758	}
759	case ISD::SHL:
760	case ISD::SRA:
761	case ISD::SRL:
762	// Don't fold a load into a shift by immediate. The BMI2 instructions
763	// support folding a load, but not an immediate. The legacy instructions
764	// support folding an immediate, but can't fold a load. Folding an
765	// immediate is preferable to folding a load.
766	if (isa<ConstantSDNode>(Val: U->getOperand(Num: `1`)))
767	return false;
768
769	break;
770	}
771	}
772
773	// Prevent folding a load if this can implemented with an insert_subreg or
774	// a move that implicitly zeroes.
775	if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
776	isNullConstant(V: Root->getOperand(Num: `2`)) &&
777	(Root->getOperand(Num: `0`).isUndef() \|\|
778	ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: `0`).getNode())))
779	return false;
780
781	return true;
782	}
783
784	// Indicates it is profitable to form an AVX512 masked operation. Returning
785	// false will favor a masked register-register masked move or vblendm and the
786	// operation will be selected separately.
787	bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode N) const* {
788	assert(
789	(N->getOpcode() == ISD::VSELECT \|\| N->getOpcode() == X86ISD::SELECTS) &&
790	"Unexpected opcode!");
791
792	// If the operation has additional users, the operation will be duplicated.
793	// Check the use count to prevent that.
794	// FIXME: Are there cheap opcodes we might want to duplicate?
795	return N->getOperand(Num: `1`).hasOneUse();
796	}
797
798	/// Replace the original chain operand of the call with
799	/// load's chain operand and move load below the call's chain operand.
800	static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
801	SDValue Call, SDValue OrigChain) {
802	SmallVector<SDValue, `8`> Ops;
803	SDValue Chain = OrigChain.getOperand(i: `0`);
804	if (Chain.getNode() == Load.getNode())
805	Ops.push_back(Elt: Load.getOperand(i: `0`));
806	else {
807	assert(Chain.getOpcode() == ISD::TokenFactor &&
808	"Unexpected chain operand");
809	for (unsigned i = `0`, e = Chain.getNumOperands(); i != e; ++i)
810	if (Chain.getOperand(i).getNode() == Load.getNode())
811	Ops.push_back(Elt: Load.getOperand(i: `0`));
812	else
813	Ops.push_back(Elt: Chain.getOperand(i));
814	SDValue NewChain =
815	CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc (Load), VT: MVT::Other, Ops);
816	Ops.clear();
817	Ops.push_back(Elt: NewChain);
818	}
819	Ops.append(in_start: OrigChain ->op_begin() + `1`, in_end: OrigChain ->op_end());
820	CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops);
821	CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: `0`),
822	Op2: Load.getOperand(i: `1`), Op3: Load.getOperand(i: `2`));
823
824	Ops.clear();
825	Ops.push_back(Elt: SDValue (Load.getNode(), `1`));
826	Ops.append(in_start: Call ->op_begin() + `1`, in_end: Call ->op_end());
827	CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops);
828	}
829
830	/// Return true if call address is a load and it can be
831	/// moved below CALLSEQ_START and the chains leading up to the call.
832	/// Return the CALLSEQ_START by reference as a second output.
833	/// In the case of a tail call, there isn't a callseq node between the call
834	/// chain and the load.
835	static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
836	// The transformation is somewhat dangerous if the call's chain was glued to
837	// the call. After MoveBelowOrigChain the load is moved between the call and
838	// the chain, this can create a cycle if the load is not folded. So it is
839	// really* important that we are sure the load will be folded.*
840	if (Callee.getNode() == Chain.getNode() \|\| !Callee.hasOneUse())
841	return false;
842	auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode());
843	if (!LD \|\|
844	!LD->isSimple() \|\|
845	LD->getAddressingMode() != ISD::UNINDEXED \|\|
846	LD->getExtensionType() != ISD::NON_EXTLOAD)
847	return false;
848
849	// Now let's find the callseq_start.
850	while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
851	if (!Chain.hasOneUse())
852	return false;
853	Chain = Chain.getOperand(i: `0`);
854	}
855
856	if (!Chain.getNumOperands())
857	return false;
858	// Since we are not checking for AA here, conservatively abort if the chain
859	// writes to memory. It's not safe to move the callee (a load) across a store.
860	if (isa<MemSDNode>(Val: Chain.getNode()) &&
861	cast<MemSDNode>(Val: Chain.getNode())->writeMem())
862	return false;
863	if (Chain.getOperand(i: `0`).getNode() == Callee.getNode())
864	return true;
865	if (Chain.getOperand(i: `0`).getOpcode() == ISD::TokenFactor &&
866	Callee.getValue(R: `1`).isOperandOf(N: Chain.getOperand(i: `0`).getNode()) &&
867	Callee.getValue(R: `1`).hasOneUse())
868	return true;
869	return false;
870	}
871
872	static bool isEndbrImm64(uint64_t Imm) {
873	// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
874	// i.g: 0xF3660F1EFA, 0xF3670F1EFA
875	if ((Imm & `0x00FFFFFF`) != `0x0F1EFA`)
876	return false;
877
878	uint8_t OptionalPrefixBytes [] = {`0x26`, `0x2e`, `0x36`, `0x3e`, `0x64`,
879	`0x65`, `0x66`, `0x67`, `0xf0`, `0xf2`};
880	int i = `24`; // 24bit 0x0F1EFA has matched
881	while (i < `64`) {
882	uint8_t Byte = (Imm >> i) & `0xFF`;
883	if (Byte == `0xF3`)
884	return true;
885	if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte))
886	return false;
887	i += `8`;
888	}
889
890	return false;
891	}
892
893	static bool needBWI(MVT VT) {
894	return (VT == MVT::v32i16 \|\| VT == MVT::v32f16 \|\| VT == MVT::v64i8);
895	}
896
897	void X86DAGToDAGISel::PreprocessISelDAG() {
898	bool MadeChange = false;
899	for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
900	E = CurDAG->allnodes_end(); I != E; ) {
901	SDNode N = &I ++; // Preincrement iterator to avoid invalidation issues.
902
903	// This is for CET enhancement.
904	//
905	// ENDBR32 and ENDBR64 have specific opcodes:
906	// ENDBR32: F3 0F 1E FB
907	// ENDBR64: F3 0F 1E FA
908	// And we want that attackers won’t find unintended ENDBR32/64
909	// opcode matches in the binary
910	// Here’s an example:
911	// If the compiler had to generate asm for the following code:
912	// a = 0xF30F1EFA
913	// it could, for example, generate:
914	// mov 0xF30F1EFA, dword ptr[a]
915	// In such a case, the binary would include a gadget that starts
916	// with a fake ENDBR64 opcode. Therefore, we split such generation
917	// into multiple operations, let it not shows in the binary
918	if (N->getOpcode() == ISD::Constant) {
919	MVT VT = N->getSimpleValueType(ResNo: `0`);
920	int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
921	int32_t EndbrImm = Subtarget->is64Bit() ? `0xF30F1EFA` : `0xF30F1EFB`;
922	if (Imm == EndbrImm \|\| isEndbrImm64(Imm)) {
923	// Check that the cf-protection-branch is enabled.
924	Metadata *CFProtectionBranch =
925	MF->getFunction().getParent()->getModuleFlag(
926	Key: "cf-protection-branch");
927	if (CFProtectionBranch \|\| IndirectBranchTracking) {
928	SDLoc dl(N);
929	SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true);
930	Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT);
931	--I;
932	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Complement);
933	++I;
934	MadeChange = true;
935	continue;
936	}
937	}
938	}
939
940	// If this is a target specific AND node with no flag usages, turn it back
941	// into ISD::AND to enable test instruction matching.
942	if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: `1`)) {
943	SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
944	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`));
945	--I;
946	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
947	++I;
948	MadeChange = true;
949	continue;
950	}
951
952	// Convert vector increment or decrement to sub/add with an all-ones
953	// constant:
954	// add X, <1, 1...> --> sub X, <-1, -1...>
955	// sub X, <1, 1...> --> add X, <-1, -1...>
956	// The all-ones vector constant can be materialized using a pcmpeq
957	// instruction that is commonly recognized as an idiom (has no register
958	// dependency), so that's better/smaller than loading a splat 1 constant.
959	//
960	// But don't do this if it would inhibit a potentially profitable load
961	// folding opportunity for the other operand. That only occurs with the
962	// intersection of:
963	// (1) The other operand (op0) is load foldable.
964	// (2) The op is an add (otherwise, we are creating* an add and can still*
965	// load fold the other op).
966	// (3) The target has AVX (otherwise, we have a destructive add and can't
967	// load fold the other op without killing the constant op).
968	// (4) The constant 1 vector has multiple uses (so it is profitable to load
969	// into a register anyway).
970	auto mayPreventLoadFold = [&]() {
971	return X86::mayFoldLoad(Op: N->getOperand(Num: `0`), Subtarget: *Subtarget) &&
972	N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
973	!N->getOperand(Num: `1`).hasOneUse();
974	};
975	if ((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
976	N->getSimpleValueType(ResNo: `0`).isVector() && !mayPreventLoadFold ()) {
977	APInt SplatVal;
978	if (X86::isConstantSplat(Op: N->getOperand(Num: `1`), SplatVal) &&
979	SplatVal.isOne()) {
980	SDLoc DL(N);
981
982	MVT VT = N->getSimpleValueType(ResNo: `0`);
983	unsigned NumElts = VT.getSizeInBits() / `32`;
984	SDValue AllOnes =
985	CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts));
986	AllOnes = CurDAG->getBitcast(VT, V: AllOnes);
987
988	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
989	SDValue Res =
990	CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: `0`), N2: AllOnes);
991	--I;
992	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
993	++I;
994	MadeChange = true;
995	continue;
996	}
997	}
998
999	switch (N->getOpcode()) {
1000	case X86ISD::VBROADCAST: {
1001	MVT VT = N->getSimpleValueType(ResNo: `0`);
1002	// Emulate v32i16/v64i8 broadcast without BWI.
1003	if (!Subtarget->hasBWI() && needBWI(VT)) {
1004	MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1005	SDLoc dl(N);
1006	SDValue NarrowBCast =
1007	CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: `0`));
1008	SDValue Res =
1009	CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1010	N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1011	unsigned Index = NarrowVT.getVectorMinNumElements();
1012	Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1013	N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1014
1015	--I;
1016	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1017	++I;
1018	MadeChange = true;
1019	continue;
1020	}
1021
1022	break;
1023	}
1024	case X86ISD::VBROADCAST_LOAD: {
1025	MVT VT = N->getSimpleValueType(ResNo: `0`);
1026	// Emulate v32i16/v64i8 broadcast without BWI.
1027	if (!Subtarget->hasBWI() && needBWI(VT)) {
1028	MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1029	auto *MemNode = cast<MemSDNode>(Val: N);
1030	SDLoc dl(N);
1031	SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other);
1032	SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1033	SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1034	Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(),
1035	MMO: MemNode->getMemOperand());
1036	SDValue Res =
1037	CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1038	N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1039	unsigned Index = NarrowVT.getVectorMinNumElements();
1040	Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1041	N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1042
1043	--I;
1044	SDValue To[] = {Res, NarrowBCast.getValue(R: `1`)};
1045	CurDAG->ReplaceAllUsesWith(From: N, To);
1046	++I;
1047	MadeChange = true;
1048	continue;
1049	}
1050
1051	break;
1052	}
1053	case ISD::LOAD: {
1054	// If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1055	// load, then just extract the lower subvector and avoid the second load.
1056	auto *Ld = cast<LoadSDNode>(Val: N);
1057	MVT VT = N->getSimpleValueType(ResNo: `0`);
1058	if (!ISD::isNormalLoad(N: Ld) \|\| !Ld->isSimple() \|\|
1059	!(VT.is128BitVector() \|\| VT.is256BitVector()))
1060	break;
1061
1062	MVT MaxVT = VT;
1063	SDNode MaxLd = nullptr*;
1064	SDValue Ptr = Ld->getBasePtr();
1065	SDValue Chain = Ld->getChain();
1066	for (SDNode *User : Ptr ->uses()) {
1067	auto *UserLd = dyn_cast<LoadSDNode>(Val: User);
1068	MVT UserVT = User->getSimpleValueType(ResNo: `0`);
1069	if (User != N && UserLd && ISD::isNormalLoad(N: User) &&
1070	UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1071	!User->hasAnyUseOfValue(Value: `1`) &&
1072	(UserVT.is256BitVector() \|\| UserVT.is512BitVector()) &&
1073	UserVT.getSizeInBits() > VT.getSizeInBits() &&
1074	(!MaxLd \|\| UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1075	MaxLd = User;
1076	MaxVT = UserVT;
1077	}
1078	}
1079	if (MaxLd) {
1080	SDLoc dl(N);
1081	unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1082	MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts);
1083	SDValue Extract = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT,
1084	N1: SDValue (MaxLd, `0`),
1085	N2: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1086	SDValue Res = CurDAG->getBitcast(VT, V: Extract);
1087
1088	--I;
1089	SDValue To[] = {Res, SDValue (MaxLd, `1`)};
1090	CurDAG->ReplaceAllUsesWith(From: N, To);
1091	++I;
1092	MadeChange = true;
1093	continue;
1094	}
1095	break;
1096	}
1097	case ISD::VSELECT: {
1098	// Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1099	EVT EleVT = N->getOperand(Num: `0`).getValueType().getVectorElementType();
1100	if (EleVT == MVT::i1)
1101	break;
1102
1103	assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1104	assert(N->getValueType(`0`).getVectorElementType() != MVT::i16 &&
1105	"We can't replace VSELECT with BLENDV in vXi16!");
1106	SDValue R;
1107	if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: `0`)) ==
1108	EleVT.getSizeInBits()) {
1109	R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1110	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`), N3: N->getOperand(Num: `2`),
1111	N4: CurDAG->getTargetConstant(Val: `0xCA`, DL: SDLoc (N), VT: MVT::i8));
1112	} else {
1113	R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1114	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`),
1115	N3: N->getOperand(Num: `2`));
1116	}
1117	--I;
1118	CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode());
1119	++I;
1120	MadeChange = true;
1121	continue;
1122	}
1123	case ISD::FP_ROUND:
1124	case ISD::STRICT_FP_ROUND:
1125	case ISD::FP_TO_SINT:
1126	case ISD::FP_TO_UINT:
1127	case ISD::STRICT_FP_TO_SINT:
1128	case ISD::STRICT_FP_TO_UINT: {
1129	// Replace vector fp_to_s/uint with their X86 specific equivalent so we
1130	// don't need 2 sets of patterns.
1131	if (!N->getSimpleValueType(ResNo: `0`).isVector())
1132	break;
1133
1134	unsigned NewOpc;
1135	switch (N->getOpcode()) {
1136	default: llvm_unreachable("Unexpected opcode!");
1137	case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1138	case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1139	case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1140	case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1141	case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1142	case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1143	}
1144	SDValue Res;
1145	if (N->isStrictFPOpcode())
1146	Res =
1147	CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), ResultTys: {N->getValueType(ResNo: `0`), MVT::Other},
1148	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`)});
1149	else
1150	Res =
1151	CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1152	Operand: N->getOperand(Num: `0`));
1153	--I;
1154	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1155	++I;
1156	MadeChange = true;
1157	continue;
1158	}
1159	case ISD::SHL:
1160	case ISD::SRA:
1161	case ISD::SRL: {
1162	// Replace vector shifts with their X86 specific equivalent so we don't
1163	// need 2 sets of patterns.
1164	if (!N->getValueType(ResNo: `0`).isVector())
1165	break;
1166
1167	unsigned NewOpc;
1168	switch (N->getOpcode()) {
1169	default: llvm_unreachable("Unexpected opcode!");
1170	case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1171	case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1172	case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1173	}
1174	SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1175	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`));
1176	--I;
1177	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1178	++I;
1179	MadeChange = true;
1180	continue;
1181	}
1182	case ISD::ANY_EXTEND:
1183	case ISD::ANY_EXTEND_VECTOR_INREG: {
1184	// Replace vector any extend with the zero extend equivalents so we don't
1185	// need 2 sets of patterns. Ignore vXi1 extensions.
1186	if (!N->getValueType(ResNo: `0`).isVector())
1187	break;
1188
1189	unsigned NewOpc;
1190	if (N->getOperand(Num: `0`).getScalarValueSizeInBits() == `1`) {
1191	assert(N->getOpcode() == ISD::ANY_EXTEND &&
1192	"Unexpected opcode for mask vector!");
1193	NewOpc = ISD::SIGN_EXTEND;
1194	} else {
1195	NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1196	? ISD::ZERO_EXTEND
1197	: ISD::ZERO_EXTEND_VECTOR_INREG;
1198	}
1199
1200	SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1201	Operand: N->getOperand(Num: `0`));
1202	--I;
1203	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1204	++I;
1205	MadeChange = true;
1206	continue;
1207	}
1208	case ISD::FCEIL:
1209	case ISD::STRICT_FCEIL:
1210	case ISD::FFLOOR:
1211	case ISD::STRICT_FFLOOR:
1212	case ISD::FTRUNC:
1213	case ISD::STRICT_FTRUNC:
1214	case ISD::FROUNDEVEN:
1215	case ISD::STRICT_FROUNDEVEN:
1216	case ISD::FNEARBYINT:
1217	case ISD::STRICT_FNEARBYINT:
1218	case ISD::FRINT:
1219	case ISD::STRICT_FRINT: {
1220	// Replace fp rounding with their X86 specific equivalent so we don't
1221	// need 2 sets of patterns.
1222	unsigned Imm;
1223	switch (N->getOpcode()) {
1224	default: llvm_unreachable("Unexpected opcode!");
1225	case ISD::STRICT_FCEIL:
1226	case ISD::FCEIL: Imm = `0xA`; break;
1227	case ISD::STRICT_FFLOOR:
1228	case ISD::FFLOOR: Imm = `0x9`; break;
1229	case ISD::STRICT_FTRUNC:
1230	case ISD::FTRUNC: Imm = `0xB`; break;
1231	case ISD::STRICT_FROUNDEVEN:
1232	case ISD::FROUNDEVEN: Imm = `0x8`; break;
1233	case ISD::STRICT_FNEARBYINT:
1234	case ISD::FNEARBYINT: Imm = `0xC`; break;
1235	case ISD::STRICT_FRINT:
1236	case ISD::FRINT: Imm = `0x4`; break;
1237	}
1238	SDLoc dl(N);
1239	bool IsStrict = N->isStrictFPOpcode();
1240	SDValue Res;
1241	if (IsStrict)
1242	Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl,
1243	ResultTys: {N->getValueType(ResNo: `0`), MVT::Other},
1244	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`),
1245	CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)});
1246	else
1247	Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: `0`),
1248	N1: N->getOperand(Num: `0`),
1249	N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32));
1250	--I;
1251	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1252	++I;
1253	MadeChange = true;
1254	continue;
1255	}
1256	case X86ISD::FANDN:
1257	case X86ISD::FAND:
1258	case X86ISD::FOR:
1259	case X86ISD::FXOR: {
1260	// Widen scalar fp logic ops to vector to reduce isel patterns.
1261	// FIXME: Can we do this during lowering/combine.
1262	MVT VT = N->getSimpleValueType(ResNo: `0`);
1263	if (VT.isVector() \|\| VT == MVT::f128)
1264	break;
1265
1266	MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1267	: VT == MVT::f32 ? MVT::v4f32
1268	: MVT::v8f16;
1269
1270	SDLoc dl(N);
1271	SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1272	Operand: N->getOperand(Num: `0`));
1273	SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1274	Operand: N->getOperand(Num: `1`));
1275
1276	SDValue Res;
1277	if (Subtarget->hasSSE2()) {
1278	EVT IntVT = EVT (VecVT).changeVectorElementTypeToInteger();
1279	Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0);
1280	Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1);
1281	unsigned Opc;
1282	switch (N->getOpcode()) {
1283	default: llvm_unreachable("Unexpected opcode!");
1284	case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1285	case X86ISD::FAND: Opc = ISD::AND; break;
1286	case X86ISD::FOR: Opc = ISD::OR; break;
1287	case X86ISD::FXOR: Opc = ISD::XOR; break;
1288	}
1289	Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1);
1290	Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res);
1291	} else {
1292	Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1);
1293	}
1294	Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res,
1295	N2: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1296	--I;
1297	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1298	++I;
1299	MadeChange = true;
1300	continue;
1301	}
1302	}
1303
1304	if (OptLevel != CodeGenOptLevel::None &&
1305	// Only do this when the target can fold the load into the call or
1306	// jmp.
1307	!Subtarget->useIndirectThunkCalls() &&
1308	((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) \|\|
1309	(N->getOpcode() == X86ISD::TC_RETURN &&
1310	(Subtarget->is64Bit() \|\|
1311	!getTargetMachine().isPositionIndependent())))) {
1312	/// Also try moving call address load from outside callseq_start to just
1313	/// before the call to allow it to be folded.
1314	///
1315	/// [Load chain]
1316	/// ^
1317	/// \|
1318	/// [Load]
1319	/// ^ ^
1320	/// \| \|
1321	/// / \--
1322	/// / \|
1323	///[CALLSEQ_START] \|
1324	/// ^ \|
1325	/// \| \|
1326	/// [LOAD/C2Reg] \|
1327	/// \| \|
1328	/// \ /
1329	/// \ /
1330	/// [CALL]
1331	bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1332	SDValue Chain = N->getOperand(Num: `0`);
1333	SDValue Load = N->getOperand(Num: `1`);
1334	if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq))
1335	continue;
1336	moveBelowOrigChain(CurDAG, Load, Call: SDValue (N, `0`), OrigChain: Chain);
1337	++NumLoadMoved;
1338	MadeChange = true;
1339	continue;
1340	}
1341
1342	// Lower fpround and fpextend nodes that target the FP stack to be store and
1343	// load to the stack. This is a gross hack. We would like to simply mark
1344	// these as being illegal, but when we do that, legalize produces these when
1345	// it expands calls, then expands these in the same legalize pass. We would
1346	// like dag combine to be able to hack on these between the call expansion
1347	// and the node legalization. As such this pass basically does "really
1348	// late" legalization of these inline with the X86 isel pass.
1349	// FIXME: This should only happen when not compiled with -O0.
1350	switch (N->getOpcode()) {
1351	default: continue;
1352	case ISD::FP_ROUND:
1353	case ISD::FP_EXTEND:
1354	{
1355	MVT SrcVT = N->getOperand(Num: `0`).getSimpleValueType();
1356	MVT DstVT = N->getSimpleValueType(ResNo: `0`);
1357
1358	// If any of the sources are vectors, no fp stack involved.
1359	if (SrcVT.isVector() \|\| DstVT.isVector())
1360	continue;
1361
1362	// If the source and destination are SSE registers, then this is a legal
1363	// conversion that should not be lowered.
1364	const X86TargetLowering *X86Lowering =
1365	static_cast<const X86TargetLowering *>(TLI);
1366	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1367	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1368	if (SrcIsSSE && DstIsSSE)
1369	continue;
1370
1371	if (!SrcIsSSE && !DstIsSSE) {
1372	// If this is an FPStack extension, it is a noop.
1373	if (N->getOpcode() == ISD::FP_EXTEND)
1374	continue;
1375	// If this is a value-preserving FPStack truncation, it is a noop.
1376	if (N->getConstantOperandVal(Num: `1`))
1377	continue;
1378	}
1379
1380	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1381	// FPStack has extload and truncstore. SSE can fold direct loads into other
1382	// operations. Based on this, decide what we want to do.
1383	MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1384	SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1385	int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1386	MachinePointerInfo MPI =
1387	MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1388	SDLoc dl(N);
1389
1390	// FIXME: optimize the case where the src/dest is a load or store?
1391
1392	SDValue Store = CurDAG->getTruncStore(
1393	Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: `0`), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT);
1394	SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store,
1395	Ptr: MemTmp, PtrInfo: MPI, MemVT);
1396
1397	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1398	// extload we created. This will cause general havok on the dag because
1399	// anything below the conversion could be folded into other existing nodes.
1400	// To avoid invalidating 'I', back it up to the convert node.
1401	--I;
1402	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Result);
1403	break;
1404	}
1405
1406	//The sequence of events for lowering STRICT_FP versions of these nodes requires
1407	//dealing with the chain differently, as there is already a preexisting chain.
1408	case ISD::STRICT_FP_ROUND:
1409	case ISD::STRICT_FP_EXTEND:
1410	{
1411	MVT SrcVT = N->getOperand(Num: `1`).getSimpleValueType();
1412	MVT DstVT = N->getSimpleValueType(ResNo: `0`);
1413
1414	// If any of the sources are vectors, no fp stack involved.
1415	if (SrcVT.isVector() \|\| DstVT.isVector())
1416	continue;
1417
1418	// If the source and destination are SSE registers, then this is a legal
1419	// conversion that should not be lowered.
1420	const X86TargetLowering *X86Lowering =
1421	static_cast<const X86TargetLowering *>(TLI);
1422	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1423	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1424	if (SrcIsSSE && DstIsSSE)
1425	continue;
1426
1427	if (!SrcIsSSE && !DstIsSSE) {
1428	// If this is an FPStack extension, it is a noop.
1429	if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1430	continue;
1431	// If this is a value-preserving FPStack truncation, it is a noop.
1432	if (N->getConstantOperandVal(Num: `2`))
1433	continue;
1434	}
1435
1436	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1437	// FPStack has extload and truncstore. SSE can fold direct loads into other
1438	// operations. Based on this, decide what we want to do.
1439	MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1440	SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1441	int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1442	MachinePointerInfo MPI =
1443	MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1444	SDLoc dl(N);
1445
1446	// FIXME: optimize the case where the src/dest is a load or store?
1447
1448	//Since the operation is StrictFP, use the preexisting chain.
1449	SDValue Store, Result;
1450	if (!SrcIsSSE) {
1451	SDVTList VTs = CurDAG->getVTList(VT: MVT::Other);
1452	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `1`), MemTmp};
1453	Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT,
1454	PtrInfo: MPI, /Align/ Alignment: std::nullopt,
1455	Flags: MachineMemOperand::MOStore);
1456	if (N->getFlags().hasNoFPExcept()) {
1457	SDNodeFlags Flags = Store ->getFlags();
1458	Flags.setNoFPExcept(true);
1459	Store ->setFlags(Flags);
1460	}
1461	} else {
1462	assert(SrcVT == MemVT && "Unexpected VT!");
1463	Store = CurDAG->getStore(Chain: N->getOperand(Num: `0`), dl, Val: N->getOperand(Num: `1`), Ptr: MemTmp,
1464	PtrInfo: MPI);
1465	}
1466
1467	if (!DstIsSSE) {
1468	SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other);
1469	SDValue Ops[] = {Store, MemTmp};
1470	Result = CurDAG->getMemIntrinsicNode(
1471	Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI,
1472	/Align/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad);
1473	if (N->getFlags().hasNoFPExcept()) {
1474	SDNodeFlags Flags = Result ->getFlags();
1475	Flags.setNoFPExcept(true);
1476	Result ->setFlags(Flags);
1477	}
1478	} else {
1479	assert(DstVT == MemVT && "Unexpected VT!");
1480	Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI);
1481	}
1482
1483	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1484	// extload we created. This will cause general havok on the dag because
1485	// anything below the conversion could be folded into other existing nodes.
1486	// To avoid invalidating 'I', back it up to the convert node.
1487	--I;
1488	CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode());
1489	break;
1490	}
1491	}
1492
1493
1494	// Now that we did that, the node is dead. Increment the iterator to the
1495	// next node to process, then delete N.
1496	++I;
1497	MadeChange = true;
1498	}
1499
1500	// Remove any dead nodes that may have been left behind.
1501	if (MadeChange)
1502	CurDAG->RemoveDeadNodes();
1503	}
1504
1505	// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1506	bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1507	unsigned Opc = N->getMachineOpcode();
1508	if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1509	Opc != X86::MOVSX64rr8)
1510	return false;
1511
1512	SDValue N0 = N->getOperand(Num: `0`);
1513
1514	// We need to be extracting the lower bit of an extend.
1515	if (!N0.isMachineOpcode() \|\|
1516	N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG \|\|
1517	N0.getConstantOperandVal(i: `1`) != X86::sub_8bit)
1518	return false;
1519
1520	// We're looking for either a movsx or movzx to match the original opcode.
1521	unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1522	: X86::MOVSX32rr8_NOREX;
1523	SDValue N00 = N0.getOperand(i: `0`);
1524	if (!N00.isMachineOpcode() \|\| N00.getMachineOpcode() != ExpectedOpc)
1525	return false;
1526
1527	if (Opc == X86::MOVSX64rr8) {
1528	// If we had a sign extend from 8 to 64 bits. We still need to go from 32
1529	// to 64.
1530	MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc (N),
1531	VT: MVT::i64, Op1: N00);
1532	ReplaceUses(F: N, T: Extend);
1533	} else {
1534	// Ok we can drop this extend and just use the original extend.
1535	ReplaceUses(F: N, T: N00.getNode());
1536	}
1537
1538	return true;
1539	}
1540
1541	void X86DAGToDAGISel::PostprocessISelDAG() {
1542	// Skip peepholes at -O0.
1543	if (TM.getOptLevel() == CodeGenOptLevel::None)
1544	return;
1545
1546	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1547
1548	bool MadeChange = false;
1549	while (Position != CurDAG->allnodes_begin()) {
1550	SDNode N = &--Position;
1551	// Skip dead nodes and any non-machine opcodes.
1552	if (N->use_empty() \|\| !N->isMachineOpcode())
1553	continue;
1554
1555	if (tryOptimizeRem8Extend(N)) {
1556	MadeChange = true;
1557	continue;
1558	}
1559
1560	unsigned Opc = N->getMachineOpcode();
1561	switch (Opc) {
1562	default:
1563	continue;
1564	// ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1565	case X86::TEST8rr:
1566	case X86::TEST16rr:
1567	case X86::TEST32rr:
1568	case X86::TEST64rr:
1569	// ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1570	case X86::CTEST8rr:
1571	case X86::CTEST16rr:
1572	case X86::CTEST32rr:
1573	case X86::CTEST64rr: {
1574	auto &Op0 = N->getOperand(Num: `0`);
1575	if (Op0 != N->getOperand(Num: `1`) \|\| !Op0 ->hasNUsesOfValue(NUses: `2`, Value: Op0.getResNo()) \|\|
1576	!Op0.isMachineOpcode())
1577	continue;
1578	SDValue And = N->getOperand(Num: `0`);
1579	#define CASE_ND(OP) \
1580	case X86::OP: \
1581	case X86::OP##_ND:
1582	switch (And.getMachineOpcode()) {
1583	default:
1584	continue;
1585	CASE_ND(AND8rr)
1586	CASE_ND(AND16rr)
1587	CASE_ND(AND32rr)
1588	CASE_ND(AND64rr) {
1589	if (And ->hasAnyUseOfValue(Value: `1`))
1590	continue;
1591	SmallVector<SDValue> Ops(N->op_values());
1592	Ops [`0`] = And.getOperand(i: `0`);
1593	Ops [`1`] = And.getOperand(i: `1`);
1594	MachineSDNode *Test =
1595	CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (N), VT: MVT::i32, Ops);
1596	ReplaceUses(F: N, T: Test);
1597	MadeChange = true;
1598	continue;
1599	}
1600	CASE_ND(AND8rm)
1601	CASE_ND(AND16rm)
1602	CASE_ND(AND32rm)
1603	CASE_ND(AND64rm) {
1604	if (And ->hasAnyUseOfValue(Value: `1`))
1605	continue;
1606	unsigned NewOpc;
1607	bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc);
1608	#define FROM_TO(A, B) \
1609	CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1610	break;
1611	switch (And.getMachineOpcode()) {
1612	FROM_TO(AND8rm, TEST8mr);
1613	FROM_TO(AND16rm, TEST16mr);
1614	FROM_TO(AND32rm, TEST32mr);
1615	FROM_TO(AND64rm, TEST64mr);
1616	}
1617	#undef FROM_TO
1618	#undef CASE_ND
1619	// Need to swap the memory and register operand.
1620	SmallVector<SDValue> Ops = {And.getOperand(i: `1`), And.getOperand(i: `2`),
1621	And.getOperand(i: `3`), And.getOperand(i: `4`),
1622	And.getOperand(i: `5`), And.getOperand(i: `0`)};
1623	// CC, Cflags.
1624	if (IsCTESTCC) {
1625	Ops.push_back(Elt: N->getOperand(Num: `2`));
1626	Ops.push_back(Elt: N->getOperand(Num: `3`));
1627	}
1628	// Chain of memory load
1629	Ops.push_back(Elt: And.getOperand(i: `6`));
1630	// Glue
1631	if (IsCTESTCC)
1632	Ops.push_back(Elt: N->getOperand(Num: `4`));
1633
1634	MachineSDNode *Test = CurDAG->getMachineNode(
1635	Opcode: NewOpc, dl: SDLoc (N), VT1: MVT::i32, VT2: MVT::Other, Ops);
1636	CurDAG->setNodeMemRefs(
1637	N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands());
1638	ReplaceUses(F: And.getValue(R: `2`), T: SDValue (Test, `1`));
1639	ReplaceUses(F: SDValue (N, `0`), T: SDValue (Test, `0`));
1640	MadeChange = true;
1641	continue;
1642	}
1643	}
1644	}
1645	// Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1646	// used. We're doing this late so we can prefer to fold the AND into masked
1647	// comparisons. Doing that can be better for the live range of the mask
1648	// register.
1649	case X86::KORTESTBrr:
1650	case X86::KORTESTWrr:
1651	case X86::KORTESTDrr:
1652	case X86::KORTESTQrr: {
1653	SDValue Op0 = N->getOperand(Num: `0`);
1654	if (Op0 != N->getOperand(Num: `1`) \|\| !N->isOnlyUserOf(N: Op0.getNode()) \|\|
1655	!Op0.isMachineOpcode() \|\| !onlyUsesZeroFlag(Flags: SDValue (N, `0`)))
1656	continue;
1657	#define CASE(A) \
1658	case X86::A: \
1659	break;
1660	switch (Op0.getMachineOpcode()) {
1661	default:
1662	continue;
1663	CASE(KANDBrr)
1664	CASE(KANDWrr)
1665	CASE(KANDDrr)
1666	CASE(KANDQrr)
1667	}
1668	unsigned NewOpc;
1669	#define FROM_TO(A, B) \
1670	case X86::A: \
1671	NewOpc = X86::B; \
1672	break;
1673	switch (Opc) {
1674	FROM_TO(KORTESTBrr, KTESTBrr)
1675	FROM_TO(KORTESTWrr, KTESTWrr)
1676	FROM_TO(KORTESTDrr, KTESTDrr)
1677	FROM_TO(KORTESTQrr, KTESTQrr)
1678	}
1679	// KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1680	// KAND instructions and KTEST use the same ISA feature.
1681	if (NewOpc == X86::KTESTWrr && !Subtarget->hasDQI())
1682	continue;
1683	#undef FROM_TO
1684	MachineSDNode *KTest = CurDAG->getMachineNode(
1685	Opcode: NewOpc, dl: SDLoc (N), VT: MVT::i32, Op1: Op0.getOperand(i: `0`), Op2: Op0.getOperand(i: `1`));
1686	ReplaceUses(F: N, T: KTest);
1687	MadeChange = true;
1688	continue;
1689	}
1690	// Attempt to remove vectors moves that were inserted to zero upper bits.
1691	case TargetOpcode::SUBREG_TO_REG: {
1692	unsigned SubRegIdx = N->getConstantOperandVal(Num: `2`);
1693	if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1694	continue;
1695
1696	SDValue Move = N->getOperand(Num: `1`);
1697	if (!Move.isMachineOpcode())
1698	continue;
1699
1700	// Make sure its one of the move opcodes we recognize.
1701	switch (Move.getMachineOpcode()) {
1702	default:
1703	continue;
1704	CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1705	CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1706	CASE(VMOVDQArr) CASE(VMOVDQUrr)
1707	CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1708	CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1709	CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1710	CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1711	CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1712	CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1713	CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1714	CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1715	CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1716	CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1717	CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1718	}
1719	#undef CASE
1720
1721	SDValue In = Move.getOperand(i: `0`);
1722	if (!In.isMachineOpcode() \|\|
1723	In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1724	continue;
1725
1726	// Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1727	// the SHA instructions which use a legacy encoding.
1728	uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags;
1729	if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1730	(TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1731	(TSFlags & X86II::EncodingMask) != X86II::XOP)
1732	continue;
1733
1734	// Producing instruction is another vector instruction. We can drop the
1735	// move.
1736	CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: `0`), Op2: In, Op3: N->getOperand(Num: `2`));
1737	MadeChange = true;
1738	}
1739	}
1740	}
1741
1742	if (MadeChange)
1743	CurDAG->RemoveDeadNodes();
1744	}
1745
1746
1747	/// Emit any code that needs to be executed only in the main function.
1748	void X86DAGToDAGISel::emitSpecialCodeForMain() {
1749	if (Subtarget->isTargetCygMing()) {
1750	TargetLowering::ArgListTy Args;
1751	auto &DL = CurDAG->getDataLayout();
1752
1753	TargetLowering::CallLoweringInfo CLI(*CurDAG);
1754	CLI.setChain(CurDAG->getRoot())
1755	.setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()),
1756	Target: CurDAG->getExternalSymbol(Sym: "__main", VT: TLI->getPointerTy(DL)),
1757	ArgsList: std::move(Args));
1758	const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1759	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1760	CurDAG->setRoot(Result.second);
1761	}
1762	}
1763
1764	void X86DAGToDAGISel::emitFunctionEntryCode() {
1765	// If this is main, emit special code for main.
1766	const Function &F = MF->getFunction();
1767	if (F.hasExternalLinkage() && F.getName() == "main")
1768	emitSpecialCodeForMain();
1769	}
1770
1771	static bool isDispSafeForFrameIndex(int64_t Val) {
1772	// On 64-bit platforms, we can run into an issue where a frame index
1773	// includes a displacement that, when added to the explicit displacement,
1774	// will overflow the displacement field. Assuming that the frame index
1775	// displacement fits into a 31-bit integer (which is only slightly more
1776	// aggressive than the current fundamental assumption that it fits into
1777	// a 32-bit integer), a 31-bit disp should always be safe.
1778	return isInt<`31`>(x: Val);
1779	}
1780
1781	bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1782	X86ISelAddressMode &AM) {
1783	// We may have already matched a displacement and the caller just added the
1784	// symbolic displacement. So we still need to do the checks even if Offset
1785	// is zero.
1786
1787	int64_t Val = AM.Disp + Offset;
1788
1789	// Cannot combine ExternalSymbol displacements with integer offsets.
1790	if (Val != `0` && (AM.ES \|\| AM.MCSym))
1791	return true;
1792
1793	CodeModel::Model M = TM.getCodeModel();
1794	if (Subtarget->is64Bit()) {
1795	if (Val != `0` &&
1796	!X86::isOffsetSuitableForCodeModel(Offset: Val, M,
1797	hasSymbolicDisplacement: AM.hasSymbolicDisplacement()))
1798	return true;
1799	// In addition to the checks required for a register base, check that
1800	// we do not try to use an unsafe Disp with a frame index.
1801	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1802	!isDispSafeForFrameIndex(Val))
1803	return true;
1804	// In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1805	// 64 bits. Instructions with 32-bit register addresses perform this zero
1806	// extension for us and we can safely ignore the high bits of Offset.
1807	// Instructions with only a 32-bit immediate address do not, though: they
1808	// sign extend instead. This means only address the low 2GB of address space
1809	// is directly addressable, we need indirect addressing for the high 2GB of
1810	// address space.
1811	// TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1812	// implicit zero extension of instructions would cover up any problem.
1813	// However, we have asserts elsewhere that get triggered if we do, so keep
1814	// the checks for now.
1815	// TODO: We would actually be able to accept these, as well as the same
1816	// addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1817	// to get an address size override to be emitted. However, this
1818	// pseudo-register is not part of any register class and therefore causes
1819	// MIR verification to fail.
1820	if (Subtarget->isTarget64BitILP32() && !isUInt<`31`>(x: Val) &&
1821	!AM.hasBaseOrIndexReg())
1822	return true;
1823	}
1824	AM.Disp = Val;
1825	return false;
1826	}
1827
1828	bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1829	bool AllowSegmentRegForX32) {
1830	SDValue Address = N->getOperand(Num: `1`);
1831
1832	// load gs:0 -> GS segment register.
1833	// load fs:0 -> FS segment register.
1834	//
1835	// This optimization is generally valid because the GNU TLS model defines that
1836	// gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1837	// with 32-bit registers, as we get in ILP32 mode, those registers are first
1838	// zero-extended to 64 bits and then added it to the base address, which gives
1839	// unwanted results when the register holds a negative value.
1840	// For more information see http://people.redhat.com/drepper/tls.pdf
1841	if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr &&
1842	!IndirectTlsSegRefs &&
1843	(Subtarget->isTargetGlibc() \|\| Subtarget->isTargetAndroid() \|\|
1844	Subtarget->isTargetFuchsia())) {
1845	if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1846	return true;
1847	switch (N->getPointerInfo().getAddrSpace()) {
1848	case X86AS::GS:
1849	AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
1850	return false;
1851	case X86AS::FS:
1852	AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
1853	return false;
1854	// Address space X86AS::SS is not handled here, because it is not used to
1855	// address TLS areas.
1856	}
1857	}
1858
1859	return true;
1860	}
1861
1862	/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1863	/// mode. These wrap things that will resolve down into a symbol reference.
1864	/// If no match is possible, this returns true, otherwise it returns false.
1865	bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1866	// If the addressing mode already has a symbol as the displacement, we can
1867	// never match another symbol.
1868	if (AM.hasSymbolicDisplacement())
1869	return true;
1870
1871	bool IsRIPRelTLS = false;
1872	bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1873	if (IsRIPRel) {
1874	SDValue Val = N.getOperand(i: `0`);
1875	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1876	IsRIPRelTLS = true;
1877	}
1878
1879	// We can't use an addressing mode in the 64-bit large code model.
1880	// Global TLS addressing is an exception. In the medium code model,
1881	// we use can use a mode when RIP wrappers are present.
1882	// That signifies access to globals that are known to be "near",
1883	// such as the GOT itself.
1884	CodeModel::Model M = TM.getCodeModel();
1885	if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1886	return true;
1887
1888	// Base and index reg must be 0 in order to use %rip as base.
1889	if (IsRIPRel && AM.hasBaseOrIndexReg())
1890	return true;
1891
1892	// Make a local copy in case we can't do this fold.
1893	X86ISelAddressMode Backup = AM;
1894
1895	int64_t Offset = `0`;
1896	SDValue N0 = N.getOperand(i: `0`);
1897	if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) {
1898	AM.GV = G->getGlobal();
1899	AM.SymbolFlags = G->getTargetFlags();
1900	Offset = G->getOffset();
1901	} else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) {
1902	AM.CP = CP->getConstVal();
1903	AM.Alignment = CP->getAlign();
1904	AM.SymbolFlags = CP->getTargetFlags();
1905	Offset = CP->getOffset();
1906	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) {
1907	AM.ES = S->getSymbol();
1908	AM.SymbolFlags = S->getTargetFlags();
1909	} else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) {
1910	AM.MCSym = S->getMCSymbol();
1911	} else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) {
1912	AM.JT = J->getIndex();
1913	AM.SymbolFlags = J->getTargetFlags();
1914	} else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) {
1915	AM.BlockAddr = BA->getBlockAddress();
1916	AM.SymbolFlags = BA->getTargetFlags();
1917	Offset = BA->getOffset();
1918	} else
1919	llvm_unreachable("Unhandled symbol reference node.");
1920
1921	// Can't use an addressing mode with large globals.
1922	if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1923	TM.isLargeGlobalValue(GV: AM.GV)) {
1924	AM = Backup;
1925	return true;
1926	}
1927
1928	if (foldOffsetIntoAddress(Offset, AM)) {
1929	AM = Backup;
1930	return true;
1931	}
1932
1933	if (IsRIPRel)
1934	AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64));
1935
1936	// Commit the changes now that we know this fold is safe.
1937	return false;
1938	}
1939
1940	/// Add the specified node to the specified addressing mode, returning true if
1941	/// it cannot be done. This just pattern matches for the addressing mode.
1942	bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1943	if (matchAddressRecursively(N, AM, Depth: `0`))
1944	return true;
1945
1946	// Post-processing: Make a second attempt to fold a load, if we now know
1947	// that there will not be any other register. This is only performed for
1948	// 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1949	// any foldable load the first time.
1950	if (Subtarget->isTarget64BitILP32() &&
1951	AM.BaseType == X86ISelAddressMode::RegBase &&
1952	AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1953	SDValue Save_Base_Reg = AM.Base_Reg;
1954	if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) {
1955	AM.Base_Reg = SDValue ();
1956	if (matchLoadInAddress(N: LoadN, AM, /AllowSegmentRegForX32=/true))
1957	AM.Base_Reg = Save_Base_Reg;
1958	}
1959	}
1960
1961	// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1962	// a smaller encoding and avoids a scaled-index.
1963	if (AM.Scale == `2` &&
1964	AM.BaseType == X86ISelAddressMode::RegBase &&
1965	AM.Base_Reg.getNode() == nullptr) {
1966	AM.Base_Reg = AM.IndexReg;
1967	AM.Scale = `1`;
1968	}
1969
1970	// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1971	// because it has a smaller encoding.
1972	if (TM.getCodeModel() != CodeModel::Large &&
1973	(!AM.GV \|\| !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() &&
1974	AM.Scale == `1` && AM.BaseType == X86ISelAddressMode::RegBase &&
1975	AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1976	AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1977	AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64);
1978	}
1979
1980	return false;
1981	}
1982
1983	bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1984	unsigned Depth) {
1985	// Add an artificial use to this node so that we can keep track of
1986	// it if it gets CSE'd with a different node.
1987	HandleSDNode Handle(N);
1988
1989	X86ISelAddressMode Backup = AM;
1990	if (!matchAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth+`1`) &&
1991	!matchAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM, Depth: Depth+`1`))
1992	return false;
1993	AM = Backup;
1994
1995	// Try again after commutating the operands.
1996	if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
1997	Depth: Depth + `1`) &&
1998	!matchAddressRecursively(N: Handle.getValue().getOperand(i: `0`), AM, Depth: Depth + `1`))
1999	return false;
2000	AM = Backup;
2001
2002	// If we couldn't fold both operands into the address at the same time,
2003	// see if we can just put each operand into a register and fold at least
2004	// the add.
2005	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2006	!AM.Base_Reg.getNode() &&
2007	!AM.IndexReg.getNode()) {
2008	N = Handle.getValue();
2009	AM.Base_Reg = N.getOperand(i: `0`);
2010	AM.IndexReg = N.getOperand(i: `1`);
2011	AM.Scale = `1`;
2012	return false;
2013	}
2014	N = Handle.getValue();
2015	return true;
2016	}
2017
2018	// Insert a node into the DAG at least before the Pos node's position. This
2019	// will reposition the node as needed, and will assign it a node ID that is <=
2020	// the Pos node's ID. Note that this does not* preserve the uniqueness of node*
2021	// IDs! The selection DAG must no longer depend on their uniqueness when this
2022	// is used.
2023	static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2024	if (N ->getNodeId() == -`1` \|\|
2025	(SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) >
2026	SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) {
2027	DAG.RepositionNode(Position: Pos ->getIterator(), N: N.getNode());
2028	// Mark Node as invalid for pruning as after this it may be a successor to a
2029	// selected node but otherwise be in the same position of Pos.
2030	// Conservatively mark it with the same -abs(Id) to assure node id
2031	// invariant is preserved.
2032	N ->setNodeId(Pos ->getNodeId());
2033	SelectionDAGISel::InvalidateNodeId(N: N.getNode());
2034	}
2035	}
2036
2037	// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2038	// safe. This allows us to convert the shift and and into an h-register
2039	// extract and a scaled index. Returns false if the simplification is
2040	// performed.
2041	static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2042	uint64_t Mask,
2043	SDValue Shift, SDValue X,
2044	X86ISelAddressMode &AM) {
2045	if (Shift.getOpcode() != ISD::SRL \|\|
2046	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)) \|\|
2047	!Shift.hasOneUse())
2048	return true;
2049
2050	int ScaleLog = `8` - Shift.getConstantOperandVal(i: `1`);
2051	if (ScaleLog <= `0` \|\| ScaleLog >= `4` \|\|
2052	Mask != (`0xffu` << ScaleLog))
2053	return true;
2054
2055	MVT XVT = X.getSimpleValueType();
2056	MVT VT = N.getSimpleValueType();
2057	SDLoc DL(N);
2058	SDValue Eight = DAG.getConstant(Val: `8`, DL, VT: MVT::i8);
2059	SDValue NewMask = DAG.getConstant(Val: `0xff`, DL, VT: XVT);
2060	SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight);
2061	SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask);
2062	SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT);
2063	SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8);
2064	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount);
2065
2066	// Insert the new nodes into the topological ordering. We must do this in
2067	// a valid topological ordering as nothing is going to go back and re-sort
2068	// these nodes. We continually insert before 'N' in sequence as this is
2069	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2070	// hierarchy left to express.
2071	insertDAGNode(DAG, Pos: N, N: Eight);
2072	insertDAGNode(DAG, Pos: N, N: NewMask);
2073	insertDAGNode(DAG, Pos: N, N: Srl);
2074	insertDAGNode(DAG, Pos: N, N: And);
2075	insertDAGNode(DAG, Pos: N, N: Ext);
2076	insertDAGNode(DAG, Pos: N, N: ShlCount);
2077	insertDAGNode(DAG, Pos: N, N: Shl);
2078	DAG.ReplaceAllUsesWith(From: N, To: Shl);
2079	DAG.RemoveDeadNode(N: N.getNode());
2080	AM.IndexReg = Ext;
2081	AM.Scale = (`1` << ScaleLog);
2082	return false;
2083	}
2084
2085	// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2086	// allows us to fold the shift into this addressing mode. Returns false if the
2087	// transform succeeded.
2088	static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2089	X86ISelAddressMode &AM) {
2090	SDValue Shift = N.getOperand(i: `0`);
2091
2092	// Use a signed mask so that shifting right will insert sign bits. These
2093	// bits will be removed when we shift the result left so it doesn't matter
2094	// what we use. This might allow a smaller immediate encoding.
2095	int64_t Mask = cast<ConstantSDNode>(Val: N ->getOperand(Num: `1`))->getSExtValue();
2096
2097	// If we have an any_extend feeding the AND, look through it to see if there
2098	// is a shift behind it. But only if the AND doesn't use the extended bits.
2099	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2100	bool FoundAnyExtend = false;
2101	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2102	Shift.getOperand(i: `0`).getSimpleValueType() == MVT::i32 &&
2103	isUInt<`32`>(x: Mask)) {
2104	FoundAnyExtend = true;
2105	Shift = Shift.getOperand(i: `0`);
2106	}
2107
2108	if (Shift.getOpcode() != ISD::SHL \|\|
2109	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
2110	return true;
2111
2112	SDValue X = Shift.getOperand(i: `0`);
2113
2114	// Not likely to be profitable if either the AND or SHIFT node has more
2115	// than one use (unless all uses are for address computation). Besides,
2116	// isel mechanism requires their node ids to be reused.
2117	if (!N.hasOneUse() \|\| !Shift.hasOneUse())
2118	return true;
2119
2120	// Verify that the shift amount is something we can fold.
2121	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2122	if (ShiftAmt != `1` && ShiftAmt != `2` && ShiftAmt != `3`)
2123	return true;
2124
2125	MVT VT = N.getSimpleValueType();
2126	SDLoc DL(N);
2127	if (FoundAnyExtend) {
2128	SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X);
2129	insertDAGNode(DAG, Pos: N, N: NewX);
2130	X = NewX;
2131	}
2132
2133	SDValue NewMask = DAG.getConstant(Val: Mask >> ShiftAmt, DL, VT);
2134	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask);
2135	SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: `1`));
2136
2137	// Insert the new nodes into the topological ordering. We must do this in
2138	// a valid topological ordering as nothing is going to go back and re-sort
2139	// these nodes. We continually insert before 'N' in sequence as this is
2140	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2141	// hierarchy left to express.
2142	insertDAGNode(DAG, Pos: N, N: NewMask);
2143	insertDAGNode(DAG, Pos: N, N: NewAnd);
2144	insertDAGNode(DAG, Pos: N, N: NewShift);
2145	DAG.ReplaceAllUsesWith(From: N, To: NewShift);
2146	DAG.RemoveDeadNode(N: N.getNode());
2147
2148	AM.Scale = `1` << ShiftAmt;
2149	AM.IndexReg = NewAnd;
2150	return false;
2151	}
2152
2153	// Implement some heroics to detect shifts of masked values where the mask can
2154	// be replaced by extending the shift and undoing that in the addressing mode
2155	// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2156	// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2157	// the addressing mode. This results in code such as:
2158	//
2159	// int f(short y, int lookup_table) {
2160	// ...
2161	// return y + lookup_table[y >> 11];
2162	// }
2163	//
2164	// Turning into:
2165	// movzwl (%rdi), %eax
2166	// movl %eax, %ecx
2167	// shrl $11, %ecx
2168	// addl (%rsi,%rcx,4), %eax
2169	//
2170	// Instead of:
2171	// movzwl (%rdi), %eax
2172	// movl %eax, %ecx
2173	// shrl $9, %ecx
2174	// andl $124, %rcx
2175	// addl (%rsi,%rcx), %eax
2176	//
2177	// Note that this function assumes the mask is provided as a mask after* the*
2178	// value is shifted. The input chain may or may not match that, but computing
2179	// such a mask is trivial.
2180	static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2181	uint64_t Mask,
2182	SDValue Shift, SDValue X,
2183	X86ISelAddressMode &AM) {
2184	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse() \|\|
2185	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
2186	return true;
2187
2188	// We need to ensure that mask is a continuous run of bits.
2189	unsigned MaskIdx, MaskLen;
2190	if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2191	return true;
2192	unsigned MaskLZ = `64` - (MaskIdx + MaskLen);
2193
2194	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2195
2196	// The amount of shift we're trying to fit into the addressing mode is taken
2197	// from the shifted mask index (number of trailing zeros of the mask).
2198	unsigned AMShiftAmt = MaskIdx;
2199
2200	// There is nothing we can do here unless the mask is removing some bits.
2201	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2202	if (AMShiftAmt == `0` \|\| AMShiftAmt > `3`) return true;
2203
2204	// Scale the leading zero count down based on the actual size of the value.
2205	// Also scale it down based on the size of the shift.
2206	unsigned ScaleDown = (`64` - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2207	if (MaskLZ < ScaleDown)
2208	return true;
2209	MaskLZ -= ScaleDown;
2210
2211	// The final check is to ensure that any masked out high bits of X are
2212	// already known to be zero. Otherwise, the mask has a semantic impact
2213	// other than masking out a couple of low bits. Unfortunately, because of
2214	// the mask, zero extensions will be removed from operands in some cases.
2215	// This code works extra hard to look through extensions because we can
2216	// replace them with zero extensions cheaply if necessary.
2217	bool ReplacingAnyExtend = false;
2218	if (X.getOpcode() == ISD::ANY_EXTEND) {
2219	unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2220	X.getOperand(i: `0`).getSimpleValueType().getSizeInBits();
2221	// Assume that we'll replace the any-extend with a zero-extend, and
2222	// narrow the search to the extended value.
2223	X = X.getOperand(i: `0`);
2224	MaskLZ = ExtendBits > MaskLZ ? `0` : MaskLZ - ExtendBits;
2225	ReplacingAnyExtend = true;
2226	}
2227	APInt MaskedHighBits =
2228	APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ);
2229	if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits))
2230	return true;
2231
2232	// We've identified a pattern that can be transformed into a single shift
2233	// and an addressing mode. Make it so.
2234	MVT VT = N.getSimpleValueType();
2235	if (ReplacingAnyExtend) {
2236	assert(X.getValueType() != VT);
2237	// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2238	SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc (X), VT, Operand: X);
2239	insertDAGNode(DAG, Pos: N, N: NewX);
2240	X = NewX;
2241	}
2242
2243	MVT XVT = X.getSimpleValueType();
2244	SDLoc DL(N);
2245	SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2246	SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2247	SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT);
2248	SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2249	SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2250
2251	// Insert the new nodes into the topological ordering. We must do this in
2252	// a valid topological ordering as nothing is going to go back and re-sort
2253	// these nodes. We continually insert before 'N' in sequence as this is
2254	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2255	// hierarchy left to express.
2256	insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2257	insertDAGNode(DAG, Pos: N, N: NewSRL);
2258	insertDAGNode(DAG, Pos: N, N: NewExt);
2259	insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2260	insertDAGNode(DAG, Pos: N, N: NewSHL);
2261	DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2262	DAG.RemoveDeadNode(N: N.getNode());
2263
2264	AM.Scale = `1` << AMShiftAmt;
2265	AM.IndexReg = NewExt;
2266	return false;
2267	}
2268
2269	// Transform "(X >> SHIFT) & (MASK << C1)" to
2270	// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2271	// matched to a BEXTR later. Returns false if the simplification is performed.
2272	static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2273	uint64_t Mask,
2274	SDValue Shift, SDValue X,
2275	X86ISelAddressMode &AM,
2276	const X86Subtarget &Subtarget) {
2277	if (Shift.getOpcode() != ISD::SRL \|\|
2278	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)) \|\|
2279	!Shift.hasOneUse() \|\| !N.hasOneUse())
2280	return true;
2281
2282	// Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2283	if (!Subtarget.hasTBM() &&
2284	!(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2285	return true;
2286
2287	// We need to ensure that mask is a continuous run of bits.
2288	unsigned MaskIdx, MaskLen;
2289	if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2290	return true;
2291
2292	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2293
2294	// The amount of shift we're trying to fit into the addressing mode is taken
2295	// from the shifted mask index (number of trailing zeros of the mask).
2296	unsigned AMShiftAmt = MaskIdx;
2297
2298	// There is nothing we can do here unless the mask is removing some bits.
2299	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2300	if (AMShiftAmt == `0` \|\| AMShiftAmt > `3`) return true;
2301
2302	MVT XVT = X.getSimpleValueType();
2303	MVT VT = N.getSimpleValueType();
2304	SDLoc DL(N);
2305	SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8);
2306	SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2307	SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT);
2308	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask);
2309	SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT);
2310	SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8);
2311	SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2312
2313	// Insert the new nodes into the topological ordering. We must do this in
2314	// a valid topological ordering as nothing is going to go back and re-sort
2315	// these nodes. We continually insert before 'N' in sequence as this is
2316	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2317	// hierarchy left to express.
2318	insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2319	insertDAGNode(DAG, Pos: N, N: NewSRL);
2320	insertDAGNode(DAG, Pos: N, N: NewMask);
2321	insertDAGNode(DAG, Pos: N, N: NewAnd);
2322	insertDAGNode(DAG, Pos: N, N: NewExt);
2323	insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2324	insertDAGNode(DAG, Pos: N, N: NewSHL);
2325	DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2326	DAG.RemoveDeadNode(N: N.getNode());
2327
2328	AM.Scale = `1` << AMShiftAmt;
2329	AM.IndexReg = NewExt;
2330	return false;
2331	}
2332
2333	// Attempt to peek further into a scaled index register, collecting additional
2334	// extensions / offsets / etc. Returns /p N if we can't peek any further.
2335	SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2336	X86ISelAddressMode &AM,
2337	unsigned Depth) {
2338	assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2339	assert((AM.Scale == `1` \|\| AM.Scale == `2` \|\| AM.Scale == `4` \|\| AM.Scale == `8`) &&
2340	"Illegal index scale");
2341
2342	// Limit recursion.
2343	if (Depth >= SelectionDAG::MaxRecursionDepth)
2344	return N;
2345
2346	EVT VT = N.getValueType();
2347	unsigned Opc = N.getOpcode();
2348
2349	// index: add(x,c) -> index: x, disp + c
2350	if (CurDAG->isBaseWithConstantOffset(Op: N)) {
2351	auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: `1`));
2352	uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2353	if (!foldOffsetIntoAddress(Offset, AM))
2354	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2355	}
2356
2357	// index: add(x,x) -> index: x, scale 2*
2358	if (Opc == ISD::ADD && N.getOperand(i: `0`) == N.getOperand(i: `1`)) {
2359	if (AM.Scale <= `4`) {
2360	AM.Scale *= `2`;
2361	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2362	}
2363	}
2364
2365	// index: shl(x,i) -> index: x, scale (1 << i)*
2366	if (Opc == X86ISD::VSHLI) {
2367	uint64_t ShiftAmt = N.getConstantOperandVal(i: `1`);
2368	uint64_t ScaleAmt = `1ULL` << ShiftAmt;
2369	if ((AM.Scale * ScaleAmt) <= `8`) {
2370	AM.Scale *= ScaleAmt;
2371	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2372	}
2373	}
2374
2375	// index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2376	// TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2377	if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2378	SDValue Src = N.getOperand(i: `0`);
2379	if (Src.getOpcode() == ISD::ADD && Src ->getFlags().hasNoSignedWrap() &&
2380	Src.hasOneUse()) {
2381	if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2382	SDValue AddSrc = Src.getOperand(i: `0`);
2383	auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: `1`));
2384	uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2385	if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2386	SDLoc DL(N);
2387	SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2388	SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2389	SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal);
2390	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2391	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2392	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2393	CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2394	CurDAG->RemoveDeadNode(N: N.getNode());
2395	return ExtSrc;
2396	}
2397	}
2398	}
2399	}
2400
2401	// index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2402	// index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2403	// TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2404	if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2405	SDValue Src = N.getOperand(i: `0`);
2406	unsigned SrcOpc = Src.getOpcode();
2407	if (((SrcOpc == ISD::ADD && Src ->getFlags().hasNoUnsignedWrap()) \|\|
2408	CurDAG->isADDLike(Op: Src, /NoWrap=/true)) &&
2409	Src.hasOneUse()) {
2410	if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2411	SDValue AddSrc = Src.getOperand(i: `0`);
2412	uint64_t Offset = Src.getConstantOperandVal(i: `1`);
2413	if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2414	SDLoc DL(N);
2415	SDValue Res;
2416	// If we're also scaling, see if we can use that as well.
2417	if (AddSrc.getOpcode() == ISD::SHL &&
2418	isa<ConstantSDNode>(Val: AddSrc.getOperand(i: `1`))) {
2419	SDValue ShVal = AddSrc.getOperand(i: `0`);
2420	uint64_t ShAmt = AddSrc.getConstantOperandVal(i: `1`);
2421	APInt HiBits =
2422	APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt);
2423	uint64_t ScaleAmt = `1ULL` << ShAmt;
2424	if ((AM.Scale * ScaleAmt) <= `8` &&
2425	(AddSrc ->getFlags().hasNoUnsignedWrap() \|\|
2426	CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) {
2427	AM.Scale *= ScaleAmt;
2428	SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal);
2429	SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal,
2430	N2: AddSrc.getOperand(i: `1`));
2431	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal);
2432	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift);
2433	AddSrc = ExtShift;
2434	Res = ExtShVal;
2435	}
2436	}
2437	SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2438	SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2439	SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal);
2440	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2441	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2442	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2443	CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2444	CurDAG->RemoveDeadNode(N: N.getNode());
2445	return Res ? Res : ExtSrc;
2446	}
2447	}
2448	}
2449	}
2450
2451	// TODO: Handle extensions, shifted masks etc.
2452	return N;
2453	}
2454
2455	bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2456	unsigned Depth) {
2457	SDLoc dl(N);
2458	LLVM_DEBUG({
2459	dbgs() << "MatchAddress: ";
2460	AM.dump(CurDAG);
2461	});
2462	// Limit recursion.
2463	if (Depth >= SelectionDAG::MaxRecursionDepth)
2464	return matchAddressBase(N, AM);
2465
2466	// If this is already a %rip relative address, we can only merge immediates
2467	// into it. Instead of handling this in every case, we handle it here.
2468	// RIP relative addressing: %rip + 32-bit displacement!
2469	if (AM.isRIPRelative()) {
2470	// FIXME: JumpTable and ExternalSymbol address currently don't like
2471	// displacements. It isn't very important, but this should be fixed for
2472	// consistency.
2473	if (!(AM.ES \|\| AM.MCSym) && AM.JT != -`1`)
2474	return true;
2475
2476	if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N))
2477	if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM))
2478	return false;
2479	return true;
2480	}
2481
2482	switch (N.getOpcode()) {
2483	default: break;
2484	case ISD::LOCAL_RECOVER: {
2485	if (!AM.hasSymbolicDisplacement() && AM.Disp == `0`)
2486	if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: `0`))) {
2487	// Use the symbol and don't prefix it.
2488	AM.MCSym = ESNode->getMCSymbol();
2489	return false;
2490	}
2491	break;
2492	}
2493	case ISD::Constant: {
2494	uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2495	if (!foldOffsetIntoAddress(Offset: Val, AM))
2496	return false;
2497	break;
2498	}
2499
2500	case X86ISD::Wrapper:
2501	case X86ISD::WrapperRIP:
2502	if (!matchWrapper(N, AM))
2503	return false;
2504	break;
2505
2506	case ISD::LOAD:
2507	if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM))
2508	return false;
2509	break;
2510
2511	case ISD::FrameIndex:
2512	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2513	AM.Base_Reg.getNode() == nullptr &&
2514	(!Subtarget->is64Bit() \|\| isDispSafeForFrameIndex(Val: AM.Disp))) {
2515	AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2516	AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex();
2517	return false;
2518	}
2519	break;
2520
2521	case ISD::SHL:
2522	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`)
2523	break;
2524
2525	if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`))) {
2526	unsigned Val = CN->getZExtValue();
2527	// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2528	// that the base operand remains free for further matching. If
2529	// the base doesn't end up getting used, a post-processing step
2530	// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2531	if (Val == `1` \|\| Val == `2` \|\| Val == `3`) {
2532	SDValue ShVal = N.getOperand(i: `0`);
2533	AM.Scale = `1` << Val;
2534	AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + `1`);
2535	return false;
2536	}
2537	}
2538	break;
2539
2540	case ISD::SRL: {
2541	// Scale must not be used already.
2542	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`) break;
2543
2544	// We only handle up to 64-bit values here as those are what matter for
2545	// addressing mode optimizations.
2546	assert(N.getSimpleValueType().getSizeInBits() <= `64` &&
2547	"Unexpected value size!");
2548
2549	SDValue And = N.getOperand(i: `0`);
2550	if (And.getOpcode() != ISD::AND) break;
2551	SDValue X = And.getOperand(i: `0`);
2552
2553	// The mask used for the transform is expected to be post-shift, but we
2554	// found the shift first so just apply the shift to the mask before passing
2555	// it down.
2556	if (!isa<ConstantSDNode>(Val: N.getOperand(i: `1`)) \|\|
2557	!isa<ConstantSDNode>(Val: And.getOperand(i: `1`)))
2558	break;
2559	uint64_t Mask = And.getConstantOperandVal(i: `1`) >> N.getConstantOperandVal(i: `1`);
2560
2561	// Try to fold the mask and shift into the scale, and return false if we
2562	// succeed.
2563	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM))
2564	return false;
2565	break;
2566	}
2567
2568	case ISD::SMUL_LOHI:
2569	case ISD::UMUL_LOHI:
2570	// A mul_lohi where we need the low part can be folded as a plain multiply.
2571	if (N.getResNo() != `0`) break;
2572	[[fallthrough]];
2573	case ISD::MUL:
2574	case X86ISD::MUL_IMM:
2575	// X[3,5,9] -> X+X[2,4,8]
2576	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2577	AM.Base_Reg.getNode() == nullptr &&
2578	AM.IndexReg.getNode() == nullptr) {
2579	if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`)))
2580	if (CN->getZExtValue() == `3` \|\| CN->getZExtValue() == `5` \|\|
2581	CN->getZExtValue() == `9`) {
2582	AM.Scale = unsigned(CN->getZExtValue())-`1`;
2583
2584	SDValue MulVal = N.getOperand(i: `0`);
2585	SDValue Reg;
2586
2587	// Okay, we know that we have a scale by now. However, if the scaled
2588	// value is an add of something and a constant, we can fold the
2589	// constant into the disp field here.
2590	if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2591	isa<ConstantSDNode>(Val: MulVal.getOperand(i: `1`))) {
2592	Reg = MulVal.getOperand(i: `0`);
2593	auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: `1`));
2594	uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2595	if (foldOffsetIntoAddress(Offset: Disp, AM))
2596	Reg = N.getOperand(i: `0`);
2597	} else {
2598	Reg = N.getOperand(i: `0`);
2599	}
2600
2601	AM.IndexReg = AM.Base_Reg = Reg;
2602	return false;
2603	}
2604	}
2605	break;
2606
2607	case ISD::SUB: {
2608	// Given A-B, if A can be completely folded into the address and
2609	// the index field with the index field unused, use -B as the index.
2610	// This is a win if a has multiple parts that can be folded into
2611	// the address. Also, this saves a mov if the base register has
2612	// other uses, since it avoids a two-address sub instruction, however
2613	// it costs an additional mov if the index register has other uses.
2614
2615	// Add an artificial use to this node so that we can keep track of
2616	// it if it gets CSE'd with a different node.
2617	HandleSDNode Handle(N);
2618
2619	// Test if the LHS of the sub can be folded.
2620	X86ISelAddressMode Backup = AM;
2621	if (matchAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth+`1`)) {
2622	N = Handle.getValue();
2623	AM = Backup;
2624	break;
2625	}
2626	N = Handle.getValue();
2627	// Test if the index field is free for use.
2628	if (AM.IndexReg.getNode() \|\| AM.isRIPRelative()) {
2629	AM = Backup;
2630	break;
2631	}
2632
2633	int Cost = `0`;
2634	SDValue RHS = N.getOperand(i: `1`);
2635	// If the RHS involves a register with multiple uses, this
2636	// transformation incurs an extra mov, due to the neg instruction
2637	// clobbering its operand.
2638	if (!RHS.getNode()->hasOneUse() \|\|
2639	RHS.getNode()->getOpcode() == ISD::CopyFromReg \|\|
2640	RHS.getNode()->getOpcode() == ISD::TRUNCATE \|\|
2641	RHS.getNode()->getOpcode() == ISD::ANY_EXTEND \|\|
2642	(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2643	RHS.getOperand(i: `0`).getValueType() == MVT::i32))
2644	++Cost;
2645	// If the base is a register with multiple uses, this
2646	// transformation may save a mov.
2647	if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2648	!AM.Base_Reg.getNode()->hasOneUse()) \|\|
2649	AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2650	--Cost;
2651	// If the folded LHS was interesting, this transformation saves
2652	// address arithmetic.
2653	if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2654	((AM.Disp != `0`) && (Backup.Disp == `0`)) +
2655	(AM.Segment.getNode() && !Backup.Segment.getNode()) >= `2`)
2656	--Cost;
2657	// If it doesn't look like it may be an overall win, don't do it.
2658	if (Cost >= `0`) {
2659	AM = Backup;
2660	break;
2661	}
2662
2663	// Ok, the transformation is legal and appears profitable. Go for it.
2664	// Negation will be emitted later to avoid creating dangling nodes if this
2665	// was an unprofitable LEA.
2666	AM.IndexReg = RHS;
2667	AM.NegateIndex = true;
2668	AM.Scale = `1`;
2669	return false;
2670	}
2671
2672	case ISD::OR:
2673	case ISD::XOR:
2674	// See if we can treat the OR/XOR node as an ADD node.
2675	if (!CurDAG->isADDLike(Op: N))
2676	break;
2677	[[fallthrough]];
2678	case ISD::ADD:
2679	if (!matchAdd(N, AM, Depth))
2680	return false;
2681	break;
2682
2683	case ISD::AND: {
2684	// Perform some heroic transforms on an and of a constant-count shift
2685	// with a constant to enable use of the scaled offset field.
2686
2687	// Scale must not be used already.
2688	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`) break;
2689
2690	// We only handle up to 64-bit values here as those are what matter for
2691	// addressing mode optimizations.
2692	assert(N.getSimpleValueType().getSizeInBits() <= `64` &&
2693	"Unexpected value size!");
2694
2695	if (!isa<ConstantSDNode>(Val: N.getOperand(i: `1`)))
2696	break;
2697
2698	if (N.getOperand(i: `0`).getOpcode() == ISD::SRL) {
2699	SDValue Shift = N.getOperand(i: `0`);
2700	SDValue X = Shift.getOperand(i: `0`);
2701
2702	uint64_t Mask = N.getConstantOperandVal(i: `1`);
2703
2704	// Try to fold the mask and shift into an extract and scale.
2705	if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2706	return false;
2707
2708	// Try to fold the mask and shift directly into the scale.
2709	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2710	return false;
2711
2712	// Try to fold the mask and shift into BEXTR and scale.
2713	if (!foldMaskedShiftToBEXTR(DAG&: CurDAG, N, Mask, Shift, X, AM, Subtarget: Subtarget))
2714	return false;
2715	}
2716
2717	// Try to swap the mask and shift to place shifts which can be done as
2718	// a scale on the outside of the mask.
2719	if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM))
2720	return false;
2721
2722	break;
2723	}
2724	case ISD::ZERO_EXTEND: {
2725	// Try to widen a zexted shift left to the same size as its use, so we can
2726	// match the shift as a scale factor.
2727	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`)
2728	break;
2729
2730	SDValue Src = N.getOperand(i: `0`);
2731
2732	// See if we can match a zext(addlike(x,c)).
2733	// TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2734	if (Src.getOpcode() == ISD::ADD \|\| Src.getOpcode() == ISD::OR)
2735	if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + `1`))
2736	if (Index != N) {
2737	AM.IndexReg = Index;
2738	return false;
2739	}
2740
2741	// Peek through mask: zext(and(shl(x,c1),c2))
2742	APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits());
2743	if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2744	if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: `1`))) {
2745	Mask = MaskC->getAPIntValue();
2746	Src = Src.getOperand(i: `0`);
2747	}
2748
2749	if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N ->hasOneUse()) {
2750	// Give up if the shift is not a valid scale factor [1,2,3].
2751	SDValue ShlSrc = Src.getOperand(i: `0`);
2752	SDValue ShlAmt = Src.getOperand(i: `1`);
2753	auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt);
2754	if (!ShAmtC)
2755	break;
2756	unsigned ShAmtV = ShAmtC->getZExtValue();
2757	if (ShAmtV > `3`)
2758	break;
2759
2760	// The narrow shift must only shift out zero bits (it must be 'nuw').
2761	// That makes it safe to widen to the destination type.
2762	APInt HighZeros =
2763	APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV);
2764	if (!Src ->getFlags().hasNoUnsignedWrap() &&
2765	!CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask))
2766	break;
2767
2768	// zext (shl nuw i8 %x, C1) to i32
2769	// --> shl (zext i8 %x to i32), (zext C1)
2770	// zext (and (shl nuw i8 %x, C1), C2) to i32
2771	// --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2772	MVT SrcVT = ShlSrc.getSimpleValueType();
2773	MVT VT = N.getSimpleValueType();
2774	SDLoc DL(N);
2775
2776	SDValue Res = ShlSrc;
2777	if (!Mask.isAllOnes()) {
2778	Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT);
2779	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2780	Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res);
2781	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2782	}
2783	SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res);
2784	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext);
2785	SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt);
2786	insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl);
2787	CurDAG->ReplaceAllUsesWith(From: N, To: NewShl);
2788	CurDAG->RemoveDeadNode(N: N.getNode());
2789
2790	// Convert the shift to scale factor.
2791	AM.Scale = `1` << ShAmtV;
2792	// If matchIndexRecursively is not called here,
2793	// Zext may be replaced by other nodes but later used to call a builder
2794	// method
2795	AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + `1`);
2796	return false;
2797	}
2798
2799	if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2800	// Try to fold the mask and shift into an extract and scale.
2801	if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2802	X: Src.getOperand(i: `0`), AM))
2803	return false;
2804
2805	// Try to fold the mask and shift directly into the scale.
2806	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2807	X: Src.getOperand(i: `0`), AM))
2808	return false;
2809
2810	// Try to fold the mask and shift into BEXTR and scale.
2811	if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2812	X: Src.getOperand(i: `0`), AM, Subtarget: *Subtarget))
2813	return false;
2814	}
2815
2816	break;
2817	}
2818	}
2819
2820	return matchAddressBase(N, AM);
2821	}
2822
2823	/// Helper for MatchAddress. Add the specified node to the
2824	/// specified addressing mode without any further recursion.
2825	bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2826	// Is the base register already occupied?
2827	if (AM.BaseType != X86ISelAddressMode::RegBase \|\| AM.Base_Reg.getNode()) {
2828	// If so, check to see if the scale index register is set.
2829	if (!AM.IndexReg.getNode()) {
2830	AM.IndexReg = N;
2831	AM.Scale = `1`;
2832	return false;
2833	}
2834
2835	// Otherwise, we cannot select it.
2836	return true;
2837	}
2838
2839	// Default, generate it as a register.
2840	AM.BaseType = X86ISelAddressMode::RegBase;
2841	AM.Base_Reg = N;
2842	return false;
2843	}
2844
2845	bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2846	X86ISelAddressMode &AM,
2847	unsigned Depth) {
2848	SDLoc dl(N);
2849	LLVM_DEBUG({
2850	dbgs() << "MatchVectorAddress: ";
2851	AM.dump(CurDAG);
2852	});
2853	// Limit recursion.
2854	if (Depth >= SelectionDAG::MaxRecursionDepth)
2855	return matchAddressBase(N, AM);
2856
2857	// TODO: Support other operations.
2858	switch (N.getOpcode()) {
2859	case ISD::Constant: {
2860	uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2861	if (!foldOffsetIntoAddress(Offset: Val, AM))
2862	return false;
2863	break;
2864	}
2865	case X86ISD::Wrapper:
2866	if (!matchWrapper(N, AM))
2867	return false;
2868	break;
2869	case ISD::ADD: {
2870	// Add an artificial use to this node so that we can keep track of
2871	// it if it gets CSE'd with a different node.
2872	HandleSDNode Handle(N);
2873
2874	X86ISelAddressMode Backup = AM;
2875	if (!matchVectorAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`) &&
2876	!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
2877	Depth: Depth + `1`))
2878	return false;
2879	AM = Backup;
2880
2881	// Try again after commuting the operands.
2882	if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
2883	Depth: Depth + `1`) &&
2884	!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `0`), AM,
2885	Depth: Depth + `1`))
2886	return false;
2887	AM = Backup;
2888
2889	N = Handle.getValue();
2890	break;
2891	}
2892	}
2893
2894	return matchAddressBase(N, AM);
2895	}
2896
2897	/// Helper for selectVectorAddr. Handles things that can be folded into a
2898	/// gather/scatter address. The index register and scale should have already
2899	/// been handled.
2900	bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2901	return matchVectorAddressRecursively(N, AM, Depth: `0`);
2902	}
2903
2904	bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2905	SDValue IndexOp, SDValue ScaleOp,
2906	SDValue &Base, SDValue &Scale,
2907	SDValue &Index, SDValue &Disp,
2908	SDValue &Segment) {
2909	X86ISelAddressMode AM;
2910	AM.Scale = ScaleOp ->getAsZExtVal();
2911
2912	// Attempt to match index patterns, as long as we're not relying on implicit
2913	// sign-extension, which is performed BEFORE scale.
2914	if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2915	AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: `0`);
2916	else
2917	AM.IndexReg = IndexOp;
2918
2919	unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2920	if (AddrSpace == X86AS::GS)
2921	AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
2922	if (AddrSpace == X86AS::FS)
2923	AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
2924	if (AddrSpace == X86AS::SS)
2925	AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
2926
2927	SDLoc DL(BasePtr);
2928	MVT VT = BasePtr.getSimpleValueType();
2929
2930	// Try to match into the base and displacement fields.
2931	if (matchVectorAddress(N: BasePtr, AM))
2932	return false;
2933
2934	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2935	return true;
2936	}
2937
2938	/// Returns true if it is able to pattern match an addressing mode.
2939	/// It returns the operands which make up the maximal addressing mode it can
2940	/// match by reference.
2941	///
2942	/// Parent is the parent node of the addr operand that is being matched. It
2943	/// is always a load, store, atomic node, or null. It is only null when
2944	/// checking memory operands for inline asm nodes.
2945	bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2946	SDValue &Scale, SDValue &Index,
2947	SDValue &Disp, SDValue &Segment) {
2948	X86ISelAddressMode AM;
2949
2950	if (Parent &&
2951	// This list of opcodes are all the nodes that have an "addr:$ptr" operand
2952	// that are not a MemSDNode, and thus don't have proper addrspace info.
2953	Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2954	Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2955	Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2956	Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2957	Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2958	Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2959	Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2960	unsigned AddrSpace =
2961	cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace();
2962	if (AddrSpace == X86AS::GS)
2963	AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16);
2964	if (AddrSpace == X86AS::FS)
2965	AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16);
2966	if (AddrSpace == X86AS::SS)
2967	AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16);
2968	}
2969
2970	// Save the DL and VT before calling matchAddress, it can invalidate N.
2971	SDLoc DL(N);
2972	MVT VT = N.getSimpleValueType();
2973
2974	if (matchAddress(N, AM))
2975	return false;
2976
2977	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2978	return true;
2979	}
2980
2981	bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2982	// Cannot use 32 bit constants to reference objects in kernel/large code
2983	// model.
2984	if (TM.getCodeModel() == CodeModel::Kernel \|\|
2985	TM.getCodeModel() == CodeModel::Large)
2986	return false;
2987
2988	// In static codegen with small code model, we can get the address of a label
2989	// into a register with 'movl'
2990	if (N ->getOpcode() != X86ISD::Wrapper)
2991	return false;
2992
2993	N = N.getOperand(i: `0`);
2994
2995	// At least GNU as does not accept 'movl' for TPOFF relocations.
2996	// FIXME: We could use 'movl' when we know we are targeting MC.
2997	if (N ->getOpcode() == ISD::TargetGlobalTLSAddress)
2998	return false;
2999
3000	Imm = N;
3001	// Small/medium code model can reference non-TargetGlobalAddress objects with
3002	// 32 bit constants.
3003	if (N ->getOpcode() != ISD::TargetGlobalAddress) {
3004	return TM.getCodeModel() == CodeModel::Small \|\|
3005	TM.getCodeModel() == CodeModel::Medium;
3006	}
3007
3008	const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal();
3009	if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3010	return CR ->getUnsignedMax().ult(RHS: `1ull` << `32`);
3011
3012	return !TM.isLargeGlobalValue(GV);
3013	}
3014
3015	bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3016	SDValue &Scale, SDValue &Index,
3017	SDValue &Disp, SDValue &Segment) {
3018	// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3019	SDLoc DL(N);
3020
3021	if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3022	return false;
3023
3024	auto *RN = dyn_cast<RegisterSDNode>(Val&: Base);
3025	if (RN && RN->getReg() == `0`)
3026	Base = CurDAG->getRegister(Reg: `0`, VT: MVT::i64);
3027	else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Val: Base)) {
3028	// Base could already be %rip, particularly in the x32 ABI.
3029	SDValue ImplDef = SDValue (CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3030	VT: MVT::i64), `0`);
3031	Base = CurDAG->getTargetInsertSubreg(SRIdx: X86::sub_32bit, DL, VT: MVT::i64, Operand: ImplDef,
3032	Subreg: Base);
3033	}
3034
3035	RN = dyn_cast<RegisterSDNode>(Val&: Index);
3036	if (RN && RN->getReg() == `0`)
3037	Index = CurDAG->getRegister(Reg: `0`, VT: MVT::i64);
3038	else {
3039	assert(Index.getValueType() == MVT::i32 &&
3040	"Expect to be extending 32-bit registers for use in LEA");
3041	SDValue ImplDef = SDValue (CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL,
3042	VT: MVT::i64), `0`);
3043	Index = CurDAG->getTargetInsertSubreg(SRIdx: X86::sub_32bit, DL, VT: MVT::i64, Operand: ImplDef,
3044	Subreg: Index);
3045	}
3046
3047	return true;
3048	}
3049
3050	/// Calls SelectAddr and determines if the maximal addressing
3051	/// mode it matches can be cost effectively emitted as an LEA instruction.
3052	bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3053	SDValue &Base, SDValue &Scale,
3054	SDValue &Index, SDValue &Disp,
3055	SDValue &Segment) {
3056	X86ISelAddressMode AM;
3057
3058	// Save the DL and VT before calling matchAddress, it can invalidate N.
3059	SDLoc DL(N);
3060	MVT VT = N.getSimpleValueType();
3061
3062	// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3063	// segments.
3064	SDValue Copy = AM.Segment;
3065	SDValue T = CurDAG->getRegister(Reg: `0`, VT: MVT::i32);
3066	AM.Segment = T;
3067	if (matchAddress(N, AM))
3068	return false;
3069	assert (T == AM.Segment);
3070	AM.Segment = Copy;
3071
3072	unsigned Complexity = `0`;
3073	if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3074	Complexity = `1`;
3075	else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3076	Complexity = `4`;
3077
3078	if (AM.IndexReg.getNode())
3079	Complexity++;
3080
3081	// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3082	// a simple shift.
3083	if (AM.Scale > `1`)
3084	Complexity++;
3085
3086	// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3087	// to a LEA. This is determined with some experimentation but is by no means
3088	// optimal (especially for code size consideration). LEA is nice because of
3089	// its three-address nature. Tweak the cost function again when we can run
3090	// convertToThreeAddress() at register allocation time.
3091	if (AM.hasSymbolicDisplacement()) {
3092	// For X86-64, always use LEA to materialize RIP-relative addresses.
3093	if (Subtarget->is64Bit())
3094	Complexity = `4`;
3095	else
3096	Complexity += `2`;
3097	}
3098
3099	// Heuristic: try harder to form an LEA from ADD if the operands set flags.
3100	// Unlike ADD, LEA does not affect flags, so we will be less likely to require
3101	// duplicating flag-producing instructions later in the pipeline.
3102	if (N.getOpcode() == ISD::ADD) {
3103	auto isMathWithFlags = [](SDValue V) {
3104	switch (V.getOpcode()) {
3105	case X86ISD::ADD:
3106	case X86ISD::SUB:
3107	case X86ISD::ADC:
3108	case X86ISD::SBB:
3109	case X86ISD::SMUL:
3110	case X86ISD::UMUL:
3111	/ TODO: These opcodes can be added safely, but we may want to justify*
3112	their inclusion for different reasons (better for reg-alloc).
3113	case X86ISD::OR:
3114	case X86ISD::XOR:
3115	case X86ISD::AND:
3116	*/
3117	// Value 1 is the flag output of the node - verify it's not dead.
3118	return !SDValue (V.getNode(), `1`).use_empty();
3119	default:
3120	return false;
3121	}
3122	};
3123	// TODO: We might want to factor in whether there's a load folding
3124	// opportunity for the math op that disappears with LEA.
3125	if (isMathWithFlags (N.getOperand(i: `0`)) \|\| isMathWithFlags (N.getOperand(i: `1`)))
3126	Complexity++;
3127	}
3128
3129	if (AM.Disp)
3130	Complexity++;
3131
3132	// If it isn't worth using an LEA, reject it.
3133	if (Complexity <= `2`)
3134	return false;
3135
3136	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3137	return true;
3138	}
3139
3140	/// This is only run on TargetGlobalTLSAddress nodes.
3141	bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3142	SDValue &Scale, SDValue &Index,
3143	SDValue &Disp, SDValue &Segment) {
3144	assert(N.getOpcode() == ISD::TargetGlobalTLSAddress \|\|
3145	N.getOpcode() == ISD::TargetExternalSymbol);
3146
3147	X86ISelAddressMode AM;
3148	if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) {
3149	AM.GV = GA->getGlobal();
3150	AM.Disp += GA->getOffset();
3151	AM.SymbolFlags = GA->getTargetFlags();
3152	} else {
3153	auto *SA = cast<ExternalSymbolSDNode>(Val&: N);
3154	AM.ES = SA->getSymbol();
3155	AM.SymbolFlags = SA->getTargetFlags();
3156	}
3157
3158	if (Subtarget->is32Bit()) {
3159	AM.Scale = `1`;
3160	AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32);
3161	}
3162
3163	MVT VT = N.getSimpleValueType();
3164	getAddressOperands(AM, DL: SDLoc (N), VT, Base, Scale, Index, Disp, Segment);
3165	return true;
3166	}
3167
3168	bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3169	// Keep track of the original value type and whether this value was
3170	// truncated. If we see a truncation from pointer type to VT that truncates
3171	// bits that are known to be zero, we can use a narrow reference.
3172	EVT VT = N.getValueType();
3173	bool WasTruncated = false;
3174	if (N.getOpcode() == ISD::TRUNCATE) {
3175	WasTruncated = true;
3176	N = N.getOperand(i: `0`);
3177	}
3178
3179	if (N.getOpcode() != X86ISD::Wrapper)
3180	return false;
3181
3182	// We can only use non-GlobalValues as immediates if they were not truncated,
3183	// as we do not have any range information. If we have a GlobalValue and the
3184	// address was not truncated, we can select it as an operand directly.
3185	unsigned Opc = N.getOperand(i: `0`)->getOpcode();
3186	if (Opc != ISD::TargetGlobalAddress \|\| !WasTruncated) {
3187	Op = N.getOperand(i: `0`);
3188	// We can only select the operand directly if we didn't have to look past a
3189	// truncate.
3190	return !WasTruncated;
3191	}
3192
3193	// Check that the global's range fits into VT.
3194	auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: `0`));
3195	std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3196	if (!CR \|\| CR ->getUnsignedMax().uge(RHS: `1ull` << VT.getSizeInBits()))
3197	return false;
3198
3199	// Okay, we can use a narrow reference.
3200	Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc (N), VT,
3201	offset: GA->getOffset(), TargetFlags: GA->getTargetFlags());
3202	return true;
3203	}
3204
3205	bool X86DAGToDAGISel::tryFoldLoad(SDNode Root, SDNode P, SDValue N,
3206	SDValue &Base, SDValue &Scale,
3207	SDValue &Index, SDValue &Disp,
3208	SDValue &Segment) {
3209	assert(Root && P && "Unknown root/parent nodes");
3210	if (!ISD::isNON_EXTLoad(N: N.getNode()) \|\|
3211	!IsProfitableToFold(N, U: P, Root) \|\|
3212	!IsLegalToFold(N, U: P, Root, OptLevel))
3213	return false;
3214
3215	return selectAddr(Parent: N.getNode(),
3216	N: N.getOperand(i: `1`), Base, Scale, Index, Disp, Segment);
3217	}
3218
3219	bool X86DAGToDAGISel::tryFoldBroadcast(SDNode Root, SDNode P, SDValue N,
3220	SDValue &Base, SDValue &Scale,
3221	SDValue &Index, SDValue &Disp,
3222	SDValue &Segment) {
3223	assert(Root && P && "Unknown root/parent nodes");
3224	if (N ->getOpcode() != X86ISD::VBROADCAST_LOAD \|\|
3225	!IsProfitableToFold(N, U: P, Root) \|\|
3226	!IsLegalToFold(N, U: P, Root, OptLevel))
3227	return false;
3228
3229	return selectAddr(Parent: N.getNode(),
3230	N: N.getOperand(i: `1`), Base, Scale, Index, Disp, Segment);
3231	}
3232
3233	/// Return an SDNode that returns the value of the global base register.
3234	/// Output instructions required to initialize the global base register,
3235	/// if necessary.
3236	SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3237	unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3238	auto &DL = MF->getDataLayout();
3239	return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode();
3240	}
3241
3242	bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode N) const* {
3243	if (N->getOpcode() == ISD::TRUNCATE)
3244	N = N->getOperand(Num: `0`).getNode();
3245	if (N->getOpcode() != X86ISD::Wrapper)
3246	return false;
3247
3248	auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: `0`));
3249	if (!GA)
3250	return false;
3251
3252	auto *GV = GA->getGlobal();
3253	std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3254	if (CR)
3255	return CR ->getSignedMin().sge(RHS: -`1ull` << Width) &&
3256	CR ->getSignedMax().slt(RHS: `1ull` << Width);
3257	// In the kernel code model, globals are in the negative 2GB of the address
3258	// space, so globals can be a sign extended 32-bit immediate.
3259	// In other code models, small globals are in the low 2GB of the address
3260	// space, so sign extending them is equivalent to zero extending them.
3261	return Width == `32` && !TM.isLargeGlobalValue(GV);
3262	}
3263
3264	X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode N) const* {
3265	assert(N->isMachineOpcode() && "Unexpected node");
3266	unsigned Opc = N->getMachineOpcode();
3267	const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc);
3268	int CondNo = X86::getCondSrcNoFromDesc(MCID);
3269	if (CondNo < `0`)
3270	return X86::COND_INVALID;
3271
3272	return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo));
3273	}
3274
3275	/// Test whether the given X86ISD::CMP node has any users that use a flag
3276	/// other than ZF.
3277	bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3278	// Examine each user of the node.
3279	for (SDNode::use_iterator UI = Flags ->use_begin(), UE = Flags ->use_end();
3280	UI != UE; ++UI) {
3281	// Only check things that use the flags.
3282	if (UI.getUse().getResNo() != Flags.getResNo())
3283	continue;
3284	// Only examine CopyToReg uses that copy to EFLAGS.
3285	if (UI ->getOpcode() != ISD::CopyToReg \|\|
3286	cast<RegisterSDNode>(Val: UI ->getOperand(Num: `1`))->getReg() != X86::EFLAGS)
3287	return false;
3288	// Examine each user of the CopyToReg use.
3289	for (SDNode::use_iterator FlagUI = UI ->use_begin(),
3290	FlagUE = UI ->use_end(); FlagUI != FlagUE; ++FlagUI) {
3291	// Only examine the Flag result.
3292	if (FlagUI.getUse().getResNo() != `1`) continue;
3293	// Anything unusual: assume conservatively.
3294	if (!FlagUI ->isMachineOpcode()) return false;
3295	// Examine the condition code of the user.
3296	X86::CondCode CC = getCondFromNode(N: *FlagUI);
3297
3298	switch (CC) {
3299	// Comparisons which only use the zero flag.
3300	case X86::COND_E: case X86::COND_NE:
3301	continue;
3302	// Anything else: assume conservatively.
3303	default:
3304	return false;
3305	}
3306	}
3307	}
3308	return true;
3309	}
3310
3311	/// Test whether the given X86ISD::CMP node has any uses which require the SF
3312	/// flag to be accurate.
3313	bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3314	// Examine each user of the node.
3315	for (SDNode::use_iterator UI = Flags ->use_begin(), UE = Flags ->use_end();
3316	UI != UE; ++UI) {
3317	// Only check things that use the flags.
3318	if (UI.getUse().getResNo() != Flags.getResNo())
3319	continue;
3320	// Only examine CopyToReg uses that copy to EFLAGS.
3321	if (UI ->getOpcode() != ISD::CopyToReg \|\|
3322	cast<RegisterSDNode>(Val: UI ->getOperand(Num: `1`))->getReg() != X86::EFLAGS)
3323	return false;
3324	// Examine each user of the CopyToReg use.
3325	for (SDNode::use_iterator FlagUI = UI ->use_begin(),
3326	FlagUE = UI ->use_end(); FlagUI != FlagUE; ++FlagUI) {
3327	// Only examine the Flag result.
3328	if (FlagUI.getUse().getResNo() != `1`) continue;
3329	// Anything unusual: assume conservatively.
3330	if (!FlagUI ->isMachineOpcode()) return false;
3331	// Examine the condition code of the user.
3332	X86::CondCode CC = getCondFromNode(N: *FlagUI);
3333
3334	switch (CC) {
3335	// Comparisons which don't examine the SF flag.
3336	case X86::COND_A: case X86::COND_AE:
3337	case X86::COND_B: case X86::COND_BE:
3338	case X86::COND_E: case X86::COND_NE:
3339	case X86::COND_O: case X86::COND_NO:
3340	case X86::COND_P: case X86::COND_NP:
3341	continue;
3342	// Anything else: assume conservatively.
3343	default:
3344	return false;
3345	}
3346	}
3347	}
3348	return true;
3349	}
3350
3351	static bool mayUseCarryFlag(X86::CondCode CC) {
3352	switch (CC) {
3353	// Comparisons which don't examine the CF flag.
3354	case X86::COND_O: case X86::COND_NO:
3355	case X86::COND_E: case X86::COND_NE:
3356	case X86::COND_S: case X86::COND_NS:
3357	case X86::COND_P: case X86::COND_NP:
3358	case X86::COND_L: case X86::COND_GE:
3359	case X86::COND_G: case X86::COND_LE:
3360	return false;
3361	// Anything else: assume conservatively.
3362	default:
3363	return true;
3364	}
3365	}
3366
3367	/// Test whether the given node which sets flags has any uses which require the
3368	/// CF flag to be accurate.
3369	bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3370	// Examine each user of the node.
3371	for (SDNode::use_iterator UI = Flags ->use_begin(), UE = Flags ->use_end();
3372	UI != UE; ++UI) {
3373	// Only check things that use the flags.
3374	if (UI.getUse().getResNo() != Flags.getResNo())
3375	continue;
3376
3377	unsigned UIOpc = UI ->getOpcode();
3378
3379	if (UIOpc == ISD::CopyToReg) {
3380	// Only examine CopyToReg uses that copy to EFLAGS.
3381	if (cast<RegisterSDNode>(Val: UI ->getOperand(Num: `1`))->getReg() != X86::EFLAGS)
3382	return false;
3383	// Examine each user of the CopyToReg use.
3384	for (SDNode::use_iterator FlagUI = UI ->use_begin(), FlagUE = UI ->use_end();
3385	FlagUI != FlagUE; ++FlagUI) {
3386	// Only examine the Flag result.
3387	if (FlagUI.getUse().getResNo() != `1`)
3388	continue;
3389	// Anything unusual: assume conservatively.
3390	if (!FlagUI ->isMachineOpcode())
3391	return false;
3392	// Examine the condition code of the user.
3393	X86::CondCode CC = getCondFromNode(N: *FlagUI);
3394
3395	if (mayUseCarryFlag(CC))
3396	return false;
3397	}
3398
3399	// This CopyToReg is ok. Move on to the next user.
3400	continue;
3401	}
3402
3403	// This might be an unselected node. So look for the pre-isel opcodes that
3404	// use flags.
3405	unsigned CCOpNo;
3406	switch (UIOpc) {
3407	default:
3408	// Something unusual. Be conservative.
3409	return false;
3410	case X86ISD::SETCC: CCOpNo = `0`; break;
3411	case X86ISD::SETCC_CARRY: CCOpNo = `0`; break;
3412	case X86ISD::CMOV: CCOpNo = `2`; break;
3413	case X86ISD::BRCOND: CCOpNo = `2`; break;
3414	}
3415
3416	X86::CondCode CC = (X86::CondCode)UI ->getConstantOperandVal(Num: CCOpNo);
3417	if (mayUseCarryFlag(CC))
3418	return false;
3419	}
3420	return true;
3421	}
3422
3423	/// Check whether or not the chain ending in StoreNode is suitable for doing
3424	/// the {load; op; store} to modify transformation.
3425	static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3426	SDValue StoredVal, SelectionDAG *CurDAG,
3427	unsigned LoadOpNo,
3428	LoadSDNode *&LoadNode,
3429	SDValue &InputChain) {
3430	// Is the stored value result 0 of the operation?
3431	if (StoredVal.getResNo() != `0`) return false;
3432
3433	// Are there other uses of the operation other than the store?
3434	if (!StoredVal.getNode()->hasNUsesOfValue(NUses: `1`, Value: `0`)) return false;
3435
3436	// Is the store non-extending and non-indexed?
3437	if (!ISD::isNormalStore(N: StoreNode) \|\| StoreNode->isNonTemporal())
3438	return false;
3439
3440	SDValue Load = StoredVal ->getOperand(Num: LoadOpNo);
3441	// Is the stored value a non-extending and non-indexed load?
3442	if (!ISD::isNormalLoad(N: Load.getNode())) return false;
3443
3444	// Return LoadNode by reference.
3445	LoadNode = cast<LoadSDNode>(Val&: Load);
3446
3447	// Is store the only read of the loaded value?
3448	if (!Load.hasOneUse())
3449	return false;
3450
3451	// Is the address of the store the same as the load?
3452	if (LoadNode->getBasePtr() != StoreNode->getBasePtr() \|\|
3453	LoadNode->getOffset() != StoreNode->getOffset())
3454	return false;
3455
3456	bool FoundLoad = false;
3457	SmallVector<SDValue, `4`> ChainOps;
3458	SmallVector<const SDNode *, `4`> LoopWorklist;
3459	SmallPtrSet<const SDNode *, `16`> Visited;
3460	const unsigned int Max = `1024`;
3461
3462	// Visualization of Load-Op-Store fusion:
3463	// -------------------------
3464	// Legend:
3465	// -lines = Chain operand dependencies.*
3466	// \|-lines = Normal operand dependencies.
3467	// Dependencies flow down and right. n-suffix references multiple nodes.
3468	//
3469	// C Xn C
3470	// * * *
3471	// * * *
3472	// Xn A-LD Yn TF Yn
3473	// * \ \| * \|*
3474	// * \ \| * \|*
3475	// * \ \| => A--LD_OP_ST*
3476	// * \\| \*
3477	// TF OP \
3478	// \| \ Zn*
3479	// \| \*
3480	// A-ST Zn
3481	//
3482
3483	// This merge induced dependences from: #1: Xn -> LD, OP, Zn
3484	// #2: Yn -> LD
3485	// #3: ST -> Zn
3486
3487	// Ensure the transform is safe by checking for the dual
3488	// dependencies to make sure we do not induce a loop.
3489
3490	// As LD is a predecessor to both OP and ST we can do this by checking:
3491	// a). if LD is a predecessor to a member of Xn or Yn.
3492	// b). if a Zn is a predecessor to ST.
3493
3494	// However, (b) can only occur through being a chain predecessor to
3495	// ST, which is the same as Zn being a member or predecessor of Xn,
3496	// which is a subset of LD being a predecessor of Xn. So it's
3497	// subsumed by check (a).
3498
3499	SDValue Chain = StoreNode->getChain();
3500
3501	// Gather X elements in ChainOps.
3502	if (Chain == Load.getValue(R: `1`)) {
3503	FoundLoad = true;
3504	ChainOps.push_back(Elt: Load.getOperand(i: `0`));
3505	} else if (Chain.getOpcode() == ISD::TokenFactor) {
3506	for (unsigned i = `0`, e = Chain.getNumOperands(); i != e; ++i) {
3507	SDValue Op = Chain.getOperand(i);
3508	if (Op == Load.getValue(R: `1`)) {
3509	FoundLoad = true;
3510	// Drop Load, but keep its chain. No cycle check necessary.
3511	ChainOps.push_back(Elt: Load.getOperand(i: `0`));
3512	continue;
3513	}
3514	LoopWorklist.push_back(Elt: Op.getNode());
3515	ChainOps.push_back(Elt: Op);
3516	}
3517	}
3518
3519	if (!FoundLoad)
3520	return false;
3521
3522	// Worklist is currently Xn. Add Yn to worklist.
3523	for (SDValue Op : StoredVal ->ops())
3524	if (Op.getNode() != LoadNode)
3525	LoopWorklist.push_back(Elt: Op.getNode());
3526
3527	// Check (a) if Load is a predecessor to Xn + Yn
3528	if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max,
3529	TopologicalPrune: true))
3530	return false;
3531
3532	InputChain =
3533	CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc (Chain), VT: MVT::Other, Ops: ChainOps);
3534	return true;
3535	}
3536
3537	// Change a chain of {load; op; store} of the same value into a simple op
3538	// through memory of that value, if the uses of the modified value and its
3539	// address are suitable.
3540	//
3541	// The tablegen pattern memory operand pattern is currently not able to match
3542	// the case where the EFLAGS on the original operation are used.
3543	//
3544	// To move this to tablegen, we'll need to improve tablegen to allow flags to
3545	// be transferred from a node in the pattern to the result node, probably with
3546	// a new keyword. For example, we have this
3547	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3548	// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3549	// (implicit EFLAGS)]>;
3550	// but maybe need something like this
3551	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3552	// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3553	// (transferrable EFLAGS)]>;
3554	//
3555	// Until then, we manually fold these and instruction select the operation
3556	// here.
3557	bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3558	auto *StoreNode = cast<StoreSDNode>(Val: Node);
3559	SDValue StoredVal = StoreNode->getOperand(Num: `1`);
3560	unsigned Opc = StoredVal ->getOpcode();
3561
3562	// Before we try to select anything, make sure this is memory operand size
3563	// and opcode we can handle. Note that this must match the code below that
3564	// actually lowers the opcodes.
3565	EVT MemVT = StoreNode->getMemoryVT();
3566	if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3567	MemVT != MVT::i8)
3568	return false;
3569
3570	bool IsCommutable = false;
3571	bool IsNegate = false;
3572	switch (Opc) {
3573	default:
3574	return false;
3575	case X86ISD::SUB:
3576	IsNegate = isNullConstant(V: StoredVal.getOperand(i: `0`));
3577	break;
3578	case X86ISD::SBB:
3579	break;
3580	case X86ISD::ADD:
3581	case X86ISD::ADC:
3582	case X86ISD::AND:
3583	case X86ISD::OR:
3584	case X86ISD::XOR:
3585	IsCommutable = true;
3586	break;
3587	}
3588
3589	unsigned LoadOpNo = IsNegate ? `1` : `0`;
3590	LoadSDNode LoadNode = nullptr*;
3591	SDValue InputChain;
3592	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3593	LoadNode, InputChain)) {
3594	if (!IsCommutable)
3595	return false;
3596
3597	// This operation is commutable, try the other operand.
3598	LoadOpNo = `1`;
3599	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3600	LoadNode, InputChain))
3601	return false;
3602	}
3603
3604	SDValue Base, Scale, Index, Disp, Segment;
3605	if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3606	Segment))
3607	return false;
3608
3609	auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3610	unsigned Opc8) {
3611	switch (MemVT.getSimpleVT().SimpleTy) {
3612	case MVT::i64:
3613	return Opc64;
3614	case MVT::i32:
3615	return Opc32;
3616	case MVT::i16:
3617	return Opc16;
3618	case MVT::i8:
3619	return Opc8;
3620	default:
3621	llvm_unreachable("Invalid size!");
3622	}
3623	};
3624
3625	MachineSDNode *Result;
3626	switch (Opc) {
3627	case X86ISD::SUB:
3628	// Handle negate.
3629	if (IsNegate) {
3630	unsigned NewOpc = SelectOpcode (X86::NEG64m, X86::NEG32m, X86::NEG16m,
3631	X86::NEG8m);
3632	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3633	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32,
3634	VT2: MVT::Other, Ops);
3635	break;
3636	}
3637	[[fallthrough]];
3638	case X86ISD::ADD:
3639	// Try to match inc/dec.
3640	if (!Subtarget->slowIncDec() \|\| CurDAG->shouldOptForSize()) {
3641	bool IsOne = isOneConstant(V: StoredVal.getOperand(i: `1`));
3642	bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: `1`));
3643	// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3644	if ((IsOne \|\| IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: `1`))) {
3645	unsigned NewOpc =
3646	((Opc == X86ISD::ADD) == IsOne)
3647	? SelectOpcode (X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3648	: SelectOpcode (X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3649	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3650	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32,
3651	VT2: MVT::Other, Ops);
3652	break;
3653	}
3654	}
3655	[[fallthrough]];
3656	case X86ISD::ADC:
3657	case X86ISD::SBB:
3658	case X86ISD::AND:
3659	case X86ISD::OR:
3660	case X86ISD::XOR: {
3661	auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3662	switch (Opc) {
3663	case X86ISD::ADD:
3664	return SelectOpcode (X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3665	X86::ADD8mr);
3666	case X86ISD::ADC:
3667	return SelectOpcode (X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3668	X86::ADC8mr);
3669	case X86ISD::SUB:
3670	return SelectOpcode (X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3671	X86::SUB8mr);
3672	case X86ISD::SBB:
3673	return SelectOpcode (X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3674	X86::SBB8mr);
3675	case X86ISD::AND:
3676	return SelectOpcode (X86::AND64mr, X86::AND32mr, X86::AND16mr,
3677	X86::AND8mr);
3678	case X86ISD::OR:
3679	return SelectOpcode (X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3680	case X86ISD::XOR:
3681	return SelectOpcode (X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3682	X86::XOR8mr);
3683	default:
3684	llvm_unreachable("Invalid opcode!");
3685	}
3686	};
3687	auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3688	switch (Opc) {
3689	case X86ISD::ADD:
3690	return SelectOpcode (X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3691	X86::ADD8mi);
3692	case X86ISD::ADC:
3693	return SelectOpcode (X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3694	X86::ADC8mi);
3695	case X86ISD::SUB:
3696	return SelectOpcode (X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3697	X86::SUB8mi);
3698	case X86ISD::SBB:
3699	return SelectOpcode (X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3700	X86::SBB8mi);
3701	case X86ISD::AND:
3702	return SelectOpcode (X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3703	X86::AND8mi);
3704	case X86ISD::OR:
3705	return SelectOpcode (X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3706	X86::OR8mi);
3707	case X86ISD::XOR:
3708	return SelectOpcode (X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3709	X86::XOR8mi);
3710	default:
3711	llvm_unreachable("Invalid opcode!");
3712	}
3713	};
3714
3715	unsigned NewOpc = SelectRegOpcode (Opc);
3716	SDValue Operand = StoredVal ->getOperand(Num: `1`-LoadOpNo);
3717
3718	// See if the operand is a constant that we can fold into an immediate
3719	// operand.
3720	if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) {
3721	int64_t OperandV = OperandC->getSExtValue();
3722
3723	// Check if we can shrink the operand enough to fit in an immediate (or
3724	// fit into a smaller immediate) by negating it and switching the
3725	// operation.
3726	if ((Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB) &&
3727	((MemVT != MVT::i8 && !isInt<`8`>(x: OperandV) && isInt<`8`>(x: -OperandV)) \|\|
3728	(MemVT == MVT::i64 && !isInt<`32`>(x: OperandV) &&
3729	isInt<`32`>(x: -OperandV))) &&
3730	hasNoCarryFlagUses(Flags: StoredVal.getValue(R: `1`))) {
3731	OperandV = -OperandV;
3732	Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3733	}
3734
3735	if (MemVT != MVT::i64 \|\| isInt<`32`>(x: OperandV)) {
3736	Operand = CurDAG->getTargetConstant(Val: OperandV, DL: SDLoc (Node), VT: MemVT);
3737	NewOpc = SelectImmOpcode (Opc);
3738	}
3739	}
3740
3741	if (Opc == X86ISD::ADC \|\| Opc == X86ISD::SBB) {
3742	SDValue CopyTo =
3743	CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc (Node), Reg: X86::EFLAGS,
3744	N: StoredVal.getOperand(i: `2`), Glue: SDValue ());
3745
3746	const SDValue Ops[] = {Base, Scale, Index, Disp,
3747	Segment, Operand, CopyTo, CopyTo.getValue(R: `1`)};
3748	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32, VT2: MVT::Other,
3749	Ops);
3750	} else {
3751	const SDValue Ops[] = {Base, Scale, Index, Disp,
3752	Segment, Operand, InputChain};
3753	Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc (Node), VT1: MVT::i32, VT2: MVT::Other,
3754	Ops);
3755	}
3756	break;
3757	}
3758	default:
3759	llvm_unreachable("Invalid opcode!");
3760	}
3761
3762	MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3763	LoadNode->getMemOperand()};
3764	CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps);
3765
3766	// Update Load Chain uses as well.
3767	ReplaceUses(F: SDValue (LoadNode, `1`), T: SDValue (Result, `1`));
3768	ReplaceUses(F: SDValue (StoreNode, `0`), T: SDValue (Result, `1`));
3769	ReplaceUses(F: SDValue (StoredVal.getNode(), `1`), T: SDValue (Result, `0`));
3770	CurDAG->RemoveDeadNode(N: Node);
3771	return true;
3772	}
3773
3774	// See if this is an X & Mask that we can match to BEXTR/BZHI.
3775	// Where Mask is one of the following patterns:
3776	// a) x & (1 << nbits) - 1
3777	// b) x & ~(-1 << nbits)
3778	// c) x & (-1 >> (32 - y))
3779	// d) x << (32 - y) >> (32 - y)
3780	// e) (1 << nbits) - 1
3781	bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3782	assert(
3783	(Node->getOpcode() == ISD::ADD \|\| Node->getOpcode() == ISD::AND \|\|
3784	Node->getOpcode() == ISD::SRL) &&
3785	"Should be either an and-mask, or right-shift after clearing high bits.");
3786
3787	// BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3788	if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3789	return false;
3790
3791	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
3792
3793	// Only supported for 32 and 64 bits.
3794	if (NVT != MVT::i32 && NVT != MVT::i64)
3795	return false;
3796
3797	SDValue NBits;
3798	bool NegateNBits;
3799
3800	// If we have BMI2's BZHI, we are ok with muti-use patterns.
3801	// Else, if we only have BMI1's BEXTR, we require one-use.
3802	const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3803	auto checkUses = [AllowExtraUsesByDefault](
3804	SDValue Op, unsigned NUses,
3805	std::optional<bool> AllowExtraUses) {
3806	return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) \|\|
3807	Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo());
3808	};
3809	auto checkOneUse = [checkUses](SDValue Op,
3810	std::optional<bool> AllowExtraUses =
3811	std::nullopt) {
3812	return checkUses (Op, `1`, AllowExtraUses);
3813	};
3814	auto checkTwoUse = [checkUses](SDValue Op,
3815	std::optional<bool> AllowExtraUses =
3816	std::nullopt) {
3817	return checkUses (Op, `2`, AllowExtraUses);
3818	};
3819
3820	auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3821	if (V ->getOpcode() == ISD::TRUNCATE && checkOneUse (V)) {
3822	assert(V.getSimpleValueType() == MVT::i32 &&
3823	V.getOperand(`0`).getSimpleValueType() == MVT::i64 &&
3824	"Expected i64 -> i32 truncation");
3825	V = V.getOperand(i: `0`);
3826	}
3827	return V;
3828	};
3829
3830	// a) x & ((1 << nbits) + (-1))
3831	auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3832	&NegateNBits](SDValue Mask) -> bool {
3833	// Match `add`. Must only have one use!
3834	if (Mask ->getOpcode() != ISD::ADD \|\| !checkOneUse (Mask))
3835	return false;
3836	// We should be adding all-ones constant (i.e. subtracting one.)
3837	if (!isAllOnesConstant(V: Mask ->getOperand(Num: `1`)))
3838	return false;
3839	// Match `1 << nbits`. Might be truncated. Must only have one use!
3840	SDValue M0 = peekThroughOneUseTruncation (Mask ->getOperand(Num: `0`));
3841	if (M0 ->getOpcode() != ISD::SHL \|\| !checkOneUse (M0))
3842	return false;
3843	if (!isOneConstant(V: M0 ->getOperand(Num: `0`)))
3844	return false;
3845	NBits = M0 ->getOperand(Num: `1`);
3846	NegateNBits = false;
3847	return true;
3848	};
3849
3850	auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3851	V = peekThroughOneUseTruncation (V);
3852	return CurDAG->MaskedValueIsAllOnes(
3853	Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(),
3854	loBitsSet: NVT.getSizeInBits()));
3855	};
3856
3857	// b) x & ~(-1 << nbits)
3858	auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3859	&NBits, &NegateNBits](SDValue Mask) -> bool {
3860	// Match `~()`. Must only have one use!
3861	if (Mask.getOpcode() != ISD::XOR \|\| !checkOneUse (Mask))
3862	return false;
3863	// The -1 only has to be all-ones for the final Node's NVT.
3864	if (!isAllOnes (Mask ->getOperand(Num: `1`)))
3865	return false;
3866	// Match `-1 << nbits`. Might be truncated. Must only have one use!
3867	SDValue M0 = peekThroughOneUseTruncation (Mask ->getOperand(Num: `0`));
3868	if (M0 ->getOpcode() != ISD::SHL \|\| !checkOneUse (M0))
3869	return false;
3870	// The -1 only has to be all-ones for the final Node's NVT.
3871	if (!isAllOnes (M0 ->getOperand(Num: `0`)))
3872	return false;
3873	NBits = M0 ->getOperand(Num: `1`);
3874	NegateNBits = false;
3875	return true;
3876	};
3877
3878	// Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3879	// or leave the shift amount as-is, but then we'll have to negate it.
3880	auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3881	unsigned Bitwidth) {
3882	NBits = ShiftAmt;
3883	NegateNBits = true;
3884	// Skip over a truncate of the shift amount, if any.
3885	if (NBits.getOpcode() == ISD::TRUNCATE)
3886	NBits = NBits.getOperand(i: `0`);
3887	// Try to match the shift amount as (bitwidth - y). It should go away, too.
3888	// If it doesn't match, that's fine, we'll just negate it ourselves.
3889	if (NBits.getOpcode() != ISD::SUB)
3890	return;
3891	auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: `0`));
3892	if (!V0 \|\| V0->getZExtValue() != Bitwidth)
3893	return;
3894	NBits = NBits.getOperand(i: `1`);
3895	NegateNBits = false;
3896	};
3897
3898	// c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3899	// or
3900	// c) x & (-1 >> (32 - y))
3901	auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3902	canonicalizeShiftAmt](SDValue Mask) -> bool {
3903	// The mask itself may be truncated.
3904	Mask = peekThroughOneUseTruncation (Mask);
3905	unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3906	// Match `l>>`. Must only have one use!
3907	if (Mask.getOpcode() != ISD::SRL \|\| !checkOneUse (Mask))
3908	return false;
3909	// We should be shifting truly all-ones constant.
3910	if (!isAllOnesConstant(V: Mask.getOperand(i: `0`)))
3911	return false;
3912	SDValue M1 = Mask.getOperand(i: `1`);
3913	// The shift amount should not be used externally.
3914	if (!checkOneUse (M1))
3915	return false;
3916	canonicalizeShiftAmt (M1, Bitwidth);
3917	// Pattern c. is non-canonical, and is expanded into pattern d. iff there
3918	// is no extra use of the mask. Clearly, there was one since we are here.
3919	// But at the same time, if we need to negate the shift amount,
3920	// then we don't want the mask to stick around, else it's unprofitable.
3921	return !NegateNBits;
3922	};
3923
3924	SDValue X;
3925
3926	// d) x << z >> z but then we'll have to subtract z from bitwidth
3927	// or
3928	// d) x << (32 - y) >> (32 - y)
3929	auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3930	AllowExtraUsesByDefault, &NegateNBits,
3931	&X](SDNode Node) -> bool* {
3932	if (Node->getOpcode() != ISD::SRL)
3933	return false;
3934	SDValue N0 = Node->getOperand(Num: `0`);
3935	if (N0 ->getOpcode() != ISD::SHL)
3936	return false;
3937	unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3938	SDValue N1 = Node->getOperand(Num: `1`);
3939	SDValue N01 = N0 ->getOperand(Num: `1`);
3940	// Both of the shifts must be by the exact same value.
3941	if (N1 != N01)
3942	return false;
3943	canonicalizeShiftAmt (N1, Bitwidth);
3944	// There should not be any external uses of the inner shift / shift amount.
3945	// Note that while we are generally okay with external uses given BMI2,
3946	// iff we need to negate the shift amount, we are not okay with extra uses.
3947	const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3948	if (!checkOneUse (N0, AllowExtraUses) \|\| !checkTwoUse (N1, AllowExtraUses))
3949	return false;
3950	X = N0 ->getOperand(Num: `0`);
3951	return true;
3952	};
3953
3954	auto matchLowBitMask = [matchPatternA, matchPatternB,
3955	matchPatternC](SDValue Mask) -> bool {
3956	return matchPatternA (Mask) \|\| matchPatternB (Mask) \|\| matchPatternC (Mask);
3957	};
3958
3959	if (Node->getOpcode() == ISD::AND) {
3960	X = Node->getOperand(Num: `0`);
3961	SDValue Mask = Node->getOperand(Num: `1`);
3962
3963	if (matchLowBitMask (Mask)) {
3964	// Great.
3965	} else {
3966	std::swap(a&: X, b&: Mask);
3967	if (!matchLowBitMask (Mask))
3968	return false;
3969	}
3970	} else if (matchLowBitMask (SDValue (Node, `0`))) {
3971	X = CurDAG->getAllOnesConstant(DL: SDLoc (Node), VT: NVT);
3972	} else if (!matchPatternD (Node))
3973	return false;
3974
3975	// If we need to negate the shift amount, require BMI2 BZHI support.
3976	// It's just too unprofitable for BMI1 BEXTR.
3977	if (NegateNBits && !Subtarget->hasBMI2())
3978	return false;
3979
3980	SDLoc DL(Node);
3981
3982	// Truncate the shift amount.
3983	NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits);
3984	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
3985
3986	// Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3987	// All the other bits are undefined, we do not care about them.
3988	SDValue ImplDef = SDValue (
3989	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), `0`);
3990	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: ImplDef);
3991
3992	SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32);
3993	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: SRIdxVal);
3994	NBits = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL,
3995	VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal),
3996	`0`);
3997	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
3998
3999	// We might have matched the amount of high bits to be cleared,
4000	// but we want the amount of low bits to be kept, so negate it then.
4001	if (NegateNBits) {
4002	SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32);
4003	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: BitWidthC);
4004
4005	NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits);
4006	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4007	}
4008
4009	if (Subtarget->hasBMI2()) {
4010	// Great, just emit the BZHI..
4011	if (NVT != MVT::i32) {
4012	// But have to place the bit count into the wide-enough register first.
4013	NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits);
4014	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
4015	}
4016
4017	SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits);
4018	ReplaceNode(F: Node, T: Extract.getNode());
4019	SelectCode(N: Extract.getNode());
4020	return true;
4021	}
4022
4023	// Else, if we do NOT* have BMI2, let's find out if the if the 'X' is*
4024	// logically* shifted (potentially with one-use trunc inbetween),*
4025	// and the truncation was the only use of the shift,
4026	// and if so look past one-use truncation.
4027	{
4028	SDValue RealX = peekThroughOneUseTruncation (X);
4029	// FIXME: only if the shift is one-use?
4030	if (RealX != X && RealX.getOpcode() == ISD::SRL)
4031	X = RealX;
4032	}
4033
4034	MVT XVT = X.getSimpleValueType();
4035
4036	// Else, emitting BEXTR requires one more step.
4037	// The 'control' of BEXTR has the pattern of:
4038	// [15...8 bit][ 7...0 bit] location
4039	// [ bit count][ shift] name
4040	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4041
4042	// Shift NBits left by 8 bits, thus producing 'control'.
4043	// This makes the low 8 bits to be zero.
4044	SDValue C8 = CurDAG->getConstant(Val: `8`, DL, VT: MVT::i8);
4045	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: C8);
4046	SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8);
4047	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4048
4049	// If the 'X' is logically* shifted, we can fold that shift into 'control'.*
4050	// FIXME: only if the shift is one-use?
4051	if (X.getOpcode() == ISD::SRL) {
4052	SDValue ShiftAmt = X.getOperand(i: `1`);
4053	X = X.getOperand(i: `0`);
4054
4055	assert(ShiftAmt.getValueType() == MVT::i8 &&
4056	"Expected shift amount to be i8");
4057
4058	// Now, zero-extend the shift amount. The bits 8...15 must* be zero!*
4059	// We could zext to i16 in some form, but we intentionally don't do that.
4060	SDValue OrigShiftAmt = ShiftAmt;
4061	ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt);
4062	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt);
4063
4064	// And now 'or' these low 8 bits of shift amount into the 'control'.
4065	Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt);
4066	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4067	}
4068
4069	// But have to place the 'control' into the wide-enough register first.
4070	if (XVT != MVT::i32) {
4071	Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control);
4072	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4073	}
4074
4075	// And finally, form the BEXTR itself.
4076	SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control);
4077
4078	// The 'X' was originally truncated. Do that now.
4079	if (XVT != NVT) {
4080	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Extract);
4081	Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract);
4082	}
4083
4084	ReplaceNode(F: Node, T: Extract.getNode());
4085	SelectCode(N: Extract.getNode());
4086
4087	return true;
4088	}
4089
4090	// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4091	MachineSDNode X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode Node) {
4092	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
4093	SDLoc dl(Node);
4094
4095	SDValue N0 = Node->getOperand(Num: `0`);
4096	SDValue N1 = Node->getOperand(Num: `1`);
4097
4098	// If we have TBM we can use an immediate for the control. If we have BMI
4099	// we should only do this if the BEXTR instruction is implemented well.
4100	// Otherwise moving the control into a register makes this more costly.
4101	// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4102	// hoisting the move immediate would make it worthwhile with a less optimal
4103	// BEXTR?
4104	bool PreferBEXTR =
4105	Subtarget->hasTBM() \|\| (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4106	if (!PreferBEXTR && !Subtarget->hasBMI2())
4107	return nullptr;
4108
4109	// Must have a shift right.
4110	if (N0 ->getOpcode() != ISD::SRL && N0 ->getOpcode() != ISD::SRA)
4111	return nullptr;
4112
4113	// Shift can't have additional users.
4114	if (!N0 ->hasOneUse())
4115	return nullptr;
4116
4117	// Only supported for 32 and 64 bits.
4118	if (NVT != MVT::i32 && NVT != MVT::i64)
4119	return nullptr;
4120
4121	// Shift amount and RHS of and must be constant.
4122	auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1);
4123	auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0 ->getOperand(Num: `1`));
4124	if (!MaskCst \|\| !ShiftCst)
4125	return nullptr;
4126
4127	// And RHS must be a mask.
4128	uint64_t Mask = MaskCst->getZExtValue();
4129	if (!isMask_64(Value: Mask))
4130	return nullptr;
4131
4132	uint64_t Shift = ShiftCst->getZExtValue();
4133	uint64_t MaskSize = llvm::popcount(Value: Mask);
4134
4135	// Don't interfere with something that can be handled by extracting AH.
4136	// TODO: If we are able to fold a load, BEXTR might still be better than AH.
4137	if (Shift == `8` && MaskSize == `8`)
4138	return nullptr;
4139
4140	// Make sure we are only using bits that were in the original value, not
4141	// shifted in.
4142	if (Shift + MaskSize > NVT.getSizeInBits())
4143	return nullptr;
4144
4145	// BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4146	// that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4147	// does not fit into 32 bits. Load folding is not a sufficient reason.
4148	if (!PreferBEXTR && MaskSize <= `32`)
4149	return nullptr;
4150
4151	SDValue Control;
4152	unsigned ROpc, MOpc;
4153
4154	#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4155	if (!PreferBEXTR) {
4156	assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4157	// If we can't make use of BEXTR then we can't fuse shift+mask stages.
4158	// Let's perform the mask first, and apply shift later. Note that we need to
4159	// widen the mask to account for the fact that we'll apply shift afterwards!
4160	Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT);
4161	ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4162	: GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4163	MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4164	: GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4165	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4166	Control = SDValue (CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), `0`);
4167	} else {
4168	// The 'control' of BEXTR has the pattern of:
4169	// [15...8 bit][ 7...0 bit] location
4170	// [ bit count][ shift] name
4171	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4172	Control = CurDAG->getTargetConstant(Val: Shift \| (MaskSize << `8`), DL: dl, VT: NVT);
4173	if (Subtarget->hasTBM()) {
4174	ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4175	MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4176	} else {
4177	assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4178	// BMI requires the immediate to placed in a register.
4179	ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4180	: GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4181	MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4182	: GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4183	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4184	Control = SDValue (CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), `0`);
4185	}
4186	}
4187
4188	MachineSDNode *NewNode;
4189	SDValue Input = N0 ->getOperand(Num: `0`);
4190	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4191	if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4192	SDValue Ops[] = {
4193	Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: `0`)};
4194	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
4195	NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4196	// Update the chain.
4197	ReplaceUses(F: Input.getValue(R: `1`), T: SDValue (NewNode, `2`));
4198	// Record the mem-refs
4199	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()});
4200	} else {
4201	NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control);
4202	}
4203
4204	if (!PreferBEXTR) {
4205	// We still need to apply the shift.
4206	SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT);
4207	unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4208	: GET_ND_IF_ENABLED(X86::SHR32ri);
4209	NewNode =
4210	CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue (NewNode, `0`), Op2: ShAmt);
4211	}
4212
4213	return NewNode;
4214	}
4215
4216	// Emit a PCMISTR(I/M) instruction.
4217	MachineSDNode X86DAGToDAGISel::emitPCMPISTR(unsigned* ROpc, unsigned MOpc,
4218	bool MayFoldLoad, const SDLoc &dl,
4219	MVT VT, SDNode *Node) {
4220	SDValue N0 = Node->getOperand(Num: `0`);
4221	SDValue N1 = Node->getOperand(Num: `1`);
4222	SDValue Imm = Node->getOperand(Num: `2`);
4223	auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4224	Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc (Node), VT: Imm.getValueType());
4225
4226	// Try to fold a load. No need to check alignment.
4227	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4228	if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4229	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4230	N1.getOperand(i: `0`) };
4231	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other);
4232	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4233	// Update the chain.
4234	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, `2`));
4235	// Record the mem-refs
4236	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
4237	return CNode;
4238	}
4239
4240	SDValue Ops[] = { N0, N1, Imm };
4241	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32);
4242	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4243	return CNode;
4244	}
4245
4246	// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4247	// to emit a second instruction after this one. This is needed since we have two
4248	// copyToReg nodes glued before this and we need to continue that glue through.
4249	MachineSDNode X86DAGToDAGISel::emitPCMPESTR(unsigned* ROpc, unsigned MOpc,
4250	bool MayFoldLoad, const SDLoc &dl,
4251	MVT VT, SDNode *Node,
4252	SDValue &InGlue) {
4253	SDValue N0 = Node->getOperand(Num: `0`);
4254	SDValue N2 = Node->getOperand(Num: `2`);
4255	SDValue Imm = Node->getOperand(Num: `4`);
4256	auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4257	Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc (Node), VT: Imm.getValueType());
4258
4259	// Try to fold a load. No need to check alignment.
4260	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4261	if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4262	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4263	N2.getOperand(i: `0`), InGlue };
4264	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
4265	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4266	InGlue = SDValue (CNode, `3`);
4267	// Update the chain.
4268	ReplaceUses(F: N2.getValue(R: `1`), T: SDValue (CNode, `2`));
4269	// Record the mem-refs
4270	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()});
4271	return CNode;
4272	}
4273
4274	SDValue Ops[] = { N0, N2, Imm, InGlue };
4275	SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue);
4276	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4277	InGlue = SDValue (CNode, `2`);
4278	return CNode;
4279	}
4280
4281	bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4282	EVT VT = N->getValueType(ResNo: `0`);
4283
4284	// Only handle scalar shifts.
4285	if (VT.isVector())
4286	return false;
4287
4288	// Narrower shifts only mask to 5 bits in hardware.
4289	unsigned Size = VT == MVT::i64 ? `64` : `32`;
4290
4291	SDValue OrigShiftAmt = N->getOperand(Num: `1`);
4292	SDValue ShiftAmt = OrigShiftAmt;
4293	SDLoc DL(N);
4294
4295	// Skip over a truncate of the shift amount.
4296	if (ShiftAmt ->getOpcode() == ISD::TRUNCATE)
4297	ShiftAmt = ShiftAmt ->getOperand(Num: `0`);
4298
4299	// This function is called after X86DAGToDAGISel::matchBitExtract(),
4300	// so we are not afraid that we might mess up BZHI/BEXTR pattern.
4301
4302	SDValue NewShiftAmt;
4303	if (ShiftAmt ->getOpcode() == ISD::ADD \|\| ShiftAmt ->getOpcode() == ISD::SUB \|\|
4304	ShiftAmt ->getOpcode() == ISD::XOR) {
4305	SDValue Add0 = ShiftAmt ->getOperand(Num: `0`);
4306	SDValue Add1 = ShiftAmt ->getOperand(Num: `1`);
4307	auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0);
4308	auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1);
4309	// If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4310	// to avoid the ADD/SUB/XOR.
4311	if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == `0`) {
4312	NewShiftAmt = Add0;
4313
4314	} else if (ShiftAmt ->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4315	((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - `1`) \|\|
4316	(Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - `1`))) {
4317	// If we are doing a NOT on just the lower bits with (SizeN-1) -/^ X*
4318	// we can replace it with a NOT. In the XOR case it may save some code
4319	// size, in the SUB case it also may save a move.
4320	assert(Add0C == nullptr \|\| Add1C == nullptr);
4321
4322	// We can only do N-X, not X-N
4323	if (ShiftAmt ->getOpcode() == ISD::SUB && Add0C == nullptr)
4324	return false;
4325
4326	EVT OpVT = ShiftAmt.getValueType();
4327
4328	SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT);
4329	NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT,
4330	N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes);
4331	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes);
4332	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4333	// If we are shifting by N-X where N == 0 mod Size, then just shift by
4334	// -X to generate a NEG instead of a SUB of a constant.
4335	} else if (ShiftAmt ->getOpcode() == ISD::SUB && Add0C &&
4336	Add0C->getZExtValue() != `0`) {
4337	EVT SubVT = ShiftAmt.getValueType();
4338	SDValue X;
4339	if (Add0C->getZExtValue() % Size == `0`)
4340	X = Add1;
4341	else if (ShiftAmt.hasOneUse() && Size == `64` &&
4342	Add0C->getZExtValue() % `32` == `0`) {
4343	// We have a 64-bit shift by (n32-x), turn it into -(x+n32).
4344	// This is mainly beneficial if we already compute (x+n32).*
4345	if (Add1.getOpcode() == ISD::TRUNCATE) {
4346	Add1 = Add1.getOperand(i: `0`);
4347	SubVT = Add1.getValueType();
4348	}
4349	if (Add0.getValueType() != SubVT) {
4350	Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT);
4351	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0);
4352	}
4353
4354	X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0);
4355	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X);
4356	} else
4357	return false;
4358	// Insert a negate op.
4359	// TODO: This isn't guaranteed to replace the sub if there is a logic cone
4360	// that uses it that's not a shift.
4361	SDValue Zero = CurDAG->getConstant(Val: `0`, DL, VT: SubVT);
4362	SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X);
4363	NewShiftAmt = Neg;
4364
4365	// Insert these operands into a valid topological order so they can
4366	// get selected independently.
4367	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero);
4368	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg);
4369	} else
4370	return false;
4371	} else
4372	return false;
4373
4374	if (NewShiftAmt.getValueType() != MVT::i8) {
4375	// Need to truncate the shift amount.
4376	NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt);
4377	// Add to a correct topological ordering.
4378	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4379	}
4380
4381	// Insert a new mask to keep the shift amount legal. This should be removed
4382	// by isel patterns.
4383	NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt,
4384	N2: CurDAG->getConstant(Val: Size - `1`, DL, VT: MVT::i8));
4385	// Place in a correct topological ordering.
4386	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4387
4388	SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: `0`),
4389	Op2: NewShiftAmt);
4390	if (UpdatedNode != N) {
4391	// If we found an existing node, we should replace ourselves with that node
4392	// and wait for it to be selected after its other users.
4393	ReplaceNode(F: N, T: UpdatedNode);
4394	return true;
4395	}
4396
4397	// If the original shift amount is now dead, delete it so that we don't run
4398	// it through isel.
4399	if (OrigShiftAmt.getNode()->use_empty())
4400	CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode());
4401
4402	// Now that we've optimized the shift amount, defer to normal isel to get
4403	// load folding and legacy vs BMI2 selection without repeating it here.
4404	SelectCode(N);
4405	return true;
4406	}
4407
4408	bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4409	MVT NVT = N->getSimpleValueType(ResNo: `0`);
4410	unsigned Opcode = N->getOpcode();
4411	SDLoc dl(N);
4412
4413	// For operations of the form (x << C1) op C2, check if we can use a smaller
4414	// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4415	SDValue Shift = N->getOperand(Num: `0`);
4416	SDValue N1 = N->getOperand(Num: `1`);
4417
4418	auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
4419	if (!Cst)
4420	return false;
4421
4422	int64_t Val = Cst->getSExtValue();
4423
4424	// If we have an any_extend feeding the AND, look through it to see if there
4425	// is a shift behind it. But only if the AND doesn't use the extended bits.
4426	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4427	bool FoundAnyExtend = false;
4428	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4429	Shift.getOperand(i: `0`).getSimpleValueType() == MVT::i32 &&
4430	isUInt<`32`>(x: Val)) {
4431	FoundAnyExtend = true;
4432	Shift = Shift.getOperand(i: `0`);
4433	}
4434
4435	if (Shift.getOpcode() != ISD::SHL \|\| !Shift.hasOneUse())
4436	return false;
4437
4438	// i8 is unshrinkable, i16 should be promoted to i32.
4439	if (NVT != MVT::i32 && NVT != MVT::i64)
4440	return false;
4441
4442	auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`));
4443	if (!ShlCst)
4444	return false;
4445
4446	uint64_t ShAmt = ShlCst->getZExtValue();
4447
4448	// Make sure that we don't change the operation by removing bits.
4449	// This only matters for OR and XOR, AND is unaffected.
4450	uint64_t RemovedBitsMask = (`1ULL` << ShAmt) - `1`;
4451	if (Opcode != ISD::AND && (Val & RemovedBitsMask) != `0`)
4452	return false;
4453
4454	// Check the minimum bitwidth for the new constant.
4455	// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4456	auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4457	if (Opcode == ISD::AND) {
4458	// AND32ri is the same as AND64ri32 with zext imm.
4459	// Try this before sign extended immediates below.
4460	ShiftedVal = (uint64_t)Val >> ShAmt;
4461	if (NVT == MVT::i64 && !isUInt<`32`>(x: Val) && isUInt<`32`>(x: ShiftedVal))
4462	return true;
4463	// Also swap order when the AND can become MOVZX.
4464	if (ShiftedVal == UINT8_MAX \|\| ShiftedVal == UINT16_MAX)
4465	return true;
4466	}
4467	ShiftedVal = Val >> ShAmt;
4468	if ((!isInt<`8`>(x: Val) && isInt<`8`>(x: ShiftedVal)) \|\|
4469	(!isInt<`32`>(x: Val) && isInt<`32`>(x: ShiftedVal)))
4470	return true;
4471	if (Opcode != ISD::AND) {
4472	// MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4473	ShiftedVal = (uint64_t)Val >> ShAmt;
4474	if (NVT == MVT::i64 && !isUInt<`32`>(x: Val) && isUInt<`32`>(x: ShiftedVal))
4475	return true;
4476	}
4477	return false;
4478	};
4479
4480	int64_t ShiftedVal;
4481	if (!CanShrinkImmediate (ShiftedVal))
4482	return false;
4483
4484	// Ok, we can reorder to get a smaller immediate.
4485
4486	// But, its possible the original immediate allowed an AND to become MOVZX.
4487	// Doing this late due to avoid the MakedValueIsZero call as late as
4488	// possible.
4489	if (Opcode == ISD::AND) {
4490	// Find the smallest zext this could possibly be.
4491	unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4492	ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: `8U`));
4493
4494	// Figure out which bits need to be zero to achieve that mask.
4495	APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(),
4496	loBitsSet: ZExtWidth);
4497	NeededMask &= ~Cst->getAPIntValue();
4498
4499	if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: `0`), Mask: NeededMask))
4500	return false;
4501	}
4502
4503	SDValue X = Shift.getOperand(i: `0`);
4504	if (FoundAnyExtend) {
4505	SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X);
4506	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewX);
4507	X = NewX;
4508	}
4509
4510	SDValue NewCst = CurDAG->getConstant(Val: ShiftedVal, DL: dl, VT: NVT);
4511	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewCst);
4512	SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst);
4513	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewBinOp);
4514	SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp,
4515	N2: Shift.getOperand(i: `1`));
4516	ReplaceNode(F: N, T: NewSHL.getNode());
4517	SelectCode(N: NewSHL.getNode());
4518	return true;
4519	}
4520
4521	bool X86DAGToDAGISel::matchVPTERNLOG(SDNode Root, SDNode ParentA,
4522	SDNode ParentB, SDNode ParentC,
4523	SDValue A, SDValue B, SDValue C,
4524	uint8_t Imm) {
4525	assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4526	C.isOperandOf(ParentC) && "Incorrect parent node");
4527
4528	auto tryFoldLoadOrBCast =
4529	[this](SDNode Root, SDNode P, SDValue &L, SDValue &Base, SDValue &Scale,
4530	SDValue &Index, SDValue &Disp, SDValue &Segment) {
4531	if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4532	return true;
4533
4534	// Not a load, check for broadcast which may be behind a bitcast.
4535	if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4536	P = L.getNode();
4537	L = L.getOperand(i: `0`);
4538	}
4539
4540	if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4541	return false;
4542
4543	// Only 32 and 64 bit broadcasts are supported.
4544	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4545	unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4546	if (Size != `32` && Size != `64`)
4547	return false;
4548
4549	return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4550	};
4551
4552	bool FoldedLoad = false;
4553	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4554	if (tryFoldLoadOrBCast (Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4555	FoldedLoad = true;
4556	} else if (tryFoldLoadOrBCast (Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4557	Tmp4)) {
4558	FoldedLoad = true;
4559	std::swap(a&: A, b&: C);
4560	// Swap bits 1/4 and 3/6.
4561	uint8_t OldImm = Imm;
4562	Imm = OldImm & `0xa5`;
4563	if (OldImm & `0x02`) Imm \|= `0x10`;
4564	if (OldImm & `0x10`) Imm \|= `0x02`;
4565	if (OldImm & `0x08`) Imm \|= `0x40`;
4566	if (OldImm & `0x40`) Imm \|= `0x08`;
4567	} else if (tryFoldLoadOrBCast (Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4568	Tmp4)) {
4569	FoldedLoad = true;
4570	std::swap(a&: B, b&: C);
4571	// Swap bits 1/2 and 5/6.
4572	uint8_t OldImm = Imm;
4573	Imm = OldImm & `0x99`;
4574	if (OldImm & `0x02`) Imm \|= `0x04`;
4575	if (OldImm & `0x04`) Imm \|= `0x02`;
4576	if (OldImm & `0x20`) Imm \|= `0x40`;
4577	if (OldImm & `0x40`) Imm \|= `0x20`;
4578	}
4579
4580	SDLoc DL(Root);
4581
4582	SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8);
4583
4584	MVT NVT = Root->getSimpleValueType(ResNo: `0`);
4585
4586	MachineSDNode *MNode;
4587	if (FoldedLoad) {
4588	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
4589
4590	unsigned Opc;
4591	if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4592	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C);
4593	unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4594	assert((EltSize == `32` \|\| EltSize == `64`) && "Unexpected broadcast size!");
4595
4596	bool UseD = EltSize == `32`;
4597	if (NVT.is128BitVector())
4598	Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4599	else if (NVT.is256BitVector())
4600	Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4601	else if (NVT.is512BitVector())
4602	Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4603	else
4604	llvm_unreachable("Unexpected vector size!");
4605	} else {
4606	bool UseD = NVT.getVectorElementType() == MVT::i32;
4607	if (NVT.is128BitVector())
4608	Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4609	else if (NVT.is256BitVector())
4610	Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4611	else if (NVT.is512BitVector())
4612	Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4613	else
4614	llvm_unreachable("Unexpected vector size!");
4615	}
4616
4617	SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: `0`)};
4618	MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops);
4619
4620	// Update the chain.
4621	ReplaceUses(F: C.getValue(R: `1`), T: SDValue (MNode, `1`));
4622	// Record the mem-refs
4623	CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()});
4624	} else {
4625	bool UseD = NVT.getVectorElementType() == MVT::i32;
4626	unsigned Opc;
4627	if (NVT.is128BitVector())
4628	Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4629	else if (NVT.is256BitVector())
4630	Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4631	else if (NVT.is512BitVector())
4632	Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4633	else
4634	llvm_unreachable("Unexpected vector size!");
4635
4636	MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm});
4637	}
4638
4639	ReplaceUses(F: SDValue (Root, `0`), T: SDValue (MNode, `0`));
4640	CurDAG->RemoveDeadNode(N: Root);
4641	return true;
4642	}
4643
4644	// Try to match two logic ops to a VPTERNLOG.
4645	// FIXME: Handle more complex patterns that use an operand more than once?
4646	bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4647	MVT NVT = N->getSimpleValueType(ResNo: `0`);
4648
4649	// Make sure we support VPTERNLOG.
4650	if (!NVT.isVector() \|\| !Subtarget->hasAVX512() \|\|
4651	NVT.getVectorElementType() == MVT::i1)
4652	return false;
4653
4654	// We need VLX for 128/256-bit.
4655	if (!(Subtarget->hasVLX() \|\| NVT.is512BitVector()))
4656	return false;
4657
4658	SDValue N0 = N->getOperand(Num: `0`);
4659	SDValue N1 = N->getOperand(Num: `1`);
4660
4661	auto getFoldableLogicOp = [](SDValue Op) {
4662	// Peek through single use bitcast.
4663	if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4664	Op = Op.getOperand(i: `0`);
4665
4666	if (!Op.hasOneUse())
4667	return SDValue ();
4668
4669	unsigned Opc = Op.getOpcode();
4670	if (Opc == ISD::AND \|\| Opc == ISD::OR \|\| Opc == ISD::XOR \|\|
4671	Opc == X86ISD::ANDNP)
4672	return Op;
4673
4674	return SDValue ();
4675	};
4676
4677	SDValue A, FoldableOp;
4678	if ((FoldableOp = getFoldableLogicOp (N1))) {
4679	A = N0;
4680	} else if ((FoldableOp = getFoldableLogicOp (N0))) {
4681	A = N1;
4682	} else
4683	return false;
4684
4685	SDValue B = FoldableOp.getOperand(i: `0`);
4686	SDValue C = FoldableOp.getOperand(i: `1`);
4687	SDNode *ParentA = N;
4688	SDNode *ParentB = FoldableOp.getNode();
4689	SDNode *ParentC = FoldableOp.getNode();
4690
4691	// We can build the appropriate control immediate by performing the logic
4692	// operation we're matching using these constants for A, B, and C.
4693	uint8_t TernlogMagicA = `0xf0`;
4694	uint8_t TernlogMagicB = `0xcc`;
4695	uint8_t TernlogMagicC = `0xaa`;
4696
4697	// Some of the inputs may be inverted, peek through them and invert the
4698	// magic values accordingly.
4699	// TODO: There may be a bitcast before the xor that we should peek through.
4700	auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4701	if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4702	ISD::isBuildVectorAllOnes(N: Op.getOperand(i: `1`).getNode())) {
4703	Magic = ~Magic;
4704	Parent = Op.getNode();
4705	Op = Op.getOperand(i: `0`);
4706	}
4707	};
4708
4709	PeekThroughNot (A, ParentA, TernlogMagicA);
4710	PeekThroughNot (B, ParentB, TernlogMagicB);
4711	PeekThroughNot (C, ParentC, TernlogMagicC);
4712
4713	uint8_t Imm;
4714	switch (FoldableOp.getOpcode()) {
4715	default: llvm_unreachable("Unexpected opcode!");
4716	case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4717	case ISD::OR: Imm = TernlogMagicB \| TernlogMagicC; break;
4718	case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4719	case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4720	}
4721
4722	switch (N->getOpcode()) {
4723	default: llvm_unreachable("Unexpected opcode!");
4724	case X86ISD::ANDNP:
4725	if (A == N0)
4726	Imm &= ~TernlogMagicA;
4727	else
4728	Imm = ~(Imm) & TernlogMagicA;
4729	break;
4730	case ISD::AND: Imm &= TernlogMagicA; break;
4731	case ISD::OR: Imm \|= TernlogMagicA; break;
4732	case ISD::XOR: Imm ^= TernlogMagicA; break;
4733	}
4734
4735	return matchVPTERNLOG(Root: N, ParentA, ParentB, ParentC, A, B, C, Imm);
4736	}
4737
4738	/// If the high bits of an 'and' operand are known zero, try setting the
4739	/// high bits of an 'and' constant operand to produce a smaller encoding by
4740	/// creating a small, sign-extended negative immediate rather than a large
4741	/// positive one. This reverses a transform in SimplifyDemandedBits that
4742	/// shrinks mask constants by clearing bits. There is also a possibility that
4743	/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4744	/// case, just replace the 'and'. Return 'true' if the node is replaced.
4745	bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4746	// i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4747	// have immediate operands.
4748	MVT VT = And->getSimpleValueType(ResNo: `0`);
4749	if (VT != MVT::i32 && VT != MVT::i64)
4750	return false;
4751
4752	auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: `1`));
4753	if (!And1C)
4754	return false;
4755
4756	// Bail out if the mask constant is already negative. It's can't shrink more.
4757	// If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4758	// patterns to use a 32-bit and instead of a 64-bit and by relying on the
4759	// implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4760	// are negative too.
4761	APInt MaskVal = And1C->getAPIntValue();
4762	unsigned MaskLZ = MaskVal.countl_zero();
4763	if (!MaskLZ \|\| (VT == MVT::i64 && MaskLZ == `32`))
4764	return false;
4765
4766	// Don't extend into the upper 32 bits of a 64 bit mask.
4767	if (VT == MVT::i64 && MaskLZ >= `32`) {
4768	MaskLZ -= `32`;
4769	MaskVal = MaskVal.trunc(width: `32`);
4770	}
4771
4772	SDValue And0 = And->getOperand(Num: `0`);
4773	APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ);
4774	APInt NegMaskVal = MaskVal \| HighZeros;
4775
4776	// If a negative constant would not allow a smaller encoding, there's no need
4777	// to continue. Only change the constant when we know it's a win.
4778	unsigned MinWidth = NegMaskVal.getSignificantBits();
4779	if (MinWidth > `32` \|\| (MinWidth > `8` && MaskVal.getSignificantBits() <= `32`))
4780	return false;
4781
4782	// Extend masks if we truncated above.
4783	if (VT == MVT::i64 && MaskVal.getBitWidth() < `64`) {
4784	NegMaskVal = NegMaskVal.zext(width: `64`);
4785	HighZeros = HighZeros.zext(width: `64`);
4786	}
4787
4788	// The variable operand must be all zeros in the top bits to allow using the
4789	// new, negative constant as the mask.
4790	if (!CurDAG->MaskedValueIsZero(Op: And0, Mask: HighZeros))
4791	return false;
4792
4793	// Check if the mask is -1. In that case, this is an unnecessary instruction
4794	// that escaped earlier analysis.
4795	if (NegMaskVal.isAllOnes()) {
4796	ReplaceNode(F: And, T: And0.getNode());
4797	return true;
4798	}
4799
4800	// A negative mask allows a smaller encoding. Create a new 'and' node.
4801	SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc (And), VT);
4802	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (And, `0`), N: NewMask);
4803	SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc (And), VT, N1: And0, N2: NewMask);
4804	ReplaceNode(F: And, T: NewAnd.getNode());
4805	SelectCode(N: NewAnd.getNode());
4806	return true;
4807	}
4808
4809	static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4810	bool FoldedBCast, bool Masked) {
4811	#define VPTESTM_CASE(VT, SUFFIX) \
4812	case MVT::VT: \
4813	if (Masked) \
4814	return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4815	return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4816
4817
4818	#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4819	default: llvm_unreachable("Unexpected VT!"); \
4820	VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4821	VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4822	VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4823	VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4824	VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4825	VPTESTM_CASE(v8i64, QZ##SUFFIX)
4826
4827	#define VPTESTM_FULL_CASES(SUFFIX) \
4828	VPTESTM_BROADCAST_CASES(SUFFIX) \
4829	VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4830	VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4831	VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4832	VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4833	VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4834	VPTESTM_CASE(v32i16, WZ##SUFFIX)
4835
4836	if (FoldedBCast) {
4837	switch (TestVT.SimpleTy) {
4838	VPTESTM_BROADCAST_CASES(rmb)
4839	}
4840	}
4841
4842	if (FoldedLoad) {
4843	switch (TestVT.SimpleTy) {
4844	VPTESTM_FULL_CASES(rm)
4845	}
4846	}
4847
4848	switch (TestVT.SimpleTy) {
4849	VPTESTM_FULL_CASES(rr)
4850	}
4851
4852	#undef VPTESTM_FULL_CASES
4853	#undef VPTESTM_BROADCAST_CASES
4854	#undef VPTESTM_CASE
4855	}
4856
4857	// Try to create VPTESTM instruction. If InMask is not null, it will be used
4858	// to form a masked operation.
4859	bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4860	SDValue InMask) {
4861	assert(Subtarget->hasAVX512() && "Expected AVX512!");
4862	assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4863	"Unexpected VT!");
4864
4865	// Look for equal and not equal compares.
4866	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: `2`))->get();
4867	if (CC != ISD::SETEQ && CC != ISD::SETNE)
4868	return false;
4869
4870	SDValue SetccOp0 = Setcc.getOperand(i: `0`);
4871	SDValue SetccOp1 = Setcc.getOperand(i: `1`);
4872
4873	// Canonicalize the all zero vector to the RHS.
4874	if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode()))
4875	std::swap(a&: SetccOp0, b&: SetccOp1);
4876
4877	// See if we're comparing against zero.
4878	if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode()))
4879	return false;
4880
4881	SDValue N0 = SetccOp0;
4882
4883	MVT CmpVT = N0.getSimpleValueType();
4884	MVT CmpSVT = CmpVT.getVectorElementType();
4885
4886	// Start with both operands the same. We'll try to refine this.
4887	SDValue Src0 = N0;
4888	SDValue Src1 = N0;
4889
4890	{
4891	// Look through single use bitcasts.
4892	SDValue N0Temp = N0;
4893	if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4894	N0Temp = N0.getOperand(i: `0`);
4895
4896	// Look for single use AND.
4897	if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4898	Src0 = N0Temp.getOperand(i: `0`);
4899	Src1 = N0Temp.getOperand(i: `1`);
4900	}
4901	}
4902
4903	// Without VLX we need to widen the operation.
4904	bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4905
4906	auto tryFoldLoadOrBCast = [&](SDNode Root, SDNode P, SDValue &L,
4907	SDValue &Base, SDValue &Scale, SDValue &Index,
4908	SDValue &Disp, SDValue &Segment) {
4909	// If we need to widen, we can't fold the load.
4910	if (!Widen)
4911	if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4912	return true;
4913
4914	// If we didn't fold a load, try to match broadcast. No widening limitation
4915	// for this. But only 32 and 64 bit types are supported.
4916	if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4917	return false;
4918
4919	// Look through single use bitcasts.
4920	if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4921	P = L.getNode();
4922	L = L.getOperand(i: `0`);
4923	}
4924
4925	if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4926	return false;
4927
4928	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4929	if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4930	return false;
4931
4932	return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4933	};
4934
4935	// We can only fold loads if the sources are unique.
4936	bool CanFoldLoads = Src0 != Src1;
4937
4938	bool FoldedLoad = false;
4939	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4940	if (CanFoldLoads) {
4941	FoldedLoad = tryFoldLoadOrBCast (Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4942	Tmp3, Tmp4);
4943	if (!FoldedLoad) {
4944	// And is commutative.
4945	FoldedLoad = tryFoldLoadOrBCast (Root, N0.getNode(), Src0, Tmp0, Tmp1,
4946	Tmp2, Tmp3, Tmp4);
4947	if (FoldedLoad)
4948	std::swap(a&: Src0, b&: Src1);
4949	}
4950	}
4951
4952	bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4953
4954	bool IsMasked = InMask.getNode() != nullptr;
4955
4956	SDLoc dl(Root);
4957
4958	MVT ResVT = Setcc.getSimpleValueType();
4959	MVT MaskVT = ResVT;
4960	if (Widen) {
4961	// Widen the inputs using insert_subreg or copy_to_regclass.
4962	unsigned Scale = CmpVT.is128BitVector() ? `4` : `2`;
4963	unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4964	unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4965	CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts);
4966	MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts);
4967	SDValue ImplDef = SDValue (CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl,
4968	VT: CmpVT), `0`);
4969	Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0);
4970
4971	if (!FoldedBCast)
4972	Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1);
4973
4974	if (IsMasked) {
4975	// Widen the mask.
4976	unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID();
4977	SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
4978	InMask = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
4979	dl, VT: MaskVT, Op1: InMask, Op2: RC), `0`);
4980	}
4981	}
4982
4983	bool IsTestN = CC == ISD::SETEQ;
4984	unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4985	Masked: IsMasked);
4986
4987	MachineSDNode *CNode;
4988	if (FoldedLoad) {
4989	SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other);
4990
4991	if (IsMasked) {
4992	SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4993	Src1.getOperand(i: `0`) };
4994	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
4995	} else {
4996	SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4997	Src1.getOperand(i: `0`) };
4998	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
4999	}
5000
5001	// Update the chain.
5002	ReplaceUses(F: Src1.getValue(R: `1`), T: SDValue (CNode, `1`));
5003	// Record the mem-refs
5004	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()});
5005	} else {
5006	if (IsMasked)
5007	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1);
5008	else
5009	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1);
5010	}
5011
5012	// If we widened, we need to shrink the mask VT.
5013	if (Widen) {
5014	unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID();
5015	SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32);
5016	CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
5017	dl, VT: ResVT, Op1: SDValue (CNode, `0`), Op2: RC);
5018	}
5019
5020	ReplaceUses(F: SDValue (Root, `0`), T: SDValue (CNode, `0`));
5021	CurDAG->RemoveDeadNode(N: Root);
5022	return true;
5023	}
5024
5025	// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5026	// into vpternlog.
5027	bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5028	assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5029
5030	MVT NVT = N->getSimpleValueType(ResNo: `0`);
5031
5032	// Make sure we support VPTERNLOG.
5033	if (!NVT.isVector() \|\| !Subtarget->hasAVX512())
5034	return false;
5035
5036	// We need VLX for 128/256-bit.
5037	if (!(Subtarget->hasVLX() \|\| NVT.is512BitVector()))
5038	return false;
5039
5040	SDValue N0 = N->getOperand(Num: `0`);
5041	SDValue N1 = N->getOperand(Num: `1`);
5042
5043	// Canonicalize AND to LHS.
5044	if (N1.getOpcode() == ISD::AND)
5045	std::swap(a&: N0, b&: N1);
5046
5047	if (N0.getOpcode() != ISD::AND \|\|
5048	N1.getOpcode() != X86ISD::ANDNP \|\|
5049	!N0.hasOneUse() \|\| !N1.hasOneUse())
5050	return false;
5051
5052	// ANDN is not commutable, use it to pick down A and C.
5053	SDValue A = N1.getOperand(i: `0`);
5054	SDValue C = N1.getOperand(i: `1`);
5055
5056	// AND is commutable, if one operand matches A, the other operand is B.
5057	// Otherwise this isn't a match.
5058	SDValue B;
5059	if (N0.getOperand(i: `0`) == A)
5060	B = N0.getOperand(i: `1`);
5061	else if (N0.getOperand(i: `1`) == A)
5062	B = N0.getOperand(i: `0`);
5063	else
5064	return false;
5065
5066	SDLoc dl(N);
5067	SDValue Imm = CurDAG->getTargetConstant(Val: `0xCA`, DL: dl, VT: MVT::i8);
5068	SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm);
5069	ReplaceNode(F: N, T: Ternlog.getNode());
5070
5071	return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(),
5072	ParentC: Ternlog.getNode(), A, B, C, Imm: `0xCA`);
5073	}
5074
5075	void X86DAGToDAGISel::Select(SDNode *Node) {
5076	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
5077	unsigned Opcode = Node->getOpcode();
5078	SDLoc dl(Node);
5079
5080	if (Node->isMachineOpcode()) {
5081	LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << `'\n'`);
5082	Node->setNodeId(-`1`);
5083	return; // Already selected.
5084	}
5085
5086	switch (Opcode) {
5087	default: break;
5088	case ISD::INTRINSIC_W_CHAIN: {
5089	unsigned IntNo = Node->getConstantOperandVal(Num: `1`);
5090	switch (IntNo) {
5091	default: break;
5092	case Intrinsic::x86_encodekey128:
5093	case Intrinsic::x86_encodekey256: {
5094	if (!Subtarget->hasKL())
5095	break;
5096
5097	unsigned Opcode;
5098	switch (IntNo) {
5099	default: llvm_unreachable("Impossible intrinsic");
5100	case Intrinsic::x86_encodekey128:
5101	Opcode = X86::ENCODEKEY128;
5102	break;
5103	case Intrinsic::x86_encodekey256:
5104	Opcode = X86::ENCODEKEY256;
5105	break;
5106	}
5107
5108	SDValue Chain = Node->getOperand(Num: `0`);
5109	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: `3`),
5110	Glue: SDValue ());
5111	if (Opcode == X86::ENCODEKEY256)
5112	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: `4`),
5113	Glue: Chain.getValue(R: `1`));
5114
5115	MachineSDNode *Res = CurDAG->getMachineNode(
5116	Opcode, dl, VTs: Node->getVTList(),
5117	Ops: {Node->getOperand(Num: `2`), Chain, Chain.getValue(R: `1`)});
5118	ReplaceNode(F: Node, T: Res);
5119	return;
5120	}
5121	case Intrinsic::x86_tileloadd64_internal:
5122	case Intrinsic::x86_tileloaddt164_internal: {
5123	if (!Subtarget->hasAMXTILE())
5124	break;
5125	auto *MFI =
5126	CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5127	MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5128	unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
5129	? X86::PTILELOADDV
5130	: X86::PTILELOADDT1V;
5131	// _tile_loadd_internal(row, col, buf, STRIDE)
5132	SDValue Base = Node->getOperand(Num: `4`);
5133	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5134	SDValue Index = Node->getOperand(Num: `5`);
5135	SDValue Disp = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
5136	SDValue Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
5137	SDValue Chain = Node->getOperand(Num: `0`);
5138	MachineSDNode *CNode;
5139	SDValue Ops[] = {Node->getOperand(Num: `2`),
5140	Node->getOperand(Num: `3`),
5141	Base,
5142	Scale,
5143	Index,
5144	Disp,
5145	Segment,
5146	Chain};
5147	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops);
5148	ReplaceNode(F: Node, T: CNode);
5149	return;
5150	}
5151	}
5152	break;
5153	}
5154	case ISD::INTRINSIC_VOID: {
5155	unsigned IntNo = Node->getConstantOperandVal(Num: `1`);
5156	switch (IntNo) {
5157	default: break;
5158	case Intrinsic::x86_sse3_monitor:
5159	case Intrinsic::x86_monitorx:
5160	case Intrinsic::x86_clzero: {
5161	bool Use64BitPtr = Node->getOperand(Num: `2`).getValueType() == MVT::i64;
5162
5163	unsigned Opc = `0`;
5164	switch (IntNo) {
5165	default: llvm_unreachable("Unexpected intrinsic!");
5166	case Intrinsic::x86_sse3_monitor:
5167	if (!Subtarget->hasSSE3())
5168	break;
5169	Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5170	break;
5171	case Intrinsic::x86_monitorx:
5172	if (!Subtarget->hasMWAITX())
5173	break;
5174	Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5175	break;
5176	case Intrinsic::x86_clzero:
5177	if (!Subtarget->hasCLZERO())
5178	break;
5179	Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5180	break;
5181	}
5182
5183	if (Opc) {
5184	unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5185	SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: `0`), dl, Reg: PtrReg,
5186	N: Node->getOperand(Num: `2`), Glue: SDValue ());
5187	SDValue InGlue = Chain.getValue(R: `1`);
5188
5189	if (IntNo == Intrinsic::x86_sse3_monitor \|\|
5190	IntNo == Intrinsic::x86_monitorx) {
5191	// Copy the other two operands to ECX and EDX.
5192	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: `3`),
5193	Glue: InGlue);
5194	InGlue = Chain.getValue(R: `1`);
5195	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: `4`),
5196	Glue: InGlue);
5197	InGlue = Chain.getValue(R: `1`);
5198	}
5199
5200	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other,
5201	Ops: { Chain, InGlue});
5202	ReplaceNode(F: Node, T: CNode);
5203	return;
5204	}
5205
5206	break;
5207	}
5208	case Intrinsic::x86_tilestored64_internal: {
5209	auto *MFI =
5210	CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5211	MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5212	unsigned Opc = X86::PTILESTOREDV;
5213	// _tile_stored_internal(row, col, buf, STRIDE, c)
5214	SDValue Base = Node->getOperand(Num: `4`);
5215	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5216	SDValue Index = Node->getOperand(Num: `5`);
5217	SDValue Disp = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
5218	SDValue Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
5219	SDValue Chain = Node->getOperand(Num: `0`);
5220	MachineSDNode *CNode;
5221	SDValue Ops[] = {Node->getOperand(Num: `2`),
5222	Node->getOperand(Num: `3`),
5223	Base,
5224	Scale,
5225	Index,
5226	Disp,
5227	Segment,
5228	Node->getOperand(Num: `6`),
5229	Chain};
5230	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5231	ReplaceNode(F: Node, T: CNode);
5232	return;
5233	}
5234	case Intrinsic::x86_tileloadd64:
5235	case Intrinsic::x86_tileloaddt164:
5236	case Intrinsic::x86_tilestored64: {
5237	if (!Subtarget->hasAMXTILE())
5238	break;
5239	auto *MFI =
5240	CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5241	MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5242	unsigned Opc;
5243	switch (IntNo) {
5244	default: llvm_unreachable("Unexpected intrinsic!");
5245	case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5246	case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5247	case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5248	}
5249	// FIXME: Match displacement and scale.
5250	unsigned TIndex = Node->getConstantOperandVal(Num: `2`);
5251	SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5252	SDValue Base = Node->getOperand(Num: `3`);
5253	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5254	SDValue Index = Node->getOperand(Num: `4`);
5255	SDValue Disp = CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i32);
5256	SDValue Segment = CurDAG->getRegister(Reg: `0`, VT: MVT::i16);
5257	SDValue Chain = Node->getOperand(Num: `0`);
5258	MachineSDNode *CNode;
5259	if (Opc == X86::PTILESTORED) {
5260	SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5261	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5262	} else {
5263	SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5264	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops);
5265	}
5266	ReplaceNode(F: Node, T: CNode);
5267	return;
5268	}
5269	}
5270	break;
5271	}
5272	case ISD::BRIND:
5273	case X86ISD::NT_BRIND: {
5274	if (Subtarget->isTargetNaCl())
5275	// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5276	// leave the instruction alone.
5277	break;
5278	if (Subtarget->isTarget64BitILP32()) {
5279	// Converts a 32-bit register to a 64-bit, zero-extended version of
5280	// it. This is needed because x86-64 can do many things, but jmp %r32
5281	// ain't one of them.
5282	SDValue Target = Node->getOperand(Num: `1`);
5283	assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5284	SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64);
5285	SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other,
5286	N1: Node->getOperand(Num: `0`), N2: ZextTarget);
5287	ReplaceNode(F: Node, T: Brind.getNode());
5288	SelectCode(N: ZextTarget.getNode());
5289	SelectCode(N: Brind.getNode());
5290	return;
5291	}
5292	break;
5293	}
5294	case X86ISD::GlobalBaseReg:
5295	ReplaceNode(F: Node, T: getGlobalBaseReg());
5296	return;
5297
5298	case ISD::BITCAST:
5299	// Just drop all 128/256/512-bit bitcasts.
5300	if (NVT.is512BitVector() \|\| NVT.is256BitVector() \|\| NVT.is128BitVector() \|\|
5301	NVT == MVT::f128) {
5302	ReplaceUses(F: SDValue (Node, `0`), T: Node->getOperand(Num: `0`));
5303	CurDAG->RemoveDeadNode(N: Node);
5304	return;
5305	}
5306	break;
5307
5308	case ISD::SRL:
5309	if (matchBitExtract(Node))
5310	return;
5311	[[fallthrough]];
5312	case ISD::SRA:
5313	case ISD::SHL:
5314	if (tryShiftAmountMod(N: Node))
5315	return;
5316	break;
5317
5318	case X86ISD::VPTERNLOG: {
5319	uint8_t Imm = Node->getConstantOperandVal(Num: `3`);
5320	if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: `0`),
5321	B: Node->getOperand(Num: `1`), C: Node->getOperand(Num: `2`), Imm))
5322	return;
5323	break;
5324	}
5325
5326	case X86ISD::ANDNP:
5327	if (tryVPTERNLOG(N: Node))
5328	return;
5329	break;
5330
5331	case ISD::AND:
5332	if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5333	// Try to form a masked VPTESTM. Operands can be in either order.
5334	SDValue N0 = Node->getOperand(Num: `0`);
5335	SDValue N1 = Node->getOperand(Num: `1`);
5336	if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5337	tryVPTESTM(Root: Node, Setcc: N0, InMask: N1))
5338	return;
5339	if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5340	tryVPTESTM(Root: Node, Setcc: N1, InMask: N0))
5341	return;
5342	}
5343
5344	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5345	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
5346	CurDAG->RemoveDeadNode(N: Node);
5347	return;
5348	}
5349	if (matchBitExtract(Node))
5350	return;
5351	if (AndImmShrink && shrinkAndImmediate(And: Node))
5352	return;
5353
5354	[[fallthrough]];
5355	case ISD::OR:
5356	case ISD::XOR:
5357	if (tryShrinkShlLogicImm(N: Node))
5358	return;
5359	if (Opcode == ISD::OR && tryMatchBitSelect(N: Node))
5360	return;
5361	if (tryVPTERNLOG(N: Node))
5362	return;
5363
5364	[[fallthrough]];
5365	case ISD::ADD:
5366	if (Opcode == ISD::ADD && matchBitExtract(Node))
5367	return;
5368	[[fallthrough]];
5369	case ISD::SUB: {
5370	// Try to avoid folding immediates with multiple uses for optsize.
5371	// This code tries to select to register form directly to avoid going
5372	// through the isel table which might fold the immediate. We can't change
5373	// the patterns on the add/sub/and/or/xor with immediate paterns in the
5374	// tablegen files to check immediate use count without making the patterns
5375	// unavailable to the fast-isel table.
5376	if (!CurDAG->shouldOptForSize())
5377	break;
5378
5379	// Only handle i8/i16/i32/i64.
5380	if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5381	break;
5382
5383	SDValue N0 = Node->getOperand(Num: `0`);
5384	SDValue N1 = Node->getOperand(Num: `1`);
5385
5386	auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
5387	if (!Cst)
5388	break;
5389
5390	int64_t Val = Cst->getSExtValue();
5391
5392	// Make sure its an immediate that is considered foldable.
5393	// FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5394	if (!isInt<`8`>(x: Val) && !isInt<`32`>(x: Val))
5395	break;
5396
5397	// If this can match to INC/DEC, let it go.
5398	if (Opcode == ISD::ADD && (Val == `1` \|\| Val == -`1`))
5399	break;
5400
5401	// Check if we should avoid folding this immediate.
5402	if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode()))
5403	break;
5404
5405	// We should not fold the immediate. So we need a register form instead.
5406	unsigned ROpc, MOpc;
5407	switch (NVT.SimpleTy) {
5408	default: llvm_unreachable("Unexpected VT!");
5409	case MVT::i8:
5410	switch (Opcode) {
5411	default: llvm_unreachable("Unexpected opcode!");
5412	case ISD::ADD:
5413	ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5414	MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5415	break;
5416	case ISD::SUB:
5417	ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5418	MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5419	break;
5420	case ISD::AND:
5421	ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5422	MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5423	break;
5424	case ISD::OR:
5425	ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5426	MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5427	break;
5428	case ISD::XOR:
5429	ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5430	MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5431	break;
5432	}
5433	break;
5434	case MVT::i16:
5435	switch (Opcode) {
5436	default: llvm_unreachable("Unexpected opcode!");
5437	case ISD::ADD:
5438	ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5439	MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5440	break;
5441	case ISD::SUB:
5442	ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5443	MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5444	break;
5445	case ISD::AND:
5446	ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5447	MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5448	break;
5449	case ISD::OR:
5450	ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5451	MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5452	break;
5453	case ISD::XOR:
5454	ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5455	MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5456	break;
5457	}
5458	break;
5459	case MVT::i32:
5460	switch (Opcode) {
5461	default: llvm_unreachable("Unexpected opcode!");
5462	case ISD::ADD:
5463	ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5464	MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5465	break;
5466	case ISD::SUB:
5467	ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5468	MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5469	break;
5470	case ISD::AND:
5471	ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5472	MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5473	break;
5474	case ISD::OR:
5475	ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5476	MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5477	break;
5478	case ISD::XOR:
5479	ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5480	MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5481	break;
5482	}
5483	break;
5484	case MVT::i64:
5485	switch (Opcode) {
5486	default: llvm_unreachable("Unexpected opcode!");
5487	case ISD::ADD:
5488	ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5489	MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5490	break;
5491	case ISD::SUB:
5492	ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5493	MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5494	break;
5495	case ISD::AND:
5496	ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5497	MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5498	break;
5499	case ISD::OR:
5500	ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5501	MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5502	break;
5503	case ISD::XOR:
5504	ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5505	MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5506	break;
5507	}
5508	break;
5509	}
5510
5511	// Ok this is a AND/OR/XOR/ADD/SUB with constant.
5512
5513	// If this is a not a subtract, we can still try to fold a load.
5514	if (Opcode != ISD::SUB) {
5515	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5516	if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5517	SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: `0`) };
5518	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5519	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5520	// Update the chain.
5521	ReplaceUses(F: N0.getValue(R: `1`), T: SDValue (CNode, `2`));
5522	// Record the mem-refs
5523	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5524	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
5525	CurDAG->RemoveDeadNode(N: Node);
5526	return;
5527	}
5528	}
5529
5530	CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1);
5531	return;
5532	}
5533
5534	case X86ISD::SMUL:
5535	// i16/i32/i64 are handled with isel patterns.
5536	if (NVT != MVT::i8)
5537	break;
5538	[[fallthrough]];
5539	case X86ISD::UMUL: {
5540	SDValue N0 = Node->getOperand(Num: `0`);
5541	SDValue N1 = Node->getOperand(Num: `1`);
5542
5543	unsigned LoReg, ROpc, MOpc;
5544	switch (NVT.SimpleTy) {
5545	default: llvm_unreachable("Unsupported VT!");
5546	case MVT::i8:
5547	LoReg = X86::AL;
5548	ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5549	MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5550	break;
5551	case MVT::i16:
5552	LoReg = X86::AX;
5553	ROpc = X86::MUL16r;
5554	MOpc = X86::MUL16m;
5555	break;
5556	case MVT::i32:
5557	LoReg = X86::EAX;
5558	ROpc = X86::MUL32r;
5559	MOpc = X86::MUL32m;
5560	break;
5561	case MVT::i64:
5562	LoReg = X86::RAX;
5563	ROpc = X86::MUL64r;
5564	MOpc = X86::MUL64m;
5565	break;
5566	}
5567
5568	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5569	bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5570	// Multiply is commutative.
5571	if (!FoldedLoad) {
5572	FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5573	if (FoldedLoad)
5574	std::swap(a&: N0, b&: N1);
5575	}
5576
5577	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5578	N: N0, Glue: SDValue ()).getValue(R: `1`);
5579
5580	MachineSDNode *CNode;
5581	if (FoldedLoad) {
5582	// i16/i32/i64 use an instruction that produces a low and high result even
5583	// though only the low result is used.
5584	SDVTList VTs;
5585	if (NVT == MVT::i8)
5586	VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other);
5587	else
5588	VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other);
5589
5590	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
5591	InGlue };
5592	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5593
5594	// Update the chain.
5595	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, NVT == MVT::i8 ? `2` : `3`));
5596	// Record the mem-refs
5597	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5598	} else {
5599	// i16/i32/i64 use an instruction that produces a low and high result even
5600	// though only the low result is used.
5601	SDVTList VTs;
5602	if (NVT == MVT::i8)
5603	VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32);
5604	else
5605	VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32);
5606
5607	CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue});
5608	}
5609
5610	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
5611	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, NVT == MVT::i8 ? `1` : `2`));
5612	CurDAG->RemoveDeadNode(N: Node);
5613	return;
5614	}
5615
5616	case ISD::SMUL_LOHI:
5617	case ISD::UMUL_LOHI: {
5618	SDValue N0 = Node->getOperand(Num: `0`);
5619	SDValue N1 = Node->getOperand(Num: `1`);
5620
5621	unsigned Opc, MOpc;
5622	unsigned LoReg, HiReg;
5623	bool IsSigned = Opcode == ISD::SMUL_LOHI;
5624	bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5625	bool UseMULXHi = UseMULX && SDValue (Node, `0`).use_empty();
5626	switch (NVT.SimpleTy) {
5627	default: llvm_unreachable("Unsupported VT!");
5628	case MVT::i32:
5629	Opc = UseMULXHi ? X86::MULX32Hrr
5630	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5631	: IsSigned ? X86::IMUL32r
5632	: X86::MUL32r;
5633	MOpc = UseMULXHi ? X86::MULX32Hrm
5634	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5635	: IsSigned ? X86::IMUL32m
5636	: X86::MUL32m;
5637	LoReg = UseMULX ? X86::EDX : X86::EAX;
5638	HiReg = X86::EDX;
5639	break;
5640	case MVT::i64:
5641	Opc = UseMULXHi ? X86::MULX64Hrr
5642	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5643	: IsSigned ? X86::IMUL64r
5644	: X86::MUL64r;
5645	MOpc = UseMULXHi ? X86::MULX64Hrm
5646	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5647	: IsSigned ? X86::IMUL64m
5648	: X86::MUL64m;
5649	LoReg = UseMULX ? X86::RDX : X86::RAX;
5650	HiReg = X86::RDX;
5651	break;
5652	}
5653
5654	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5655	bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5656	// Multiply is commutative.
5657	if (!foldedLoad) {
5658	foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5659	if (foldedLoad)
5660	std::swap(a&: N0, b&: N1);
5661	}
5662
5663	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5664	N: N0, Glue: SDValue ()).getValue(R: `1`);
5665	SDValue ResHi, ResLo;
5666	if (foldedLoad) {
5667	SDValue Chain;
5668	MachineSDNode CNode = nullptr*;
5669	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
5670	InGlue };
5671	if (UseMULXHi) {
5672	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other);
5673	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5674	ResHi = SDValue (CNode, `0`);
5675	Chain = SDValue (CNode, `1`);
5676	} else if (UseMULX) {
5677	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other);
5678	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5679	ResHi = SDValue (CNode, `0`);
5680	ResLo = SDValue (CNode, `1`);
5681	Chain = SDValue (CNode, `2`);
5682	} else {
5683	SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
5684	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5685	Chain = SDValue (CNode, `0`);
5686	InGlue = SDValue (CNode, `1`);
5687	}
5688
5689	// Update the chain.
5690	ReplaceUses(F: N1.getValue(R: `1`), T: Chain);
5691	// Record the mem-refs
5692	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5693	} else {
5694	SDValue Ops[] = { N1, InGlue };
5695	if (UseMULXHi) {
5696	SDVTList VTs = CurDAG->getVTList(VT: NVT);
5697	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5698	ResHi = SDValue (CNode, `0`);
5699	} else if (UseMULX) {
5700	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT);
5701	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5702	ResHi = SDValue (CNode, `0`);
5703	ResLo = SDValue (CNode, `1`);
5704	} else {
5705	SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue);
5706	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5707	InGlue = SDValue (CNode, `0`);
5708	}
5709	}
5710
5711	// Copy the low half of the result, if it is needed.
5712	if (!SDValue (Node, `0`).use_empty()) {
5713	if (!ResLo) {
5714	assert(LoReg && "Register for low half is not defined!");
5715	ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5716	VT: NVT, Glue: InGlue);
5717	InGlue = ResLo.getValue(R: `2`);
5718	}
5719	ReplaceUses(F: SDValue (Node, `0`), T: ResLo);
5720	LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5721	dbgs() << `'\n'`);
5722	}
5723	// Copy the high half of the result, if it is needed.
5724	if (!SDValue (Node, `1`).use_empty()) {
5725	if (!ResHi) {
5726	assert(HiReg && "Register for high half is not defined!");
5727	ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg,
5728	VT: NVT, Glue: InGlue);
5729	InGlue = ResHi.getValue(R: `2`);
5730	}
5731	ReplaceUses(F: SDValue (Node, `1`), T: ResHi);
5732	LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5733	dbgs() << `'\n'`);
5734	}
5735
5736	CurDAG->RemoveDeadNode(N: Node);
5737	return;
5738	}
5739
5740	case ISD::SDIVREM:
5741	case ISD::UDIVREM: {
5742	SDValue N0 = Node->getOperand(Num: `0`);
5743	SDValue N1 = Node->getOperand(Num: `1`);
5744
5745	unsigned ROpc, MOpc;
5746	bool isSigned = Opcode == ISD::SDIVREM;
5747	if (!isSigned) {
5748	switch (NVT.SimpleTy) {
5749	default: llvm_unreachable("Unsupported VT!");
5750	case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5751	case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5752	case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5753	case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5754	}
5755	} else {
5756	switch (NVT.SimpleTy) {
5757	default: llvm_unreachable("Unsupported VT!");
5758	case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5759	case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5760	case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5761	case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5762	}
5763	}
5764
5765	unsigned LoReg, HiReg, ClrReg;
5766	unsigned SExtOpcode;
5767	switch (NVT.SimpleTy) {
5768	default: llvm_unreachable("Unsupported VT!");
5769	case MVT::i8:
5770	LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5771	SExtOpcode = `0`; // Not used.
5772	break;
5773	case MVT::i16:
5774	LoReg = X86::AX; HiReg = X86::DX;
5775	ClrReg = X86::DX;
5776	SExtOpcode = X86::CWD;
5777	break;
5778	case MVT::i32:
5779	LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5780	SExtOpcode = X86::CDQ;
5781	break;
5782	case MVT::i64:
5783	LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5784	SExtOpcode = X86::CQO;
5785	break;
5786	}
5787
5788	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5789	bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5790	bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0);
5791
5792	SDValue InGlue;
5793	if (NVT == MVT::i8) {
5794	// Special case for div8, just use a move with zero extension to AX to
5795	// clear the upper 8 bits (AH).
5796	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5797	MachineSDNode *Move;
5798	if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5799	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: `0`) };
5800	unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5801	: X86::MOVZX16rm8;
5802	Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops);
5803	Chain = SDValue (Move, `1`);
5804	ReplaceUses(F: N0.getValue(R: `1`), T: Chain);
5805	// Record the mem-refs
5806	CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5807	} else {
5808	unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5809	: X86::MOVZX16rr8;
5810	Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0);
5811	Chain = CurDAG->getEntryNode();
5812	}
5813	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue (Move, `0`),
5814	Glue: SDValue ());
5815	InGlue = Chain.getValue(R: `1`);
5816	} else {
5817	InGlue =
5818	CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl,
5819	Reg: LoReg, N: N0, Glue: SDValue ()).getValue(R: `1`);
5820	if (isSigned && !signBitIsZero) {
5821	// Sign extend the low part into the high part.
5822	InGlue =
5823	SDValue (CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),`0`);
5824	} else {
5825	// Zero out the high part, effectively zero extending the input.
5826	SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32);
5827	SDValue ClrNode = SDValue (
5828	CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: std::nullopt), `0`);
5829	switch (NVT.SimpleTy) {
5830	case MVT::i16:
5831	ClrNode =
5832	SDValue (CurDAG->getMachineNode(
5833	Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode,
5834	Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl,
5835	VT: MVT::i32)),
5836	`0`);
5837	break;
5838	case MVT::i32:
5839	break;
5840	case MVT::i64:
5841	ClrNode =
5842	SDValue (CurDAG->getMachineNode(
5843	Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64,
5844	Op1: CurDAG->getTargetConstant(Val: `0`, DL: dl, VT: MVT::i64), Op2: ClrNode,
5845	Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl,
5846	VT: MVT::i32)),
5847	`0`);
5848	break;
5849	default:
5850	llvm_unreachable("Unexpected division source");
5851	}
5852
5853	InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg,
5854	N: ClrNode, Glue: InGlue).getValue(R: `1`);
5855	}
5856	}
5857
5858	if (foldedLoad) {
5859	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
5860	InGlue };
5861	MachineSDNode *CNode =
5862	CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops);
5863	InGlue = SDValue (CNode, `1`);
5864	// Update the chain.
5865	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, `0`));
5866	// Record the mem-refs
5867	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5868	} else {
5869	InGlue =
5870	SDValue (CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), `0`);
5871	}
5872
5873	// Prevent use of AH in a REX instruction by explicitly copying it to
5874	// an ABCD_L register.
5875	//
5876	// The current assumption of the register allocator is that isel
5877	// won't generate explicit references to the GR8_ABCD_H registers. If
5878	// the allocator and/or the backend get enhanced to be more robust in
5879	// that regard, this can be, and should be, removed.
5880	if (HiReg == X86::AH && !SDValue (Node, `1`).use_empty()) {
5881	SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8);
5882	unsigned AHExtOpcode =
5883	isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5884
5885	SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32,
5886	VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue);
5887	SDValue Result(RNode, `0`);
5888	InGlue = SDValue (RNode, `1`);
5889
5890	Result =
5891	CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result);
5892
5893	ReplaceUses(F: SDValue (Node, `1`), T: Result);
5894	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5895	dbgs() << `'\n'`);
5896	}
5897	// Copy the division (low) result, if it is needed.
5898	if (!SDValue (Node, `0`).use_empty()) {
5899	SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
5900	Reg: LoReg, VT: NVT, Glue: InGlue);
5901	InGlue = Result.getValue(R: `2`);
5902	ReplaceUses(F: SDValue (Node, `0`), T: Result);
5903	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5904	dbgs() << `'\n'`);
5905	}
5906	// Copy the remainder (high) result, if it is needed.
5907	if (!SDValue (Node, `1`).use_empty()) {
5908	SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
5909	Reg: HiReg, VT: NVT, Glue: InGlue);
5910	InGlue = Result.getValue(R: `2`);
5911	ReplaceUses(F: SDValue (Node, `1`), T: Result);
5912	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5913	dbgs() << `'\n'`);
5914	}
5915	CurDAG->RemoveDeadNode(N: Node);
5916	return;
5917	}
5918
5919	case X86ISD::FCMP:
5920	case X86ISD::STRICT_FCMP:
5921	case X86ISD::STRICT_FCMPS: {
5922	bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP \|\|
5923	Node->getOpcode() == X86ISD::STRICT_FCMPS;
5924	SDValue N0 = Node->getOperand(Num: IsStrictCmp ? `1` : `0`);
5925	SDValue N1 = Node->getOperand(Num: IsStrictCmp ? `2` : `1`);
5926
5927	// Save the original VT of the compare.
5928	MVT CmpVT = N0.getSimpleValueType();
5929
5930	// Floating point needs special handling if we don't have FCOMI.
5931	if (Subtarget->canUseCMOV())
5932	break;
5933
5934	bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5935
5936	unsigned Opc;
5937	switch (CmpVT.SimpleTy) {
5938	default: llvm_unreachable("Unexpected type!");
5939	case MVT::f32:
5940	Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5941	break;
5942	case MVT::f64:
5943	Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5944	break;
5945	case MVT::f80:
5946	Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5947	break;
5948	}
5949
5950	SDValue Chain =
5951	IsStrictCmp ? Node->getOperand(Num: `0`) : CurDAG->getEntryNode();
5952	SDValue Glue;
5953	if (IsStrictCmp) {
5954	SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
5955	Chain = SDValue (CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), `0`);
5956	Glue = Chain.getValue(R: `1`);
5957	} else {
5958	Glue = SDValue (CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), `0`);
5959	}
5960
5961	// Move FPSW to AX.
5962	SDValue FNSTSW =
5963	SDValue (CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), `0`);
5964
5965	// Extract upper 8-bits of AX.
5966	SDValue Extract =
5967	CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW);
5968
5969	// Move AH into flags.
5970	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
5971	assert(Subtarget->canUseLAHFSAHF() &&
5972	"Target doesn't support SAHF or FCOMI?");
5973	SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue ());
5974	Chain = AH;
5975	SDValue SAHF = SDValue (
5976	CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: `1`)), `0`);
5977
5978	if (IsStrictCmp)
5979	ReplaceUses(F: SDValue (Node, `1`), T: Chain);
5980
5981	ReplaceUses(F: SDValue (Node, `0`), T: SAHF);
5982	CurDAG->RemoveDeadNode(N: Node);
5983	return;
5984	}
5985
5986	case X86ISD::CMP: {
5987	SDValue N0 = Node->getOperand(Num: `0`);
5988	SDValue N1 = Node->getOperand(Num: `1`);
5989
5990	// Optimizations for TEST compares.
5991	if (!isNullConstant(V: N1))
5992	break;
5993
5994	// Save the original VT of the compare.
5995	MVT CmpVT = N0.getSimpleValueType();
5996
5997	// If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5998	// by a test instruction. The test should be removed later by
5999	// analyzeCompare if we are using only the zero flag.
6000	// TODO: Should we check the users and use the BEXTR flags directly?
6001	if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6002	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) {
6003	unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6004	: X86::TEST32rr;
6005	SDValue BEXTR = SDValue (NewNode, `0`);
6006	NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR);
6007	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
6008	CurDAG->RemoveDeadNode(N: Node);
6009	return;
6010	}
6011	}
6012
6013	// We can peek through truncates, but we need to be careful below.
6014	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6015	N0 = N0.getOperand(i: `0`);
6016
6017	// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6018	// use a smaller encoding.
6019	// Look past the truncate if CMP is the only use of it.
6020	if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6021	N0.getValueType() != MVT::i8) {
6022	auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: `1`));
6023	if (!MaskC)
6024	break;
6025
6026	// We may have looked through a truncate so mask off any bits that
6027	// shouldn't be part of the compare.
6028	uint64_t Mask = MaskC->getZExtValue();
6029	Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits());
6030
6031	// Check if we can replace AND+IMM{32,64} with a shift. This is possible
6032	// for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6033	// zero flag.
6034	if (CmpVT == MVT::i64 && !isInt<`8`>(x: Mask) && isShiftedMask_64(Value: Mask) &&
6035	onlyUsesZeroFlag(Flags: SDValue (Node, `0`))) {
6036	unsigned ShiftOpcode = ISD::DELETED_NODE;
6037	unsigned ShiftAmt;
6038	unsigned SubRegIdx;
6039	MVT SubRegVT;
6040	unsigned TestOpcode;
6041	unsigned LeadingZeros = llvm::countl_zero(Val: Mask);
6042	unsigned TrailingZeros = llvm::countr_zero(Val: Mask);
6043
6044	// With leading/trailing zeros, the transform is profitable if we can
6045	// eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6046	// incurring any extra register moves.
6047	bool SavesBytes = !isInt<`32`>(x: Mask) \|\| N0.getOperand(i: `0`).hasOneUse();
6048	if (LeadingZeros == `0` && SavesBytes) {
6049	// If the mask covers the most significant bit, then we can replace
6050	// TEST+AND with a SHR and check eflags.
6051	// This emits a redundant TEST which is subsequently eliminated.
6052	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6053	ShiftAmt = TrailingZeros;
6054	SubRegIdx = `0`;
6055	TestOpcode = X86::TEST64rr;
6056	} else if (TrailingZeros == `0` && SavesBytes) {
6057	// If the mask covers the least significant bit, then we can replace
6058	// TEST+AND with a SHL and check eflags.
6059	// This emits a redundant TEST which is subsequently eliminated.
6060	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6061	ShiftAmt = LeadingZeros;
6062	SubRegIdx = `0`;
6063	TestOpcode = X86::TEST64rr;
6064	} else if (MaskC->hasOneUse() && !isInt<`32`>(x: Mask)) {
6065	// If the shifted mask extends into the high half and is 8/16/32 bits
6066	// wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6067	unsigned PopCount = `64` - LeadingZeros - TrailingZeros;
6068	if (PopCount == `8`) {
6069	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6070	ShiftAmt = TrailingZeros;
6071	SubRegIdx = X86::sub_8bit;
6072	SubRegVT = MVT::i8;
6073	TestOpcode = X86::TEST8rr;
6074	} else if (PopCount == `16`) {
6075	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6076	ShiftAmt = TrailingZeros;
6077	SubRegIdx = X86::sub_16bit;
6078	SubRegVT = MVT::i16;
6079	TestOpcode = X86::TEST16rr;
6080	} else if (PopCount == `32`) {
6081	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6082	ShiftAmt = TrailingZeros;
6083	SubRegIdx = X86::sub_32bit;
6084	SubRegVT = MVT::i32;
6085	TestOpcode = X86::TEST32rr;
6086	}
6087	}
6088	if (ShiftOpcode != ISD::DELETED_NODE) {
6089	SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64);
6090	SDValue Shift = SDValue (
6091	CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32,
6092	Op1: N0.getOperand(i: `0`), Op2: ShiftC),
6093	`0`);
6094	if (SubRegIdx != `0`) {
6095	Shift =
6096	CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift);
6097	}
6098	MachineSDNode *Test =
6099	CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift);
6100	ReplaceNode(F: Node, T: Test);
6101	return;
6102	}
6103	}
6104
6105	MVT VT;
6106	int SubRegOp;
6107	unsigned ROpc, MOpc;
6108
6109	// For each of these checks we need to be careful if the sign flag is
6110	// being used. It is only safe to use the sign flag in two conditions,
6111	// either the sign bit in the shrunken mask is zero or the final test
6112	// size is equal to the original compare size.
6113
6114	if (isUInt<`8`>(x: Mask) &&
6115	(!(Mask & `0x80`) \|\| CmpVT == MVT::i8 \|\|
6116	hasNoSignFlagUses(Flags: SDValue (Node, `0`)))) {
6117	// For example, convert "testl %eax, $8" to "testb %al, $8"
6118	VT = MVT::i8;
6119	SubRegOp = X86::sub_8bit;
6120	ROpc = X86::TEST8ri;
6121	MOpc = X86::TEST8mi;
6122	} else if (OptForMinSize && isUInt<`16`>(x: Mask) &&
6123	(!(Mask & `0x8000`) \|\| CmpVT == MVT::i16 \|\|
6124	hasNoSignFlagUses(Flags: SDValue (Node, `0`)))) {
6125	// For example, "testl %eax, $32776" to "testw %ax, $32776".
6126	// NOTE: We only want to form TESTW instructions if optimizing for
6127	// min size. Otherwise we only save one byte and possibly get a length
6128	// changing prefix penalty in the decoders.
6129	VT = MVT::i16;
6130	SubRegOp = X86::sub_16bit;
6131	ROpc = X86::TEST16ri;
6132	MOpc = X86::TEST16mi;
6133	} else if (isUInt<`32`>(x: Mask) && N0.getValueType() != MVT::i16 &&
6134	((!(Mask & `0x80000000`) &&
6135	// Without minsize 16-bit Cmps can get here so we need to
6136	// be sure we calculate the correct sign flag if needed.
6137	(CmpVT != MVT::i16 \|\| !(Mask & `0x8000`))) \|\|
6138	CmpVT == MVT::i32 \|\|
6139	hasNoSignFlagUses(Flags: SDValue (Node, `0`)))) {
6140	// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6141	// NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6142	// Otherwize, we find ourselves in a position where we have to do
6143	// promotion. If previous passes did not promote the and, we assume
6144	// they had a good reason not to and do not promote here.
6145	VT = MVT::i32;
6146	SubRegOp = X86::sub_32bit;
6147	ROpc = X86::TEST32ri;
6148	MOpc = X86::TEST32mi;
6149	} else {
6150	// No eligible transformation was found.
6151	break;
6152	}
6153
6154	SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT);
6155	SDValue Reg = N0.getOperand(i: `0`);
6156
6157	// Emit a testl or testw.
6158	MachineSDNode *NewNode;
6159	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6160	if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6161	if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: `0`).getNode())) {
6162	if (!LoadN->isSimple()) {
6163	unsigned NumVolBits = LoadN->getValueType(ResNo: `0`).getSizeInBits();
6164	if ((MOpc == X86::TEST8mi && NumVolBits != `8`) \|\|
6165	(MOpc == X86::TEST16mi && NumVolBits != `16`) \|\|
6166	(MOpc == X86::TEST32mi && NumVolBits != `32`))
6167	break;
6168	}
6169	}
6170	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6171	Reg.getOperand(i: `0`) };
6172	NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops);
6173	// Update the chain.
6174	ReplaceUses(F: Reg.getValue(R: `1`), T: SDValue (NewNode, `1`));
6175	// Record the mem-refs
6176	CurDAG->setNodeMemRefs(N: NewNode,
6177	NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()});
6178	} else {
6179	// Extract the subregister if necessary.
6180	if (N0.getValueType() != VT)
6181	Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg);
6182
6183	NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm);
6184	}
6185	// Replace CMP with TEST.
6186	ReplaceNode(F: Node, T: NewNode);
6187	return;
6188	}
6189	break;
6190	}
6191	case X86ISD::PCMPISTR: {
6192	if (!Subtarget->hasSSE42())
6193	break;
6194
6195	bool NeedIndex = !SDValue (Node, `0`).use_empty();
6196	bool NeedMask = !SDValue (Node, `1`).use_empty();
6197	// We can't fold a load if we are going to make two instructions.
6198	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;
6199
6200	MachineSDNode *CNode;
6201	if (NeedMask) {
6202	unsigned ROpc =
6203	Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6204	unsigned MOpc =
6205	Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6206	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node);
6207	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, `0`));
6208	}
6209	if (NeedIndex \|\| !NeedMask) {
6210	unsigned ROpc =
6211	Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6212	unsigned MOpc =
6213	Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6214	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node);
6215	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
6216	}
6217
6218	// Connect the flag usage to the last instruction created.
6219	ReplaceUses(F: SDValue (Node, `2`), T: SDValue (CNode, `1`));
6220	CurDAG->RemoveDeadNode(N: Node);
6221	return;
6222	}
6223	case X86ISD::PCMPESTR: {
6224	if (!Subtarget->hasSSE42())
6225	break;
6226
6227	// Copy the two implicit register inputs.
6228	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX,
6229	N: Node->getOperand(Num: `1`),
6230	Glue: SDValue ()).getValue(R: `1`);
6231	InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX,
6232	N: Node->getOperand(Num: `3`), Glue: InGlue).getValue(R: `1`);
6233
6234	bool NeedIndex = !SDValue (Node, `0`).use_empty();
6235	bool NeedMask = !SDValue (Node, `1`).use_empty();
6236	// We can't fold a load if we are going to make two instructions.
6237	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;
6238
6239	MachineSDNode *CNode;
6240	if (NeedMask) {
6241	unsigned ROpc =
6242	Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6243	unsigned MOpc =
6244	Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6245	CNode =
6246	emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue);
6247	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, `0`));
6248	}
6249	if (NeedIndex \|\| !NeedMask) {
6250	unsigned ROpc =
6251	Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6252	unsigned MOpc =
6253	Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6254	CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue);
6255	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
6256	}
6257	// Connect the flag usage to the last instruction created.
6258	ReplaceUses(F: SDValue (Node, `2`), T: SDValue (CNode, `1`));
6259	CurDAG->RemoveDeadNode(N: Node);
6260	return;
6261	}
6262
6263	case ISD::SETCC: {
6264	if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue (Node, `0`), InMask: SDValue ()))
6265	return;
6266
6267	break;
6268	}
6269
6270	case ISD::STORE:
6271	if (foldLoadStoreIntoMemOperand(Node))
6272	return;
6273	break;
6274
6275	case X86ISD::SETCC_CARRY: {
6276	MVT VT = Node->getSimpleValueType(ResNo: `0`);
6277	SDValue Result;
6278	if (Subtarget->hasSBBDepBreaking()) {
6279	// We have to do this manually because tblgen will put the eflags copy in
6280	// the wrong place if we use an extract_subreg in the pattern.
6281	// Copy flags to the EFLAGS register and glue it to next node.
6282	SDValue EFLAGS =
6283	CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS,
6284	N: Node->getOperand(Num: `1`), Glue: SDValue ());
6285
6286	// Create a 64-bit instruction if the result is 64-bits otherwise use the
6287	// 32-bit version.
6288	unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6289	MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6290	Result = SDValue (
6291	CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: `1`)),
6292	`0`);
6293	} else {
6294	// The target does not recognize sbb with the same reg operand as a
6295	// no-source idiom, so we explicitly zero the input values.
6296	Result = getSBBZero(N: Node);
6297	}
6298
6299	// For less than 32-bits we need to extract from the 32-bit node.
6300	if (VT == MVT::i8 \|\| VT == MVT::i16) {
6301	int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6302	Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6303	}
6304
6305	ReplaceUses(F: SDValue (Node, `0`), T: Result);
6306	CurDAG->RemoveDeadNode(N: Node);
6307	return;
6308	}
6309	case X86ISD::SBB: {
6310	if (isNullConstant(V: Node->getOperand(Num: `0`)) &&
6311	isNullConstant(V: Node->getOperand(Num: `1`))) {
6312	SDValue Result = getSBBZero(N: Node);
6313
6314	// Replace the flag use.
6315	ReplaceUses(F: SDValue (Node, `1`), T: Result.getValue(R: `1`));
6316
6317	// Replace the result use.
6318	if (!SDValue (Node, `0`).use_empty()) {
6319	// For less than 32-bits we need to extract from the 32-bit node.
6320	MVT VT = Node->getSimpleValueType(ResNo: `0`);
6321	if (VT == MVT::i8 \|\| VT == MVT::i16) {
6322	int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6323	Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6324	}
6325	ReplaceUses(F: SDValue (Node, `0`), T: Result);
6326	}
6327
6328	CurDAG->RemoveDeadNode(N: Node);
6329	return;
6330	}
6331	break;
6332	}
6333	case X86ISD::MGATHER: {
6334	auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node);
6335	SDValue IndexOp = Mgt->getIndex();
6336	SDValue Mask = Mgt->getMask();
6337	MVT IndexVT = IndexOp.getSimpleValueType();
6338	MVT ValueVT = Node->getSimpleValueType(ResNo: `0`);
6339	MVT MaskVT = Mask.getSimpleValueType();
6340
6341	// This is just to prevent crashes if the nodes are malformed somehow. We're
6342	// otherwise only doing loose type checking in here based on type what
6343	// a type constraint would say just like table based isel.
6344	if (!ValueVT.isVector() \|\| !MaskVT.isVector())
6345	break;
6346
6347	unsigned NumElts = ValueVT.getVectorNumElements();
6348	MVT ValueSVT = ValueVT.getVectorElementType();
6349
6350	bool IsFP = ValueSVT.isFloatingPoint();
6351	unsigned EltSize = ValueSVT.getSizeInBits();
6352
6353	unsigned Opc = `0`;
6354	bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6355	if (AVX512Gather) {
6356	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6357	Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6358	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6359	Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6360	else if (IndexVT == MVT::v16i32 && NumElts == `16` && EltSize == `32`)
6361	Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6362	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6363	Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6364	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6365	Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6366	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `64`)
6367	Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6368	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6369	Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6370	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6371	Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6372	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `32`)
6373	Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6374	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6375	Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6376	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6377	Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6378	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `64`)
6379	Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6380	} else {
6381	assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6382	"Unexpected mask VT!");
6383	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6384	Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6385	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6386	Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6387	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6388	Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6389	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6390	Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6391	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6392	Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6393	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6394	Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6395	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6396	Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6397	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6398	Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6399	}
6400
6401	if (!Opc)
6402	break;
6403
6404	SDValue Base, Scale, Index, Disp, Segment;
6405	if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(),
6406	Base, Scale, Index, Disp, Segment))
6407	break;
6408
6409	SDValue PassThru = Mgt->getPassThru();
6410	SDValue Chain = Mgt->getChain();
6411	// Gather instructions have a mask output not in the ISD node.
6412	SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other);
6413
6414	MachineSDNode *NewNode;
6415	if (AVX512Gather) {
6416	SDValue Ops[] = {PassThru, Mask, Base, Scale,
6417	Index, Disp, Segment, Chain};
6418	NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6419	} else {
6420	SDValue Ops[] = {PassThru, Base, Scale, Index,
6421	Disp, Segment, Mask, Chain};
6422	NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6423	}
6424	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()});
6425	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
6426	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (NewNode, `2`));
6427	CurDAG->RemoveDeadNode(N: Node);
6428	return;
6429	}
6430	case X86ISD::MSCATTER: {
6431	auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node);
6432	SDValue Value = Sc->getValue();
6433	SDValue IndexOp = Sc->getIndex();
6434	MVT IndexVT = IndexOp.getSimpleValueType();
6435	MVT ValueVT = Value.getSimpleValueType();
6436
6437	// This is just to prevent crashes if the nodes are malformed somehow. We're
6438	// otherwise only doing loose type checking in here based on type what
6439	// a type constraint would say just like table based isel.
6440	if (!ValueVT.isVector())
6441	break;
6442
6443	unsigned NumElts = ValueVT.getVectorNumElements();
6444	MVT ValueSVT = ValueVT.getVectorElementType();
6445
6446	bool IsFP = ValueSVT.isFloatingPoint();
6447	unsigned EltSize = ValueSVT.getSizeInBits();
6448
6449	unsigned Opc;
6450	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6451	Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6452	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6453	Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6454	else if (IndexVT == MVT::v16i32 && NumElts == `16` && EltSize == `32`)
6455	Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6456	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6457	Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6458	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6459	Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6460	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `64`)
6461	Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6462	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6463	Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6464	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6465	Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6466	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `32`)
6467	Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6468	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6469	Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6470	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6471	Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6472	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `64`)
6473	Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6474	else
6475	break;
6476
6477	SDValue Base, Scale, Index, Disp, Segment;
6478	if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(),
6479	Base, Scale, Index, Disp, Segment))
6480	break;
6481
6482	SDValue Mask = Sc->getMask();
6483	SDValue Chain = Sc->getChain();
6484	// Scatter instructions have a mask output not in the ISD node.
6485	SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other);
6486	SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6487
6488	MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6489	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()});
6490	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `1`));
6491	CurDAG->RemoveDeadNode(N: Node);
6492	return;
6493	}
6494	case ISD::PREALLOCATED_SETUP: {
6495	auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6496	auto CallId = MFI->getPreallocatedIdForCallSite(
6497	CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: `1`))->getValue());
6498	SDValue Chain = Node->getOperand(Num: `0`);
6499	SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6500	MachineSDNode *New = CurDAG->getMachineNode(
6501	Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain);
6502	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (New, `0`)); // Chain
6503	CurDAG->RemoveDeadNode(N: Node);
6504	return;
6505	}
6506	case ISD::PREALLOCATED_ARG: {
6507	auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6508	auto CallId = MFI->getPreallocatedIdForCallSite(
6509	CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: `1`))->getValue());
6510	SDValue Chain = Node->getOperand(Num: `0`);
6511	SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32);
6512	SDValue ArgIndex = Node->getOperand(Num: `2`);
6513	SDValue Ops[`3`];
6514	Ops[`0`] = CallIdValue;
6515	Ops[`1`] = ArgIndex;
6516	Ops[`2`] = Chain;
6517	MachineSDNode *New = CurDAG->getMachineNode(
6518	Opcode: TargetOpcode::PREALLOCATED_ARG, dl,
6519	VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()),
6520	VT2: MVT::Other),
6521	Ops);
6522	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (New, `0`)); // Arg pointer
6523	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (New, `1`)); // Chain
6524	CurDAG->RemoveDeadNode(N: Node);
6525	return;
6526	}
6527	case X86ISD::AESENCWIDE128KL:
6528	case X86ISD::AESDECWIDE128KL:
6529	case X86ISD::AESENCWIDE256KL:
6530	case X86ISD::AESDECWIDE256KL: {
6531	if (!Subtarget->hasWIDEKL())
6532	break;
6533
6534	unsigned Opcode;
6535	switch (Node->getOpcode()) {
6536	default:
6537	llvm_unreachable("Unexpected opcode!");
6538	case X86ISD::AESENCWIDE128KL:
6539	Opcode = X86::AESENCWIDE128KL;
6540	break;
6541	case X86ISD::AESDECWIDE128KL:
6542	Opcode = X86::AESDECWIDE128KL;
6543	break;
6544	case X86ISD::AESENCWIDE256KL:
6545	Opcode = X86::AESENCWIDE256KL;
6546	break;
6547	case X86ISD::AESDECWIDE256KL:
6548	Opcode = X86::AESDECWIDE256KL;
6549	break;
6550	}
6551
6552	SDValue Chain = Node->getOperand(Num: `0`);
6553	SDValue Addr = Node->getOperand(Num: `1`);
6554
6555	SDValue Base, Scale, Index, Disp, Segment;
6556	if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment))
6557	break;
6558
6559	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: `2`),
6560	Glue: SDValue ());
6561	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: `3`),
6562	Glue: Chain.getValue(R: `1`));
6563	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: `4`),
6564	Glue: Chain.getValue(R: `1`));
6565	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: `5`),
6566	Glue: Chain.getValue(R: `1`));
6567	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: `6`),
6568	Glue: Chain.getValue(R: `1`));
6569	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: `7`),
6570	Glue: Chain.getValue(R: `1`));
6571	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: `8`),
6572	Glue: Chain.getValue(R: `1`));
6573	Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: `9`),
6574	Glue: Chain.getValue(R: `1`));
6575
6576	MachineSDNode *Res = CurDAG->getMachineNode(
6577	Opcode, dl, VTs: Node->getVTList(),
6578	Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: `1`)});
6579	CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand());
6580	ReplaceNode(F: Node, T: Res);
6581	return;
6582	}
6583	}
6584
6585	SelectCode(N: Node);
6586	}
6587
6588	bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6589	const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6590	std::vector<SDValue> &OutOps) {
6591	SDValue Op0, Op1, Op2, Op3, Op4;
6592	switch (ConstraintID) {
6593	default:
6594	llvm_unreachable("Unexpected asm memory constraint");
6595	case InlineAsm::ConstraintCode::o: // offsetable ??
6596	case InlineAsm::ConstraintCode::v: // not offsetable ??
6597	case InlineAsm::ConstraintCode::m: // memory
6598	case InlineAsm::ConstraintCode::X:
6599	case InlineAsm::ConstraintCode::p: // address
6600	if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4))
6601	return true;
6602	break;
6603	}
6604
6605	OutOps.push_back(x: Op0);
6606	OutOps.push_back(x: Op1);
6607	OutOps.push_back(x: Op2);
6608	OutOps.push_back(x: Op3);
6609	OutOps.push_back(x: Op4);
6610	return false;
6611	}
6612
6613	X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6614	: SelectionDAGISelPass (
6615	std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
6616
6617	/// This pass converts a legalized DAG into a X86-specific DAG,
6618	/// ready for instruction scheduling.
6619	FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6620	CodeGenOptLevel OptLevel) {
6621	return new X86DAGToDAGISelLegacy (TM, OptLevel);
6622	}
6623

Browse the source code of llvm_projects/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp