1 | //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines a DAG pattern matching instruction selector for X86, |
10 | // converting from a legalized dag to a X86 dag. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "X86ISelDAGToDAG.h" |
15 | #include "X86.h" |
16 | #include "X86MachineFunctionInfo.h" |
17 | #include "X86RegisterInfo.h" |
18 | #include "X86Subtarget.h" |
19 | #include "X86TargetMachine.h" |
20 | #include "llvm/ADT/Statistic.h" |
21 | #include "llvm/CodeGen/MachineModuleInfo.h" |
22 | #include "llvm/CodeGen/SelectionDAGISel.h" |
23 | #include "llvm/Config/llvm-config.h" |
24 | #include "llvm/IR/ConstantRange.h" |
25 | #include "llvm/IR/Function.h" |
26 | #include "llvm/IR/Instructions.h" |
27 | #include "llvm/IR/Intrinsics.h" |
28 | #include "llvm/IR/IntrinsicsX86.h" |
29 | #include "llvm/IR/Module.h" |
30 | #include "llvm/IR/Type.h" |
31 | #include "llvm/Support/Debug.h" |
32 | #include "llvm/Support/ErrorHandling.h" |
33 | #include "llvm/Support/KnownBits.h" |
34 | #include "llvm/Support/MathExtras.h" |
35 | #include <cstdint> |
36 | |
37 | using namespace llvm; |
38 | |
39 | #define DEBUG_TYPE "x86-isel" |
40 | #define PASS_NAME "X86 DAG->DAG Instruction Selection" |
41 | |
42 | STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor" ); |
43 | |
44 | static cl::opt<bool> AndImmShrink("x86-and-imm-shrink" , cl::init(Val: true), |
45 | cl::desc("Enable setting constant bits to reduce size of mask immediates" ), |
46 | cl::Hidden); |
47 | |
48 | static cl::opt<bool> EnablePromoteAnyextLoad( |
49 | "x86-promote-anyext-load" , cl::init(Val: true), |
50 | cl::desc("Enable promoting aligned anyext load to wider load" ), cl::Hidden); |
51 | |
52 | extern cl::opt<bool> IndirectBranchTracking; |
53 | |
54 | //===----------------------------------------------------------------------===// |
55 | // Pattern Matcher Implementation |
56 | //===----------------------------------------------------------------------===// |
57 | |
58 | namespace { |
59 | /// This corresponds to X86AddressMode, but uses SDValue's instead of register |
60 | /// numbers for the leaves of the matched tree. |
61 | struct X86ISelAddressMode { |
62 | enum { |
63 | RegBase, |
64 | FrameIndexBase |
65 | } BaseType = RegBase; |
66 | |
67 | // This is really a union, discriminated by BaseType! |
68 | SDValue Base_Reg; |
69 | int Base_FrameIndex = 0; |
70 | |
71 | unsigned Scale = 1; |
72 | SDValue IndexReg; |
73 | int32_t Disp = 0; |
74 | SDValue Segment; |
75 | const GlobalValue *GV = nullptr; |
76 | const Constant *CP = nullptr; |
77 | const BlockAddress *BlockAddr = nullptr; |
78 | const char *ES = nullptr; |
79 | MCSymbol *MCSym = nullptr; |
80 | int JT = -1; |
81 | Align Alignment; // CP alignment. |
82 | unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_* |
83 | bool NegateIndex = false; |
84 | |
85 | X86ISelAddressMode() = default; |
86 | |
87 | bool hasSymbolicDisplacement() const { |
88 | return GV != nullptr || CP != nullptr || ES != nullptr || |
89 | MCSym != nullptr || JT != -1 || BlockAddr != nullptr; |
90 | } |
91 | |
92 | bool hasBaseOrIndexReg() const { |
93 | return BaseType == FrameIndexBase || |
94 | IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; |
95 | } |
96 | |
97 | /// Return true if this addressing mode is already RIP-relative. |
98 | bool isRIPRelative() const { |
99 | if (BaseType != RegBase) return false; |
100 | if (RegisterSDNode *RegNode = |
101 | dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode())) |
102 | return RegNode->getReg() == X86::RIP; |
103 | return false; |
104 | } |
105 | |
106 | void setBaseReg(SDValue Reg) { |
107 | BaseType = RegBase; |
108 | Base_Reg = Reg; |
109 | } |
110 | |
111 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
112 | void dump(SelectionDAG *DAG = nullptr) { |
113 | dbgs() << "X86ISelAddressMode " << this << '\n'; |
114 | dbgs() << "Base_Reg " ; |
115 | if (Base_Reg.getNode()) |
116 | Base_Reg.getNode()->dump(DAG); |
117 | else |
118 | dbgs() << "nul\n" ; |
119 | if (BaseType == FrameIndexBase) |
120 | dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; |
121 | dbgs() << " Scale " << Scale << '\n' |
122 | << "IndexReg " ; |
123 | if (NegateIndex) |
124 | dbgs() << "negate " ; |
125 | if (IndexReg.getNode()) |
126 | IndexReg.getNode()->dump(DAG); |
127 | else |
128 | dbgs() << "nul\n" ; |
129 | dbgs() << " Disp " << Disp << '\n' |
130 | << "GV " ; |
131 | if (GV) |
132 | GV->dump(); |
133 | else |
134 | dbgs() << "nul" ; |
135 | dbgs() << " CP " ; |
136 | if (CP) |
137 | CP->dump(); |
138 | else |
139 | dbgs() << "nul" ; |
140 | dbgs() << '\n' |
141 | << "ES " ; |
142 | if (ES) |
143 | dbgs() << ES; |
144 | else |
145 | dbgs() << "nul" ; |
146 | dbgs() << " MCSym " ; |
147 | if (MCSym) |
148 | dbgs() << MCSym; |
149 | else |
150 | dbgs() << "nul" ; |
151 | dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n'; |
152 | } |
153 | #endif |
154 | }; |
155 | } |
156 | |
157 | namespace { |
158 | //===--------------------------------------------------------------------===// |
159 | /// ISel - X86-specific code to select X86 machine instructions for |
160 | /// SelectionDAG operations. |
161 | /// |
162 | class X86DAGToDAGISel final : public SelectionDAGISel { |
163 | /// Keep a pointer to the X86Subtarget around so that we can |
164 | /// make the right decision when generating code for different targets. |
165 | const X86Subtarget *Subtarget; |
166 | |
167 | /// If true, selector should try to optimize for minimum code size. |
168 | bool OptForMinSize; |
169 | |
170 | /// Disable direct TLS access through segment registers. |
171 | bool IndirectTlsSegRefs; |
172 | |
173 | public: |
174 | X86DAGToDAGISel() = delete; |
175 | |
176 | explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel) |
177 | : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), |
178 | OptForMinSize(false), IndirectTlsSegRefs(false) {} |
179 | |
180 | bool runOnMachineFunction(MachineFunction &MF) override { |
181 | // Reset the subtarget each time through. |
182 | Subtarget = &MF.getSubtarget<X86Subtarget>(); |
183 | IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( |
184 | Kind: "indirect-tls-seg-refs" ); |
185 | |
186 | // OptFor[Min]Size are used in pattern predicates that isel is matching. |
187 | OptForMinSize = MF.getFunction().hasMinSize(); |
188 | assert((!OptForMinSize || MF.getFunction().hasOptSize()) && |
189 | "OptForMinSize implies OptForSize" ); |
190 | return SelectionDAGISel::runOnMachineFunction(mf&: MF); |
191 | } |
192 | |
193 | void emitFunctionEntryCode() override; |
194 | |
195 | bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; |
196 | |
197 | void PreprocessISelDAG() override; |
198 | void PostprocessISelDAG() override; |
199 | |
200 | // Include the pieces autogenerated from the target description. |
201 | #include "X86GenDAGISel.inc" |
202 | |
203 | private: |
204 | void Select(SDNode *N) override; |
205 | |
206 | bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); |
207 | bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, |
208 | bool AllowSegmentRegForX32 = false); |
209 | bool matchWrapper(SDValue N, X86ISelAddressMode &AM); |
210 | bool matchAddress(SDValue N, X86ISelAddressMode &AM); |
211 | bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); |
212 | bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); |
213 | SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM, |
214 | unsigned Depth); |
215 | bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
216 | unsigned Depth); |
217 | bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
218 | unsigned Depth); |
219 | bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); |
220 | bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, |
221 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
222 | SDValue &Segment); |
223 | bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, |
224 | SDValue ScaleOp, SDValue &Base, SDValue &Scale, |
225 | SDValue &Index, SDValue &Disp, SDValue &Segment); |
226 | bool selectMOV64Imm32(SDValue N, SDValue &Imm); |
227 | bool selectLEAAddr(SDValue N, SDValue &Base, |
228 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
229 | SDValue &Segment); |
230 | bool selectLEA64_32Addr(SDValue N, SDValue &Base, |
231 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
232 | SDValue &Segment); |
233 | bool selectTLSADDRAddr(SDValue N, SDValue &Base, |
234 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
235 | SDValue &Segment); |
236 | bool selectRelocImm(SDValue N, SDValue &Op); |
237 | |
238 | bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, |
239 | SDValue &Base, SDValue &Scale, |
240 | SDValue &Index, SDValue &Disp, |
241 | SDValue &Segment); |
242 | |
243 | // Convenience method where P is also root. |
244 | bool tryFoldLoad(SDNode *P, SDValue N, |
245 | SDValue &Base, SDValue &Scale, |
246 | SDValue &Index, SDValue &Disp, |
247 | SDValue &Segment) { |
248 | return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment); |
249 | } |
250 | |
251 | bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, |
252 | SDValue &Base, SDValue &Scale, |
253 | SDValue &Index, SDValue &Disp, |
254 | SDValue &Segment); |
255 | |
256 | bool isProfitableToFormMaskedOp(SDNode *N) const; |
257 | |
258 | /// Implement addressing mode selection for inline asm expressions. |
259 | bool SelectInlineAsmMemoryOperand(const SDValue &Op, |
260 | InlineAsm::ConstraintCode ConstraintID, |
261 | std::vector<SDValue> &OutOps) override; |
262 | |
263 | void emitSpecialCodeForMain(); |
264 | |
265 | inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, |
266 | MVT VT, SDValue &Base, SDValue &Scale, |
267 | SDValue &Index, SDValue &Disp, |
268 | SDValue &Segment) { |
269 | if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
270 | Base = CurDAG->getTargetFrameIndex( |
271 | FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout())); |
272 | else if (AM.Base_Reg.getNode()) |
273 | Base = AM.Base_Reg; |
274 | else |
275 | Base = CurDAG->getRegister(Reg: 0, VT); |
276 | |
277 | Scale = getI8Imm(Imm: AM.Scale, DL); |
278 | |
279 | #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC) |
280 | // Negate the index if needed. |
281 | if (AM.NegateIndex) { |
282 | unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r) |
283 | : GET_ND_IF_ENABLED(X86::NEG32r); |
284 | SDValue Neg = SDValue(CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32, |
285 | Ops: AM.IndexReg), 0); |
286 | AM.IndexReg = Neg; |
287 | } |
288 | |
289 | if (AM.IndexReg.getNode()) |
290 | Index = AM.IndexReg; |
291 | else |
292 | Index = CurDAG->getRegister(Reg: 0, VT); |
293 | |
294 | // These are 32-bit even in 64-bit mode since RIP-relative offset |
295 | // is 32-bit. |
296 | if (AM.GV) |
297 | Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc(), |
298 | VT: MVT::i32, offset: AM.Disp, |
299 | TargetFlags: AM.SymbolFlags); |
300 | else if (AM.CP) |
301 | Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment, |
302 | Offset: AM.Disp, TargetFlags: AM.SymbolFlags); |
303 | else if (AM.ES) { |
304 | assert(!AM.Disp && "Non-zero displacement is ignored with ES." ); |
305 | Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags); |
306 | } else if (AM.MCSym) { |
307 | assert(!AM.Disp && "Non-zero displacement is ignored with MCSym." ); |
308 | assert(AM.SymbolFlags == 0 && "oo" ); |
309 | Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32); |
310 | } else if (AM.JT != -1) { |
311 | assert(!AM.Disp && "Non-zero displacement is ignored with JT." ); |
312 | Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags); |
313 | } else if (AM.BlockAddr) |
314 | Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp, |
315 | TargetFlags: AM.SymbolFlags); |
316 | else |
317 | Disp = CurDAG->getTargetConstant(Val: AM.Disp, DL, VT: MVT::i32); |
318 | |
319 | if (AM.Segment.getNode()) |
320 | Segment = AM.Segment; |
321 | else |
322 | Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
323 | } |
324 | |
325 | // Utility function to determine whether we should avoid selecting |
326 | // immediate forms of instructions for better code size or not. |
327 | // At a high level, we'd like to avoid such instructions when |
328 | // we have similar constants used within the same basic block |
329 | // that can be kept in a register. |
330 | // |
331 | bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { |
332 | uint32_t UseCount = 0; |
333 | |
334 | // Do not want to hoist if we're not optimizing for size. |
335 | // TODO: We'd like to remove this restriction. |
336 | // See the comment in X86InstrInfo.td for more info. |
337 | if (!CurDAG->shouldOptForSize()) |
338 | return false; |
339 | |
340 | // Walk all the users of the immediate. |
341 | for (const SDNode *User : N->uses()) { |
342 | if (UseCount >= 2) |
343 | break; |
344 | |
345 | // This user is already selected. Count it as a legitimate use and |
346 | // move on. |
347 | if (User->isMachineOpcode()) { |
348 | UseCount++; |
349 | continue; |
350 | } |
351 | |
352 | // We want to count stores of immediates as real uses. |
353 | if (User->getOpcode() == ISD::STORE && |
354 | User->getOperand(Num: 1).getNode() == N) { |
355 | UseCount++; |
356 | continue; |
357 | } |
358 | |
359 | // We don't currently match users that have > 2 operands (except |
360 | // for stores, which are handled above) |
361 | // Those instruction won't match in ISEL, for now, and would |
362 | // be counted incorrectly. |
363 | // This may change in the future as we add additional instruction |
364 | // types. |
365 | if (User->getNumOperands() != 2) |
366 | continue; |
367 | |
368 | // If this is a sign-extended 8-bit integer immediate used in an ALU |
369 | // instruction, there is probably an opcode encoding to save space. |
370 | auto *C = dyn_cast<ConstantSDNode>(Val: N); |
371 | if (C && isInt<8>(x: C->getSExtValue())) |
372 | continue; |
373 | |
374 | // Immediates that are used for offsets as part of stack |
375 | // manipulation should be left alone. These are typically |
376 | // used to indicate SP offsets for argument passing and |
377 | // will get pulled into stores/pushes (implicitly). |
378 | if (User->getOpcode() == X86ISD::ADD || |
379 | User->getOpcode() == ISD::ADD || |
380 | User->getOpcode() == X86ISD::SUB || |
381 | User->getOpcode() == ISD::SUB) { |
382 | |
383 | // Find the other operand of the add/sub. |
384 | SDValue OtherOp = User->getOperand(Num: 0); |
385 | if (OtherOp.getNode() == N) |
386 | OtherOp = User->getOperand(Num: 1); |
387 | |
388 | // Don't count if the other operand is SP. |
389 | RegisterSDNode *RegNode; |
390 | if (OtherOp->getOpcode() == ISD::CopyFromReg && |
391 | (RegNode = dyn_cast_or_null<RegisterSDNode>( |
392 | Val: OtherOp->getOperand(Num: 1).getNode()))) |
393 | if ((RegNode->getReg() == X86::ESP) || |
394 | (RegNode->getReg() == X86::RSP)) |
395 | continue; |
396 | } |
397 | |
398 | // ... otherwise, count this and move on. |
399 | UseCount++; |
400 | } |
401 | |
402 | // If we have more than 1 use, then recommend for hoisting. |
403 | return (UseCount > 1); |
404 | } |
405 | |
406 | /// Return a target constant with the specified value of type i8. |
407 | inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { |
408 | return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8); |
409 | } |
410 | |
411 | /// Return a target constant with the specified value, of type i32. |
412 | inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { |
413 | return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32); |
414 | } |
415 | |
416 | /// Return a target constant with the specified value, of type i64. |
417 | inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { |
418 | return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64); |
419 | } |
420 | |
421 | SDValue (SDNode *N, unsigned VecWidth, |
422 | const SDLoc &DL) { |
423 | assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width" ); |
424 | uint64_t Index = N->getConstantOperandVal(Num: 1); |
425 | MVT VecVT = N->getOperand(Num: 0).getSimpleValueType(); |
426 | return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); |
427 | } |
428 | |
429 | SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, |
430 | const SDLoc &DL) { |
431 | assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width" ); |
432 | uint64_t Index = N->getConstantOperandVal(Num: 2); |
433 | MVT VecVT = N->getSimpleValueType(ResNo: 0); |
434 | return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); |
435 | } |
436 | |
437 | SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth, |
438 | const SDLoc &DL) { |
439 | assert(VecWidth == 128 && "Unexpected vector width" ); |
440 | uint64_t Index = N->getConstantOperandVal(Num: 2); |
441 | MVT VecVT = N->getSimpleValueType(ResNo: 0); |
442 | uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth; |
443 | assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index" ); |
444 | // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub) |
445 | // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub) |
446 | return getI8Imm(Imm: InsertIdx ? 0x02 : 0x30, DL); |
447 | } |
448 | |
449 | SDValue getSBBZero(SDNode *N) { |
450 | SDLoc dl(N); |
451 | MVT VT = N->getSimpleValueType(ResNo: 0); |
452 | |
453 | // Create zero. |
454 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32); |
455 | SDValue Zero = SDValue( |
456 | CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: std::nullopt), 0); |
457 | if (VT == MVT::i64) { |
458 | Zero = SDValue( |
459 | CurDAG->getMachineNode( |
460 | Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, |
461 | Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: Zero, |
462 | Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)), |
463 | 0); |
464 | } |
465 | |
466 | // Copy flags to the EFLAGS register and glue it to next node. |
467 | unsigned Opcode = N->getOpcode(); |
468 | assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) && |
469 | "Unexpected opcode for SBB materialization" ); |
470 | unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1; |
471 | SDValue EFLAGS = |
472 | CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS, |
473 | N: N->getOperand(Num: FlagOpIndex), Glue: SDValue()); |
474 | |
475 | // Create a 64-bit instruction if the result is 64-bits otherwise use the |
476 | // 32-bit version. |
477 | unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; |
478 | MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; |
479 | VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32); |
480 | return SDValue( |
481 | CurDAG->getMachineNode(Opcode: Opc, dl, VTs, |
482 | Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: 1)}), |
483 | 0); |
484 | } |
485 | |
486 | // Helper to detect unneeded and instructions on shift amounts. Called |
487 | // from PatFrags in tablegen. |
488 | bool isUnneededShiftMask(SDNode *N, unsigned Width) const { |
489 | assert(N->getOpcode() == ISD::AND && "Unexpected opcode" ); |
490 | const APInt &Val = N->getConstantOperandAPInt(Num: 1); |
491 | |
492 | if (Val.countr_one() >= Width) |
493 | return true; |
494 | |
495 | APInt Mask = Val | CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero; |
496 | return Mask.countr_one() >= Width; |
497 | } |
498 | |
499 | /// Return an SDNode that returns the value of the global base register. |
500 | /// Output instructions required to initialize the global base register, |
501 | /// if necessary. |
502 | SDNode *getGlobalBaseReg(); |
503 | |
504 | /// Return a reference to the TargetMachine, casted to the target-specific |
505 | /// type. |
506 | const X86TargetMachine &getTargetMachine() const { |
507 | return static_cast<const X86TargetMachine &>(TM); |
508 | } |
509 | |
510 | /// Return a reference to the TargetInstrInfo, casted to the target-specific |
511 | /// type. |
512 | const X86InstrInfo *getInstrInfo() const { |
513 | return Subtarget->getInstrInfo(); |
514 | } |
515 | |
516 | /// Return a condition code of the given SDNode |
517 | X86::CondCode getCondFromNode(SDNode *N) const; |
518 | |
519 | /// Address-mode matching performs shift-of-and to and-of-shift |
520 | /// reassociation in order to expose more scaled addressing |
521 | /// opportunities. |
522 | bool ComplexPatternFuncMutatesDAG() const override { |
523 | return true; |
524 | } |
525 | |
526 | bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; |
527 | |
528 | // Indicates we should prefer to use a non-temporal load for this load. |
529 | bool useNonTemporalLoad(LoadSDNode *N) const { |
530 | if (!N->isNonTemporal()) |
531 | return false; |
532 | |
533 | unsigned StoreSize = N->getMemoryVT().getStoreSize(); |
534 | |
535 | if (N->getAlign().value() < StoreSize) |
536 | return false; |
537 | |
538 | switch (StoreSize) { |
539 | default: llvm_unreachable("Unsupported store size" ); |
540 | case 4: |
541 | case 8: |
542 | return false; |
543 | case 16: |
544 | return Subtarget->hasSSE41(); |
545 | case 32: |
546 | return Subtarget->hasAVX2(); |
547 | case 64: |
548 | return Subtarget->hasAVX512(); |
549 | } |
550 | } |
551 | |
552 | bool foldLoadStoreIntoMemOperand(SDNode *Node); |
553 | MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); |
554 | bool matchBitExtract(SDNode *Node); |
555 | bool shrinkAndImmediate(SDNode *N); |
556 | bool isMaskZeroExtended(SDNode *N) const; |
557 | bool tryShiftAmountMod(SDNode *N); |
558 | bool tryShrinkShlLogicImm(SDNode *N); |
559 | bool tryVPTERNLOG(SDNode *N); |
560 | bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB, |
561 | SDNode *ParentC, SDValue A, SDValue B, SDValue C, |
562 | uint8_t Imm); |
563 | bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); |
564 | bool tryMatchBitSelect(SDNode *N); |
565 | |
566 | MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, |
567 | const SDLoc &dl, MVT VT, SDNode *Node); |
568 | MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, |
569 | const SDLoc &dl, MVT VT, SDNode *Node, |
570 | SDValue &InGlue); |
571 | |
572 | bool tryOptimizeRem8Extend(SDNode *N); |
573 | |
574 | bool onlyUsesZeroFlag(SDValue Flags) const; |
575 | bool hasNoSignFlagUses(SDValue Flags) const; |
576 | bool hasNoCarryFlagUses(SDValue Flags) const; |
577 | }; |
578 | |
579 | class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy { |
580 | public: |
581 | static char ID; |
582 | explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm, |
583 | CodeGenOptLevel OptLevel) |
584 | : SelectionDAGISelLegacy( |
585 | ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {} |
586 | }; |
587 | } |
588 | |
589 | char X86DAGToDAGISelLegacy::ID = 0; |
590 | |
591 | INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false) |
592 | |
593 | // Returns true if this masked compare can be implemented legally with this |
594 | // type. |
595 | static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { |
596 | unsigned Opcode = N->getOpcode(); |
597 | if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM || |
598 | Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC || |
599 | Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) { |
600 | // We can get 256-bit 8 element types here without VLX being enabled. When |
601 | // this happens we will use 512-bit operations and the mask will not be |
602 | // zero extended. |
603 | EVT OpVT = N->getOperand(Num: 0).getValueType(); |
604 | // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the |
605 | // second operand. |
606 | if (Opcode == X86ISD::STRICT_CMPM) |
607 | OpVT = N->getOperand(Num: 1).getValueType(); |
608 | if (OpVT.is256BitVector() || OpVT.is128BitVector()) |
609 | return Subtarget->hasVLX(); |
610 | |
611 | return true; |
612 | } |
613 | // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. |
614 | if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || |
615 | Opcode == X86ISD::FSETCCM_SAE) |
616 | return true; |
617 | |
618 | return false; |
619 | } |
620 | |
621 | // Returns true if we can assume the writer of the mask has zero extended it |
622 | // for us. |
623 | bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { |
624 | // If this is an AND, check if we have a compare on either side. As long as |
625 | // one side guarantees the mask is zero extended, the AND will preserve those |
626 | // zeros. |
627 | if (N->getOpcode() == ISD::AND) |
628 | return isLegalMaskCompare(N: N->getOperand(Num: 0).getNode(), Subtarget) || |
629 | isLegalMaskCompare(N: N->getOperand(Num: 1).getNode(), Subtarget); |
630 | |
631 | return isLegalMaskCompare(N, Subtarget); |
632 | } |
633 | |
634 | bool |
635 | X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { |
636 | if (OptLevel == CodeGenOptLevel::None) |
637 | return false; |
638 | |
639 | if (!N.hasOneUse()) |
640 | return false; |
641 | |
642 | if (N.getOpcode() != ISD::LOAD) |
643 | return true; |
644 | |
645 | // Don't fold non-temporal loads if we have an instruction for them. |
646 | if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N))) |
647 | return false; |
648 | |
649 | // If N is a load, do additional profitability checks. |
650 | if (U == Root) { |
651 | switch (U->getOpcode()) { |
652 | default: break; |
653 | case X86ISD::ADD: |
654 | case X86ISD::ADC: |
655 | case X86ISD::SUB: |
656 | case X86ISD::SBB: |
657 | case X86ISD::AND: |
658 | case X86ISD::XOR: |
659 | case X86ISD::OR: |
660 | case ISD::ADD: |
661 | case ISD::UADDO_CARRY: |
662 | case ISD::AND: |
663 | case ISD::OR: |
664 | case ISD::XOR: { |
665 | SDValue Op1 = U->getOperand(Num: 1); |
666 | |
667 | // If the other operand is a 8-bit immediate we should fold the immediate |
668 | // instead. This reduces code size. |
669 | // e.g. |
670 | // movl 4(%esp), %eax |
671 | // addl $4, %eax |
672 | // vs. |
673 | // movl $4, %eax |
674 | // addl 4(%esp), %eax |
675 | // The former is 2 bytes shorter. In case where the increment is 1, then |
676 | // the saving can be 4 bytes (by using incl %eax). |
677 | if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) { |
678 | if (Imm->getAPIntValue().isSignedIntN(N: 8)) |
679 | return false; |
680 | |
681 | // If this is a 64-bit AND with an immediate that fits in 32-bits, |
682 | // prefer using the smaller and over folding the load. This is needed to |
683 | // make sure immediates created by shrinkAndImmediate are always folded. |
684 | // Ideally we would narrow the load during DAG combine and get the |
685 | // best of both worlds. |
686 | if (U->getOpcode() == ISD::AND && |
687 | Imm->getAPIntValue().getBitWidth() == 64 && |
688 | Imm->getAPIntValue().isIntN(N: 32)) |
689 | return false; |
690 | |
691 | // If this really a zext_inreg that can be represented with a movzx |
692 | // instruction, prefer that. |
693 | // TODO: We could shrink the load and fold if it is non-volatile. |
694 | if (U->getOpcode() == ISD::AND && |
695 | (Imm->getAPIntValue() == UINT8_MAX || |
696 | Imm->getAPIntValue() == UINT16_MAX || |
697 | Imm->getAPIntValue() == UINT32_MAX)) |
698 | return false; |
699 | |
700 | // ADD/SUB with can negate the immediate and use the opposite operation |
701 | // to fit 128 into a sign extended 8 bit immediate. |
702 | if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && |
703 | (-Imm->getAPIntValue()).isSignedIntN(N: 8)) |
704 | return false; |
705 | |
706 | if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) && |
707 | (-Imm->getAPIntValue()).isSignedIntN(N: 8) && |
708 | hasNoCarryFlagUses(Flags: SDValue(U, 1))) |
709 | return false; |
710 | } |
711 | |
712 | // If the other operand is a TLS address, we should fold it instead. |
713 | // This produces |
714 | // movl %gs:0, %eax |
715 | // leal i@NTPOFF(%eax), %eax |
716 | // instead of |
717 | // movl $i@NTPOFF, %eax |
718 | // addl %gs:0, %eax |
719 | // if the block also has an access to a second TLS address this will save |
720 | // a load. |
721 | // FIXME: This is probably also true for non-TLS addresses. |
722 | if (Op1.getOpcode() == X86ISD::Wrapper) { |
723 | SDValue Val = Op1.getOperand(i: 0); |
724 | if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) |
725 | return false; |
726 | } |
727 | |
728 | // Don't fold load if this matches the BTS/BTR/BTC patterns. |
729 | // BTS: (or X, (shl 1, n)) |
730 | // BTR: (and X, (rotl -2, n)) |
731 | // BTC: (xor X, (shl 1, n)) |
732 | if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { |
733 | if (U->getOperand(Num: 0).getOpcode() == ISD::SHL && |
734 | isOneConstant(V: U->getOperand(Num: 0).getOperand(i: 0))) |
735 | return false; |
736 | |
737 | if (U->getOperand(Num: 1).getOpcode() == ISD::SHL && |
738 | isOneConstant(V: U->getOperand(Num: 1).getOperand(i: 0))) |
739 | return false; |
740 | } |
741 | if (U->getOpcode() == ISD::AND) { |
742 | SDValue U0 = U->getOperand(Num: 0); |
743 | SDValue U1 = U->getOperand(Num: 1); |
744 | if (U0.getOpcode() == ISD::ROTL) { |
745 | auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: 0)); |
746 | if (C && C->getSExtValue() == -2) |
747 | return false; |
748 | } |
749 | |
750 | if (U1.getOpcode() == ISD::ROTL) { |
751 | auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: 0)); |
752 | if (C && C->getSExtValue() == -2) |
753 | return false; |
754 | } |
755 | } |
756 | |
757 | break; |
758 | } |
759 | case ISD::SHL: |
760 | case ISD::SRA: |
761 | case ISD::SRL: |
762 | // Don't fold a load into a shift by immediate. The BMI2 instructions |
763 | // support folding a load, but not an immediate. The legacy instructions |
764 | // support folding an immediate, but can't fold a load. Folding an |
765 | // immediate is preferable to folding a load. |
766 | if (isa<ConstantSDNode>(Val: U->getOperand(Num: 1))) |
767 | return false; |
768 | |
769 | break; |
770 | } |
771 | } |
772 | |
773 | // Prevent folding a load if this can implemented with an insert_subreg or |
774 | // a move that implicitly zeroes. |
775 | if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && |
776 | isNullConstant(V: Root->getOperand(Num: 2)) && |
777 | (Root->getOperand(Num: 0).isUndef() || |
778 | ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: 0).getNode()))) |
779 | return false; |
780 | |
781 | return true; |
782 | } |
783 | |
784 | // Indicates it is profitable to form an AVX512 masked operation. Returning |
785 | // false will favor a masked register-register masked move or vblendm and the |
786 | // operation will be selected separately. |
787 | bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const { |
788 | assert( |
789 | (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) && |
790 | "Unexpected opcode!" ); |
791 | |
792 | // If the operation has additional users, the operation will be duplicated. |
793 | // Check the use count to prevent that. |
794 | // FIXME: Are there cheap opcodes we might want to duplicate? |
795 | return N->getOperand(Num: 1).hasOneUse(); |
796 | } |
797 | |
798 | /// Replace the original chain operand of the call with |
799 | /// load's chain operand and move load below the call's chain operand. |
800 | static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, |
801 | SDValue Call, SDValue OrigChain) { |
802 | SmallVector<SDValue, 8> Ops; |
803 | SDValue Chain = OrigChain.getOperand(i: 0); |
804 | if (Chain.getNode() == Load.getNode()) |
805 | Ops.push_back(Elt: Load.getOperand(i: 0)); |
806 | else { |
807 | assert(Chain.getOpcode() == ISD::TokenFactor && |
808 | "Unexpected chain operand" ); |
809 | for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) |
810 | if (Chain.getOperand(i).getNode() == Load.getNode()) |
811 | Ops.push_back(Elt: Load.getOperand(i: 0)); |
812 | else |
813 | Ops.push_back(Elt: Chain.getOperand(i)); |
814 | SDValue NewChain = |
815 | CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Load), VT: MVT::Other, Ops); |
816 | Ops.clear(); |
817 | Ops.push_back(Elt: NewChain); |
818 | } |
819 | Ops.append(in_start: OrigChain->op_begin() + 1, in_end: OrigChain->op_end()); |
820 | CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops); |
821 | CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: 0), |
822 | Op2: Load.getOperand(i: 1), Op3: Load.getOperand(i: 2)); |
823 | |
824 | Ops.clear(); |
825 | Ops.push_back(Elt: SDValue(Load.getNode(), 1)); |
826 | Ops.append(in_start: Call->op_begin() + 1, in_end: Call->op_end()); |
827 | CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops); |
828 | } |
829 | |
830 | /// Return true if call address is a load and it can be |
831 | /// moved below CALLSEQ_START and the chains leading up to the call. |
832 | /// Return the CALLSEQ_START by reference as a second output. |
833 | /// In the case of a tail call, there isn't a callseq node between the call |
834 | /// chain and the load. |
835 | static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { |
836 | // The transformation is somewhat dangerous if the call's chain was glued to |
837 | // the call. After MoveBelowOrigChain the load is moved between the call and |
838 | // the chain, this can create a cycle if the load is not folded. So it is |
839 | // *really* important that we are sure the load will be folded. |
840 | if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) |
841 | return false; |
842 | auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode()); |
843 | if (!LD || |
844 | !LD->isSimple() || |
845 | LD->getAddressingMode() != ISD::UNINDEXED || |
846 | LD->getExtensionType() != ISD::NON_EXTLOAD) |
847 | return false; |
848 | |
849 | // Now let's find the callseq_start. |
850 | while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { |
851 | if (!Chain.hasOneUse()) |
852 | return false; |
853 | Chain = Chain.getOperand(i: 0); |
854 | } |
855 | |
856 | if (!Chain.getNumOperands()) |
857 | return false; |
858 | // Since we are not checking for AA here, conservatively abort if the chain |
859 | // writes to memory. It's not safe to move the callee (a load) across a store. |
860 | if (isa<MemSDNode>(Val: Chain.getNode()) && |
861 | cast<MemSDNode>(Val: Chain.getNode())->writeMem()) |
862 | return false; |
863 | if (Chain.getOperand(i: 0).getNode() == Callee.getNode()) |
864 | return true; |
865 | if (Chain.getOperand(i: 0).getOpcode() == ISD::TokenFactor && |
866 | Callee.getValue(R: 1).isOperandOf(N: Chain.getOperand(i: 0).getNode()) && |
867 | Callee.getValue(R: 1).hasOneUse()) |
868 | return true; |
869 | return false; |
870 | } |
871 | |
872 | static bool isEndbrImm64(uint64_t Imm) { |
873 | // There may be some other prefix bytes between 0xF3 and 0x0F1EFA. |
874 | // i.g: 0xF3660F1EFA, 0xF3670F1EFA |
875 | if ((Imm & 0x00FFFFFF) != 0x0F1EFA) |
876 | return false; |
877 | |
878 | uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64, |
879 | 0x65, 0x66, 0x67, 0xf0, 0xf2}; |
880 | int i = 24; // 24bit 0x0F1EFA has matched |
881 | while (i < 64) { |
882 | uint8_t Byte = (Imm >> i) & 0xFF; |
883 | if (Byte == 0xF3) |
884 | return true; |
885 | if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte)) |
886 | return false; |
887 | i += 8; |
888 | } |
889 | |
890 | return false; |
891 | } |
892 | |
893 | static bool needBWI(MVT VT) { |
894 | return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8); |
895 | } |
896 | |
897 | void X86DAGToDAGISel::PreprocessISelDAG() { |
898 | bool MadeChange = false; |
899 | for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), |
900 | E = CurDAG->allnodes_end(); I != E; ) { |
901 | SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. |
902 | |
903 | // This is for CET enhancement. |
904 | // |
905 | // ENDBR32 and ENDBR64 have specific opcodes: |
906 | // ENDBR32: F3 0F 1E FB |
907 | // ENDBR64: F3 0F 1E FA |
908 | // And we want that attackers won’t find unintended ENDBR32/64 |
909 | // opcode matches in the binary |
910 | // Here’s an example: |
911 | // If the compiler had to generate asm for the following code: |
912 | // a = 0xF30F1EFA |
913 | // it could, for example, generate: |
914 | // mov 0xF30F1EFA, dword ptr[a] |
915 | // In such a case, the binary would include a gadget that starts |
916 | // with a fake ENDBR64 opcode. Therefore, we split such generation |
917 | // into multiple operations, let it not shows in the binary |
918 | if (N->getOpcode() == ISD::Constant) { |
919 | MVT VT = N->getSimpleValueType(ResNo: 0); |
920 | int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue(); |
921 | int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB; |
922 | if (Imm == EndbrImm || isEndbrImm64(Imm)) { |
923 | // Check that the cf-protection-branch is enabled. |
924 | Metadata *CFProtectionBranch = |
925 | MF->getFunction().getParent()->getModuleFlag( |
926 | Key: "cf-protection-branch" ); |
927 | if (CFProtectionBranch || IndirectBranchTracking) { |
928 | SDLoc dl(N); |
929 | SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true); |
930 | Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT); |
931 | --I; |
932 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Complement); |
933 | ++I; |
934 | MadeChange = true; |
935 | continue; |
936 | } |
937 | } |
938 | } |
939 | |
940 | // If this is a target specific AND node with no flag usages, turn it back |
941 | // into ISD::AND to enable test instruction matching. |
942 | if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: 1)) { |
943 | SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
944 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1)); |
945 | --I; |
946 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
947 | ++I; |
948 | MadeChange = true; |
949 | continue; |
950 | } |
951 | |
952 | // Convert vector increment or decrement to sub/add with an all-ones |
953 | // constant: |
954 | // add X, <1, 1...> --> sub X, <-1, -1...> |
955 | // sub X, <1, 1...> --> add X, <-1, -1...> |
956 | // The all-ones vector constant can be materialized using a pcmpeq |
957 | // instruction that is commonly recognized as an idiom (has no register |
958 | // dependency), so that's better/smaller than loading a splat 1 constant. |
959 | // |
960 | // But don't do this if it would inhibit a potentially profitable load |
961 | // folding opportunity for the other operand. That only occurs with the |
962 | // intersection of: |
963 | // (1) The other operand (op0) is load foldable. |
964 | // (2) The op is an add (otherwise, we are *creating* an add and can still |
965 | // load fold the other op). |
966 | // (3) The target has AVX (otherwise, we have a destructive add and can't |
967 | // load fold the other op without killing the constant op). |
968 | // (4) The constant 1 vector has multiple uses (so it is profitable to load |
969 | // into a register anyway). |
970 | auto mayPreventLoadFold = [&]() { |
971 | return X86::mayFoldLoad(Op: N->getOperand(Num: 0), Subtarget: *Subtarget) && |
972 | N->getOpcode() == ISD::ADD && Subtarget->hasAVX() && |
973 | !N->getOperand(Num: 1).hasOneUse(); |
974 | }; |
975 | if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && |
976 | N->getSimpleValueType(ResNo: 0).isVector() && !mayPreventLoadFold()) { |
977 | APInt SplatVal; |
978 | if (X86::isConstantSplat(Op: N->getOperand(Num: 1), SplatVal) && |
979 | SplatVal.isOne()) { |
980 | SDLoc DL(N); |
981 | |
982 | MVT VT = N->getSimpleValueType(ResNo: 0); |
983 | unsigned NumElts = VT.getSizeInBits() / 32; |
984 | SDValue AllOnes = |
985 | CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts)); |
986 | AllOnes = CurDAG->getBitcast(VT, V: AllOnes); |
987 | |
988 | unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; |
989 | SDValue Res = |
990 | CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: 0), N2: AllOnes); |
991 | --I; |
992 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
993 | ++I; |
994 | MadeChange = true; |
995 | continue; |
996 | } |
997 | } |
998 | |
999 | switch (N->getOpcode()) { |
1000 | case X86ISD::VBROADCAST: { |
1001 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1002 | // Emulate v32i16/v64i8 broadcast without BWI. |
1003 | if (!Subtarget->hasBWI() && needBWI(VT)) { |
1004 | MVT NarrowVT = VT.getHalfNumVectorElementsVT(); |
1005 | SDLoc dl(N); |
1006 | SDValue NarrowBCast = |
1007 | CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: 0)); |
1008 | SDValue Res = |
1009 | CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT), |
1010 | N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1011 | unsigned Index = NarrowVT.getVectorMinNumElements(); |
1012 | Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast, |
1013 | N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl)); |
1014 | |
1015 | --I; |
1016 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
1017 | ++I; |
1018 | MadeChange = true; |
1019 | continue; |
1020 | } |
1021 | |
1022 | break; |
1023 | } |
1024 | case X86ISD::VBROADCAST_LOAD: { |
1025 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1026 | // Emulate v32i16/v64i8 broadcast without BWI. |
1027 | if (!Subtarget->hasBWI() && needBWI(VT)) { |
1028 | MVT NarrowVT = VT.getHalfNumVectorElementsVT(); |
1029 | auto *MemNode = cast<MemSDNode>(Val: N); |
1030 | SDLoc dl(N); |
1031 | SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other); |
1032 | SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; |
1033 | SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( |
1034 | Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(), |
1035 | MMO: MemNode->getMemOperand()); |
1036 | SDValue Res = |
1037 | CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT), |
1038 | N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1039 | unsigned Index = NarrowVT.getVectorMinNumElements(); |
1040 | Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast, |
1041 | N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl)); |
1042 | |
1043 | --I; |
1044 | SDValue To[] = {Res, NarrowBCast.getValue(R: 1)}; |
1045 | CurDAG->ReplaceAllUsesWith(From: N, To); |
1046 | ++I; |
1047 | MadeChange = true; |
1048 | continue; |
1049 | } |
1050 | |
1051 | break; |
1052 | } |
1053 | case ISD::LOAD: { |
1054 | // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM |
1055 | // load, then just extract the lower subvector and avoid the second load. |
1056 | auto *Ld = cast<LoadSDNode>(Val: N); |
1057 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1058 | if (!ISD::isNormalLoad(N: Ld) || !Ld->isSimple() || |
1059 | !(VT.is128BitVector() || VT.is256BitVector())) |
1060 | break; |
1061 | |
1062 | MVT MaxVT = VT; |
1063 | SDNode *MaxLd = nullptr; |
1064 | SDValue Ptr = Ld->getBasePtr(); |
1065 | SDValue Chain = Ld->getChain(); |
1066 | for (SDNode *User : Ptr->uses()) { |
1067 | auto *UserLd = dyn_cast<LoadSDNode>(Val: User); |
1068 | MVT UserVT = User->getSimpleValueType(ResNo: 0); |
1069 | if (User != N && UserLd && ISD::isNormalLoad(N: User) && |
1070 | UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain && |
1071 | !User->hasAnyUseOfValue(Value: 1) && |
1072 | (UserVT.is256BitVector() || UserVT.is512BitVector()) && |
1073 | UserVT.getSizeInBits() > VT.getSizeInBits() && |
1074 | (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) { |
1075 | MaxLd = User; |
1076 | MaxVT = UserVT; |
1077 | } |
1078 | } |
1079 | if (MaxLd) { |
1080 | SDLoc dl(N); |
1081 | unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits(); |
1082 | MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts); |
1083 | SDValue = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT, |
1084 | N1: SDValue(MaxLd, 0), |
1085 | N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1086 | SDValue Res = CurDAG->getBitcast(VT, V: Extract); |
1087 | |
1088 | --I; |
1089 | SDValue To[] = {Res, SDValue(MaxLd, 1)}; |
1090 | CurDAG->ReplaceAllUsesWith(From: N, To); |
1091 | ++I; |
1092 | MadeChange = true; |
1093 | continue; |
1094 | } |
1095 | break; |
1096 | } |
1097 | case ISD::VSELECT: { |
1098 | // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG. |
1099 | EVT EleVT = N->getOperand(Num: 0).getValueType().getVectorElementType(); |
1100 | if (EleVT == MVT::i1) |
1101 | break; |
1102 | |
1103 | assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!" ); |
1104 | assert(N->getValueType(0).getVectorElementType() != MVT::i16 && |
1105 | "We can't replace VSELECT with BLENDV in vXi16!" ); |
1106 | SDValue R; |
1107 | if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: 0)) == |
1108 | EleVT.getSizeInBits()) { |
1109 | R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1110 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), N3: N->getOperand(Num: 2), |
1111 | N4: CurDAG->getTargetConstant(Val: 0xCA, DL: SDLoc(N), VT: MVT::i8)); |
1112 | } else { |
1113 | R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1114 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), |
1115 | N3: N->getOperand(Num: 2)); |
1116 | } |
1117 | --I; |
1118 | CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode()); |
1119 | ++I; |
1120 | MadeChange = true; |
1121 | continue; |
1122 | } |
1123 | case ISD::FP_ROUND: |
1124 | case ISD::STRICT_FP_ROUND: |
1125 | case ISD::FP_TO_SINT: |
1126 | case ISD::FP_TO_UINT: |
1127 | case ISD::STRICT_FP_TO_SINT: |
1128 | case ISD::STRICT_FP_TO_UINT: { |
1129 | // Replace vector fp_to_s/uint with their X86 specific equivalent so we |
1130 | // don't need 2 sets of patterns. |
1131 | if (!N->getSimpleValueType(ResNo: 0).isVector()) |
1132 | break; |
1133 | |
1134 | unsigned NewOpc; |
1135 | switch (N->getOpcode()) { |
1136 | default: llvm_unreachable("Unexpected opcode!" ); |
1137 | case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break; |
1138 | case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break; |
1139 | case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break; |
1140 | case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; |
1141 | case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break; |
1142 | case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; |
1143 | } |
1144 | SDValue Res; |
1145 | if (N->isStrictFPOpcode()) |
1146 | Res = |
1147 | CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), ResultTys: {N->getValueType(ResNo: 0), MVT::Other}, |
1148 | Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)}); |
1149 | else |
1150 | Res = |
1151 | CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1152 | Operand: N->getOperand(Num: 0)); |
1153 | --I; |
1154 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
1155 | ++I; |
1156 | MadeChange = true; |
1157 | continue; |
1158 | } |
1159 | case ISD::SHL: |
1160 | case ISD::SRA: |
1161 | case ISD::SRL: { |
1162 | // Replace vector shifts with their X86 specific equivalent so we don't |
1163 | // need 2 sets of patterns. |
1164 | if (!N->getValueType(ResNo: 0).isVector()) |
1165 | break; |
1166 | |
1167 | unsigned NewOpc; |
1168 | switch (N->getOpcode()) { |
1169 | default: llvm_unreachable("Unexpected opcode!" ); |
1170 | case ISD::SHL: NewOpc = X86ISD::VSHLV; break; |
1171 | case ISD::SRA: NewOpc = X86ISD::VSRAV; break; |
1172 | case ISD::SRL: NewOpc = X86ISD::VSRLV; break; |
1173 | } |
1174 | SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1175 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1)); |
1176 | --I; |
1177 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
1178 | ++I; |
1179 | MadeChange = true; |
1180 | continue; |
1181 | } |
1182 | case ISD::ANY_EXTEND: |
1183 | case ISD::ANY_EXTEND_VECTOR_INREG: { |
1184 | // Replace vector any extend with the zero extend equivalents so we don't |
1185 | // need 2 sets of patterns. Ignore vXi1 extensions. |
1186 | if (!N->getValueType(ResNo: 0).isVector()) |
1187 | break; |
1188 | |
1189 | unsigned NewOpc; |
1190 | if (N->getOperand(Num: 0).getScalarValueSizeInBits() == 1) { |
1191 | assert(N->getOpcode() == ISD::ANY_EXTEND && |
1192 | "Unexpected opcode for mask vector!" ); |
1193 | NewOpc = ISD::SIGN_EXTEND; |
1194 | } else { |
1195 | NewOpc = N->getOpcode() == ISD::ANY_EXTEND |
1196 | ? ISD::ZERO_EXTEND |
1197 | : ISD::ZERO_EXTEND_VECTOR_INREG; |
1198 | } |
1199 | |
1200 | SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1201 | Operand: N->getOperand(Num: 0)); |
1202 | --I; |
1203 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
1204 | ++I; |
1205 | MadeChange = true; |
1206 | continue; |
1207 | } |
1208 | case ISD::FCEIL: |
1209 | case ISD::STRICT_FCEIL: |
1210 | case ISD::FFLOOR: |
1211 | case ISD::STRICT_FFLOOR: |
1212 | case ISD::FTRUNC: |
1213 | case ISD::STRICT_FTRUNC: |
1214 | case ISD::FROUNDEVEN: |
1215 | case ISD::STRICT_FROUNDEVEN: |
1216 | case ISD::FNEARBYINT: |
1217 | case ISD::STRICT_FNEARBYINT: |
1218 | case ISD::FRINT: |
1219 | case ISD::STRICT_FRINT: { |
1220 | // Replace fp rounding with their X86 specific equivalent so we don't |
1221 | // need 2 sets of patterns. |
1222 | unsigned Imm; |
1223 | switch (N->getOpcode()) { |
1224 | default: llvm_unreachable("Unexpected opcode!" ); |
1225 | case ISD::STRICT_FCEIL: |
1226 | case ISD::FCEIL: Imm = 0xA; break; |
1227 | case ISD::STRICT_FFLOOR: |
1228 | case ISD::FFLOOR: Imm = 0x9; break; |
1229 | case ISD::STRICT_FTRUNC: |
1230 | case ISD::FTRUNC: Imm = 0xB; break; |
1231 | case ISD::STRICT_FROUNDEVEN: |
1232 | case ISD::FROUNDEVEN: Imm = 0x8; break; |
1233 | case ISD::STRICT_FNEARBYINT: |
1234 | case ISD::FNEARBYINT: Imm = 0xC; break; |
1235 | case ISD::STRICT_FRINT: |
1236 | case ISD::FRINT: Imm = 0x4; break; |
1237 | } |
1238 | SDLoc dl(N); |
1239 | bool IsStrict = N->isStrictFPOpcode(); |
1240 | SDValue Res; |
1241 | if (IsStrict) |
1242 | Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl, |
1243 | ResultTys: {N->getValueType(ResNo: 0), MVT::Other}, |
1244 | Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1), |
1245 | CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)}); |
1246 | else |
1247 | Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: 0), |
1248 | N1: N->getOperand(Num: 0), |
1249 | N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)); |
1250 | --I; |
1251 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
1252 | ++I; |
1253 | MadeChange = true; |
1254 | continue; |
1255 | } |
1256 | case X86ISD::FANDN: |
1257 | case X86ISD::FAND: |
1258 | case X86ISD::FOR: |
1259 | case X86ISD::FXOR: { |
1260 | // Widen scalar fp logic ops to vector to reduce isel patterns. |
1261 | // FIXME: Can we do this during lowering/combine. |
1262 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1263 | if (VT.isVector() || VT == MVT::f128) |
1264 | break; |
1265 | |
1266 | MVT VecVT = VT == MVT::f64 ? MVT::v2f64 |
1267 | : VT == MVT::f32 ? MVT::v4f32 |
1268 | : MVT::v8f16; |
1269 | |
1270 | SDLoc dl(N); |
1271 | SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT, |
1272 | Operand: N->getOperand(Num: 0)); |
1273 | SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT, |
1274 | Operand: N->getOperand(Num: 1)); |
1275 | |
1276 | SDValue Res; |
1277 | if (Subtarget->hasSSE2()) { |
1278 | EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); |
1279 | Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0); |
1280 | Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1); |
1281 | unsigned Opc; |
1282 | switch (N->getOpcode()) { |
1283 | default: llvm_unreachable("Unexpected opcode!" ); |
1284 | case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; |
1285 | case X86ISD::FAND: Opc = ISD::AND; break; |
1286 | case X86ISD::FOR: Opc = ISD::OR; break; |
1287 | case X86ISD::FXOR: Opc = ISD::XOR; break; |
1288 | } |
1289 | Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1); |
1290 | Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res); |
1291 | } else { |
1292 | Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1); |
1293 | } |
1294 | Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res, |
1295 | N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1296 | --I; |
1297 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
1298 | ++I; |
1299 | MadeChange = true; |
1300 | continue; |
1301 | } |
1302 | } |
1303 | |
1304 | if (OptLevel != CodeGenOptLevel::None && |
1305 | // Only do this when the target can fold the load into the call or |
1306 | // jmp. |
1307 | !Subtarget->useIndirectThunkCalls() && |
1308 | ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || |
1309 | (N->getOpcode() == X86ISD::TC_RETURN && |
1310 | (Subtarget->is64Bit() || |
1311 | !getTargetMachine().isPositionIndependent())))) { |
1312 | /// Also try moving call address load from outside callseq_start to just |
1313 | /// before the call to allow it to be folded. |
1314 | /// |
1315 | /// [Load chain] |
1316 | /// ^ |
1317 | /// | |
1318 | /// [Load] |
1319 | /// ^ ^ |
1320 | /// | | |
1321 | /// / \-- |
1322 | /// / | |
1323 | ///[CALLSEQ_START] | |
1324 | /// ^ | |
1325 | /// | | |
1326 | /// [LOAD/C2Reg] | |
1327 | /// | | |
1328 | /// \ / |
1329 | /// \ / |
1330 | /// [CALL] |
1331 | bool HasCallSeq = N->getOpcode() == X86ISD::CALL; |
1332 | SDValue Chain = N->getOperand(Num: 0); |
1333 | SDValue Load = N->getOperand(Num: 1); |
1334 | if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq)) |
1335 | continue; |
1336 | moveBelowOrigChain(CurDAG, Load, Call: SDValue(N, 0), OrigChain: Chain); |
1337 | ++NumLoadMoved; |
1338 | MadeChange = true; |
1339 | continue; |
1340 | } |
1341 | |
1342 | // Lower fpround and fpextend nodes that target the FP stack to be store and |
1343 | // load to the stack. This is a gross hack. We would like to simply mark |
1344 | // these as being illegal, but when we do that, legalize produces these when |
1345 | // it expands calls, then expands these in the same legalize pass. We would |
1346 | // like dag combine to be able to hack on these between the call expansion |
1347 | // and the node legalization. As such this pass basically does "really |
1348 | // late" legalization of these inline with the X86 isel pass. |
1349 | // FIXME: This should only happen when not compiled with -O0. |
1350 | switch (N->getOpcode()) { |
1351 | default: continue; |
1352 | case ISD::FP_ROUND: |
1353 | case ISD::FP_EXTEND: |
1354 | { |
1355 | MVT SrcVT = N->getOperand(Num: 0).getSimpleValueType(); |
1356 | MVT DstVT = N->getSimpleValueType(ResNo: 0); |
1357 | |
1358 | // If any of the sources are vectors, no fp stack involved. |
1359 | if (SrcVT.isVector() || DstVT.isVector()) |
1360 | continue; |
1361 | |
1362 | // If the source and destination are SSE registers, then this is a legal |
1363 | // conversion that should not be lowered. |
1364 | const X86TargetLowering *X86Lowering = |
1365 | static_cast<const X86TargetLowering *>(TLI); |
1366 | bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT); |
1367 | bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT); |
1368 | if (SrcIsSSE && DstIsSSE) |
1369 | continue; |
1370 | |
1371 | if (!SrcIsSSE && !DstIsSSE) { |
1372 | // If this is an FPStack extension, it is a noop. |
1373 | if (N->getOpcode() == ISD::FP_EXTEND) |
1374 | continue; |
1375 | // If this is a value-preserving FPStack truncation, it is a noop. |
1376 | if (N->getConstantOperandVal(Num: 1)) |
1377 | continue; |
1378 | } |
1379 | |
1380 | // Here we could have an FP stack truncation or an FPStack <-> SSE convert. |
1381 | // FPStack has extload and truncstore. SSE can fold direct loads into other |
1382 | // operations. Based on this, decide what we want to do. |
1383 | MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; |
1384 | SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT); |
1385 | int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex(); |
1386 | MachinePointerInfo MPI = |
1387 | MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI); |
1388 | SDLoc dl(N); |
1389 | |
1390 | // FIXME: optimize the case where the src/dest is a load or store? |
1391 | |
1392 | SDValue Store = CurDAG->getTruncStore( |
1393 | Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: 0), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT); |
1394 | SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store, |
1395 | Ptr: MemTmp, PtrInfo: MPI, MemVT); |
1396 | |
1397 | // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the |
1398 | // extload we created. This will cause general havok on the dag because |
1399 | // anything below the conversion could be folded into other existing nodes. |
1400 | // To avoid invalidating 'I', back it up to the convert node. |
1401 | --I; |
1402 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Result); |
1403 | break; |
1404 | } |
1405 | |
1406 | //The sequence of events for lowering STRICT_FP versions of these nodes requires |
1407 | //dealing with the chain differently, as there is already a preexisting chain. |
1408 | case ISD::STRICT_FP_ROUND: |
1409 | case ISD::STRICT_FP_EXTEND: |
1410 | { |
1411 | MVT SrcVT = N->getOperand(Num: 1).getSimpleValueType(); |
1412 | MVT DstVT = N->getSimpleValueType(ResNo: 0); |
1413 | |
1414 | // If any of the sources are vectors, no fp stack involved. |
1415 | if (SrcVT.isVector() || DstVT.isVector()) |
1416 | continue; |
1417 | |
1418 | // If the source and destination are SSE registers, then this is a legal |
1419 | // conversion that should not be lowered. |
1420 | const X86TargetLowering *X86Lowering = |
1421 | static_cast<const X86TargetLowering *>(TLI); |
1422 | bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT); |
1423 | bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT); |
1424 | if (SrcIsSSE && DstIsSSE) |
1425 | continue; |
1426 | |
1427 | if (!SrcIsSSE && !DstIsSSE) { |
1428 | // If this is an FPStack extension, it is a noop. |
1429 | if (N->getOpcode() == ISD::STRICT_FP_EXTEND) |
1430 | continue; |
1431 | // If this is a value-preserving FPStack truncation, it is a noop. |
1432 | if (N->getConstantOperandVal(Num: 2)) |
1433 | continue; |
1434 | } |
1435 | |
1436 | // Here we could have an FP stack truncation or an FPStack <-> SSE convert. |
1437 | // FPStack has extload and truncstore. SSE can fold direct loads into other |
1438 | // operations. Based on this, decide what we want to do. |
1439 | MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; |
1440 | SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT); |
1441 | int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex(); |
1442 | MachinePointerInfo MPI = |
1443 | MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI); |
1444 | SDLoc dl(N); |
1445 | |
1446 | // FIXME: optimize the case where the src/dest is a load or store? |
1447 | |
1448 | //Since the operation is StrictFP, use the preexisting chain. |
1449 | SDValue Store, Result; |
1450 | if (!SrcIsSSE) { |
1451 | SDVTList VTs = CurDAG->getVTList(VT: MVT::Other); |
1452 | SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), MemTmp}; |
1453 | Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT, |
1454 | PtrInfo: MPI, /*Align*/ Alignment: std::nullopt, |
1455 | Flags: MachineMemOperand::MOStore); |
1456 | if (N->getFlags().hasNoFPExcept()) { |
1457 | SDNodeFlags Flags = Store->getFlags(); |
1458 | Flags.setNoFPExcept(true); |
1459 | Store->setFlags(Flags); |
1460 | } |
1461 | } else { |
1462 | assert(SrcVT == MemVT && "Unexpected VT!" ); |
1463 | Store = CurDAG->getStore(Chain: N->getOperand(Num: 0), dl, Val: N->getOperand(Num: 1), Ptr: MemTmp, |
1464 | PtrInfo: MPI); |
1465 | } |
1466 | |
1467 | if (!DstIsSSE) { |
1468 | SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other); |
1469 | SDValue Ops[] = {Store, MemTmp}; |
1470 | Result = CurDAG->getMemIntrinsicNode( |
1471 | Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI, |
1472 | /*Align*/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad); |
1473 | if (N->getFlags().hasNoFPExcept()) { |
1474 | SDNodeFlags Flags = Result->getFlags(); |
1475 | Flags.setNoFPExcept(true); |
1476 | Result->setFlags(Flags); |
1477 | } |
1478 | } else { |
1479 | assert(DstVT == MemVT && "Unexpected VT!" ); |
1480 | Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI); |
1481 | } |
1482 | |
1483 | // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the |
1484 | // extload we created. This will cause general havok on the dag because |
1485 | // anything below the conversion could be folded into other existing nodes. |
1486 | // To avoid invalidating 'I', back it up to the convert node. |
1487 | --I; |
1488 | CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode()); |
1489 | break; |
1490 | } |
1491 | } |
1492 | |
1493 | |
1494 | // Now that we did that, the node is dead. Increment the iterator to the |
1495 | // next node to process, then delete N. |
1496 | ++I; |
1497 | MadeChange = true; |
1498 | } |
1499 | |
1500 | // Remove any dead nodes that may have been left behind. |
1501 | if (MadeChange) |
1502 | CurDAG->RemoveDeadNodes(); |
1503 | } |
1504 | |
1505 | // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. |
1506 | bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { |
1507 | unsigned Opc = N->getMachineOpcode(); |
1508 | if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && |
1509 | Opc != X86::MOVSX64rr8) |
1510 | return false; |
1511 | |
1512 | SDValue N0 = N->getOperand(Num: 0); |
1513 | |
1514 | // We need to be extracting the lower bit of an extend. |
1515 | if (!N0.isMachineOpcode() || |
1516 | N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || |
1517 | N0.getConstantOperandVal(i: 1) != X86::sub_8bit) |
1518 | return false; |
1519 | |
1520 | // We're looking for either a movsx or movzx to match the original opcode. |
1521 | unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX |
1522 | : X86::MOVSX32rr8_NOREX; |
1523 | SDValue N00 = N0.getOperand(i: 0); |
1524 | if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) |
1525 | return false; |
1526 | |
1527 | if (Opc == X86::MOVSX64rr8) { |
1528 | // If we had a sign extend from 8 to 64 bits. We still need to go from 32 |
1529 | // to 64. |
1530 | MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc(N), |
1531 | VT: MVT::i64, Op1: N00); |
1532 | ReplaceUses(F: N, T: Extend); |
1533 | } else { |
1534 | // Ok we can drop this extend and just use the original extend. |
1535 | ReplaceUses(F: N, T: N00.getNode()); |
1536 | } |
1537 | |
1538 | return true; |
1539 | } |
1540 | |
1541 | void X86DAGToDAGISel::PostprocessISelDAG() { |
1542 | // Skip peepholes at -O0. |
1543 | if (TM.getOptLevel() == CodeGenOptLevel::None) |
1544 | return; |
1545 | |
1546 | SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); |
1547 | |
1548 | bool MadeChange = false; |
1549 | while (Position != CurDAG->allnodes_begin()) { |
1550 | SDNode *N = &*--Position; |
1551 | // Skip dead nodes and any non-machine opcodes. |
1552 | if (N->use_empty() || !N->isMachineOpcode()) |
1553 | continue; |
1554 | |
1555 | if (tryOptimizeRem8Extend(N)) { |
1556 | MadeChange = true; |
1557 | continue; |
1558 | } |
1559 | |
1560 | unsigned Opc = N->getMachineOpcode(); |
1561 | switch (Opc) { |
1562 | default: |
1563 | continue; |
1564 | // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr |
1565 | case X86::TEST8rr: |
1566 | case X86::TEST16rr: |
1567 | case X86::TEST32rr: |
1568 | case X86::TEST64rr: |
1569 | // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr |
1570 | case X86::CTEST8rr: |
1571 | case X86::CTEST16rr: |
1572 | case X86::CTEST32rr: |
1573 | case X86::CTEST64rr: { |
1574 | auto &Op0 = N->getOperand(Num: 0); |
1575 | if (Op0 != N->getOperand(Num: 1) || !Op0->hasNUsesOfValue(NUses: 2, Value: Op0.getResNo()) || |
1576 | !Op0.isMachineOpcode()) |
1577 | continue; |
1578 | SDValue And = N->getOperand(Num: 0); |
1579 | #define CASE_ND(OP) \ |
1580 | case X86::OP: \ |
1581 | case X86::OP##_ND: |
1582 | switch (And.getMachineOpcode()) { |
1583 | default: |
1584 | continue; |
1585 | CASE_ND(AND8rr) |
1586 | CASE_ND(AND16rr) |
1587 | CASE_ND(AND32rr) |
1588 | CASE_ND(AND64rr) { |
1589 | if (And->hasAnyUseOfValue(Value: 1)) |
1590 | continue; |
1591 | SmallVector<SDValue> Ops(N->op_values()); |
1592 | Ops[0] = And.getOperand(i: 0); |
1593 | Ops[1] = And.getOperand(i: 1); |
1594 | MachineSDNode *Test = |
1595 | CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: MVT::i32, Ops); |
1596 | ReplaceUses(F: N, T: Test); |
1597 | MadeChange = true; |
1598 | continue; |
1599 | } |
1600 | CASE_ND(AND8rm) |
1601 | CASE_ND(AND16rm) |
1602 | CASE_ND(AND32rm) |
1603 | CASE_ND(AND64rm) { |
1604 | if (And->hasAnyUseOfValue(Value: 1)) |
1605 | continue; |
1606 | unsigned NewOpc; |
1607 | bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc); |
1608 | #define FROM_TO(A, B) \ |
1609 | CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \ |
1610 | break; |
1611 | switch (And.getMachineOpcode()) { |
1612 | FROM_TO(AND8rm, TEST8mr); |
1613 | FROM_TO(AND16rm, TEST16mr); |
1614 | FROM_TO(AND32rm, TEST32mr); |
1615 | FROM_TO(AND64rm, TEST64mr); |
1616 | } |
1617 | #undef FROM_TO |
1618 | #undef CASE_ND |
1619 | // Need to swap the memory and register operand. |
1620 | SmallVector<SDValue> Ops = {And.getOperand(i: 1), And.getOperand(i: 2), |
1621 | And.getOperand(i: 3), And.getOperand(i: 4), |
1622 | And.getOperand(i: 5), And.getOperand(i: 0)}; |
1623 | // CC, Cflags. |
1624 | if (IsCTESTCC) { |
1625 | Ops.push_back(Elt: N->getOperand(Num: 2)); |
1626 | Ops.push_back(Elt: N->getOperand(Num: 3)); |
1627 | } |
1628 | // Chain of memory load |
1629 | Ops.push_back(Elt: And.getOperand(i: 6)); |
1630 | // Glue |
1631 | if (IsCTESTCC) |
1632 | Ops.push_back(Elt: N->getOperand(Num: 4)); |
1633 | |
1634 | MachineSDNode *Test = CurDAG->getMachineNode( |
1635 | Opcode: NewOpc, dl: SDLoc(N), VT1: MVT::i32, VT2: MVT::Other, Ops); |
1636 | CurDAG->setNodeMemRefs( |
1637 | N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands()); |
1638 | ReplaceUses(F: And.getValue(R: 2), T: SDValue(Test, 1)); |
1639 | ReplaceUses(F: SDValue(N, 0), T: SDValue(Test, 0)); |
1640 | MadeChange = true; |
1641 | continue; |
1642 | } |
1643 | } |
1644 | } |
1645 | // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is |
1646 | // used. We're doing this late so we can prefer to fold the AND into masked |
1647 | // comparisons. Doing that can be better for the live range of the mask |
1648 | // register. |
1649 | case X86::KORTESTBrr: |
1650 | case X86::KORTESTWrr: |
1651 | case X86::KORTESTDrr: |
1652 | case X86::KORTESTQrr: { |
1653 | SDValue Op0 = N->getOperand(Num: 0); |
1654 | if (Op0 != N->getOperand(Num: 1) || !N->isOnlyUserOf(N: Op0.getNode()) || |
1655 | !Op0.isMachineOpcode() || !onlyUsesZeroFlag(Flags: SDValue(N, 0))) |
1656 | continue; |
1657 | #define CASE(A) \ |
1658 | case X86::A: \ |
1659 | break; |
1660 | switch (Op0.getMachineOpcode()) { |
1661 | default: |
1662 | continue; |
1663 | CASE(KANDBrr) |
1664 | CASE(KANDWrr) |
1665 | CASE(KANDDrr) |
1666 | CASE(KANDQrr) |
1667 | } |
1668 | unsigned NewOpc; |
1669 | #define FROM_TO(A, B) \ |
1670 | case X86::A: \ |
1671 | NewOpc = X86::B; \ |
1672 | break; |
1673 | switch (Opc) { |
1674 | FROM_TO(KORTESTBrr, KTESTBrr) |
1675 | FROM_TO(KORTESTWrr, KTESTWrr) |
1676 | FROM_TO(KORTESTDrr, KTESTDrr) |
1677 | FROM_TO(KORTESTQrr, KTESTQrr) |
1678 | } |
1679 | // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other |
1680 | // KAND instructions and KTEST use the same ISA feature. |
1681 | if (NewOpc == X86::KTESTWrr && !Subtarget->hasDQI()) |
1682 | continue; |
1683 | #undef FROM_TO |
1684 | MachineSDNode *KTest = CurDAG->getMachineNode( |
1685 | Opcode: NewOpc, dl: SDLoc(N), VT: MVT::i32, Op1: Op0.getOperand(i: 0), Op2: Op0.getOperand(i: 1)); |
1686 | ReplaceUses(F: N, T: KTest); |
1687 | MadeChange = true; |
1688 | continue; |
1689 | } |
1690 | // Attempt to remove vectors moves that were inserted to zero upper bits. |
1691 | case TargetOpcode::SUBREG_TO_REG: { |
1692 | unsigned SubRegIdx = N->getConstantOperandVal(Num: 2); |
1693 | if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) |
1694 | continue; |
1695 | |
1696 | SDValue Move = N->getOperand(Num: 1); |
1697 | if (!Move.isMachineOpcode()) |
1698 | continue; |
1699 | |
1700 | // Make sure its one of the move opcodes we recognize. |
1701 | switch (Move.getMachineOpcode()) { |
1702 | default: |
1703 | continue; |
1704 | CASE(VMOVAPDrr) CASE(VMOVUPDrr) |
1705 | CASE(VMOVAPSrr) CASE(VMOVUPSrr) |
1706 | CASE(VMOVDQArr) CASE(VMOVDQUrr) |
1707 | CASE(VMOVAPDYrr) CASE(VMOVUPDYrr) |
1708 | CASE(VMOVAPSYrr) CASE(VMOVUPSYrr) |
1709 | CASE(VMOVDQAYrr) CASE(VMOVDQUYrr) |
1710 | CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr) |
1711 | CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr) |
1712 | CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr) |
1713 | CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr) |
1714 | CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr) |
1715 | CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr) |
1716 | CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr) |
1717 | CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr) |
1718 | } |
1719 | #undef CASE |
1720 | |
1721 | SDValue In = Move.getOperand(i: 0); |
1722 | if (!In.isMachineOpcode() || |
1723 | In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) |
1724 | continue; |
1725 | |
1726 | // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers |
1727 | // the SHA instructions which use a legacy encoding. |
1728 | uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags; |
1729 | if ((TSFlags & X86II::EncodingMask) != X86II::VEX && |
1730 | (TSFlags & X86II::EncodingMask) != X86II::EVEX && |
1731 | (TSFlags & X86II::EncodingMask) != X86II::XOP) |
1732 | continue; |
1733 | |
1734 | // Producing instruction is another vector instruction. We can drop the |
1735 | // move. |
1736 | CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: In, Op3: N->getOperand(Num: 2)); |
1737 | MadeChange = true; |
1738 | } |
1739 | } |
1740 | } |
1741 | |
1742 | if (MadeChange) |
1743 | CurDAG->RemoveDeadNodes(); |
1744 | } |
1745 | |
1746 | |
1747 | /// Emit any code that needs to be executed only in the main function. |
1748 | void X86DAGToDAGISel::emitSpecialCodeForMain() { |
1749 | if (Subtarget->isTargetCygMing()) { |
1750 | TargetLowering::ArgListTy Args; |
1751 | auto &DL = CurDAG->getDataLayout(); |
1752 | |
1753 | TargetLowering::CallLoweringInfo CLI(*CurDAG); |
1754 | CLI.setChain(CurDAG->getRoot()) |
1755 | .setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()), |
1756 | Target: CurDAG->getExternalSymbol(Sym: "__main" , VT: TLI->getPointerTy(DL)), |
1757 | ArgsList: std::move(Args)); |
1758 | const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); |
1759 | std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); |
1760 | CurDAG->setRoot(Result.second); |
1761 | } |
1762 | } |
1763 | |
1764 | void X86DAGToDAGISel::emitFunctionEntryCode() { |
1765 | // If this is main, emit special code for main. |
1766 | const Function &F = MF->getFunction(); |
1767 | if (F.hasExternalLinkage() && F.getName() == "main" ) |
1768 | emitSpecialCodeForMain(); |
1769 | } |
1770 | |
1771 | static bool isDispSafeForFrameIndex(int64_t Val) { |
1772 | // On 64-bit platforms, we can run into an issue where a frame index |
1773 | // includes a displacement that, when added to the explicit displacement, |
1774 | // will overflow the displacement field. Assuming that the frame index |
1775 | // displacement fits into a 31-bit integer (which is only slightly more |
1776 | // aggressive than the current fundamental assumption that it fits into |
1777 | // a 32-bit integer), a 31-bit disp should always be safe. |
1778 | return isInt<31>(x: Val); |
1779 | } |
1780 | |
1781 | bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, |
1782 | X86ISelAddressMode &AM) { |
1783 | // We may have already matched a displacement and the caller just added the |
1784 | // symbolic displacement. So we still need to do the checks even if Offset |
1785 | // is zero. |
1786 | |
1787 | int64_t Val = AM.Disp + Offset; |
1788 | |
1789 | // Cannot combine ExternalSymbol displacements with integer offsets. |
1790 | if (Val != 0 && (AM.ES || AM.MCSym)) |
1791 | return true; |
1792 | |
1793 | CodeModel::Model M = TM.getCodeModel(); |
1794 | if (Subtarget->is64Bit()) { |
1795 | if (Val != 0 && |
1796 | !X86::isOffsetSuitableForCodeModel(Offset: Val, M, |
1797 | hasSymbolicDisplacement: AM.hasSymbolicDisplacement())) |
1798 | return true; |
1799 | // In addition to the checks required for a register base, check that |
1800 | // we do not try to use an unsafe Disp with a frame index. |
1801 | if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && |
1802 | !isDispSafeForFrameIndex(Val)) |
1803 | return true; |
1804 | // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to |
1805 | // 64 bits. Instructions with 32-bit register addresses perform this zero |
1806 | // extension for us and we can safely ignore the high bits of Offset. |
1807 | // Instructions with only a 32-bit immediate address do not, though: they |
1808 | // sign extend instead. This means only address the low 2GB of address space |
1809 | // is directly addressable, we need indirect addressing for the high 2GB of |
1810 | // address space. |
1811 | // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the |
1812 | // implicit zero extension of instructions would cover up any problem. |
1813 | // However, we have asserts elsewhere that get triggered if we do, so keep |
1814 | // the checks for now. |
1815 | // TODO: We would actually be able to accept these, as well as the same |
1816 | // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand |
1817 | // to get an address size override to be emitted. However, this |
1818 | // pseudo-register is not part of any register class and therefore causes |
1819 | // MIR verification to fail. |
1820 | if (Subtarget->isTarget64BitILP32() && !isUInt<31>(x: Val) && |
1821 | !AM.hasBaseOrIndexReg()) |
1822 | return true; |
1823 | } |
1824 | AM.Disp = Val; |
1825 | return false; |
1826 | } |
1827 | |
1828 | bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, |
1829 | bool AllowSegmentRegForX32) { |
1830 | SDValue Address = N->getOperand(Num: 1); |
1831 | |
1832 | // load gs:0 -> GS segment register. |
1833 | // load fs:0 -> FS segment register. |
1834 | // |
1835 | // This optimization is generally valid because the GNU TLS model defines that |
1836 | // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode |
1837 | // with 32-bit registers, as we get in ILP32 mode, those registers are first |
1838 | // zero-extended to 64 bits and then added it to the base address, which gives |
1839 | // unwanted results when the register holds a negative value. |
1840 | // For more information see http://people.redhat.com/drepper/tls.pdf |
1841 | if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr && |
1842 | !IndirectTlsSegRefs && |
1843 | (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || |
1844 | Subtarget->isTargetFuchsia())) { |
1845 | if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32) |
1846 | return true; |
1847 | switch (N->getPointerInfo().getAddrSpace()) { |
1848 | case X86AS::GS: |
1849 | AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16); |
1850 | return false; |
1851 | case X86AS::FS: |
1852 | AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16); |
1853 | return false; |
1854 | // Address space X86AS::SS is not handled here, because it is not used to |
1855 | // address TLS areas. |
1856 | } |
1857 | } |
1858 | |
1859 | return true; |
1860 | } |
1861 | |
1862 | /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing |
1863 | /// mode. These wrap things that will resolve down into a symbol reference. |
1864 | /// If no match is possible, this returns true, otherwise it returns false. |
1865 | bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { |
1866 | // If the addressing mode already has a symbol as the displacement, we can |
1867 | // never match another symbol. |
1868 | if (AM.hasSymbolicDisplacement()) |
1869 | return true; |
1870 | |
1871 | bool IsRIPRelTLS = false; |
1872 | bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; |
1873 | if (IsRIPRel) { |
1874 | SDValue Val = N.getOperand(i: 0); |
1875 | if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) |
1876 | IsRIPRelTLS = true; |
1877 | } |
1878 | |
1879 | // We can't use an addressing mode in the 64-bit large code model. |
1880 | // Global TLS addressing is an exception. In the medium code model, |
1881 | // we use can use a mode when RIP wrappers are present. |
1882 | // That signifies access to globals that are known to be "near", |
1883 | // such as the GOT itself. |
1884 | CodeModel::Model M = TM.getCodeModel(); |
1885 | if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS) |
1886 | return true; |
1887 | |
1888 | // Base and index reg must be 0 in order to use %rip as base. |
1889 | if (IsRIPRel && AM.hasBaseOrIndexReg()) |
1890 | return true; |
1891 | |
1892 | // Make a local copy in case we can't do this fold. |
1893 | X86ISelAddressMode Backup = AM; |
1894 | |
1895 | int64_t Offset = 0; |
1896 | SDValue N0 = N.getOperand(i: 0); |
1897 | if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) { |
1898 | AM.GV = G->getGlobal(); |
1899 | AM.SymbolFlags = G->getTargetFlags(); |
1900 | Offset = G->getOffset(); |
1901 | } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) { |
1902 | AM.CP = CP->getConstVal(); |
1903 | AM.Alignment = CP->getAlign(); |
1904 | AM.SymbolFlags = CP->getTargetFlags(); |
1905 | Offset = CP->getOffset(); |
1906 | } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) { |
1907 | AM.ES = S->getSymbol(); |
1908 | AM.SymbolFlags = S->getTargetFlags(); |
1909 | } else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) { |
1910 | AM.MCSym = S->getMCSymbol(); |
1911 | } else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) { |
1912 | AM.JT = J->getIndex(); |
1913 | AM.SymbolFlags = J->getTargetFlags(); |
1914 | } else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) { |
1915 | AM.BlockAddr = BA->getBlockAddress(); |
1916 | AM.SymbolFlags = BA->getTargetFlags(); |
1917 | Offset = BA->getOffset(); |
1918 | } else |
1919 | llvm_unreachable("Unhandled symbol reference node." ); |
1920 | |
1921 | // Can't use an addressing mode with large globals. |
1922 | if (Subtarget->is64Bit() && !IsRIPRel && AM.GV && |
1923 | TM.isLargeGlobalValue(GV: AM.GV)) { |
1924 | AM = Backup; |
1925 | return true; |
1926 | } |
1927 | |
1928 | if (foldOffsetIntoAddress(Offset, AM)) { |
1929 | AM = Backup; |
1930 | return true; |
1931 | } |
1932 | |
1933 | if (IsRIPRel) |
1934 | AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64)); |
1935 | |
1936 | // Commit the changes now that we know this fold is safe. |
1937 | return false; |
1938 | } |
1939 | |
1940 | /// Add the specified node to the specified addressing mode, returning true if |
1941 | /// it cannot be done. This just pattern matches for the addressing mode. |
1942 | bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { |
1943 | if (matchAddressRecursively(N, AM, Depth: 0)) |
1944 | return true; |
1945 | |
1946 | // Post-processing: Make a second attempt to fold a load, if we now know |
1947 | // that there will not be any other register. This is only performed for |
1948 | // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded |
1949 | // any foldable load the first time. |
1950 | if (Subtarget->isTarget64BitILP32() && |
1951 | AM.BaseType == X86ISelAddressMode::RegBase && |
1952 | AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) { |
1953 | SDValue Save_Base_Reg = AM.Base_Reg; |
1954 | if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) { |
1955 | AM.Base_Reg = SDValue(); |
1956 | if (matchLoadInAddress(N: LoadN, AM, /*AllowSegmentRegForX32=*/true)) |
1957 | AM.Base_Reg = Save_Base_Reg; |
1958 | } |
1959 | } |
1960 | |
1961 | // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has |
1962 | // a smaller encoding and avoids a scaled-index. |
1963 | if (AM.Scale == 2 && |
1964 | AM.BaseType == X86ISelAddressMode::RegBase && |
1965 | AM.Base_Reg.getNode() == nullptr) { |
1966 | AM.Base_Reg = AM.IndexReg; |
1967 | AM.Scale = 1; |
1968 | } |
1969 | |
1970 | // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, |
1971 | // because it has a smaller encoding. |
1972 | if (TM.getCodeModel() != CodeModel::Large && |
1973 | (!AM.GV || !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() && |
1974 | AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase && |
1975 | AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr && |
1976 | AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) { |
1977 | AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64); |
1978 | } |
1979 | |
1980 | return false; |
1981 | } |
1982 | |
1983 | bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, |
1984 | unsigned Depth) { |
1985 | // Add an artificial use to this node so that we can keep track of |
1986 | // it if it gets CSE'd with a different node. |
1987 | HandleSDNode Handle(N); |
1988 | |
1989 | X86ISelAddressMode Backup = AM; |
1990 | if (!matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1) && |
1991 | !matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, Depth: Depth+1)) |
1992 | return false; |
1993 | AM = Backup; |
1994 | |
1995 | // Try again after commutating the operands. |
1996 | if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
1997 | Depth: Depth + 1) && |
1998 | !matchAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, Depth: Depth + 1)) |
1999 | return false; |
2000 | AM = Backup; |
2001 | |
2002 | // If we couldn't fold both operands into the address at the same time, |
2003 | // see if we can just put each operand into a register and fold at least |
2004 | // the add. |
2005 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
2006 | !AM.Base_Reg.getNode() && |
2007 | !AM.IndexReg.getNode()) { |
2008 | N = Handle.getValue(); |
2009 | AM.Base_Reg = N.getOperand(i: 0); |
2010 | AM.IndexReg = N.getOperand(i: 1); |
2011 | AM.Scale = 1; |
2012 | return false; |
2013 | } |
2014 | N = Handle.getValue(); |
2015 | return true; |
2016 | } |
2017 | |
2018 | // Insert a node into the DAG at least before the Pos node's position. This |
2019 | // will reposition the node as needed, and will assign it a node ID that is <= |
2020 | // the Pos node's ID. Note that this does *not* preserve the uniqueness of node |
2021 | // IDs! The selection DAG must no longer depend on their uniqueness when this |
2022 | // is used. |
2023 | static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { |
2024 | if (N->getNodeId() == -1 || |
2025 | (SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) > |
2026 | SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) { |
2027 | DAG.RepositionNode(Position: Pos->getIterator(), N: N.getNode()); |
2028 | // Mark Node as invalid for pruning as after this it may be a successor to a |
2029 | // selected node but otherwise be in the same position of Pos. |
2030 | // Conservatively mark it with the same -abs(Id) to assure node id |
2031 | // invariant is preserved. |
2032 | N->setNodeId(Pos->getNodeId()); |
2033 | SelectionDAGISel::InvalidateNodeId(N: N.getNode()); |
2034 | } |
2035 | } |
2036 | |
2037 | // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if |
2038 | // safe. This allows us to convert the shift and and into an h-register |
2039 | // extract and a scaled index. Returns false if the simplification is |
2040 | // performed. |
2041 | static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, |
2042 | uint64_t Mask, |
2043 | SDValue Shift, SDValue X, |
2044 | X86ISelAddressMode &AM) { |
2045 | if (Shift.getOpcode() != ISD::SRL || |
2046 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) || |
2047 | !Shift.hasOneUse()) |
2048 | return true; |
2049 | |
2050 | int ScaleLog = 8 - Shift.getConstantOperandVal(i: 1); |
2051 | if (ScaleLog <= 0 || ScaleLog >= 4 || |
2052 | Mask != (0xffu << ScaleLog)) |
2053 | return true; |
2054 | |
2055 | MVT XVT = X.getSimpleValueType(); |
2056 | MVT VT = N.getSimpleValueType(); |
2057 | SDLoc DL(N); |
2058 | SDValue Eight = DAG.getConstant(Val: 8, DL, VT: MVT::i8); |
2059 | SDValue NewMask = DAG.getConstant(Val: 0xff, DL, VT: XVT); |
2060 | SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight); |
2061 | SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask); |
2062 | SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT); |
2063 | SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8); |
2064 | SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount); |
2065 | |
2066 | // Insert the new nodes into the topological ordering. We must do this in |
2067 | // a valid topological ordering as nothing is going to go back and re-sort |
2068 | // these nodes. We continually insert before 'N' in sequence as this is |
2069 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2070 | // hierarchy left to express. |
2071 | insertDAGNode(DAG, Pos: N, N: Eight); |
2072 | insertDAGNode(DAG, Pos: N, N: NewMask); |
2073 | insertDAGNode(DAG, Pos: N, N: Srl); |
2074 | insertDAGNode(DAG, Pos: N, N: And); |
2075 | insertDAGNode(DAG, Pos: N, N: Ext); |
2076 | insertDAGNode(DAG, Pos: N, N: ShlCount); |
2077 | insertDAGNode(DAG, Pos: N, N: Shl); |
2078 | DAG.ReplaceAllUsesWith(From: N, To: Shl); |
2079 | DAG.RemoveDeadNode(N: N.getNode()); |
2080 | AM.IndexReg = Ext; |
2081 | AM.Scale = (1 << ScaleLog); |
2082 | return false; |
2083 | } |
2084 | |
2085 | // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this |
2086 | // allows us to fold the shift into this addressing mode. Returns false if the |
2087 | // transform succeeded. |
2088 | static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, |
2089 | X86ISelAddressMode &AM) { |
2090 | SDValue Shift = N.getOperand(i: 0); |
2091 | |
2092 | // Use a signed mask so that shifting right will insert sign bits. These |
2093 | // bits will be removed when we shift the result left so it doesn't matter |
2094 | // what we use. This might allow a smaller immediate encoding. |
2095 | int64_t Mask = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getSExtValue(); |
2096 | |
2097 | // If we have an any_extend feeding the AND, look through it to see if there |
2098 | // is a shift behind it. But only if the AND doesn't use the extended bits. |
2099 | // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? |
2100 | bool FoundAnyExtend = false; |
2101 | if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && |
2102 | Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 && |
2103 | isUInt<32>(x: Mask)) { |
2104 | FoundAnyExtend = true; |
2105 | Shift = Shift.getOperand(i: 0); |
2106 | } |
2107 | |
2108 | if (Shift.getOpcode() != ISD::SHL || |
2109 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1))) |
2110 | return true; |
2111 | |
2112 | SDValue X = Shift.getOperand(i: 0); |
2113 | |
2114 | // Not likely to be profitable if either the AND or SHIFT node has more |
2115 | // than one use (unless all uses are for address computation). Besides, |
2116 | // isel mechanism requires their node ids to be reused. |
2117 | if (!N.hasOneUse() || !Shift.hasOneUse()) |
2118 | return true; |
2119 | |
2120 | // Verify that the shift amount is something we can fold. |
2121 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
2122 | if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) |
2123 | return true; |
2124 | |
2125 | MVT VT = N.getSimpleValueType(); |
2126 | SDLoc DL(N); |
2127 | if (FoundAnyExtend) { |
2128 | SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X); |
2129 | insertDAGNode(DAG, Pos: N, N: NewX); |
2130 | X = NewX; |
2131 | } |
2132 | |
2133 | SDValue NewMask = DAG.getConstant(Val: Mask >> ShiftAmt, DL, VT); |
2134 | SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask); |
2135 | SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: 1)); |
2136 | |
2137 | // Insert the new nodes into the topological ordering. We must do this in |
2138 | // a valid topological ordering as nothing is going to go back and re-sort |
2139 | // these nodes. We continually insert before 'N' in sequence as this is |
2140 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2141 | // hierarchy left to express. |
2142 | insertDAGNode(DAG, Pos: N, N: NewMask); |
2143 | insertDAGNode(DAG, Pos: N, N: NewAnd); |
2144 | insertDAGNode(DAG, Pos: N, N: NewShift); |
2145 | DAG.ReplaceAllUsesWith(From: N, To: NewShift); |
2146 | DAG.RemoveDeadNode(N: N.getNode()); |
2147 | |
2148 | AM.Scale = 1 << ShiftAmt; |
2149 | AM.IndexReg = NewAnd; |
2150 | return false; |
2151 | } |
2152 | |
2153 | // Implement some heroics to detect shifts of masked values where the mask can |
2154 | // be replaced by extending the shift and undoing that in the addressing mode |
2155 | // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and |
2156 | // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in |
2157 | // the addressing mode. This results in code such as: |
2158 | // |
2159 | // int f(short *y, int *lookup_table) { |
2160 | // ... |
2161 | // return *y + lookup_table[*y >> 11]; |
2162 | // } |
2163 | // |
2164 | // Turning into: |
2165 | // movzwl (%rdi), %eax |
2166 | // movl %eax, %ecx |
2167 | // shrl $11, %ecx |
2168 | // addl (%rsi,%rcx,4), %eax |
2169 | // |
2170 | // Instead of: |
2171 | // movzwl (%rdi), %eax |
2172 | // movl %eax, %ecx |
2173 | // shrl $9, %ecx |
2174 | // andl $124, %rcx |
2175 | // addl (%rsi,%rcx), %eax |
2176 | // |
2177 | // Note that this function assumes the mask is provided as a mask *after* the |
2178 | // value is shifted. The input chain may or may not match that, but computing |
2179 | // such a mask is trivial. |
2180 | static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, |
2181 | uint64_t Mask, |
2182 | SDValue Shift, SDValue X, |
2183 | X86ISelAddressMode &AM) { |
2184 | if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || |
2185 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1))) |
2186 | return true; |
2187 | |
2188 | // We need to ensure that mask is a continuous run of bits. |
2189 | unsigned MaskIdx, MaskLen; |
2190 | if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen)) |
2191 | return true; |
2192 | unsigned MaskLZ = 64 - (MaskIdx + MaskLen); |
2193 | |
2194 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
2195 | |
2196 | // The amount of shift we're trying to fit into the addressing mode is taken |
2197 | // from the shifted mask index (number of trailing zeros of the mask). |
2198 | unsigned AMShiftAmt = MaskIdx; |
2199 | |
2200 | // There is nothing we can do here unless the mask is removing some bits. |
2201 | // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. |
2202 | if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; |
2203 | |
2204 | // Scale the leading zero count down based on the actual size of the value. |
2205 | // Also scale it down based on the size of the shift. |
2206 | unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; |
2207 | if (MaskLZ < ScaleDown) |
2208 | return true; |
2209 | MaskLZ -= ScaleDown; |
2210 | |
2211 | // The final check is to ensure that any masked out high bits of X are |
2212 | // already known to be zero. Otherwise, the mask has a semantic impact |
2213 | // other than masking out a couple of low bits. Unfortunately, because of |
2214 | // the mask, zero extensions will be removed from operands in some cases. |
2215 | // This code works extra hard to look through extensions because we can |
2216 | // replace them with zero extensions cheaply if necessary. |
2217 | bool ReplacingAnyExtend = false; |
2218 | if (X.getOpcode() == ISD::ANY_EXTEND) { |
2219 | unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - |
2220 | X.getOperand(i: 0).getSimpleValueType().getSizeInBits(); |
2221 | // Assume that we'll replace the any-extend with a zero-extend, and |
2222 | // narrow the search to the extended value. |
2223 | X = X.getOperand(i: 0); |
2224 | MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; |
2225 | ReplacingAnyExtend = true; |
2226 | } |
2227 | APInt MaskedHighBits = |
2228 | APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ); |
2229 | if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits)) |
2230 | return true; |
2231 | |
2232 | // We've identified a pattern that can be transformed into a single shift |
2233 | // and an addressing mode. Make it so. |
2234 | MVT VT = N.getSimpleValueType(); |
2235 | if (ReplacingAnyExtend) { |
2236 | assert(X.getValueType() != VT); |
2237 | // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. |
2238 | SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(X), VT, Operand: X); |
2239 | insertDAGNode(DAG, Pos: N, N: NewX); |
2240 | X = NewX; |
2241 | } |
2242 | |
2243 | MVT XVT = X.getSimpleValueType(); |
2244 | SDLoc DL(N); |
2245 | SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8); |
2246 | SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt); |
2247 | SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT); |
2248 | SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8); |
2249 | SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt); |
2250 | |
2251 | // Insert the new nodes into the topological ordering. We must do this in |
2252 | // a valid topological ordering as nothing is going to go back and re-sort |
2253 | // these nodes. We continually insert before 'N' in sequence as this is |
2254 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2255 | // hierarchy left to express. |
2256 | insertDAGNode(DAG, Pos: N, N: NewSRLAmt); |
2257 | insertDAGNode(DAG, Pos: N, N: NewSRL); |
2258 | insertDAGNode(DAG, Pos: N, N: NewExt); |
2259 | insertDAGNode(DAG, Pos: N, N: NewSHLAmt); |
2260 | insertDAGNode(DAG, Pos: N, N: NewSHL); |
2261 | DAG.ReplaceAllUsesWith(From: N, To: NewSHL); |
2262 | DAG.RemoveDeadNode(N: N.getNode()); |
2263 | |
2264 | AM.Scale = 1 << AMShiftAmt; |
2265 | AM.IndexReg = NewExt; |
2266 | return false; |
2267 | } |
2268 | |
2269 | // Transform "(X >> SHIFT) & (MASK << C1)" to |
2270 | // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be |
2271 | // matched to a BEXTR later. Returns false if the simplification is performed. |
2272 | static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, |
2273 | uint64_t Mask, |
2274 | SDValue Shift, SDValue X, |
2275 | X86ISelAddressMode &AM, |
2276 | const X86Subtarget &Subtarget) { |
2277 | if (Shift.getOpcode() != ISD::SRL || |
2278 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) || |
2279 | !Shift.hasOneUse() || !N.hasOneUse()) |
2280 | return true; |
2281 | |
2282 | // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. |
2283 | if (!Subtarget.hasTBM() && |
2284 | !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) |
2285 | return true; |
2286 | |
2287 | // We need to ensure that mask is a continuous run of bits. |
2288 | unsigned MaskIdx, MaskLen; |
2289 | if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen)) |
2290 | return true; |
2291 | |
2292 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
2293 | |
2294 | // The amount of shift we're trying to fit into the addressing mode is taken |
2295 | // from the shifted mask index (number of trailing zeros of the mask). |
2296 | unsigned AMShiftAmt = MaskIdx; |
2297 | |
2298 | // There is nothing we can do here unless the mask is removing some bits. |
2299 | // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. |
2300 | if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; |
2301 | |
2302 | MVT XVT = X.getSimpleValueType(); |
2303 | MVT VT = N.getSimpleValueType(); |
2304 | SDLoc DL(N); |
2305 | SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8); |
2306 | SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt); |
2307 | SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT); |
2308 | SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask); |
2309 | SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT); |
2310 | SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8); |
2311 | SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt); |
2312 | |
2313 | // Insert the new nodes into the topological ordering. We must do this in |
2314 | // a valid topological ordering as nothing is going to go back and re-sort |
2315 | // these nodes. We continually insert before 'N' in sequence as this is |
2316 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2317 | // hierarchy left to express. |
2318 | insertDAGNode(DAG, Pos: N, N: NewSRLAmt); |
2319 | insertDAGNode(DAG, Pos: N, N: NewSRL); |
2320 | insertDAGNode(DAG, Pos: N, N: NewMask); |
2321 | insertDAGNode(DAG, Pos: N, N: NewAnd); |
2322 | insertDAGNode(DAG, Pos: N, N: NewExt); |
2323 | insertDAGNode(DAG, Pos: N, N: NewSHLAmt); |
2324 | insertDAGNode(DAG, Pos: N, N: NewSHL); |
2325 | DAG.ReplaceAllUsesWith(From: N, To: NewSHL); |
2326 | DAG.RemoveDeadNode(N: N.getNode()); |
2327 | |
2328 | AM.Scale = 1 << AMShiftAmt; |
2329 | AM.IndexReg = NewExt; |
2330 | return false; |
2331 | } |
2332 | |
2333 | // Attempt to peek further into a scaled index register, collecting additional |
2334 | // extensions / offsets / etc. Returns /p N if we can't peek any further. |
2335 | SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N, |
2336 | X86ISelAddressMode &AM, |
2337 | unsigned Depth) { |
2338 | assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched" ); |
2339 | assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) && |
2340 | "Illegal index scale" ); |
2341 | |
2342 | // Limit recursion. |
2343 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
2344 | return N; |
2345 | |
2346 | EVT VT = N.getValueType(); |
2347 | unsigned Opc = N.getOpcode(); |
2348 | |
2349 | // index: add(x,c) -> index: x, disp + c |
2350 | if (CurDAG->isBaseWithConstantOffset(Op: N)) { |
2351 | auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: 1)); |
2352 | uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale; |
2353 | if (!foldOffsetIntoAddress(Offset, AM)) |
2354 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
2355 | } |
2356 | |
2357 | // index: add(x,x) -> index: x, scale * 2 |
2358 | if (Opc == ISD::ADD && N.getOperand(i: 0) == N.getOperand(i: 1)) { |
2359 | if (AM.Scale <= 4) { |
2360 | AM.Scale *= 2; |
2361 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
2362 | } |
2363 | } |
2364 | |
2365 | // index: shl(x,i) -> index: x, scale * (1 << i) |
2366 | if (Opc == X86ISD::VSHLI) { |
2367 | uint64_t ShiftAmt = N.getConstantOperandVal(i: 1); |
2368 | uint64_t ScaleAmt = 1ULL << ShiftAmt; |
2369 | if ((AM.Scale * ScaleAmt) <= 8) { |
2370 | AM.Scale *= ScaleAmt; |
2371 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
2372 | } |
2373 | } |
2374 | |
2375 | // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c) |
2376 | // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext? |
2377 | if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) { |
2378 | SDValue Src = N.getOperand(i: 0); |
2379 | if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() && |
2380 | Src.hasOneUse()) { |
2381 | if (CurDAG->isBaseWithConstantOffset(Op: Src)) { |
2382 | SDValue AddSrc = Src.getOperand(i: 0); |
2383 | auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: 1)); |
2384 | uint64_t Offset = (uint64_t)AddVal->getSExtValue(); |
2385 | if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) { |
2386 | SDLoc DL(N); |
2387 | SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc); |
2388 | SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT); |
2389 | SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal); |
2390 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc); |
2391 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal); |
2392 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd); |
2393 | CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd); |
2394 | CurDAG->RemoveDeadNode(N: N.getNode()); |
2395 | return ExtSrc; |
2396 | } |
2397 | } |
2398 | } |
2399 | } |
2400 | |
2401 | // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c) |
2402 | // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c) |
2403 | // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext? |
2404 | if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) { |
2405 | SDValue Src = N.getOperand(i: 0); |
2406 | unsigned SrcOpc = Src.getOpcode(); |
2407 | if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) || |
2408 | CurDAG->isADDLike(Op: Src, /*NoWrap=*/true)) && |
2409 | Src.hasOneUse()) { |
2410 | if (CurDAG->isBaseWithConstantOffset(Op: Src)) { |
2411 | SDValue AddSrc = Src.getOperand(i: 0); |
2412 | uint64_t Offset = Src.getConstantOperandVal(i: 1); |
2413 | if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) { |
2414 | SDLoc DL(N); |
2415 | SDValue Res; |
2416 | // If we're also scaling, see if we can use that as well. |
2417 | if (AddSrc.getOpcode() == ISD::SHL && |
2418 | isa<ConstantSDNode>(Val: AddSrc.getOperand(i: 1))) { |
2419 | SDValue ShVal = AddSrc.getOperand(i: 0); |
2420 | uint64_t ShAmt = AddSrc.getConstantOperandVal(i: 1); |
2421 | APInt HiBits = |
2422 | APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt); |
2423 | uint64_t ScaleAmt = 1ULL << ShAmt; |
2424 | if ((AM.Scale * ScaleAmt) <= 8 && |
2425 | (AddSrc->getFlags().hasNoUnsignedWrap() || |
2426 | CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) { |
2427 | AM.Scale *= ScaleAmt; |
2428 | SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal); |
2429 | SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal, |
2430 | N2: AddSrc.getOperand(i: 1)); |
2431 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal); |
2432 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift); |
2433 | AddSrc = ExtShift; |
2434 | Res = ExtShVal; |
2435 | } |
2436 | } |
2437 | SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc); |
2438 | SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT); |
2439 | SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal); |
2440 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc); |
2441 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal); |
2442 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd); |
2443 | CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd); |
2444 | CurDAG->RemoveDeadNode(N: N.getNode()); |
2445 | return Res ? Res : ExtSrc; |
2446 | } |
2447 | } |
2448 | } |
2449 | } |
2450 | |
2451 | // TODO: Handle extensions, shifted masks etc. |
2452 | return N; |
2453 | } |
2454 | |
2455 | bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
2456 | unsigned Depth) { |
2457 | SDLoc dl(N); |
2458 | LLVM_DEBUG({ |
2459 | dbgs() << "MatchAddress: " ; |
2460 | AM.dump(CurDAG); |
2461 | }); |
2462 | // Limit recursion. |
2463 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
2464 | return matchAddressBase(N, AM); |
2465 | |
2466 | // If this is already a %rip relative address, we can only merge immediates |
2467 | // into it. Instead of handling this in every case, we handle it here. |
2468 | // RIP relative addressing: %rip + 32-bit displacement! |
2469 | if (AM.isRIPRelative()) { |
2470 | // FIXME: JumpTable and ExternalSymbol address currently don't like |
2471 | // displacements. It isn't very important, but this should be fixed for |
2472 | // consistency. |
2473 | if (!(AM.ES || AM.MCSym) && AM.JT != -1) |
2474 | return true; |
2475 | |
2476 | if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N)) |
2477 | if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM)) |
2478 | return false; |
2479 | return true; |
2480 | } |
2481 | |
2482 | switch (N.getOpcode()) { |
2483 | default: break; |
2484 | case ISD::LOCAL_RECOVER: { |
2485 | if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) |
2486 | if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: 0))) { |
2487 | // Use the symbol and don't prefix it. |
2488 | AM.MCSym = ESNode->getMCSymbol(); |
2489 | return false; |
2490 | } |
2491 | break; |
2492 | } |
2493 | case ISD::Constant: { |
2494 | uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue(); |
2495 | if (!foldOffsetIntoAddress(Offset: Val, AM)) |
2496 | return false; |
2497 | break; |
2498 | } |
2499 | |
2500 | case X86ISD::Wrapper: |
2501 | case X86ISD::WrapperRIP: |
2502 | if (!matchWrapper(N, AM)) |
2503 | return false; |
2504 | break; |
2505 | |
2506 | case ISD::LOAD: |
2507 | if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM)) |
2508 | return false; |
2509 | break; |
2510 | |
2511 | case ISD::FrameIndex: |
2512 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
2513 | AM.Base_Reg.getNode() == nullptr && |
2514 | (!Subtarget->is64Bit() || isDispSafeForFrameIndex(Val: AM.Disp))) { |
2515 | AM.BaseType = X86ISelAddressMode::FrameIndexBase; |
2516 | AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex(); |
2517 | return false; |
2518 | } |
2519 | break; |
2520 | |
2521 | case ISD::SHL: |
2522 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) |
2523 | break; |
2524 | |
2525 | if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) { |
2526 | unsigned Val = CN->getZExtValue(); |
2527 | // Note that we handle x<<1 as (,x,2) rather than (x,x) here so |
2528 | // that the base operand remains free for further matching. If |
2529 | // the base doesn't end up getting used, a post-processing step |
2530 | // in MatchAddress turns (,x,2) into (x,x), which is cheaper. |
2531 | if (Val == 1 || Val == 2 || Val == 3) { |
2532 | SDValue ShVal = N.getOperand(i: 0); |
2533 | AM.Scale = 1 << Val; |
2534 | AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + 1); |
2535 | return false; |
2536 | } |
2537 | } |
2538 | break; |
2539 | |
2540 | case ISD::SRL: { |
2541 | // Scale must not be used already. |
2542 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; |
2543 | |
2544 | // We only handle up to 64-bit values here as those are what matter for |
2545 | // addressing mode optimizations. |
2546 | assert(N.getSimpleValueType().getSizeInBits() <= 64 && |
2547 | "Unexpected value size!" ); |
2548 | |
2549 | SDValue And = N.getOperand(i: 0); |
2550 | if (And.getOpcode() != ISD::AND) break; |
2551 | SDValue X = And.getOperand(i: 0); |
2552 | |
2553 | // The mask used for the transform is expected to be post-shift, but we |
2554 | // found the shift first so just apply the shift to the mask before passing |
2555 | // it down. |
2556 | if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)) || |
2557 | !isa<ConstantSDNode>(Val: And.getOperand(i: 1))) |
2558 | break; |
2559 | uint64_t Mask = And.getConstantOperandVal(i: 1) >> N.getConstantOperandVal(i: 1); |
2560 | |
2561 | // Try to fold the mask and shift into the scale, and return false if we |
2562 | // succeed. |
2563 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM)) |
2564 | return false; |
2565 | break; |
2566 | } |
2567 | |
2568 | case ISD::SMUL_LOHI: |
2569 | case ISD::UMUL_LOHI: |
2570 | // A mul_lohi where we need the low part can be folded as a plain multiply. |
2571 | if (N.getResNo() != 0) break; |
2572 | [[fallthrough]]; |
2573 | case ISD::MUL: |
2574 | case X86ISD::MUL_IMM: |
2575 | // X*[3,5,9] -> X+X*[2,4,8] |
2576 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
2577 | AM.Base_Reg.getNode() == nullptr && |
2578 | AM.IndexReg.getNode() == nullptr) { |
2579 | if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) |
2580 | if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || |
2581 | CN->getZExtValue() == 9) { |
2582 | AM.Scale = unsigned(CN->getZExtValue())-1; |
2583 | |
2584 | SDValue MulVal = N.getOperand(i: 0); |
2585 | SDValue Reg; |
2586 | |
2587 | // Okay, we know that we have a scale by now. However, if the scaled |
2588 | // value is an add of something and a constant, we can fold the |
2589 | // constant into the disp field here. |
2590 | if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && |
2591 | isa<ConstantSDNode>(Val: MulVal.getOperand(i: 1))) { |
2592 | Reg = MulVal.getOperand(i: 0); |
2593 | auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: 1)); |
2594 | uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); |
2595 | if (foldOffsetIntoAddress(Offset: Disp, AM)) |
2596 | Reg = N.getOperand(i: 0); |
2597 | } else { |
2598 | Reg = N.getOperand(i: 0); |
2599 | } |
2600 | |
2601 | AM.IndexReg = AM.Base_Reg = Reg; |
2602 | return false; |
2603 | } |
2604 | } |
2605 | break; |
2606 | |
2607 | case ISD::SUB: { |
2608 | // Given A-B, if A can be completely folded into the address and |
2609 | // the index field with the index field unused, use -B as the index. |
2610 | // This is a win if a has multiple parts that can be folded into |
2611 | // the address. Also, this saves a mov if the base register has |
2612 | // other uses, since it avoids a two-address sub instruction, however |
2613 | // it costs an additional mov if the index register has other uses. |
2614 | |
2615 | // Add an artificial use to this node so that we can keep track of |
2616 | // it if it gets CSE'd with a different node. |
2617 | HandleSDNode Handle(N); |
2618 | |
2619 | // Test if the LHS of the sub can be folded. |
2620 | X86ISelAddressMode Backup = AM; |
2621 | if (matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1)) { |
2622 | N = Handle.getValue(); |
2623 | AM = Backup; |
2624 | break; |
2625 | } |
2626 | N = Handle.getValue(); |
2627 | // Test if the index field is free for use. |
2628 | if (AM.IndexReg.getNode() || AM.isRIPRelative()) { |
2629 | AM = Backup; |
2630 | break; |
2631 | } |
2632 | |
2633 | int Cost = 0; |
2634 | SDValue RHS = N.getOperand(i: 1); |
2635 | // If the RHS involves a register with multiple uses, this |
2636 | // transformation incurs an extra mov, due to the neg instruction |
2637 | // clobbering its operand. |
2638 | if (!RHS.getNode()->hasOneUse() || |
2639 | RHS.getNode()->getOpcode() == ISD::CopyFromReg || |
2640 | RHS.getNode()->getOpcode() == ISD::TRUNCATE || |
2641 | RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || |
2642 | (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && |
2643 | RHS.getOperand(i: 0).getValueType() == MVT::i32)) |
2644 | ++Cost; |
2645 | // If the base is a register with multiple uses, this |
2646 | // transformation may save a mov. |
2647 | if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && |
2648 | !AM.Base_Reg.getNode()->hasOneUse()) || |
2649 | AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
2650 | --Cost; |
2651 | // If the folded LHS was interesting, this transformation saves |
2652 | // address arithmetic. |
2653 | if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + |
2654 | ((AM.Disp != 0) && (Backup.Disp == 0)) + |
2655 | (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) |
2656 | --Cost; |
2657 | // If it doesn't look like it may be an overall win, don't do it. |
2658 | if (Cost >= 0) { |
2659 | AM = Backup; |
2660 | break; |
2661 | } |
2662 | |
2663 | // Ok, the transformation is legal and appears profitable. Go for it. |
2664 | // Negation will be emitted later to avoid creating dangling nodes if this |
2665 | // was an unprofitable LEA. |
2666 | AM.IndexReg = RHS; |
2667 | AM.NegateIndex = true; |
2668 | AM.Scale = 1; |
2669 | return false; |
2670 | } |
2671 | |
2672 | case ISD::OR: |
2673 | case ISD::XOR: |
2674 | // See if we can treat the OR/XOR node as an ADD node. |
2675 | if (!CurDAG->isADDLike(Op: N)) |
2676 | break; |
2677 | [[fallthrough]]; |
2678 | case ISD::ADD: |
2679 | if (!matchAdd(N, AM, Depth)) |
2680 | return false; |
2681 | break; |
2682 | |
2683 | case ISD::AND: { |
2684 | // Perform some heroic transforms on an and of a constant-count shift |
2685 | // with a constant to enable use of the scaled offset field. |
2686 | |
2687 | // Scale must not be used already. |
2688 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; |
2689 | |
2690 | // We only handle up to 64-bit values here as those are what matter for |
2691 | // addressing mode optimizations. |
2692 | assert(N.getSimpleValueType().getSizeInBits() <= 64 && |
2693 | "Unexpected value size!" ); |
2694 | |
2695 | if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1))) |
2696 | break; |
2697 | |
2698 | if (N.getOperand(i: 0).getOpcode() == ISD::SRL) { |
2699 | SDValue Shift = N.getOperand(i: 0); |
2700 | SDValue X = Shift.getOperand(i: 0); |
2701 | |
2702 | uint64_t Mask = N.getConstantOperandVal(i: 1); |
2703 | |
2704 | // Try to fold the mask and shift into an extract and scale. |
2705 | if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM)) |
2706 | return false; |
2707 | |
2708 | // Try to fold the mask and shift directly into the scale. |
2709 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM)) |
2710 | return false; |
2711 | |
2712 | // Try to fold the mask and shift into BEXTR and scale. |
2713 | if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask, Shift, X, AM, Subtarget: *Subtarget)) |
2714 | return false; |
2715 | } |
2716 | |
2717 | // Try to swap the mask and shift to place shifts which can be done as |
2718 | // a scale on the outside of the mask. |
2719 | if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM)) |
2720 | return false; |
2721 | |
2722 | break; |
2723 | } |
2724 | case ISD::ZERO_EXTEND: { |
2725 | // Try to widen a zexted shift left to the same size as its use, so we can |
2726 | // match the shift as a scale factor. |
2727 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) |
2728 | break; |
2729 | |
2730 | SDValue Src = N.getOperand(i: 0); |
2731 | |
2732 | // See if we can match a zext(addlike(x,c)). |
2733 | // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively. |
2734 | if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR) |
2735 | if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + 1)) |
2736 | if (Index != N) { |
2737 | AM.IndexReg = Index; |
2738 | return false; |
2739 | } |
2740 | |
2741 | // Peek through mask: zext(and(shl(x,c1),c2)) |
2742 | APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits()); |
2743 | if (Src.getOpcode() == ISD::AND && Src.hasOneUse()) |
2744 | if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1))) { |
2745 | Mask = MaskC->getAPIntValue(); |
2746 | Src = Src.getOperand(i: 0); |
2747 | } |
2748 | |
2749 | if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) { |
2750 | // Give up if the shift is not a valid scale factor [1,2,3]. |
2751 | SDValue ShlSrc = Src.getOperand(i: 0); |
2752 | SDValue ShlAmt = Src.getOperand(i: 1); |
2753 | auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt); |
2754 | if (!ShAmtC) |
2755 | break; |
2756 | unsigned ShAmtV = ShAmtC->getZExtValue(); |
2757 | if (ShAmtV > 3) |
2758 | break; |
2759 | |
2760 | // The narrow shift must only shift out zero bits (it must be 'nuw'). |
2761 | // That makes it safe to widen to the destination type. |
2762 | APInt HighZeros = |
2763 | APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV); |
2764 | if (!Src->getFlags().hasNoUnsignedWrap() && |
2765 | !CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask)) |
2766 | break; |
2767 | |
2768 | // zext (shl nuw i8 %x, C1) to i32 |
2769 | // --> shl (zext i8 %x to i32), (zext C1) |
2770 | // zext (and (shl nuw i8 %x, C1), C2) to i32 |
2771 | // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1) |
2772 | MVT SrcVT = ShlSrc.getSimpleValueType(); |
2773 | MVT VT = N.getSimpleValueType(); |
2774 | SDLoc DL(N); |
2775 | |
2776 | SDValue Res = ShlSrc; |
2777 | if (!Mask.isAllOnes()) { |
2778 | Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT); |
2779 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res); |
2780 | Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res); |
2781 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res); |
2782 | } |
2783 | SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res); |
2784 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext); |
2785 | SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt); |
2786 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl); |
2787 | CurDAG->ReplaceAllUsesWith(From: N, To: NewShl); |
2788 | CurDAG->RemoveDeadNode(N: N.getNode()); |
2789 | |
2790 | // Convert the shift to scale factor. |
2791 | AM.Scale = 1 << ShAmtV; |
2792 | // If matchIndexRecursively is not called here, |
2793 | // Zext may be replaced by other nodes but later used to call a builder |
2794 | // method |
2795 | AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + 1); |
2796 | return false; |
2797 | } |
2798 | |
2799 | if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) { |
2800 | // Try to fold the mask and shift into an extract and scale. |
2801 | if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
2802 | X: Src.getOperand(i: 0), AM)) |
2803 | return false; |
2804 | |
2805 | // Try to fold the mask and shift directly into the scale. |
2806 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
2807 | X: Src.getOperand(i: 0), AM)) |
2808 | return false; |
2809 | |
2810 | // Try to fold the mask and shift into BEXTR and scale. |
2811 | if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
2812 | X: Src.getOperand(i: 0), AM, Subtarget: *Subtarget)) |
2813 | return false; |
2814 | } |
2815 | |
2816 | break; |
2817 | } |
2818 | } |
2819 | |
2820 | return matchAddressBase(N, AM); |
2821 | } |
2822 | |
2823 | /// Helper for MatchAddress. Add the specified node to the |
2824 | /// specified addressing mode without any further recursion. |
2825 | bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { |
2826 | // Is the base register already occupied? |
2827 | if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { |
2828 | // If so, check to see if the scale index register is set. |
2829 | if (!AM.IndexReg.getNode()) { |
2830 | AM.IndexReg = N; |
2831 | AM.Scale = 1; |
2832 | return false; |
2833 | } |
2834 | |
2835 | // Otherwise, we cannot select it. |
2836 | return true; |
2837 | } |
2838 | |
2839 | // Default, generate it as a register. |
2840 | AM.BaseType = X86ISelAddressMode::RegBase; |
2841 | AM.Base_Reg = N; |
2842 | return false; |
2843 | } |
2844 | |
2845 | bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N, |
2846 | X86ISelAddressMode &AM, |
2847 | unsigned Depth) { |
2848 | SDLoc dl(N); |
2849 | LLVM_DEBUG({ |
2850 | dbgs() << "MatchVectorAddress: " ; |
2851 | AM.dump(CurDAG); |
2852 | }); |
2853 | // Limit recursion. |
2854 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
2855 | return matchAddressBase(N, AM); |
2856 | |
2857 | // TODO: Support other operations. |
2858 | switch (N.getOpcode()) { |
2859 | case ISD::Constant: { |
2860 | uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue(); |
2861 | if (!foldOffsetIntoAddress(Offset: Val, AM)) |
2862 | return false; |
2863 | break; |
2864 | } |
2865 | case X86ISD::Wrapper: |
2866 | if (!matchWrapper(N, AM)) |
2867 | return false; |
2868 | break; |
2869 | case ISD::ADD: { |
2870 | // Add an artificial use to this node so that we can keep track of |
2871 | // it if it gets CSE'd with a different node. |
2872 | HandleSDNode Handle(N); |
2873 | |
2874 | X86ISelAddressMode Backup = AM; |
2875 | if (!matchVectorAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1) && |
2876 | !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
2877 | Depth: Depth + 1)) |
2878 | return false; |
2879 | AM = Backup; |
2880 | |
2881 | // Try again after commuting the operands. |
2882 | if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
2883 | Depth: Depth + 1) && |
2884 | !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, |
2885 | Depth: Depth + 1)) |
2886 | return false; |
2887 | AM = Backup; |
2888 | |
2889 | N = Handle.getValue(); |
2890 | break; |
2891 | } |
2892 | } |
2893 | |
2894 | return matchAddressBase(N, AM); |
2895 | } |
2896 | |
2897 | /// Helper for selectVectorAddr. Handles things that can be folded into a |
2898 | /// gather/scatter address. The index register and scale should have already |
2899 | /// been handled. |
2900 | bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { |
2901 | return matchVectorAddressRecursively(N, AM, Depth: 0); |
2902 | } |
2903 | |
2904 | bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, |
2905 | SDValue IndexOp, SDValue ScaleOp, |
2906 | SDValue &Base, SDValue &Scale, |
2907 | SDValue &Index, SDValue &Disp, |
2908 | SDValue &Segment) { |
2909 | X86ISelAddressMode AM; |
2910 | AM.Scale = ScaleOp->getAsZExtVal(); |
2911 | |
2912 | // Attempt to match index patterns, as long as we're not relying on implicit |
2913 | // sign-extension, which is performed BEFORE scale. |
2914 | if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits()) |
2915 | AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: 0); |
2916 | else |
2917 | AM.IndexReg = IndexOp; |
2918 | |
2919 | unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace(); |
2920 | if (AddrSpace == X86AS::GS) |
2921 | AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16); |
2922 | if (AddrSpace == X86AS::FS) |
2923 | AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16); |
2924 | if (AddrSpace == X86AS::SS) |
2925 | AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16); |
2926 | |
2927 | SDLoc DL(BasePtr); |
2928 | MVT VT = BasePtr.getSimpleValueType(); |
2929 | |
2930 | // Try to match into the base and displacement fields. |
2931 | if (matchVectorAddress(N: BasePtr, AM)) |
2932 | return false; |
2933 | |
2934 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
2935 | return true; |
2936 | } |
2937 | |
2938 | /// Returns true if it is able to pattern match an addressing mode. |
2939 | /// It returns the operands which make up the maximal addressing mode it can |
2940 | /// match by reference. |
2941 | /// |
2942 | /// Parent is the parent node of the addr operand that is being matched. It |
2943 | /// is always a load, store, atomic node, or null. It is only null when |
2944 | /// checking memory operands for inline asm nodes. |
2945 | bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, |
2946 | SDValue &Scale, SDValue &Index, |
2947 | SDValue &Disp, SDValue &Segment) { |
2948 | X86ISelAddressMode AM; |
2949 | |
2950 | if (Parent && |
2951 | // This list of opcodes are all the nodes that have an "addr:$ptr" operand |
2952 | // that are not a MemSDNode, and thus don't have proper addrspace info. |
2953 | Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme |
2954 | Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores |
2955 | Parent->getOpcode() != X86ISD::TLSCALL && // Fixme |
2956 | Parent->getOpcode() != X86ISD::ENQCMD && // Fixme |
2957 | Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme |
2958 | Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp |
2959 | Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp |
2960 | unsigned AddrSpace = |
2961 | cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace(); |
2962 | if (AddrSpace == X86AS::GS) |
2963 | AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16); |
2964 | if (AddrSpace == X86AS::FS) |
2965 | AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16); |
2966 | if (AddrSpace == X86AS::SS) |
2967 | AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16); |
2968 | } |
2969 | |
2970 | // Save the DL and VT before calling matchAddress, it can invalidate N. |
2971 | SDLoc DL(N); |
2972 | MVT VT = N.getSimpleValueType(); |
2973 | |
2974 | if (matchAddress(N, AM)) |
2975 | return false; |
2976 | |
2977 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
2978 | return true; |
2979 | } |
2980 | |
2981 | bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { |
2982 | // Cannot use 32 bit constants to reference objects in kernel/large code |
2983 | // model. |
2984 | if (TM.getCodeModel() == CodeModel::Kernel || |
2985 | TM.getCodeModel() == CodeModel::Large) |
2986 | return false; |
2987 | |
2988 | // In static codegen with small code model, we can get the address of a label |
2989 | // into a register with 'movl' |
2990 | if (N->getOpcode() != X86ISD::Wrapper) |
2991 | return false; |
2992 | |
2993 | N = N.getOperand(i: 0); |
2994 | |
2995 | // At least GNU as does not accept 'movl' for TPOFF relocations. |
2996 | // FIXME: We could use 'movl' when we know we are targeting MC. |
2997 | if (N->getOpcode() == ISD::TargetGlobalTLSAddress) |
2998 | return false; |
2999 | |
3000 | Imm = N; |
3001 | // Small/medium code model can reference non-TargetGlobalAddress objects with |
3002 | // 32 bit constants. |
3003 | if (N->getOpcode() != ISD::TargetGlobalAddress) { |
3004 | return TM.getCodeModel() == CodeModel::Small || |
3005 | TM.getCodeModel() == CodeModel::Medium; |
3006 | } |
3007 | |
3008 | const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal(); |
3009 | if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) |
3010 | return CR->getUnsignedMax().ult(RHS: 1ull << 32); |
3011 | |
3012 | return !TM.isLargeGlobalValue(GV); |
3013 | } |
3014 | |
3015 | bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, |
3016 | SDValue &Scale, SDValue &Index, |
3017 | SDValue &Disp, SDValue &Segment) { |
3018 | // Save the debug loc before calling selectLEAAddr, in case it invalidates N. |
3019 | SDLoc DL(N); |
3020 | |
3021 | if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) |
3022 | return false; |
3023 | |
3024 | auto *RN = dyn_cast<RegisterSDNode>(Val&: Base); |
3025 | if (RN && RN->getReg() == 0) |
3026 | Base = CurDAG->getRegister(Reg: 0, VT: MVT::i64); |
3027 | else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Val: Base)) { |
3028 | // Base could already be %rip, particularly in the x32 ABI. |
3029 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL, |
3030 | VT: MVT::i64), 0); |
3031 | Base = CurDAG->getTargetInsertSubreg(SRIdx: X86::sub_32bit, DL, VT: MVT::i64, Operand: ImplDef, |
3032 | Subreg: Base); |
3033 | } |
3034 | |
3035 | RN = dyn_cast<RegisterSDNode>(Val&: Index); |
3036 | if (RN && RN->getReg() == 0) |
3037 | Index = CurDAG->getRegister(Reg: 0, VT: MVT::i64); |
3038 | else { |
3039 | assert(Index.getValueType() == MVT::i32 && |
3040 | "Expect to be extending 32-bit registers for use in LEA" ); |
3041 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL, |
3042 | VT: MVT::i64), 0); |
3043 | Index = CurDAG->getTargetInsertSubreg(SRIdx: X86::sub_32bit, DL, VT: MVT::i64, Operand: ImplDef, |
3044 | Subreg: Index); |
3045 | } |
3046 | |
3047 | return true; |
3048 | } |
3049 | |
3050 | /// Calls SelectAddr and determines if the maximal addressing |
3051 | /// mode it matches can be cost effectively emitted as an LEA instruction. |
3052 | bool X86DAGToDAGISel::selectLEAAddr(SDValue N, |
3053 | SDValue &Base, SDValue &Scale, |
3054 | SDValue &Index, SDValue &Disp, |
3055 | SDValue &Segment) { |
3056 | X86ISelAddressMode AM; |
3057 | |
3058 | // Save the DL and VT before calling matchAddress, it can invalidate N. |
3059 | SDLoc DL(N); |
3060 | MVT VT = N.getSimpleValueType(); |
3061 | |
3062 | // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support |
3063 | // segments. |
3064 | SDValue Copy = AM.Segment; |
3065 | SDValue T = CurDAG->getRegister(Reg: 0, VT: MVT::i32); |
3066 | AM.Segment = T; |
3067 | if (matchAddress(N, AM)) |
3068 | return false; |
3069 | assert (T == AM.Segment); |
3070 | AM.Segment = Copy; |
3071 | |
3072 | unsigned Complexity = 0; |
3073 | if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) |
3074 | Complexity = 1; |
3075 | else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
3076 | Complexity = 4; |
3077 | |
3078 | if (AM.IndexReg.getNode()) |
3079 | Complexity++; |
3080 | |
3081 | // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with |
3082 | // a simple shift. |
3083 | if (AM.Scale > 1) |
3084 | Complexity++; |
3085 | |
3086 | // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA |
3087 | // to a LEA. This is determined with some experimentation but is by no means |
3088 | // optimal (especially for code size consideration). LEA is nice because of |
3089 | // its three-address nature. Tweak the cost function again when we can run |
3090 | // convertToThreeAddress() at register allocation time. |
3091 | if (AM.hasSymbolicDisplacement()) { |
3092 | // For X86-64, always use LEA to materialize RIP-relative addresses. |
3093 | if (Subtarget->is64Bit()) |
3094 | Complexity = 4; |
3095 | else |
3096 | Complexity += 2; |
3097 | } |
3098 | |
3099 | // Heuristic: try harder to form an LEA from ADD if the operands set flags. |
3100 | // Unlike ADD, LEA does not affect flags, so we will be less likely to require |
3101 | // duplicating flag-producing instructions later in the pipeline. |
3102 | if (N.getOpcode() == ISD::ADD) { |
3103 | auto isMathWithFlags = [](SDValue V) { |
3104 | switch (V.getOpcode()) { |
3105 | case X86ISD::ADD: |
3106 | case X86ISD::SUB: |
3107 | case X86ISD::ADC: |
3108 | case X86ISD::SBB: |
3109 | case X86ISD::SMUL: |
3110 | case X86ISD::UMUL: |
3111 | /* TODO: These opcodes can be added safely, but we may want to justify |
3112 | their inclusion for different reasons (better for reg-alloc). |
3113 | case X86ISD::OR: |
3114 | case X86ISD::XOR: |
3115 | case X86ISD::AND: |
3116 | */ |
3117 | // Value 1 is the flag output of the node - verify it's not dead. |
3118 | return !SDValue(V.getNode(), 1).use_empty(); |
3119 | default: |
3120 | return false; |
3121 | } |
3122 | }; |
3123 | // TODO: We might want to factor in whether there's a load folding |
3124 | // opportunity for the math op that disappears with LEA. |
3125 | if (isMathWithFlags(N.getOperand(i: 0)) || isMathWithFlags(N.getOperand(i: 1))) |
3126 | Complexity++; |
3127 | } |
3128 | |
3129 | if (AM.Disp) |
3130 | Complexity++; |
3131 | |
3132 | // If it isn't worth using an LEA, reject it. |
3133 | if (Complexity <= 2) |
3134 | return false; |
3135 | |
3136 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
3137 | return true; |
3138 | } |
3139 | |
3140 | /// This is only run on TargetGlobalTLSAddress nodes. |
3141 | bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, |
3142 | SDValue &Scale, SDValue &Index, |
3143 | SDValue &Disp, SDValue &Segment) { |
3144 | assert(N.getOpcode() == ISD::TargetGlobalTLSAddress || |
3145 | N.getOpcode() == ISD::TargetExternalSymbol); |
3146 | |
3147 | X86ISelAddressMode AM; |
3148 | if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) { |
3149 | AM.GV = GA->getGlobal(); |
3150 | AM.Disp += GA->getOffset(); |
3151 | AM.SymbolFlags = GA->getTargetFlags(); |
3152 | } else { |
3153 | auto *SA = cast<ExternalSymbolSDNode>(Val&: N); |
3154 | AM.ES = SA->getSymbol(); |
3155 | AM.SymbolFlags = SA->getTargetFlags(); |
3156 | } |
3157 | |
3158 | if (Subtarget->is32Bit()) { |
3159 | AM.Scale = 1; |
3160 | AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32); |
3161 | } |
3162 | |
3163 | MVT VT = N.getSimpleValueType(); |
3164 | getAddressOperands(AM, DL: SDLoc(N), VT, Base, Scale, Index, Disp, Segment); |
3165 | return true; |
3166 | } |
3167 | |
3168 | bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { |
3169 | // Keep track of the original value type and whether this value was |
3170 | // truncated. If we see a truncation from pointer type to VT that truncates |
3171 | // bits that are known to be zero, we can use a narrow reference. |
3172 | EVT VT = N.getValueType(); |
3173 | bool WasTruncated = false; |
3174 | if (N.getOpcode() == ISD::TRUNCATE) { |
3175 | WasTruncated = true; |
3176 | N = N.getOperand(i: 0); |
3177 | } |
3178 | |
3179 | if (N.getOpcode() != X86ISD::Wrapper) |
3180 | return false; |
3181 | |
3182 | // We can only use non-GlobalValues as immediates if they were not truncated, |
3183 | // as we do not have any range information. If we have a GlobalValue and the |
3184 | // address was not truncated, we can select it as an operand directly. |
3185 | unsigned Opc = N.getOperand(i: 0)->getOpcode(); |
3186 | if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { |
3187 | Op = N.getOperand(i: 0); |
3188 | // We can only select the operand directly if we didn't have to look past a |
3189 | // truncate. |
3190 | return !WasTruncated; |
3191 | } |
3192 | |
3193 | // Check that the global's range fits into VT. |
3194 | auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: 0)); |
3195 | std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); |
3196 | if (!CR || CR->getUnsignedMax().uge(RHS: 1ull << VT.getSizeInBits())) |
3197 | return false; |
3198 | |
3199 | // Okay, we can use a narrow reference. |
3200 | Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N), VT, |
3201 | offset: GA->getOffset(), TargetFlags: GA->getTargetFlags()); |
3202 | return true; |
3203 | } |
3204 | |
3205 | bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, |
3206 | SDValue &Base, SDValue &Scale, |
3207 | SDValue &Index, SDValue &Disp, |
3208 | SDValue &Segment) { |
3209 | assert(Root && P && "Unknown root/parent nodes" ); |
3210 | if (!ISD::isNON_EXTLoad(N: N.getNode()) || |
3211 | !IsProfitableToFold(N, U: P, Root) || |
3212 | !IsLegalToFold(N, U: P, Root, OptLevel)) |
3213 | return false; |
3214 | |
3215 | return selectAddr(Parent: N.getNode(), |
3216 | N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment); |
3217 | } |
3218 | |
3219 | bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, |
3220 | SDValue &Base, SDValue &Scale, |
3221 | SDValue &Index, SDValue &Disp, |
3222 | SDValue &Segment) { |
3223 | assert(Root && P && "Unknown root/parent nodes" ); |
3224 | if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || |
3225 | !IsProfitableToFold(N, U: P, Root) || |
3226 | !IsLegalToFold(N, U: P, Root, OptLevel)) |
3227 | return false; |
3228 | |
3229 | return selectAddr(Parent: N.getNode(), |
3230 | N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment); |
3231 | } |
3232 | |
3233 | /// Return an SDNode that returns the value of the global base register. |
3234 | /// Output instructions required to initialize the global base register, |
3235 | /// if necessary. |
3236 | SDNode *X86DAGToDAGISel::getGlobalBaseReg() { |
3237 | unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); |
3238 | auto &DL = MF->getDataLayout(); |
3239 | return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode(); |
3240 | } |
3241 | |
3242 | bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { |
3243 | if (N->getOpcode() == ISD::TRUNCATE) |
3244 | N = N->getOperand(Num: 0).getNode(); |
3245 | if (N->getOpcode() != X86ISD::Wrapper) |
3246 | return false; |
3247 | |
3248 | auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: 0)); |
3249 | if (!GA) |
3250 | return false; |
3251 | |
3252 | auto *GV = GA->getGlobal(); |
3253 | std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange(); |
3254 | if (CR) |
3255 | return CR->getSignedMin().sge(RHS: -1ull << Width) && |
3256 | CR->getSignedMax().slt(RHS: 1ull << Width); |
3257 | // In the kernel code model, globals are in the negative 2GB of the address |
3258 | // space, so globals can be a sign extended 32-bit immediate. |
3259 | // In other code models, small globals are in the low 2GB of the address |
3260 | // space, so sign extending them is equivalent to zero extending them. |
3261 | return Width == 32 && !TM.isLargeGlobalValue(GV); |
3262 | } |
3263 | |
3264 | X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const { |
3265 | assert(N->isMachineOpcode() && "Unexpected node" ); |
3266 | unsigned Opc = N->getMachineOpcode(); |
3267 | const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc); |
3268 | int CondNo = X86::getCondSrcNoFromDesc(MCID); |
3269 | if (CondNo < 0) |
3270 | return X86::COND_INVALID; |
3271 | |
3272 | return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo)); |
3273 | } |
3274 | |
3275 | /// Test whether the given X86ISD::CMP node has any users that use a flag |
3276 | /// other than ZF. |
3277 | bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { |
3278 | // Examine each user of the node. |
3279 | for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); |
3280 | UI != UE; ++UI) { |
3281 | // Only check things that use the flags. |
3282 | if (UI.getUse().getResNo() != Flags.getResNo()) |
3283 | continue; |
3284 | // Only examine CopyToReg uses that copy to EFLAGS. |
3285 | if (UI->getOpcode() != ISD::CopyToReg || |
3286 | cast<RegisterSDNode>(Val: UI->getOperand(Num: 1))->getReg() != X86::EFLAGS) |
3287 | return false; |
3288 | // Examine each user of the CopyToReg use. |
3289 | for (SDNode::use_iterator FlagUI = UI->use_begin(), |
3290 | FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { |
3291 | // Only examine the Flag result. |
3292 | if (FlagUI.getUse().getResNo() != 1) continue; |
3293 | // Anything unusual: assume conservatively. |
3294 | if (!FlagUI->isMachineOpcode()) return false; |
3295 | // Examine the condition code of the user. |
3296 | X86::CondCode CC = getCondFromNode(N: *FlagUI); |
3297 | |
3298 | switch (CC) { |
3299 | // Comparisons which only use the zero flag. |
3300 | case X86::COND_E: case X86::COND_NE: |
3301 | continue; |
3302 | // Anything else: assume conservatively. |
3303 | default: |
3304 | return false; |
3305 | } |
3306 | } |
3307 | } |
3308 | return true; |
3309 | } |
3310 | |
3311 | /// Test whether the given X86ISD::CMP node has any uses which require the SF |
3312 | /// flag to be accurate. |
3313 | bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { |
3314 | // Examine each user of the node. |
3315 | for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); |
3316 | UI != UE; ++UI) { |
3317 | // Only check things that use the flags. |
3318 | if (UI.getUse().getResNo() != Flags.getResNo()) |
3319 | continue; |
3320 | // Only examine CopyToReg uses that copy to EFLAGS. |
3321 | if (UI->getOpcode() != ISD::CopyToReg || |
3322 | cast<RegisterSDNode>(Val: UI->getOperand(Num: 1))->getReg() != X86::EFLAGS) |
3323 | return false; |
3324 | // Examine each user of the CopyToReg use. |
3325 | for (SDNode::use_iterator FlagUI = UI->use_begin(), |
3326 | FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { |
3327 | // Only examine the Flag result. |
3328 | if (FlagUI.getUse().getResNo() != 1) continue; |
3329 | // Anything unusual: assume conservatively. |
3330 | if (!FlagUI->isMachineOpcode()) return false; |
3331 | // Examine the condition code of the user. |
3332 | X86::CondCode CC = getCondFromNode(N: *FlagUI); |
3333 | |
3334 | switch (CC) { |
3335 | // Comparisons which don't examine the SF flag. |
3336 | case X86::COND_A: case X86::COND_AE: |
3337 | case X86::COND_B: case X86::COND_BE: |
3338 | case X86::COND_E: case X86::COND_NE: |
3339 | case X86::COND_O: case X86::COND_NO: |
3340 | case X86::COND_P: case X86::COND_NP: |
3341 | continue; |
3342 | // Anything else: assume conservatively. |
3343 | default: |
3344 | return false; |
3345 | } |
3346 | } |
3347 | } |
3348 | return true; |
3349 | } |
3350 | |
3351 | static bool mayUseCarryFlag(X86::CondCode CC) { |
3352 | switch (CC) { |
3353 | // Comparisons which don't examine the CF flag. |
3354 | case X86::COND_O: case X86::COND_NO: |
3355 | case X86::COND_E: case X86::COND_NE: |
3356 | case X86::COND_S: case X86::COND_NS: |
3357 | case X86::COND_P: case X86::COND_NP: |
3358 | case X86::COND_L: case X86::COND_GE: |
3359 | case X86::COND_G: case X86::COND_LE: |
3360 | return false; |
3361 | // Anything else: assume conservatively. |
3362 | default: |
3363 | return true; |
3364 | } |
3365 | } |
3366 | |
3367 | /// Test whether the given node which sets flags has any uses which require the |
3368 | /// CF flag to be accurate. |
3369 | bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { |
3370 | // Examine each user of the node. |
3371 | for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); |
3372 | UI != UE; ++UI) { |
3373 | // Only check things that use the flags. |
3374 | if (UI.getUse().getResNo() != Flags.getResNo()) |
3375 | continue; |
3376 | |
3377 | unsigned UIOpc = UI->getOpcode(); |
3378 | |
3379 | if (UIOpc == ISD::CopyToReg) { |
3380 | // Only examine CopyToReg uses that copy to EFLAGS. |
3381 | if (cast<RegisterSDNode>(Val: UI->getOperand(Num: 1))->getReg() != X86::EFLAGS) |
3382 | return false; |
3383 | // Examine each user of the CopyToReg use. |
3384 | for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); |
3385 | FlagUI != FlagUE; ++FlagUI) { |
3386 | // Only examine the Flag result. |
3387 | if (FlagUI.getUse().getResNo() != 1) |
3388 | continue; |
3389 | // Anything unusual: assume conservatively. |
3390 | if (!FlagUI->isMachineOpcode()) |
3391 | return false; |
3392 | // Examine the condition code of the user. |
3393 | X86::CondCode CC = getCondFromNode(N: *FlagUI); |
3394 | |
3395 | if (mayUseCarryFlag(CC)) |
3396 | return false; |
3397 | } |
3398 | |
3399 | // This CopyToReg is ok. Move on to the next user. |
3400 | continue; |
3401 | } |
3402 | |
3403 | // This might be an unselected node. So look for the pre-isel opcodes that |
3404 | // use flags. |
3405 | unsigned CCOpNo; |
3406 | switch (UIOpc) { |
3407 | default: |
3408 | // Something unusual. Be conservative. |
3409 | return false; |
3410 | case X86ISD::SETCC: CCOpNo = 0; break; |
3411 | case X86ISD::SETCC_CARRY: CCOpNo = 0; break; |
3412 | case X86ISD::CMOV: CCOpNo = 2; break; |
3413 | case X86ISD::BRCOND: CCOpNo = 2; break; |
3414 | } |
3415 | |
3416 | X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(Num: CCOpNo); |
3417 | if (mayUseCarryFlag(CC)) |
3418 | return false; |
3419 | } |
3420 | return true; |
3421 | } |
3422 | |
3423 | /// Check whether or not the chain ending in StoreNode is suitable for doing |
3424 | /// the {load; op; store} to modify transformation. |
3425 | static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, |
3426 | SDValue StoredVal, SelectionDAG *CurDAG, |
3427 | unsigned LoadOpNo, |
3428 | LoadSDNode *&LoadNode, |
3429 | SDValue &InputChain) { |
3430 | // Is the stored value result 0 of the operation? |
3431 | if (StoredVal.getResNo() != 0) return false; |
3432 | |
3433 | // Are there other uses of the operation other than the store? |
3434 | if (!StoredVal.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) return false; |
3435 | |
3436 | // Is the store non-extending and non-indexed? |
3437 | if (!ISD::isNormalStore(N: StoreNode) || StoreNode->isNonTemporal()) |
3438 | return false; |
3439 | |
3440 | SDValue Load = StoredVal->getOperand(Num: LoadOpNo); |
3441 | // Is the stored value a non-extending and non-indexed load? |
3442 | if (!ISD::isNormalLoad(N: Load.getNode())) return false; |
3443 | |
3444 | // Return LoadNode by reference. |
3445 | LoadNode = cast<LoadSDNode>(Val&: Load); |
3446 | |
3447 | // Is store the only read of the loaded value? |
3448 | if (!Load.hasOneUse()) |
3449 | return false; |
3450 | |
3451 | // Is the address of the store the same as the load? |
3452 | if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || |
3453 | LoadNode->getOffset() != StoreNode->getOffset()) |
3454 | return false; |
3455 | |
3456 | bool FoundLoad = false; |
3457 | SmallVector<SDValue, 4> ChainOps; |
3458 | SmallVector<const SDNode *, 4> LoopWorklist; |
3459 | SmallPtrSet<const SDNode *, 16> Visited; |
3460 | const unsigned int Max = 1024; |
3461 | |
3462 | // Visualization of Load-Op-Store fusion: |
3463 | // ------------------------- |
3464 | // Legend: |
3465 | // *-lines = Chain operand dependencies. |
3466 | // |-lines = Normal operand dependencies. |
3467 | // Dependencies flow down and right. n-suffix references multiple nodes. |
3468 | // |
3469 | // C Xn C |
3470 | // * * * |
3471 | // * * * |
3472 | // Xn A-LD Yn TF Yn |
3473 | // * * \ | * | |
3474 | // * * \ | * | |
3475 | // * * \ | => A--LD_OP_ST |
3476 | // * * \| \ |
3477 | // TF OP \ |
3478 | // * | \ Zn |
3479 | // * | \ |
3480 | // A-ST Zn |
3481 | // |
3482 | |
3483 | // This merge induced dependences from: #1: Xn -> LD, OP, Zn |
3484 | // #2: Yn -> LD |
3485 | // #3: ST -> Zn |
3486 | |
3487 | // Ensure the transform is safe by checking for the dual |
3488 | // dependencies to make sure we do not induce a loop. |
3489 | |
3490 | // As LD is a predecessor to both OP and ST we can do this by checking: |
3491 | // a). if LD is a predecessor to a member of Xn or Yn. |
3492 | // b). if a Zn is a predecessor to ST. |
3493 | |
3494 | // However, (b) can only occur through being a chain predecessor to |
3495 | // ST, which is the same as Zn being a member or predecessor of Xn, |
3496 | // which is a subset of LD being a predecessor of Xn. So it's |
3497 | // subsumed by check (a). |
3498 | |
3499 | SDValue Chain = StoreNode->getChain(); |
3500 | |
3501 | // Gather X elements in ChainOps. |
3502 | if (Chain == Load.getValue(R: 1)) { |
3503 | FoundLoad = true; |
3504 | ChainOps.push_back(Elt: Load.getOperand(i: 0)); |
3505 | } else if (Chain.getOpcode() == ISD::TokenFactor) { |
3506 | for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { |
3507 | SDValue Op = Chain.getOperand(i); |
3508 | if (Op == Load.getValue(R: 1)) { |
3509 | FoundLoad = true; |
3510 | // Drop Load, but keep its chain. No cycle check necessary. |
3511 | ChainOps.push_back(Elt: Load.getOperand(i: 0)); |
3512 | continue; |
3513 | } |
3514 | LoopWorklist.push_back(Elt: Op.getNode()); |
3515 | ChainOps.push_back(Elt: Op); |
3516 | } |
3517 | } |
3518 | |
3519 | if (!FoundLoad) |
3520 | return false; |
3521 | |
3522 | // Worklist is currently Xn. Add Yn to worklist. |
3523 | for (SDValue Op : StoredVal->ops()) |
3524 | if (Op.getNode() != LoadNode) |
3525 | LoopWorklist.push_back(Elt: Op.getNode()); |
3526 | |
3527 | // Check (a) if Load is a predecessor to Xn + Yn |
3528 | if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max, |
3529 | TopologicalPrune: true)) |
3530 | return false; |
3531 | |
3532 | InputChain = |
3533 | CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ChainOps); |
3534 | return true; |
3535 | } |
3536 | |
3537 | // Change a chain of {load; op; store} of the same value into a simple op |
3538 | // through memory of that value, if the uses of the modified value and its |
3539 | // address are suitable. |
3540 | // |
3541 | // The tablegen pattern memory operand pattern is currently not able to match |
3542 | // the case where the EFLAGS on the original operation are used. |
3543 | // |
3544 | // To move this to tablegen, we'll need to improve tablegen to allow flags to |
3545 | // be transferred from a node in the pattern to the result node, probably with |
3546 | // a new keyword. For example, we have this |
3547 | // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", |
3548 | // [(store (add (loadi64 addr:$dst), -1), addr:$dst), |
3549 | // (implicit EFLAGS)]>; |
3550 | // but maybe need something like this |
3551 | // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", |
3552 | // [(store (add (loadi64 addr:$dst), -1), addr:$dst), |
3553 | // (transferrable EFLAGS)]>; |
3554 | // |
3555 | // Until then, we manually fold these and instruction select the operation |
3556 | // here. |
3557 | bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { |
3558 | auto *StoreNode = cast<StoreSDNode>(Val: Node); |
3559 | SDValue StoredVal = StoreNode->getOperand(Num: 1); |
3560 | unsigned Opc = StoredVal->getOpcode(); |
3561 | |
3562 | // Before we try to select anything, make sure this is memory operand size |
3563 | // and opcode we can handle. Note that this must match the code below that |
3564 | // actually lowers the opcodes. |
3565 | EVT MemVT = StoreNode->getMemoryVT(); |
3566 | if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && |
3567 | MemVT != MVT::i8) |
3568 | return false; |
3569 | |
3570 | bool IsCommutable = false; |
3571 | bool IsNegate = false; |
3572 | switch (Opc) { |
3573 | default: |
3574 | return false; |
3575 | case X86ISD::SUB: |
3576 | IsNegate = isNullConstant(V: StoredVal.getOperand(i: 0)); |
3577 | break; |
3578 | case X86ISD::SBB: |
3579 | break; |
3580 | case X86ISD::ADD: |
3581 | case X86ISD::ADC: |
3582 | case X86ISD::AND: |
3583 | case X86ISD::OR: |
3584 | case X86ISD::XOR: |
3585 | IsCommutable = true; |
3586 | break; |
3587 | } |
3588 | |
3589 | unsigned LoadOpNo = IsNegate ? 1 : 0; |
3590 | LoadSDNode *LoadNode = nullptr; |
3591 | SDValue InputChain; |
3592 | if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, |
3593 | LoadNode, InputChain)) { |
3594 | if (!IsCommutable) |
3595 | return false; |
3596 | |
3597 | // This operation is commutable, try the other operand. |
3598 | LoadOpNo = 1; |
3599 | if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, |
3600 | LoadNode, InputChain)) |
3601 | return false; |
3602 | } |
3603 | |
3604 | SDValue Base, Scale, Index, Disp, Segment; |
3605 | if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp, |
3606 | Segment)) |
3607 | return false; |
3608 | |
3609 | auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, |
3610 | unsigned Opc8) { |
3611 | switch (MemVT.getSimpleVT().SimpleTy) { |
3612 | case MVT::i64: |
3613 | return Opc64; |
3614 | case MVT::i32: |
3615 | return Opc32; |
3616 | case MVT::i16: |
3617 | return Opc16; |
3618 | case MVT::i8: |
3619 | return Opc8; |
3620 | default: |
3621 | llvm_unreachable("Invalid size!" ); |
3622 | } |
3623 | }; |
3624 | |
3625 | MachineSDNode *Result; |
3626 | switch (Opc) { |
3627 | case X86ISD::SUB: |
3628 | // Handle negate. |
3629 | if (IsNegate) { |
3630 | unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, |
3631 | X86::NEG8m); |
3632 | const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; |
3633 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, |
3634 | VT2: MVT::Other, Ops); |
3635 | break; |
3636 | } |
3637 | [[fallthrough]]; |
3638 | case X86ISD::ADD: |
3639 | // Try to match inc/dec. |
3640 | if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { |
3641 | bool IsOne = isOneConstant(V: StoredVal.getOperand(i: 1)); |
3642 | bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: 1)); |
3643 | // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. |
3644 | if ((IsOne || IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) { |
3645 | unsigned NewOpc = |
3646 | ((Opc == X86ISD::ADD) == IsOne) |
3647 | ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) |
3648 | : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); |
3649 | const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; |
3650 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, |
3651 | VT2: MVT::Other, Ops); |
3652 | break; |
3653 | } |
3654 | } |
3655 | [[fallthrough]]; |
3656 | case X86ISD::ADC: |
3657 | case X86ISD::SBB: |
3658 | case X86ISD::AND: |
3659 | case X86ISD::OR: |
3660 | case X86ISD::XOR: { |
3661 | auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { |
3662 | switch (Opc) { |
3663 | case X86ISD::ADD: |
3664 | return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, |
3665 | X86::ADD8mr); |
3666 | case X86ISD::ADC: |
3667 | return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, |
3668 | X86::ADC8mr); |
3669 | case X86ISD::SUB: |
3670 | return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, |
3671 | X86::SUB8mr); |
3672 | case X86ISD::SBB: |
3673 | return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, |
3674 | X86::SBB8mr); |
3675 | case X86ISD::AND: |
3676 | return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, |
3677 | X86::AND8mr); |
3678 | case X86ISD::OR: |
3679 | return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); |
3680 | case X86ISD::XOR: |
3681 | return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, |
3682 | X86::XOR8mr); |
3683 | default: |
3684 | llvm_unreachable("Invalid opcode!" ); |
3685 | } |
3686 | }; |
3687 | auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { |
3688 | switch (Opc) { |
3689 | case X86ISD::ADD: |
3690 | return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, |
3691 | X86::ADD8mi); |
3692 | case X86ISD::ADC: |
3693 | return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, |
3694 | X86::ADC8mi); |
3695 | case X86ISD::SUB: |
3696 | return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, |
3697 | X86::SUB8mi); |
3698 | case X86ISD::SBB: |
3699 | return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, |
3700 | X86::SBB8mi); |
3701 | case X86ISD::AND: |
3702 | return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, |
3703 | X86::AND8mi); |
3704 | case X86ISD::OR: |
3705 | return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, |
3706 | X86::OR8mi); |
3707 | case X86ISD::XOR: |
3708 | return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, |
3709 | X86::XOR8mi); |
3710 | default: |
3711 | llvm_unreachable("Invalid opcode!" ); |
3712 | } |
3713 | }; |
3714 | |
3715 | unsigned NewOpc = SelectRegOpcode(Opc); |
3716 | SDValue Operand = StoredVal->getOperand(Num: 1-LoadOpNo); |
3717 | |
3718 | // See if the operand is a constant that we can fold into an immediate |
3719 | // operand. |
3720 | if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) { |
3721 | int64_t OperandV = OperandC->getSExtValue(); |
3722 | |
3723 | // Check if we can shrink the operand enough to fit in an immediate (or |
3724 | // fit into a smaller immediate) by negating it and switching the |
3725 | // operation. |
3726 | if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && |
3727 | ((MemVT != MVT::i8 && !isInt<8>(x: OperandV) && isInt<8>(x: -OperandV)) || |
3728 | (MemVT == MVT::i64 && !isInt<32>(x: OperandV) && |
3729 | isInt<32>(x: -OperandV))) && |
3730 | hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) { |
3731 | OperandV = -OperandV; |
3732 | Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; |
3733 | } |
3734 | |
3735 | if (MemVT != MVT::i64 || isInt<32>(x: OperandV)) { |
3736 | Operand = CurDAG->getTargetConstant(Val: OperandV, DL: SDLoc(Node), VT: MemVT); |
3737 | NewOpc = SelectImmOpcode(Opc); |
3738 | } |
3739 | } |
3740 | |
3741 | if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { |
3742 | SDValue CopyTo = |
3743 | CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc(Node), Reg: X86::EFLAGS, |
3744 | N: StoredVal.getOperand(i: 2), Glue: SDValue()); |
3745 | |
3746 | const SDValue Ops[] = {Base, Scale, Index, Disp, |
3747 | Segment, Operand, CopyTo, CopyTo.getValue(R: 1)}; |
3748 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other, |
3749 | Ops); |
3750 | } else { |
3751 | const SDValue Ops[] = {Base, Scale, Index, Disp, |
3752 | Segment, Operand, InputChain}; |
3753 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other, |
3754 | Ops); |
3755 | } |
3756 | break; |
3757 | } |
3758 | default: |
3759 | llvm_unreachable("Invalid opcode!" ); |
3760 | } |
3761 | |
3762 | MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), |
3763 | LoadNode->getMemOperand()}; |
3764 | CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps); |
3765 | |
3766 | // Update Load Chain uses as well. |
3767 | ReplaceUses(F: SDValue(LoadNode, 1), T: SDValue(Result, 1)); |
3768 | ReplaceUses(F: SDValue(StoreNode, 0), T: SDValue(Result, 1)); |
3769 | ReplaceUses(F: SDValue(StoredVal.getNode(), 1), T: SDValue(Result, 0)); |
3770 | CurDAG->RemoveDeadNode(N: Node); |
3771 | return true; |
3772 | } |
3773 | |
3774 | // See if this is an X & Mask that we can match to BEXTR/BZHI. |
3775 | // Where Mask is one of the following patterns: |
3776 | // a) x & (1 << nbits) - 1 |
3777 | // b) x & ~(-1 << nbits) |
3778 | // c) x & (-1 >> (32 - y)) |
3779 | // d) x << (32 - y) >> (32 - y) |
3780 | // e) (1 << nbits) - 1 |
3781 | bool X86DAGToDAGISel::(SDNode *Node) { |
3782 | assert( |
3783 | (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND || |
3784 | Node->getOpcode() == ISD::SRL) && |
3785 | "Should be either an and-mask, or right-shift after clearing high bits." ); |
3786 | |
3787 | // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. |
3788 | if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) |
3789 | return false; |
3790 | |
3791 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
3792 | |
3793 | // Only supported for 32 and 64 bits. |
3794 | if (NVT != MVT::i32 && NVT != MVT::i64) |
3795 | return false; |
3796 | |
3797 | SDValue NBits; |
3798 | bool NegateNBits; |
3799 | |
3800 | // If we have BMI2's BZHI, we are ok with muti-use patterns. |
3801 | // Else, if we only have BMI1's BEXTR, we require one-use. |
3802 | const bool = Subtarget->hasBMI2(); |
3803 | auto checkUses = [AllowExtraUsesByDefault]( |
3804 | SDValue Op, unsigned NUses, |
3805 | std::optional<bool> ) { |
3806 | return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) || |
3807 | Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo()); |
3808 | }; |
3809 | auto checkOneUse = [checkUses](SDValue Op, |
3810 | std::optional<bool> = |
3811 | std::nullopt) { |
3812 | return checkUses(Op, 1, AllowExtraUses); |
3813 | }; |
3814 | auto checkTwoUse = [checkUses](SDValue Op, |
3815 | std::optional<bool> = |
3816 | std::nullopt) { |
3817 | return checkUses(Op, 2, AllowExtraUses); |
3818 | }; |
3819 | |
3820 | auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { |
3821 | if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { |
3822 | assert(V.getSimpleValueType() == MVT::i32 && |
3823 | V.getOperand(0).getSimpleValueType() == MVT::i64 && |
3824 | "Expected i64 -> i32 truncation" ); |
3825 | V = V.getOperand(i: 0); |
3826 | } |
3827 | return V; |
3828 | }; |
3829 | |
3830 | // a) x & ((1 << nbits) + (-1)) |
3831 | auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits, |
3832 | &NegateNBits](SDValue Mask) -> bool { |
3833 | // Match `add`. Must only have one use! |
3834 | if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) |
3835 | return false; |
3836 | // We should be adding all-ones constant (i.e. subtracting one.) |
3837 | if (!isAllOnesConstant(V: Mask->getOperand(Num: 1))) |
3838 | return false; |
3839 | // Match `1 << nbits`. Might be truncated. Must only have one use! |
3840 | SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0)); |
3841 | if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) |
3842 | return false; |
3843 | if (!isOneConstant(V: M0->getOperand(Num: 0))) |
3844 | return false; |
3845 | NBits = M0->getOperand(Num: 1); |
3846 | NegateNBits = false; |
3847 | return true; |
3848 | }; |
3849 | |
3850 | auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { |
3851 | V = peekThroughOneUseTruncation(V); |
3852 | return CurDAG->MaskedValueIsAllOnes( |
3853 | Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(), |
3854 | loBitsSet: NVT.getSizeInBits())); |
3855 | }; |
3856 | |
3857 | // b) x & ~(-1 << nbits) |
3858 | auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, |
3859 | &NBits, &NegateNBits](SDValue Mask) -> bool { |
3860 | // Match `~()`. Must only have one use! |
3861 | if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) |
3862 | return false; |
3863 | // The -1 only has to be all-ones for the final Node's NVT. |
3864 | if (!isAllOnes(Mask->getOperand(Num: 1))) |
3865 | return false; |
3866 | // Match `-1 << nbits`. Might be truncated. Must only have one use! |
3867 | SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0)); |
3868 | if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) |
3869 | return false; |
3870 | // The -1 only has to be all-ones for the final Node's NVT. |
3871 | if (!isAllOnes(M0->getOperand(Num: 0))) |
3872 | return false; |
3873 | NBits = M0->getOperand(Num: 1); |
3874 | NegateNBits = false; |
3875 | return true; |
3876 | }; |
3877 | |
3878 | // Try to match potentially-truncated shift amount as `(bitwidth - y)`, |
3879 | // or leave the shift amount as-is, but then we'll have to negate it. |
3880 | auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt, |
3881 | unsigned Bitwidth) { |
3882 | NBits = ShiftAmt; |
3883 | NegateNBits = true; |
3884 | // Skip over a truncate of the shift amount, if any. |
3885 | if (NBits.getOpcode() == ISD::TRUNCATE) |
3886 | NBits = NBits.getOperand(i: 0); |
3887 | // Try to match the shift amount as (bitwidth - y). It should go away, too. |
3888 | // If it doesn't match, that's fine, we'll just negate it ourselves. |
3889 | if (NBits.getOpcode() != ISD::SUB) |
3890 | return; |
3891 | auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: 0)); |
3892 | if (!V0 || V0->getZExtValue() != Bitwidth) |
3893 | return; |
3894 | NBits = NBits.getOperand(i: 1); |
3895 | NegateNBits = false; |
3896 | }; |
3897 | |
3898 | // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth |
3899 | // or |
3900 | // c) x & (-1 >> (32 - y)) |
3901 | auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits, |
3902 | canonicalizeShiftAmt](SDValue Mask) -> bool { |
3903 | // The mask itself may be truncated. |
3904 | Mask = peekThroughOneUseTruncation(Mask); |
3905 | unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); |
3906 | // Match `l>>`. Must only have one use! |
3907 | if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) |
3908 | return false; |
3909 | // We should be shifting truly all-ones constant. |
3910 | if (!isAllOnesConstant(V: Mask.getOperand(i: 0))) |
3911 | return false; |
3912 | SDValue M1 = Mask.getOperand(i: 1); |
3913 | // The shift amount should not be used externally. |
3914 | if (!checkOneUse(M1)) |
3915 | return false; |
3916 | canonicalizeShiftAmt(M1, Bitwidth); |
3917 | // Pattern c. is non-canonical, and is expanded into pattern d. iff there |
3918 | // is no extra use of the mask. Clearly, there was one since we are here. |
3919 | // But at the same time, if we need to negate the shift amount, |
3920 | // then we don't want the mask to stick around, else it's unprofitable. |
3921 | return !NegateNBits; |
3922 | }; |
3923 | |
3924 | SDValue X; |
3925 | |
3926 | // d) x << z >> z but then we'll have to subtract z from bitwidth |
3927 | // or |
3928 | // d) x << (32 - y) >> (32 - y) |
3929 | auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt, |
3930 | AllowExtraUsesByDefault, &NegateNBits, |
3931 | &X](SDNode *Node) -> bool { |
3932 | if (Node->getOpcode() != ISD::SRL) |
3933 | return false; |
3934 | SDValue N0 = Node->getOperand(Num: 0); |
3935 | if (N0->getOpcode() != ISD::SHL) |
3936 | return false; |
3937 | unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); |
3938 | SDValue N1 = Node->getOperand(Num: 1); |
3939 | SDValue N01 = N0->getOperand(Num: 1); |
3940 | // Both of the shifts must be by the exact same value. |
3941 | if (N1 != N01) |
3942 | return false; |
3943 | canonicalizeShiftAmt(N1, Bitwidth); |
3944 | // There should not be any external uses of the inner shift / shift amount. |
3945 | // Note that while we are generally okay with external uses given BMI2, |
3946 | // iff we need to negate the shift amount, we are not okay with extra uses. |
3947 | const bool = AllowExtraUsesByDefault && !NegateNBits; |
3948 | if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses)) |
3949 | return false; |
3950 | X = N0->getOperand(Num: 0); |
3951 | return true; |
3952 | }; |
3953 | |
3954 | auto matchLowBitMask = [matchPatternA, matchPatternB, |
3955 | matchPatternC](SDValue Mask) -> bool { |
3956 | return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); |
3957 | }; |
3958 | |
3959 | if (Node->getOpcode() == ISD::AND) { |
3960 | X = Node->getOperand(Num: 0); |
3961 | SDValue Mask = Node->getOperand(Num: 1); |
3962 | |
3963 | if (matchLowBitMask(Mask)) { |
3964 | // Great. |
3965 | } else { |
3966 | std::swap(a&: X, b&: Mask); |
3967 | if (!matchLowBitMask(Mask)) |
3968 | return false; |
3969 | } |
3970 | } else if (matchLowBitMask(SDValue(Node, 0))) { |
3971 | X = CurDAG->getAllOnesConstant(DL: SDLoc(Node), VT: NVT); |
3972 | } else if (!matchPatternD(Node)) |
3973 | return false; |
3974 | |
3975 | // If we need to negate the shift amount, require BMI2 BZHI support. |
3976 | // It's just too unprofitable for BMI1 BEXTR. |
3977 | if (NegateNBits && !Subtarget->hasBMI2()) |
3978 | return false; |
3979 | |
3980 | SDLoc DL(Node); |
3981 | |
3982 | // Truncate the shift amount. |
3983 | NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits); |
3984 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
3985 | |
3986 | // Insert 8-bit NBits into lowest 8 bits of 32-bit register. |
3987 | // All the other bits are undefined, we do not care about them. |
3988 | SDValue ImplDef = SDValue( |
3989 | CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), 0); |
3990 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: ImplDef); |
3991 | |
3992 | SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32); |
3993 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: SRIdxVal); |
3994 | NBits = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL, |
3995 | VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal), |
3996 | 0); |
3997 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
3998 | |
3999 | // We might have matched the amount of high bits to be cleared, |
4000 | // but we want the amount of low bits to be kept, so negate it then. |
4001 | if (NegateNBits) { |
4002 | SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32); |
4003 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: BitWidthC); |
4004 | |
4005 | NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits); |
4006 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
4007 | } |
4008 | |
4009 | if (Subtarget->hasBMI2()) { |
4010 | // Great, just emit the BZHI.. |
4011 | if (NVT != MVT::i32) { |
4012 | // But have to place the bit count into the wide-enough register first. |
4013 | NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits); |
4014 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
4015 | } |
4016 | |
4017 | SDValue = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits); |
4018 | ReplaceNode(F: Node, T: Extract.getNode()); |
4019 | SelectCode(N: Extract.getNode()); |
4020 | return true; |
4021 | } |
4022 | |
4023 | // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is |
4024 | // *logically* shifted (potentially with one-use trunc inbetween), |
4025 | // and the truncation was the only use of the shift, |
4026 | // and if so look past one-use truncation. |
4027 | { |
4028 | SDValue RealX = peekThroughOneUseTruncation(X); |
4029 | // FIXME: only if the shift is one-use? |
4030 | if (RealX != X && RealX.getOpcode() == ISD::SRL) |
4031 | X = RealX; |
4032 | } |
4033 | |
4034 | MVT XVT = X.getSimpleValueType(); |
4035 | |
4036 | // Else, emitting BEXTR requires one more step. |
4037 | // The 'control' of BEXTR has the pattern of: |
4038 | // [15...8 bit][ 7...0 bit] location |
4039 | // [ bit count][ shift] name |
4040 | // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 |
4041 | |
4042 | // Shift NBits left by 8 bits, thus producing 'control'. |
4043 | // This makes the low 8 bits to be zero. |
4044 | SDValue C8 = CurDAG->getConstant(Val: 8, DL, VT: MVT::i8); |
4045 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: C8); |
4046 | SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8); |
4047 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
4048 | |
4049 | // If the 'X' is *logically* shifted, we can fold that shift into 'control'. |
4050 | // FIXME: only if the shift is one-use? |
4051 | if (X.getOpcode() == ISD::SRL) { |
4052 | SDValue ShiftAmt = X.getOperand(i: 1); |
4053 | X = X.getOperand(i: 0); |
4054 | |
4055 | assert(ShiftAmt.getValueType() == MVT::i8 && |
4056 | "Expected shift amount to be i8" ); |
4057 | |
4058 | // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! |
4059 | // We could zext to i16 in some form, but we intentionally don't do that. |
4060 | SDValue OrigShiftAmt = ShiftAmt; |
4061 | ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt); |
4062 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt); |
4063 | |
4064 | // And now 'or' these low 8 bits of shift amount into the 'control'. |
4065 | Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt); |
4066 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
4067 | } |
4068 | |
4069 | // But have to place the 'control' into the wide-enough register first. |
4070 | if (XVT != MVT::i32) { |
4071 | Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control); |
4072 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
4073 | } |
4074 | |
4075 | // And finally, form the BEXTR itself. |
4076 | SDValue = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control); |
4077 | |
4078 | // The 'X' was originally truncated. Do that now. |
4079 | if (XVT != NVT) { |
4080 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Extract); |
4081 | Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract); |
4082 | } |
4083 | |
4084 | ReplaceNode(F: Node, T: Extract.getNode()); |
4085 | SelectCode(N: Extract.getNode()); |
4086 | |
4087 | return true; |
4088 | } |
4089 | |
4090 | // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. |
4091 | MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { |
4092 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
4093 | SDLoc dl(Node); |
4094 | |
4095 | SDValue N0 = Node->getOperand(Num: 0); |
4096 | SDValue N1 = Node->getOperand(Num: 1); |
4097 | |
4098 | // If we have TBM we can use an immediate for the control. If we have BMI |
4099 | // we should only do this if the BEXTR instruction is implemented well. |
4100 | // Otherwise moving the control into a register makes this more costly. |
4101 | // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM |
4102 | // hoisting the move immediate would make it worthwhile with a less optimal |
4103 | // BEXTR? |
4104 | bool PreferBEXTR = |
4105 | Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); |
4106 | if (!PreferBEXTR && !Subtarget->hasBMI2()) |
4107 | return nullptr; |
4108 | |
4109 | // Must have a shift right. |
4110 | if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) |
4111 | return nullptr; |
4112 | |
4113 | // Shift can't have additional users. |
4114 | if (!N0->hasOneUse()) |
4115 | return nullptr; |
4116 | |
4117 | // Only supported for 32 and 64 bits. |
4118 | if (NVT != MVT::i32 && NVT != MVT::i64) |
4119 | return nullptr; |
4120 | |
4121 | // Shift amount and RHS of and must be constant. |
4122 | auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1); |
4123 | auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1)); |
4124 | if (!MaskCst || !ShiftCst) |
4125 | return nullptr; |
4126 | |
4127 | // And RHS must be a mask. |
4128 | uint64_t Mask = MaskCst->getZExtValue(); |
4129 | if (!isMask_64(Value: Mask)) |
4130 | return nullptr; |
4131 | |
4132 | uint64_t Shift = ShiftCst->getZExtValue(); |
4133 | uint64_t MaskSize = llvm::popcount(Value: Mask); |
4134 | |
4135 | // Don't interfere with something that can be handled by extracting AH. |
4136 | // TODO: If we are able to fold a load, BEXTR might still be better than AH. |
4137 | if (Shift == 8 && MaskSize == 8) |
4138 | return nullptr; |
4139 | |
4140 | // Make sure we are only using bits that were in the original value, not |
4141 | // shifted in. |
4142 | if (Shift + MaskSize > NVT.getSizeInBits()) |
4143 | return nullptr; |
4144 | |
4145 | // BZHI, if available, is always fast, unlike BEXTR. But even if we decide |
4146 | // that we can't use BEXTR, it is only worthwhile using BZHI if the mask |
4147 | // does not fit into 32 bits. Load folding is not a sufficient reason. |
4148 | if (!PreferBEXTR && MaskSize <= 32) |
4149 | return nullptr; |
4150 | |
4151 | SDValue Control; |
4152 | unsigned ROpc, MOpc; |
4153 | |
4154 | #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC) |
4155 | if (!PreferBEXTR) { |
4156 | assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then." ); |
4157 | // If we can't make use of BEXTR then we can't fuse shift+mask stages. |
4158 | // Let's perform the mask first, and apply shift later. Note that we need to |
4159 | // widen the mask to account for the fact that we'll apply shift afterwards! |
4160 | Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT); |
4161 | ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr) |
4162 | : GET_EGPR_IF_ENABLED(X86::BZHI32rr); |
4163 | MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm) |
4164 | : GET_EGPR_IF_ENABLED(X86::BZHI32rm); |
4165 | unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; |
4166 | Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0); |
4167 | } else { |
4168 | // The 'control' of BEXTR has the pattern of: |
4169 | // [15...8 bit][ 7...0 bit] location |
4170 | // [ bit count][ shift] name |
4171 | // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 |
4172 | Control = CurDAG->getTargetConstant(Val: Shift | (MaskSize << 8), DL: dl, VT: NVT); |
4173 | if (Subtarget->hasTBM()) { |
4174 | ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; |
4175 | MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; |
4176 | } else { |
4177 | assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then." ); |
4178 | // BMI requires the immediate to placed in a register. |
4179 | ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr) |
4180 | : GET_EGPR_IF_ENABLED(X86::BEXTR32rr); |
4181 | MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm) |
4182 | : GET_EGPR_IF_ENABLED(X86::BEXTR32rm); |
4183 | unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; |
4184 | Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0); |
4185 | } |
4186 | } |
4187 | |
4188 | MachineSDNode *NewNode; |
4189 | SDValue Input = N0->getOperand(Num: 0); |
4190 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4191 | if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
4192 | SDValue Ops[] = { |
4193 | Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: 0)}; |
4194 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other); |
4195 | NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
4196 | // Update the chain. |
4197 | ReplaceUses(F: Input.getValue(R: 1), T: SDValue(NewNode, 2)); |
4198 | // Record the mem-refs |
4199 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()}); |
4200 | } else { |
4201 | NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control); |
4202 | } |
4203 | |
4204 | if (!PreferBEXTR) { |
4205 | // We still need to apply the shift. |
4206 | SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT); |
4207 | unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri) |
4208 | : GET_ND_IF_ENABLED(X86::SHR32ri); |
4209 | NewNode = |
4210 | CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue(NewNode, 0), Op2: ShAmt); |
4211 | } |
4212 | |
4213 | return NewNode; |
4214 | } |
4215 | |
4216 | // Emit a PCMISTR(I/M) instruction. |
4217 | MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, |
4218 | bool MayFoldLoad, const SDLoc &dl, |
4219 | MVT VT, SDNode *Node) { |
4220 | SDValue N0 = Node->getOperand(Num: 0); |
4221 | SDValue N1 = Node->getOperand(Num: 1); |
4222 | SDValue Imm = Node->getOperand(Num: 2); |
4223 | auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue(); |
4224 | Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType()); |
4225 | |
4226 | // Try to fold a load. No need to check alignment. |
4227 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4228 | if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
4229 | SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
4230 | N1.getOperand(i: 0) }; |
4231 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other); |
4232 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
4233 | // Update the chain. |
4234 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 2)); |
4235 | // Record the mem-refs |
4236 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
4237 | return CNode; |
4238 | } |
4239 | |
4240 | SDValue Ops[] = { N0, N1, Imm }; |
4241 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32); |
4242 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops); |
4243 | return CNode; |
4244 | } |
4245 | |
4246 | // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need |
4247 | // to emit a second instruction after this one. This is needed since we have two |
4248 | // copyToReg nodes glued before this and we need to continue that glue through. |
4249 | MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, |
4250 | bool MayFoldLoad, const SDLoc &dl, |
4251 | MVT VT, SDNode *Node, |
4252 | SDValue &InGlue) { |
4253 | SDValue N0 = Node->getOperand(Num: 0); |
4254 | SDValue N2 = Node->getOperand(Num: 2); |
4255 | SDValue Imm = Node->getOperand(Num: 4); |
4256 | auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue(); |
4257 | Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType()); |
4258 | |
4259 | // Try to fold a load. No need to check alignment. |
4260 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4261 | if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
4262 | SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
4263 | N2.getOperand(i: 0), InGlue }; |
4264 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue); |
4265 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
4266 | InGlue = SDValue(CNode, 3); |
4267 | // Update the chain. |
4268 | ReplaceUses(F: N2.getValue(R: 1), T: SDValue(CNode, 2)); |
4269 | // Record the mem-refs |
4270 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()}); |
4271 | return CNode; |
4272 | } |
4273 | |
4274 | SDValue Ops[] = { N0, N2, Imm, InGlue }; |
4275 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue); |
4276 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops); |
4277 | InGlue = SDValue(CNode, 2); |
4278 | return CNode; |
4279 | } |
4280 | |
4281 | bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { |
4282 | EVT VT = N->getValueType(ResNo: 0); |
4283 | |
4284 | // Only handle scalar shifts. |
4285 | if (VT.isVector()) |
4286 | return false; |
4287 | |
4288 | // Narrower shifts only mask to 5 bits in hardware. |
4289 | unsigned Size = VT == MVT::i64 ? 64 : 32; |
4290 | |
4291 | SDValue OrigShiftAmt = N->getOperand(Num: 1); |
4292 | SDValue ShiftAmt = OrigShiftAmt; |
4293 | SDLoc DL(N); |
4294 | |
4295 | // Skip over a truncate of the shift amount. |
4296 | if (ShiftAmt->getOpcode() == ISD::TRUNCATE) |
4297 | ShiftAmt = ShiftAmt->getOperand(Num: 0); |
4298 | |
4299 | // This function is called after X86DAGToDAGISel::matchBitExtract(), |
4300 | // so we are not afraid that we might mess up BZHI/BEXTR pattern. |
4301 | |
4302 | SDValue NewShiftAmt; |
4303 | if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB || |
4304 | ShiftAmt->getOpcode() == ISD::XOR) { |
4305 | SDValue Add0 = ShiftAmt->getOperand(Num: 0); |
4306 | SDValue Add1 = ShiftAmt->getOperand(Num: 1); |
4307 | auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0); |
4308 | auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1); |
4309 | // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X |
4310 | // to avoid the ADD/SUB/XOR. |
4311 | if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == 0) { |
4312 | NewShiftAmt = Add0; |
4313 | |
4314 | } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() && |
4315 | ((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - 1) || |
4316 | (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - 1))) { |
4317 | // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X |
4318 | // we can replace it with a NOT. In the XOR case it may save some code |
4319 | // size, in the SUB case it also may save a move. |
4320 | assert(Add0C == nullptr || Add1C == nullptr); |
4321 | |
4322 | // We can only do N-X, not X-N |
4323 | if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr) |
4324 | return false; |
4325 | |
4326 | EVT OpVT = ShiftAmt.getValueType(); |
4327 | |
4328 | SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT); |
4329 | NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT, |
4330 | N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes); |
4331 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes); |
4332 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
4333 | // If we are shifting by N-X where N == 0 mod Size, then just shift by |
4334 | // -X to generate a NEG instead of a SUB of a constant. |
4335 | } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C && |
4336 | Add0C->getZExtValue() != 0) { |
4337 | EVT SubVT = ShiftAmt.getValueType(); |
4338 | SDValue X; |
4339 | if (Add0C->getZExtValue() % Size == 0) |
4340 | X = Add1; |
4341 | else if (ShiftAmt.hasOneUse() && Size == 64 && |
4342 | Add0C->getZExtValue() % 32 == 0) { |
4343 | // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32). |
4344 | // This is mainly beneficial if we already compute (x+n*32). |
4345 | if (Add1.getOpcode() == ISD::TRUNCATE) { |
4346 | Add1 = Add1.getOperand(i: 0); |
4347 | SubVT = Add1.getValueType(); |
4348 | } |
4349 | if (Add0.getValueType() != SubVT) { |
4350 | Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT); |
4351 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0); |
4352 | } |
4353 | |
4354 | X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0); |
4355 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X); |
4356 | } else |
4357 | return false; |
4358 | // Insert a negate op. |
4359 | // TODO: This isn't guaranteed to replace the sub if there is a logic cone |
4360 | // that uses it that's not a shift. |
4361 | SDValue Zero = CurDAG->getConstant(Val: 0, DL, VT: SubVT); |
4362 | SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X); |
4363 | NewShiftAmt = Neg; |
4364 | |
4365 | // Insert these operands into a valid topological order so they can |
4366 | // get selected independently. |
4367 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero); |
4368 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg); |
4369 | } else |
4370 | return false; |
4371 | } else |
4372 | return false; |
4373 | |
4374 | if (NewShiftAmt.getValueType() != MVT::i8) { |
4375 | // Need to truncate the shift amount. |
4376 | NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt); |
4377 | // Add to a correct topological ordering. |
4378 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
4379 | } |
4380 | |
4381 | // Insert a new mask to keep the shift amount legal. This should be removed |
4382 | // by isel patterns. |
4383 | NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt, |
4384 | N2: CurDAG->getConstant(Val: Size - 1, DL, VT: MVT::i8)); |
4385 | // Place in a correct topological ordering. |
4386 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
4387 | |
4388 | SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), |
4389 | Op2: NewShiftAmt); |
4390 | if (UpdatedNode != N) { |
4391 | // If we found an existing node, we should replace ourselves with that node |
4392 | // and wait for it to be selected after its other users. |
4393 | ReplaceNode(F: N, T: UpdatedNode); |
4394 | return true; |
4395 | } |
4396 | |
4397 | // If the original shift amount is now dead, delete it so that we don't run |
4398 | // it through isel. |
4399 | if (OrigShiftAmt.getNode()->use_empty()) |
4400 | CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode()); |
4401 | |
4402 | // Now that we've optimized the shift amount, defer to normal isel to get |
4403 | // load folding and legacy vs BMI2 selection without repeating it here. |
4404 | SelectCode(N); |
4405 | return true; |
4406 | } |
4407 | |
4408 | bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { |
4409 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
4410 | unsigned Opcode = N->getOpcode(); |
4411 | SDLoc dl(N); |
4412 | |
4413 | // For operations of the form (x << C1) op C2, check if we can use a smaller |
4414 | // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. |
4415 | SDValue Shift = N->getOperand(Num: 0); |
4416 | SDValue N1 = N->getOperand(Num: 1); |
4417 | |
4418 | auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1); |
4419 | if (!Cst) |
4420 | return false; |
4421 | |
4422 | int64_t Val = Cst->getSExtValue(); |
4423 | |
4424 | // If we have an any_extend feeding the AND, look through it to see if there |
4425 | // is a shift behind it. But only if the AND doesn't use the extended bits. |
4426 | // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? |
4427 | bool FoundAnyExtend = false; |
4428 | if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && |
4429 | Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 && |
4430 | isUInt<32>(x: Val)) { |
4431 | FoundAnyExtend = true; |
4432 | Shift = Shift.getOperand(i: 0); |
4433 | } |
4434 | |
4435 | if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) |
4436 | return false; |
4437 | |
4438 | // i8 is unshrinkable, i16 should be promoted to i32. |
4439 | if (NVT != MVT::i32 && NVT != MVT::i64) |
4440 | return false; |
4441 | |
4442 | auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1)); |
4443 | if (!ShlCst) |
4444 | return false; |
4445 | |
4446 | uint64_t ShAmt = ShlCst->getZExtValue(); |
4447 | |
4448 | // Make sure that we don't change the operation by removing bits. |
4449 | // This only matters for OR and XOR, AND is unaffected. |
4450 | uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; |
4451 | if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) |
4452 | return false; |
4453 | |
4454 | // Check the minimum bitwidth for the new constant. |
4455 | // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. |
4456 | auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { |
4457 | if (Opcode == ISD::AND) { |
4458 | // AND32ri is the same as AND64ri32 with zext imm. |
4459 | // Try this before sign extended immediates below. |
4460 | ShiftedVal = (uint64_t)Val >> ShAmt; |
4461 | if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal)) |
4462 | return true; |
4463 | // Also swap order when the AND can become MOVZX. |
4464 | if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) |
4465 | return true; |
4466 | } |
4467 | ShiftedVal = Val >> ShAmt; |
4468 | if ((!isInt<8>(x: Val) && isInt<8>(x: ShiftedVal)) || |
4469 | (!isInt<32>(x: Val) && isInt<32>(x: ShiftedVal))) |
4470 | return true; |
4471 | if (Opcode != ISD::AND) { |
4472 | // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr |
4473 | ShiftedVal = (uint64_t)Val >> ShAmt; |
4474 | if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal)) |
4475 | return true; |
4476 | } |
4477 | return false; |
4478 | }; |
4479 | |
4480 | int64_t ShiftedVal; |
4481 | if (!CanShrinkImmediate(ShiftedVal)) |
4482 | return false; |
4483 | |
4484 | // Ok, we can reorder to get a smaller immediate. |
4485 | |
4486 | // But, its possible the original immediate allowed an AND to become MOVZX. |
4487 | // Doing this late due to avoid the MakedValueIsZero call as late as |
4488 | // possible. |
4489 | if (Opcode == ISD::AND) { |
4490 | // Find the smallest zext this could possibly be. |
4491 | unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); |
4492 | ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: 8U)); |
4493 | |
4494 | // Figure out which bits need to be zero to achieve that mask. |
4495 | APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(), |
4496 | loBitsSet: ZExtWidth); |
4497 | NeededMask &= ~Cst->getAPIntValue(); |
4498 | |
4499 | if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: 0), Mask: NeededMask)) |
4500 | return false; |
4501 | } |
4502 | |
4503 | SDValue X = Shift.getOperand(i: 0); |
4504 | if (FoundAnyExtend) { |
4505 | SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X); |
4506 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewX); |
4507 | X = NewX; |
4508 | } |
4509 | |
4510 | SDValue NewCst = CurDAG->getConstant(Val: ShiftedVal, DL: dl, VT: NVT); |
4511 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewCst); |
4512 | SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst); |
4513 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewBinOp); |
4514 | SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp, |
4515 | N2: Shift.getOperand(i: 1)); |
4516 | ReplaceNode(F: N, T: NewSHL.getNode()); |
4517 | SelectCode(N: NewSHL.getNode()); |
4518 | return true; |
4519 | } |
4520 | |
4521 | bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA, |
4522 | SDNode *ParentB, SDNode *ParentC, |
4523 | SDValue A, SDValue B, SDValue C, |
4524 | uint8_t Imm) { |
4525 | assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) && |
4526 | C.isOperandOf(ParentC) && "Incorrect parent node" ); |
4527 | |
4528 | auto tryFoldLoadOrBCast = |
4529 | [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, |
4530 | SDValue &Index, SDValue &Disp, SDValue &Segment) { |
4531 | if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment)) |
4532 | return true; |
4533 | |
4534 | // Not a load, check for broadcast which may be behind a bitcast. |
4535 | if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { |
4536 | P = L.getNode(); |
4537 | L = L.getOperand(i: 0); |
4538 | } |
4539 | |
4540 | if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) |
4541 | return false; |
4542 | |
4543 | // Only 32 and 64 bit broadcasts are supported. |
4544 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L); |
4545 | unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); |
4546 | if (Size != 32 && Size != 64) |
4547 | return false; |
4548 | |
4549 | return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment); |
4550 | }; |
4551 | |
4552 | bool FoldedLoad = false; |
4553 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4554 | if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { |
4555 | FoldedLoad = true; |
4556 | } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3, |
4557 | Tmp4)) { |
4558 | FoldedLoad = true; |
4559 | std::swap(a&: A, b&: C); |
4560 | // Swap bits 1/4 and 3/6. |
4561 | uint8_t OldImm = Imm; |
4562 | Imm = OldImm & 0xa5; |
4563 | if (OldImm & 0x02) Imm |= 0x10; |
4564 | if (OldImm & 0x10) Imm |= 0x02; |
4565 | if (OldImm & 0x08) Imm |= 0x40; |
4566 | if (OldImm & 0x40) Imm |= 0x08; |
4567 | } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3, |
4568 | Tmp4)) { |
4569 | FoldedLoad = true; |
4570 | std::swap(a&: B, b&: C); |
4571 | // Swap bits 1/2 and 5/6. |
4572 | uint8_t OldImm = Imm; |
4573 | Imm = OldImm & 0x99; |
4574 | if (OldImm & 0x02) Imm |= 0x04; |
4575 | if (OldImm & 0x04) Imm |= 0x02; |
4576 | if (OldImm & 0x20) Imm |= 0x40; |
4577 | if (OldImm & 0x40) Imm |= 0x20; |
4578 | } |
4579 | |
4580 | SDLoc DL(Root); |
4581 | |
4582 | SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8); |
4583 | |
4584 | MVT NVT = Root->getSimpleValueType(ResNo: 0); |
4585 | |
4586 | MachineSDNode *MNode; |
4587 | if (FoldedLoad) { |
4588 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other); |
4589 | |
4590 | unsigned Opc; |
4591 | if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { |
4592 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C); |
4593 | unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); |
4594 | assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!" ); |
4595 | |
4596 | bool UseD = EltSize == 32; |
4597 | if (NVT.is128BitVector()) |
4598 | Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; |
4599 | else if (NVT.is256BitVector()) |
4600 | Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; |
4601 | else if (NVT.is512BitVector()) |
4602 | Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; |
4603 | else |
4604 | llvm_unreachable("Unexpected vector size!" ); |
4605 | } else { |
4606 | bool UseD = NVT.getVectorElementType() == MVT::i32; |
4607 | if (NVT.is128BitVector()) |
4608 | Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; |
4609 | else if (NVT.is256BitVector()) |
4610 | Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; |
4611 | else if (NVT.is512BitVector()) |
4612 | Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; |
4613 | else |
4614 | llvm_unreachable("Unexpected vector size!" ); |
4615 | } |
4616 | |
4617 | SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: 0)}; |
4618 | MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops); |
4619 | |
4620 | // Update the chain. |
4621 | ReplaceUses(F: C.getValue(R: 1), T: SDValue(MNode, 1)); |
4622 | // Record the mem-refs |
4623 | CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()}); |
4624 | } else { |
4625 | bool UseD = NVT.getVectorElementType() == MVT::i32; |
4626 | unsigned Opc; |
4627 | if (NVT.is128BitVector()) |
4628 | Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; |
4629 | else if (NVT.is256BitVector()) |
4630 | Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; |
4631 | else if (NVT.is512BitVector()) |
4632 | Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; |
4633 | else |
4634 | llvm_unreachable("Unexpected vector size!" ); |
4635 | |
4636 | MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm}); |
4637 | } |
4638 | |
4639 | ReplaceUses(F: SDValue(Root, 0), T: SDValue(MNode, 0)); |
4640 | CurDAG->RemoveDeadNode(N: Root); |
4641 | return true; |
4642 | } |
4643 | |
4644 | // Try to match two logic ops to a VPTERNLOG. |
4645 | // FIXME: Handle more complex patterns that use an operand more than once? |
4646 | bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { |
4647 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
4648 | |
4649 | // Make sure we support VPTERNLOG. |
4650 | if (!NVT.isVector() || !Subtarget->hasAVX512() || |
4651 | NVT.getVectorElementType() == MVT::i1) |
4652 | return false; |
4653 | |
4654 | // We need VLX for 128/256-bit. |
4655 | if (!(Subtarget->hasVLX() || NVT.is512BitVector())) |
4656 | return false; |
4657 | |
4658 | SDValue N0 = N->getOperand(Num: 0); |
4659 | SDValue N1 = N->getOperand(Num: 1); |
4660 | |
4661 | auto getFoldableLogicOp = [](SDValue Op) { |
4662 | // Peek through single use bitcast. |
4663 | if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) |
4664 | Op = Op.getOperand(i: 0); |
4665 | |
4666 | if (!Op.hasOneUse()) |
4667 | return SDValue(); |
4668 | |
4669 | unsigned Opc = Op.getOpcode(); |
4670 | if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || |
4671 | Opc == X86ISD::ANDNP) |
4672 | return Op; |
4673 | |
4674 | return SDValue(); |
4675 | }; |
4676 | |
4677 | SDValue A, FoldableOp; |
4678 | if ((FoldableOp = getFoldableLogicOp(N1))) { |
4679 | A = N0; |
4680 | } else if ((FoldableOp = getFoldableLogicOp(N0))) { |
4681 | A = N1; |
4682 | } else |
4683 | return false; |
4684 | |
4685 | SDValue B = FoldableOp.getOperand(i: 0); |
4686 | SDValue C = FoldableOp.getOperand(i: 1); |
4687 | SDNode *ParentA = N; |
4688 | SDNode *ParentB = FoldableOp.getNode(); |
4689 | SDNode *ParentC = FoldableOp.getNode(); |
4690 | |
4691 | // We can build the appropriate control immediate by performing the logic |
4692 | // operation we're matching using these constants for A, B, and C. |
4693 | uint8_t TernlogMagicA = 0xf0; |
4694 | uint8_t TernlogMagicB = 0xcc; |
4695 | uint8_t TernlogMagicC = 0xaa; |
4696 | |
4697 | // Some of the inputs may be inverted, peek through them and invert the |
4698 | // magic values accordingly. |
4699 | // TODO: There may be a bitcast before the xor that we should peek through. |
4700 | auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) { |
4701 | if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() && |
4702 | ISD::isBuildVectorAllOnes(N: Op.getOperand(i: 1).getNode())) { |
4703 | Magic = ~Magic; |
4704 | Parent = Op.getNode(); |
4705 | Op = Op.getOperand(i: 0); |
4706 | } |
4707 | }; |
4708 | |
4709 | PeekThroughNot(A, ParentA, TernlogMagicA); |
4710 | PeekThroughNot(B, ParentB, TernlogMagicB); |
4711 | PeekThroughNot(C, ParentC, TernlogMagicC); |
4712 | |
4713 | uint8_t Imm; |
4714 | switch (FoldableOp.getOpcode()) { |
4715 | default: llvm_unreachable("Unexpected opcode!" ); |
4716 | case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break; |
4717 | case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break; |
4718 | case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break; |
4719 | case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; |
4720 | } |
4721 | |
4722 | switch (N->getOpcode()) { |
4723 | default: llvm_unreachable("Unexpected opcode!" ); |
4724 | case X86ISD::ANDNP: |
4725 | if (A == N0) |
4726 | Imm &= ~TernlogMagicA; |
4727 | else |
4728 | Imm = ~(Imm) & TernlogMagicA; |
4729 | break; |
4730 | case ISD::AND: Imm &= TernlogMagicA; break; |
4731 | case ISD::OR: Imm |= TernlogMagicA; break; |
4732 | case ISD::XOR: Imm ^= TernlogMagicA; break; |
4733 | } |
4734 | |
4735 | return matchVPTERNLOG(Root: N, ParentA, ParentB, ParentC, A, B, C, Imm); |
4736 | } |
4737 | |
4738 | /// If the high bits of an 'and' operand are known zero, try setting the |
4739 | /// high bits of an 'and' constant operand to produce a smaller encoding by |
4740 | /// creating a small, sign-extended negative immediate rather than a large |
4741 | /// positive one. This reverses a transform in SimplifyDemandedBits that |
4742 | /// shrinks mask constants by clearing bits. There is also a possibility that |
4743 | /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that |
4744 | /// case, just replace the 'and'. Return 'true' if the node is replaced. |
4745 | bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { |
4746 | // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't |
4747 | // have immediate operands. |
4748 | MVT VT = And->getSimpleValueType(ResNo: 0); |
4749 | if (VT != MVT::i32 && VT != MVT::i64) |
4750 | return false; |
4751 | |
4752 | auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1)); |
4753 | if (!And1C) |
4754 | return false; |
4755 | |
4756 | // Bail out if the mask constant is already negative. It's can't shrink more. |
4757 | // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel |
4758 | // patterns to use a 32-bit and instead of a 64-bit and by relying on the |
4759 | // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits |
4760 | // are negative too. |
4761 | APInt MaskVal = And1C->getAPIntValue(); |
4762 | unsigned MaskLZ = MaskVal.countl_zero(); |
4763 | if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) |
4764 | return false; |
4765 | |
4766 | // Don't extend into the upper 32 bits of a 64 bit mask. |
4767 | if (VT == MVT::i64 && MaskLZ >= 32) { |
4768 | MaskLZ -= 32; |
4769 | MaskVal = MaskVal.trunc(width: 32); |
4770 | } |
4771 | |
4772 | SDValue And0 = And->getOperand(Num: 0); |
4773 | APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ); |
4774 | APInt NegMaskVal = MaskVal | HighZeros; |
4775 | |
4776 | // If a negative constant would not allow a smaller encoding, there's no need |
4777 | // to continue. Only change the constant when we know it's a win. |
4778 | unsigned MinWidth = NegMaskVal.getSignificantBits(); |
4779 | if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32)) |
4780 | return false; |
4781 | |
4782 | // Extend masks if we truncated above. |
4783 | if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { |
4784 | NegMaskVal = NegMaskVal.zext(width: 64); |
4785 | HighZeros = HighZeros.zext(width: 64); |
4786 | } |
4787 | |
4788 | // The variable operand must be all zeros in the top bits to allow using the |
4789 | // new, negative constant as the mask. |
4790 | if (!CurDAG->MaskedValueIsZero(Op: And0, Mask: HighZeros)) |
4791 | return false; |
4792 | |
4793 | // Check if the mask is -1. In that case, this is an unnecessary instruction |
4794 | // that escaped earlier analysis. |
4795 | if (NegMaskVal.isAllOnes()) { |
4796 | ReplaceNode(F: And, T: And0.getNode()); |
4797 | return true; |
4798 | } |
4799 | |
4800 | // A negative mask allows a smaller encoding. Create a new 'and' node. |
4801 | SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc(And), VT); |
4802 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(And, 0), N: NewMask); |
4803 | SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(And), VT, N1: And0, N2: NewMask); |
4804 | ReplaceNode(F: And, T: NewAnd.getNode()); |
4805 | SelectCode(N: NewAnd.getNode()); |
4806 | return true; |
4807 | } |
4808 | |
4809 | static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, |
4810 | bool FoldedBCast, bool Masked) { |
4811 | #define VPTESTM_CASE(VT, SUFFIX) \ |
4812 | case MVT::VT: \ |
4813 | if (Masked) \ |
4814 | return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \ |
4815 | return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX; |
4816 | |
4817 | |
4818 | #define VPTESTM_BROADCAST_CASES(SUFFIX) \ |
4819 | default: llvm_unreachable("Unexpected VT!"); \ |
4820 | VPTESTM_CASE(v4i32, DZ128##SUFFIX) \ |
4821 | VPTESTM_CASE(v2i64, QZ128##SUFFIX) \ |
4822 | VPTESTM_CASE(v8i32, DZ256##SUFFIX) \ |
4823 | VPTESTM_CASE(v4i64, QZ256##SUFFIX) \ |
4824 | VPTESTM_CASE(v16i32, DZ##SUFFIX) \ |
4825 | VPTESTM_CASE(v8i64, QZ##SUFFIX) |
4826 | |
4827 | #define VPTESTM_FULL_CASES(SUFFIX) \ |
4828 | VPTESTM_BROADCAST_CASES(SUFFIX) \ |
4829 | VPTESTM_CASE(v16i8, BZ128##SUFFIX) \ |
4830 | VPTESTM_CASE(v8i16, WZ128##SUFFIX) \ |
4831 | VPTESTM_CASE(v32i8, BZ256##SUFFIX) \ |
4832 | VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ |
4833 | VPTESTM_CASE(v64i8, BZ##SUFFIX) \ |
4834 | VPTESTM_CASE(v32i16, WZ##SUFFIX) |
4835 | |
4836 | if (FoldedBCast) { |
4837 | switch (TestVT.SimpleTy) { |
4838 | VPTESTM_BROADCAST_CASES(rmb) |
4839 | } |
4840 | } |
4841 | |
4842 | if (FoldedLoad) { |
4843 | switch (TestVT.SimpleTy) { |
4844 | VPTESTM_FULL_CASES(rm) |
4845 | } |
4846 | } |
4847 | |
4848 | switch (TestVT.SimpleTy) { |
4849 | VPTESTM_FULL_CASES(rr) |
4850 | } |
4851 | |
4852 | #undef VPTESTM_FULL_CASES |
4853 | #undef VPTESTM_BROADCAST_CASES |
4854 | #undef VPTESTM_CASE |
4855 | } |
4856 | |
4857 | // Try to create VPTESTM instruction. If InMask is not null, it will be used |
4858 | // to form a masked operation. |
4859 | bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, |
4860 | SDValue InMask) { |
4861 | assert(Subtarget->hasAVX512() && "Expected AVX512!" ); |
4862 | assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && |
4863 | "Unexpected VT!" ); |
4864 | |
4865 | // Look for equal and not equal compares. |
4866 | ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: 2))->get(); |
4867 | if (CC != ISD::SETEQ && CC != ISD::SETNE) |
4868 | return false; |
4869 | |
4870 | SDValue SetccOp0 = Setcc.getOperand(i: 0); |
4871 | SDValue SetccOp1 = Setcc.getOperand(i: 1); |
4872 | |
4873 | // Canonicalize the all zero vector to the RHS. |
4874 | if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode())) |
4875 | std::swap(a&: SetccOp0, b&: SetccOp1); |
4876 | |
4877 | // See if we're comparing against zero. |
4878 | if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode())) |
4879 | return false; |
4880 | |
4881 | SDValue N0 = SetccOp0; |
4882 | |
4883 | MVT CmpVT = N0.getSimpleValueType(); |
4884 | MVT CmpSVT = CmpVT.getVectorElementType(); |
4885 | |
4886 | // Start with both operands the same. We'll try to refine this. |
4887 | SDValue Src0 = N0; |
4888 | SDValue Src1 = N0; |
4889 | |
4890 | { |
4891 | // Look through single use bitcasts. |
4892 | SDValue N0Temp = N0; |
4893 | if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) |
4894 | N0Temp = N0.getOperand(i: 0); |
4895 | |
4896 | // Look for single use AND. |
4897 | if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { |
4898 | Src0 = N0Temp.getOperand(i: 0); |
4899 | Src1 = N0Temp.getOperand(i: 1); |
4900 | } |
4901 | } |
4902 | |
4903 | // Without VLX we need to widen the operation. |
4904 | bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); |
4905 | |
4906 | auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L, |
4907 | SDValue &Base, SDValue &Scale, SDValue &Index, |
4908 | SDValue &Disp, SDValue &Segment) { |
4909 | // If we need to widen, we can't fold the load. |
4910 | if (!Widen) |
4911 | if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment)) |
4912 | return true; |
4913 | |
4914 | // If we didn't fold a load, try to match broadcast. No widening limitation |
4915 | // for this. But only 32 and 64 bit types are supported. |
4916 | if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64) |
4917 | return false; |
4918 | |
4919 | // Look through single use bitcasts. |
4920 | if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { |
4921 | P = L.getNode(); |
4922 | L = L.getOperand(i: 0); |
4923 | } |
4924 | |
4925 | if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) |
4926 | return false; |
4927 | |
4928 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L); |
4929 | if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits()) |
4930 | return false; |
4931 | |
4932 | return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment); |
4933 | }; |
4934 | |
4935 | // We can only fold loads if the sources are unique. |
4936 | bool CanFoldLoads = Src0 != Src1; |
4937 | |
4938 | bool FoldedLoad = false; |
4939 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4940 | if (CanFoldLoads) { |
4941 | FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2, |
4942 | Tmp3, Tmp4); |
4943 | if (!FoldedLoad) { |
4944 | // And is commutative. |
4945 | FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1, |
4946 | Tmp2, Tmp3, Tmp4); |
4947 | if (FoldedLoad) |
4948 | std::swap(a&: Src0, b&: Src1); |
4949 | } |
4950 | } |
4951 | |
4952 | bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD; |
4953 | |
4954 | bool IsMasked = InMask.getNode() != nullptr; |
4955 | |
4956 | SDLoc dl(Root); |
4957 | |
4958 | MVT ResVT = Setcc.getSimpleValueType(); |
4959 | MVT MaskVT = ResVT; |
4960 | if (Widen) { |
4961 | // Widen the inputs using insert_subreg or copy_to_regclass. |
4962 | unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; |
4963 | unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; |
4964 | unsigned NumElts = CmpVT.getVectorNumElements() * Scale; |
4965 | CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts); |
4966 | MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts); |
4967 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl, |
4968 | VT: CmpVT), 0); |
4969 | Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0); |
4970 | |
4971 | if (!FoldedBCast) |
4972 | Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1); |
4973 | |
4974 | if (IsMasked) { |
4975 | // Widen the mask. |
4976 | unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID(); |
4977 | SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32); |
4978 | InMask = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS, |
4979 | dl, VT: MaskVT, Op1: InMask, Op2: RC), 0); |
4980 | } |
4981 | } |
4982 | |
4983 | bool IsTestN = CC == ISD::SETEQ; |
4984 | unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast, |
4985 | Masked: IsMasked); |
4986 | |
4987 | MachineSDNode *CNode; |
4988 | if (FoldedLoad) { |
4989 | SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other); |
4990 | |
4991 | if (IsMasked) { |
4992 | SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, |
4993 | Src1.getOperand(i: 0) }; |
4994 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
4995 | } else { |
4996 | SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, |
4997 | Src1.getOperand(i: 0) }; |
4998 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
4999 | } |
5000 | |
5001 | // Update the chain. |
5002 | ReplaceUses(F: Src1.getValue(R: 1), T: SDValue(CNode, 1)); |
5003 | // Record the mem-refs |
5004 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()}); |
5005 | } else { |
5006 | if (IsMasked) |
5007 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1); |
5008 | else |
5009 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1); |
5010 | } |
5011 | |
5012 | // If we widened, we need to shrink the mask VT. |
5013 | if (Widen) { |
5014 | unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID(); |
5015 | SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32); |
5016 | CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS, |
5017 | dl, VT: ResVT, Op1: SDValue(CNode, 0), Op2: RC); |
5018 | } |
5019 | |
5020 | ReplaceUses(F: SDValue(Root, 0), T: SDValue(CNode, 0)); |
5021 | CurDAG->RemoveDeadNode(N: Root); |
5022 | return true; |
5023 | } |
5024 | |
5025 | // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it |
5026 | // into vpternlog. |
5027 | bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { |
5028 | assert(N->getOpcode() == ISD::OR && "Unexpected opcode!" ); |
5029 | |
5030 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
5031 | |
5032 | // Make sure we support VPTERNLOG. |
5033 | if (!NVT.isVector() || !Subtarget->hasAVX512()) |
5034 | return false; |
5035 | |
5036 | // We need VLX for 128/256-bit. |
5037 | if (!(Subtarget->hasVLX() || NVT.is512BitVector())) |
5038 | return false; |
5039 | |
5040 | SDValue N0 = N->getOperand(Num: 0); |
5041 | SDValue N1 = N->getOperand(Num: 1); |
5042 | |
5043 | // Canonicalize AND to LHS. |
5044 | if (N1.getOpcode() == ISD::AND) |
5045 | std::swap(a&: N0, b&: N1); |
5046 | |
5047 | if (N0.getOpcode() != ISD::AND || |
5048 | N1.getOpcode() != X86ISD::ANDNP || |
5049 | !N0.hasOneUse() || !N1.hasOneUse()) |
5050 | return false; |
5051 | |
5052 | // ANDN is not commutable, use it to pick down A and C. |
5053 | SDValue A = N1.getOperand(i: 0); |
5054 | SDValue C = N1.getOperand(i: 1); |
5055 | |
5056 | // AND is commutable, if one operand matches A, the other operand is B. |
5057 | // Otherwise this isn't a match. |
5058 | SDValue B; |
5059 | if (N0.getOperand(i: 0) == A) |
5060 | B = N0.getOperand(i: 1); |
5061 | else if (N0.getOperand(i: 1) == A) |
5062 | B = N0.getOperand(i: 0); |
5063 | else |
5064 | return false; |
5065 | |
5066 | SDLoc dl(N); |
5067 | SDValue Imm = CurDAG->getTargetConstant(Val: 0xCA, DL: dl, VT: MVT::i8); |
5068 | SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm); |
5069 | ReplaceNode(F: N, T: Ternlog.getNode()); |
5070 | |
5071 | return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(), |
5072 | ParentC: Ternlog.getNode(), A, B, C, Imm: 0xCA); |
5073 | } |
5074 | |
5075 | void X86DAGToDAGISel::Select(SDNode *Node) { |
5076 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
5077 | unsigned Opcode = Node->getOpcode(); |
5078 | SDLoc dl(Node); |
5079 | |
5080 | if (Node->isMachineOpcode()) { |
5081 | LLVM_DEBUG(dbgs() << "== " ; Node->dump(CurDAG); dbgs() << '\n'); |
5082 | Node->setNodeId(-1); |
5083 | return; // Already selected. |
5084 | } |
5085 | |
5086 | switch (Opcode) { |
5087 | default: break; |
5088 | case ISD::INTRINSIC_W_CHAIN: { |
5089 | unsigned IntNo = Node->getConstantOperandVal(Num: 1); |
5090 | switch (IntNo) { |
5091 | default: break; |
5092 | case Intrinsic::x86_encodekey128: |
5093 | case Intrinsic::x86_encodekey256: { |
5094 | if (!Subtarget->hasKL()) |
5095 | break; |
5096 | |
5097 | unsigned Opcode; |
5098 | switch (IntNo) { |
5099 | default: llvm_unreachable("Impossible intrinsic" ); |
5100 | case Intrinsic::x86_encodekey128: |
5101 | Opcode = X86::ENCODEKEY128; |
5102 | break; |
5103 | case Intrinsic::x86_encodekey256: |
5104 | Opcode = X86::ENCODEKEY256; |
5105 | break; |
5106 | } |
5107 | |
5108 | SDValue Chain = Node->getOperand(Num: 0); |
5109 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 3), |
5110 | Glue: SDValue()); |
5111 | if (Opcode == X86::ENCODEKEY256) |
5112 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 4), |
5113 | Glue: Chain.getValue(R: 1)); |
5114 | |
5115 | MachineSDNode *Res = CurDAG->getMachineNode( |
5116 | Opcode, dl, VTs: Node->getVTList(), |
5117 | Ops: {Node->getOperand(Num: 2), Chain, Chain.getValue(R: 1)}); |
5118 | ReplaceNode(F: Node, T: Res); |
5119 | return; |
5120 | } |
5121 | case Intrinsic::x86_tileloadd64_internal: |
5122 | case Intrinsic::x86_tileloaddt164_internal: { |
5123 | if (!Subtarget->hasAMXTILE()) |
5124 | break; |
5125 | auto *MFI = |
5126 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
5127 | MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); |
5128 | unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal |
5129 | ? X86::PTILELOADDV |
5130 | : X86::PTILELOADDT1V; |
5131 | // _tile_loadd_internal(row, col, buf, STRIDE) |
5132 | SDValue Base = Node->getOperand(Num: 4); |
5133 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
5134 | SDValue Index = Node->getOperand(Num: 5); |
5135 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
5136 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
5137 | SDValue Chain = Node->getOperand(Num: 0); |
5138 | MachineSDNode *CNode; |
5139 | SDValue Ops[] = {Node->getOperand(Num: 2), |
5140 | Node->getOperand(Num: 3), |
5141 | Base, |
5142 | Scale, |
5143 | Index, |
5144 | Disp, |
5145 | Segment, |
5146 | Chain}; |
5147 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops); |
5148 | ReplaceNode(F: Node, T: CNode); |
5149 | return; |
5150 | } |
5151 | } |
5152 | break; |
5153 | } |
5154 | case ISD::INTRINSIC_VOID: { |
5155 | unsigned IntNo = Node->getConstantOperandVal(Num: 1); |
5156 | switch (IntNo) { |
5157 | default: break; |
5158 | case Intrinsic::x86_sse3_monitor: |
5159 | case Intrinsic::x86_monitorx: |
5160 | case Intrinsic::x86_clzero: { |
5161 | bool Use64BitPtr = Node->getOperand(Num: 2).getValueType() == MVT::i64; |
5162 | |
5163 | unsigned Opc = 0; |
5164 | switch (IntNo) { |
5165 | default: llvm_unreachable("Unexpected intrinsic!" ); |
5166 | case Intrinsic::x86_sse3_monitor: |
5167 | if (!Subtarget->hasSSE3()) |
5168 | break; |
5169 | Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; |
5170 | break; |
5171 | case Intrinsic::x86_monitorx: |
5172 | if (!Subtarget->hasMWAITX()) |
5173 | break; |
5174 | Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; |
5175 | break; |
5176 | case Intrinsic::x86_clzero: |
5177 | if (!Subtarget->hasCLZERO()) |
5178 | break; |
5179 | Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; |
5180 | break; |
5181 | } |
5182 | |
5183 | if (Opc) { |
5184 | unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; |
5185 | SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: 0), dl, Reg: PtrReg, |
5186 | N: Node->getOperand(Num: 2), Glue: SDValue()); |
5187 | SDValue InGlue = Chain.getValue(R: 1); |
5188 | |
5189 | if (IntNo == Intrinsic::x86_sse3_monitor || |
5190 | IntNo == Intrinsic::x86_monitorx) { |
5191 | // Copy the other two operands to ECX and EDX. |
5192 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: 3), |
5193 | Glue: InGlue); |
5194 | InGlue = Chain.getValue(R: 1); |
5195 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: 4), |
5196 | Glue: InGlue); |
5197 | InGlue = Chain.getValue(R: 1); |
5198 | } |
5199 | |
5200 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, |
5201 | Ops: { Chain, InGlue}); |
5202 | ReplaceNode(F: Node, T: CNode); |
5203 | return; |
5204 | } |
5205 | |
5206 | break; |
5207 | } |
5208 | case Intrinsic::x86_tilestored64_internal: { |
5209 | auto *MFI = |
5210 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
5211 | MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); |
5212 | unsigned Opc = X86::PTILESTOREDV; |
5213 | // _tile_stored_internal(row, col, buf, STRIDE, c) |
5214 | SDValue Base = Node->getOperand(Num: 4); |
5215 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
5216 | SDValue Index = Node->getOperand(Num: 5); |
5217 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
5218 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
5219 | SDValue Chain = Node->getOperand(Num: 0); |
5220 | MachineSDNode *CNode; |
5221 | SDValue Ops[] = {Node->getOperand(Num: 2), |
5222 | Node->getOperand(Num: 3), |
5223 | Base, |
5224 | Scale, |
5225 | Index, |
5226 | Disp, |
5227 | Segment, |
5228 | Node->getOperand(Num: 6), |
5229 | Chain}; |
5230 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
5231 | ReplaceNode(F: Node, T: CNode); |
5232 | return; |
5233 | } |
5234 | case Intrinsic::x86_tileloadd64: |
5235 | case Intrinsic::x86_tileloaddt164: |
5236 | case Intrinsic::x86_tilestored64: { |
5237 | if (!Subtarget->hasAMXTILE()) |
5238 | break; |
5239 | auto *MFI = |
5240 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
5241 | MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); |
5242 | unsigned Opc; |
5243 | switch (IntNo) { |
5244 | default: llvm_unreachable("Unexpected intrinsic!" ); |
5245 | case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; |
5246 | case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; |
5247 | case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; |
5248 | } |
5249 | // FIXME: Match displacement and scale. |
5250 | unsigned TIndex = Node->getConstantOperandVal(Num: 2); |
5251 | SDValue TReg = getI8Imm(Imm: TIndex, DL: dl); |
5252 | SDValue Base = Node->getOperand(Num: 3); |
5253 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
5254 | SDValue Index = Node->getOperand(Num: 4); |
5255 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
5256 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
5257 | SDValue Chain = Node->getOperand(Num: 0); |
5258 | MachineSDNode *CNode; |
5259 | if (Opc == X86::PTILESTORED) { |
5260 | SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; |
5261 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
5262 | } else { |
5263 | SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; |
5264 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
5265 | } |
5266 | ReplaceNode(F: Node, T: CNode); |
5267 | return; |
5268 | } |
5269 | } |
5270 | break; |
5271 | } |
5272 | case ISD::BRIND: |
5273 | case X86ISD::NT_BRIND: { |
5274 | if (Subtarget->isTargetNaCl()) |
5275 | // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We |
5276 | // leave the instruction alone. |
5277 | break; |
5278 | if (Subtarget->isTarget64BitILP32()) { |
5279 | // Converts a 32-bit register to a 64-bit, zero-extended version of |
5280 | // it. This is needed because x86-64 can do many things, but jmp %r32 |
5281 | // ain't one of them. |
5282 | SDValue Target = Node->getOperand(Num: 1); |
5283 | assert(Target.getValueType() == MVT::i32 && "Unexpected VT!" ); |
5284 | SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64); |
5285 | SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other, |
5286 | N1: Node->getOperand(Num: 0), N2: ZextTarget); |
5287 | ReplaceNode(F: Node, T: Brind.getNode()); |
5288 | SelectCode(N: ZextTarget.getNode()); |
5289 | SelectCode(N: Brind.getNode()); |
5290 | return; |
5291 | } |
5292 | break; |
5293 | } |
5294 | case X86ISD::GlobalBaseReg: |
5295 | ReplaceNode(F: Node, T: getGlobalBaseReg()); |
5296 | return; |
5297 | |
5298 | case ISD::BITCAST: |
5299 | // Just drop all 128/256/512-bit bitcasts. |
5300 | if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || |
5301 | NVT == MVT::f128) { |
5302 | ReplaceUses(F: SDValue(Node, 0), T: Node->getOperand(Num: 0)); |
5303 | CurDAG->RemoveDeadNode(N: Node); |
5304 | return; |
5305 | } |
5306 | break; |
5307 | |
5308 | case ISD::SRL: |
5309 | if (matchBitExtract(Node)) |
5310 | return; |
5311 | [[fallthrough]]; |
5312 | case ISD::SRA: |
5313 | case ISD::SHL: |
5314 | if (tryShiftAmountMod(N: Node)) |
5315 | return; |
5316 | break; |
5317 | |
5318 | case X86ISD::VPTERNLOG: { |
5319 | uint8_t Imm = Node->getConstantOperandVal(Num: 3); |
5320 | if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: 0), |
5321 | B: Node->getOperand(Num: 1), C: Node->getOperand(Num: 2), Imm)) |
5322 | return; |
5323 | break; |
5324 | } |
5325 | |
5326 | case X86ISD::ANDNP: |
5327 | if (tryVPTERNLOG(N: Node)) |
5328 | return; |
5329 | break; |
5330 | |
5331 | case ISD::AND: |
5332 | if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { |
5333 | // Try to form a masked VPTESTM. Operands can be in either order. |
5334 | SDValue N0 = Node->getOperand(Num: 0); |
5335 | SDValue N1 = Node->getOperand(Num: 1); |
5336 | if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && |
5337 | tryVPTESTM(Root: Node, Setcc: N0, InMask: N1)) |
5338 | return; |
5339 | if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && |
5340 | tryVPTESTM(Root: Node, Setcc: N1, InMask: N0)) |
5341 | return; |
5342 | } |
5343 | |
5344 | if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { |
5345 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
5346 | CurDAG->RemoveDeadNode(N: Node); |
5347 | return; |
5348 | } |
5349 | if (matchBitExtract(Node)) |
5350 | return; |
5351 | if (AndImmShrink && shrinkAndImmediate(And: Node)) |
5352 | return; |
5353 | |
5354 | [[fallthrough]]; |
5355 | case ISD::OR: |
5356 | case ISD::XOR: |
5357 | if (tryShrinkShlLogicImm(N: Node)) |
5358 | return; |
5359 | if (Opcode == ISD::OR && tryMatchBitSelect(N: Node)) |
5360 | return; |
5361 | if (tryVPTERNLOG(N: Node)) |
5362 | return; |
5363 | |
5364 | [[fallthrough]]; |
5365 | case ISD::ADD: |
5366 | if (Opcode == ISD::ADD && matchBitExtract(Node)) |
5367 | return; |
5368 | [[fallthrough]]; |
5369 | case ISD::SUB: { |
5370 | // Try to avoid folding immediates with multiple uses for optsize. |
5371 | // This code tries to select to register form directly to avoid going |
5372 | // through the isel table which might fold the immediate. We can't change |
5373 | // the patterns on the add/sub/and/or/xor with immediate paterns in the |
5374 | // tablegen files to check immediate use count without making the patterns |
5375 | // unavailable to the fast-isel table. |
5376 | if (!CurDAG->shouldOptForSize()) |
5377 | break; |
5378 | |
5379 | // Only handle i8/i16/i32/i64. |
5380 | if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) |
5381 | break; |
5382 | |
5383 | SDValue N0 = Node->getOperand(Num: 0); |
5384 | SDValue N1 = Node->getOperand(Num: 1); |
5385 | |
5386 | auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1); |
5387 | if (!Cst) |
5388 | break; |
5389 | |
5390 | int64_t Val = Cst->getSExtValue(); |
5391 | |
5392 | // Make sure its an immediate that is considered foldable. |
5393 | // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. |
5394 | if (!isInt<8>(x: Val) && !isInt<32>(x: Val)) |
5395 | break; |
5396 | |
5397 | // If this can match to INC/DEC, let it go. |
5398 | if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) |
5399 | break; |
5400 | |
5401 | // Check if we should avoid folding this immediate. |
5402 | if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode())) |
5403 | break; |
5404 | |
5405 | // We should not fold the immediate. So we need a register form instead. |
5406 | unsigned ROpc, MOpc; |
5407 | switch (NVT.SimpleTy) { |
5408 | default: llvm_unreachable("Unexpected VT!" ); |
5409 | case MVT::i8: |
5410 | switch (Opcode) { |
5411 | default: llvm_unreachable("Unexpected opcode!" ); |
5412 | case ISD::ADD: |
5413 | ROpc = GET_ND_IF_ENABLED(X86::ADD8rr); |
5414 | MOpc = GET_ND_IF_ENABLED(X86::ADD8rm); |
5415 | break; |
5416 | case ISD::SUB: |
5417 | ROpc = GET_ND_IF_ENABLED(X86::SUB8rr); |
5418 | MOpc = GET_ND_IF_ENABLED(X86::SUB8rm); |
5419 | break; |
5420 | case ISD::AND: |
5421 | ROpc = GET_ND_IF_ENABLED(X86::AND8rr); |
5422 | MOpc = GET_ND_IF_ENABLED(X86::AND8rm); |
5423 | break; |
5424 | case ISD::OR: |
5425 | ROpc = GET_ND_IF_ENABLED(X86::OR8rr); |
5426 | MOpc = GET_ND_IF_ENABLED(X86::OR8rm); |
5427 | break; |
5428 | case ISD::XOR: |
5429 | ROpc = GET_ND_IF_ENABLED(X86::XOR8rr); |
5430 | MOpc = GET_ND_IF_ENABLED(X86::XOR8rm); |
5431 | break; |
5432 | } |
5433 | break; |
5434 | case MVT::i16: |
5435 | switch (Opcode) { |
5436 | default: llvm_unreachable("Unexpected opcode!" ); |
5437 | case ISD::ADD: |
5438 | ROpc = GET_ND_IF_ENABLED(X86::ADD16rr); |
5439 | MOpc = GET_ND_IF_ENABLED(X86::ADD16rm); |
5440 | break; |
5441 | case ISD::SUB: |
5442 | ROpc = GET_ND_IF_ENABLED(X86::SUB16rr); |
5443 | MOpc = GET_ND_IF_ENABLED(X86::SUB16rm); |
5444 | break; |
5445 | case ISD::AND: |
5446 | ROpc = GET_ND_IF_ENABLED(X86::AND16rr); |
5447 | MOpc = GET_ND_IF_ENABLED(X86::AND16rm); |
5448 | break; |
5449 | case ISD::OR: |
5450 | ROpc = GET_ND_IF_ENABLED(X86::OR16rr); |
5451 | MOpc = GET_ND_IF_ENABLED(X86::OR16rm); |
5452 | break; |
5453 | case ISD::XOR: |
5454 | ROpc = GET_ND_IF_ENABLED(X86::XOR16rr); |
5455 | MOpc = GET_ND_IF_ENABLED(X86::XOR16rm); |
5456 | break; |
5457 | } |
5458 | break; |
5459 | case MVT::i32: |
5460 | switch (Opcode) { |
5461 | default: llvm_unreachable("Unexpected opcode!" ); |
5462 | case ISD::ADD: |
5463 | ROpc = GET_ND_IF_ENABLED(X86::ADD32rr); |
5464 | MOpc = GET_ND_IF_ENABLED(X86::ADD32rm); |
5465 | break; |
5466 | case ISD::SUB: |
5467 | ROpc = GET_ND_IF_ENABLED(X86::SUB32rr); |
5468 | MOpc = GET_ND_IF_ENABLED(X86::SUB32rm); |
5469 | break; |
5470 | case ISD::AND: |
5471 | ROpc = GET_ND_IF_ENABLED(X86::AND32rr); |
5472 | MOpc = GET_ND_IF_ENABLED(X86::AND32rm); |
5473 | break; |
5474 | case ISD::OR: |
5475 | ROpc = GET_ND_IF_ENABLED(X86::OR32rr); |
5476 | MOpc = GET_ND_IF_ENABLED(X86::OR32rm); |
5477 | break; |
5478 | case ISD::XOR: |
5479 | ROpc = GET_ND_IF_ENABLED(X86::XOR32rr); |
5480 | MOpc = GET_ND_IF_ENABLED(X86::XOR32rm); |
5481 | break; |
5482 | } |
5483 | break; |
5484 | case MVT::i64: |
5485 | switch (Opcode) { |
5486 | default: llvm_unreachable("Unexpected opcode!" ); |
5487 | case ISD::ADD: |
5488 | ROpc = GET_ND_IF_ENABLED(X86::ADD64rr); |
5489 | MOpc = GET_ND_IF_ENABLED(X86::ADD64rm); |
5490 | break; |
5491 | case ISD::SUB: |
5492 | ROpc = GET_ND_IF_ENABLED(X86::SUB64rr); |
5493 | MOpc = GET_ND_IF_ENABLED(X86::SUB64rm); |
5494 | break; |
5495 | case ISD::AND: |
5496 | ROpc = GET_ND_IF_ENABLED(X86::AND64rr); |
5497 | MOpc = GET_ND_IF_ENABLED(X86::AND64rm); |
5498 | break; |
5499 | case ISD::OR: |
5500 | ROpc = GET_ND_IF_ENABLED(X86::OR64rr); |
5501 | MOpc = GET_ND_IF_ENABLED(X86::OR64rm); |
5502 | break; |
5503 | case ISD::XOR: |
5504 | ROpc = GET_ND_IF_ENABLED(X86::XOR64rr); |
5505 | MOpc = GET_ND_IF_ENABLED(X86::XOR64rm); |
5506 | break; |
5507 | } |
5508 | break; |
5509 | } |
5510 | |
5511 | // Ok this is a AND/OR/XOR/ADD/SUB with constant. |
5512 | |
5513 | // If this is a not a subtract, we can still try to fold a load. |
5514 | if (Opcode != ISD::SUB) { |
5515 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5516 | if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
5517 | SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) }; |
5518 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other); |
5519 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5520 | // Update the chain. |
5521 | ReplaceUses(F: N0.getValue(R: 1), T: SDValue(CNode, 2)); |
5522 | // Record the mem-refs |
5523 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()}); |
5524 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
5525 | CurDAG->RemoveDeadNode(N: Node); |
5526 | return; |
5527 | } |
5528 | } |
5529 | |
5530 | CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1); |
5531 | return; |
5532 | } |
5533 | |
5534 | case X86ISD::SMUL: |
5535 | // i16/i32/i64 are handled with isel patterns. |
5536 | if (NVT != MVT::i8) |
5537 | break; |
5538 | [[fallthrough]]; |
5539 | case X86ISD::UMUL: { |
5540 | SDValue N0 = Node->getOperand(Num: 0); |
5541 | SDValue N1 = Node->getOperand(Num: 1); |
5542 | |
5543 | unsigned LoReg, ROpc, MOpc; |
5544 | switch (NVT.SimpleTy) { |
5545 | default: llvm_unreachable("Unsupported VT!" ); |
5546 | case MVT::i8: |
5547 | LoReg = X86::AL; |
5548 | ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; |
5549 | MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; |
5550 | break; |
5551 | case MVT::i16: |
5552 | LoReg = X86::AX; |
5553 | ROpc = X86::MUL16r; |
5554 | MOpc = X86::MUL16m; |
5555 | break; |
5556 | case MVT::i32: |
5557 | LoReg = X86::EAX; |
5558 | ROpc = X86::MUL32r; |
5559 | MOpc = X86::MUL32m; |
5560 | break; |
5561 | case MVT::i64: |
5562 | LoReg = X86::RAX; |
5563 | ROpc = X86::MUL64r; |
5564 | MOpc = X86::MUL64m; |
5565 | break; |
5566 | } |
5567 | |
5568 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5569 | bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5570 | // Multiply is commutative. |
5571 | if (!FoldedLoad) { |
5572 | FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5573 | if (FoldedLoad) |
5574 | std::swap(a&: N0, b&: N1); |
5575 | } |
5576 | |
5577 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
5578 | N: N0, Glue: SDValue()).getValue(R: 1); |
5579 | |
5580 | MachineSDNode *CNode; |
5581 | if (FoldedLoad) { |
5582 | // i16/i32/i64 use an instruction that produces a low and high result even |
5583 | // though only the low result is used. |
5584 | SDVTList VTs; |
5585 | if (NVT == MVT::i8) |
5586 | VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other); |
5587 | else |
5588 | VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other); |
5589 | |
5590 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
5591 | InGlue }; |
5592 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5593 | |
5594 | // Update the chain. |
5595 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); |
5596 | // Record the mem-refs |
5597 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
5598 | } else { |
5599 | // i16/i32/i64 use an instruction that produces a low and high result even |
5600 | // though only the low result is used. |
5601 | SDVTList VTs; |
5602 | if (NVT == MVT::i8) |
5603 | VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32); |
5604 | else |
5605 | VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32); |
5606 | |
5607 | CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue}); |
5608 | } |
5609 | |
5610 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
5611 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); |
5612 | CurDAG->RemoveDeadNode(N: Node); |
5613 | return; |
5614 | } |
5615 | |
5616 | case ISD::SMUL_LOHI: |
5617 | case ISD::UMUL_LOHI: { |
5618 | SDValue N0 = Node->getOperand(Num: 0); |
5619 | SDValue N1 = Node->getOperand(Num: 1); |
5620 | |
5621 | unsigned Opc, MOpc; |
5622 | unsigned LoReg, HiReg; |
5623 | bool IsSigned = Opcode == ISD::SMUL_LOHI; |
5624 | bool UseMULX = !IsSigned && Subtarget->hasBMI2(); |
5625 | bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); |
5626 | switch (NVT.SimpleTy) { |
5627 | default: llvm_unreachable("Unsupported VT!" ); |
5628 | case MVT::i32: |
5629 | Opc = UseMULXHi ? X86::MULX32Hrr |
5630 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr) |
5631 | : IsSigned ? X86::IMUL32r |
5632 | : X86::MUL32r; |
5633 | MOpc = UseMULXHi ? X86::MULX32Hrm |
5634 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm) |
5635 | : IsSigned ? X86::IMUL32m |
5636 | : X86::MUL32m; |
5637 | LoReg = UseMULX ? X86::EDX : X86::EAX; |
5638 | HiReg = X86::EDX; |
5639 | break; |
5640 | case MVT::i64: |
5641 | Opc = UseMULXHi ? X86::MULX64Hrr |
5642 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr) |
5643 | : IsSigned ? X86::IMUL64r |
5644 | : X86::MUL64r; |
5645 | MOpc = UseMULXHi ? X86::MULX64Hrm |
5646 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm) |
5647 | : IsSigned ? X86::IMUL64m |
5648 | : X86::MUL64m; |
5649 | LoReg = UseMULX ? X86::RDX : X86::RAX; |
5650 | HiReg = X86::RDX; |
5651 | break; |
5652 | } |
5653 | |
5654 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5655 | bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5656 | // Multiply is commutative. |
5657 | if (!foldedLoad) { |
5658 | foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5659 | if (foldedLoad) |
5660 | std::swap(a&: N0, b&: N1); |
5661 | } |
5662 | |
5663 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
5664 | N: N0, Glue: SDValue()).getValue(R: 1); |
5665 | SDValue ResHi, ResLo; |
5666 | if (foldedLoad) { |
5667 | SDValue Chain; |
5668 | MachineSDNode *CNode = nullptr; |
5669 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
5670 | InGlue }; |
5671 | if (UseMULXHi) { |
5672 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other); |
5673 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5674 | ResHi = SDValue(CNode, 0); |
5675 | Chain = SDValue(CNode, 1); |
5676 | } else if (UseMULX) { |
5677 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other); |
5678 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5679 | ResHi = SDValue(CNode, 0); |
5680 | ResLo = SDValue(CNode, 1); |
5681 | Chain = SDValue(CNode, 2); |
5682 | } else { |
5683 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
5684 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5685 | Chain = SDValue(CNode, 0); |
5686 | InGlue = SDValue(CNode, 1); |
5687 | } |
5688 | |
5689 | // Update the chain. |
5690 | ReplaceUses(F: N1.getValue(R: 1), T: Chain); |
5691 | // Record the mem-refs |
5692 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
5693 | } else { |
5694 | SDValue Ops[] = { N1, InGlue }; |
5695 | if (UseMULXHi) { |
5696 | SDVTList VTs = CurDAG->getVTList(VT: NVT); |
5697 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5698 | ResHi = SDValue(CNode, 0); |
5699 | } else if (UseMULX) { |
5700 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT); |
5701 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5702 | ResHi = SDValue(CNode, 0); |
5703 | ResLo = SDValue(CNode, 1); |
5704 | } else { |
5705 | SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue); |
5706 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5707 | InGlue = SDValue(CNode, 0); |
5708 | } |
5709 | } |
5710 | |
5711 | // Copy the low half of the result, if it is needed. |
5712 | if (!SDValue(Node, 0).use_empty()) { |
5713 | if (!ResLo) { |
5714 | assert(LoReg && "Register for low half is not defined!" ); |
5715 | ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
5716 | VT: NVT, Glue: InGlue); |
5717 | InGlue = ResLo.getValue(R: 2); |
5718 | } |
5719 | ReplaceUses(F: SDValue(Node, 0), T: ResLo); |
5720 | LLVM_DEBUG(dbgs() << "=> " ; ResLo.getNode()->dump(CurDAG); |
5721 | dbgs() << '\n'); |
5722 | } |
5723 | // Copy the high half of the result, if it is needed. |
5724 | if (!SDValue(Node, 1).use_empty()) { |
5725 | if (!ResHi) { |
5726 | assert(HiReg && "Register for high half is not defined!" ); |
5727 | ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg, |
5728 | VT: NVT, Glue: InGlue); |
5729 | InGlue = ResHi.getValue(R: 2); |
5730 | } |
5731 | ReplaceUses(F: SDValue(Node, 1), T: ResHi); |
5732 | LLVM_DEBUG(dbgs() << "=> " ; ResHi.getNode()->dump(CurDAG); |
5733 | dbgs() << '\n'); |
5734 | } |
5735 | |
5736 | CurDAG->RemoveDeadNode(N: Node); |
5737 | return; |
5738 | } |
5739 | |
5740 | case ISD::SDIVREM: |
5741 | case ISD::UDIVREM: { |
5742 | SDValue N0 = Node->getOperand(Num: 0); |
5743 | SDValue N1 = Node->getOperand(Num: 1); |
5744 | |
5745 | unsigned ROpc, MOpc; |
5746 | bool isSigned = Opcode == ISD::SDIVREM; |
5747 | if (!isSigned) { |
5748 | switch (NVT.SimpleTy) { |
5749 | default: llvm_unreachable("Unsupported VT!" ); |
5750 | case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break; |
5751 | case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break; |
5752 | case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break; |
5753 | case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break; |
5754 | } |
5755 | } else { |
5756 | switch (NVT.SimpleTy) { |
5757 | default: llvm_unreachable("Unsupported VT!" ); |
5758 | case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break; |
5759 | case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break; |
5760 | case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break; |
5761 | case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break; |
5762 | } |
5763 | } |
5764 | |
5765 | unsigned LoReg, HiReg, ClrReg; |
5766 | unsigned SExtOpcode; |
5767 | switch (NVT.SimpleTy) { |
5768 | default: llvm_unreachable("Unsupported VT!" ); |
5769 | case MVT::i8: |
5770 | LoReg = X86::AL; ClrReg = HiReg = X86::AH; |
5771 | SExtOpcode = 0; // Not used. |
5772 | break; |
5773 | case MVT::i16: |
5774 | LoReg = X86::AX; HiReg = X86::DX; |
5775 | ClrReg = X86::DX; |
5776 | SExtOpcode = X86::CWD; |
5777 | break; |
5778 | case MVT::i32: |
5779 | LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; |
5780 | SExtOpcode = X86::CDQ; |
5781 | break; |
5782 | case MVT::i64: |
5783 | LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; |
5784 | SExtOpcode = X86::CQO; |
5785 | break; |
5786 | } |
5787 | |
5788 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5789 | bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5790 | bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0); |
5791 | |
5792 | SDValue InGlue; |
5793 | if (NVT == MVT::i8) { |
5794 | // Special case for div8, just use a move with zero extension to AX to |
5795 | // clear the upper 8 bits (AH). |
5796 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; |
5797 | MachineSDNode *Move; |
5798 | if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
5799 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) }; |
5800 | unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 |
5801 | : X86::MOVZX16rm8; |
5802 | Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops); |
5803 | Chain = SDValue(Move, 1); |
5804 | ReplaceUses(F: N0.getValue(R: 1), T: Chain); |
5805 | // Record the mem-refs |
5806 | CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()}); |
5807 | } else { |
5808 | unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 |
5809 | : X86::MOVZX16rr8; |
5810 | Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0); |
5811 | Chain = CurDAG->getEntryNode(); |
5812 | } |
5813 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue(Move, 0), |
5814 | Glue: SDValue()); |
5815 | InGlue = Chain.getValue(R: 1); |
5816 | } else { |
5817 | InGlue = |
5818 | CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, |
5819 | Reg: LoReg, N: N0, Glue: SDValue()).getValue(R: 1); |
5820 | if (isSigned && !signBitIsZero) { |
5821 | // Sign extend the low part into the high part. |
5822 | InGlue = |
5823 | SDValue(CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),0); |
5824 | } else { |
5825 | // Zero out the high part, effectively zero extending the input. |
5826 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32); |
5827 | SDValue ClrNode = SDValue( |
5828 | CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: std::nullopt), 0); |
5829 | switch (NVT.SimpleTy) { |
5830 | case MVT::i16: |
5831 | ClrNode = |
5832 | SDValue(CurDAG->getMachineNode( |
5833 | Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode, |
5834 | Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl, |
5835 | VT: MVT::i32)), |
5836 | 0); |
5837 | break; |
5838 | case MVT::i32: |
5839 | break; |
5840 | case MVT::i64: |
5841 | ClrNode = |
5842 | SDValue(CurDAG->getMachineNode( |
5843 | Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, |
5844 | Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: ClrNode, |
5845 | Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, |
5846 | VT: MVT::i32)), |
5847 | 0); |
5848 | break; |
5849 | default: |
5850 | llvm_unreachable("Unexpected division source" ); |
5851 | } |
5852 | |
5853 | InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg, |
5854 | N: ClrNode, Glue: InGlue).getValue(R: 1); |
5855 | } |
5856 | } |
5857 | |
5858 | if (foldedLoad) { |
5859 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
5860 | InGlue }; |
5861 | MachineSDNode *CNode = |
5862 | CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops); |
5863 | InGlue = SDValue(CNode, 1); |
5864 | // Update the chain. |
5865 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 0)); |
5866 | // Record the mem-refs |
5867 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
5868 | } else { |
5869 | InGlue = |
5870 | SDValue(CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), 0); |
5871 | } |
5872 | |
5873 | // Prevent use of AH in a REX instruction by explicitly copying it to |
5874 | // an ABCD_L register. |
5875 | // |
5876 | // The current assumption of the register allocator is that isel |
5877 | // won't generate explicit references to the GR8_ABCD_H registers. If |
5878 | // the allocator and/or the backend get enhanced to be more robust in |
5879 | // that regard, this can be, and should be, removed. |
5880 | if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { |
5881 | SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8); |
5882 | unsigned AHExtOpcode = |
5883 | isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; |
5884 | |
5885 | SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32, |
5886 | VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue); |
5887 | SDValue Result(RNode, 0); |
5888 | InGlue = SDValue(RNode, 1); |
5889 | |
5890 | Result = |
5891 | CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result); |
5892 | |
5893 | ReplaceUses(F: SDValue(Node, 1), T: Result); |
5894 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
5895 | dbgs() << '\n'); |
5896 | } |
5897 | // Copy the division (low) result, if it is needed. |
5898 | if (!SDValue(Node, 0).use_empty()) { |
5899 | SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, |
5900 | Reg: LoReg, VT: NVT, Glue: InGlue); |
5901 | InGlue = Result.getValue(R: 2); |
5902 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
5903 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
5904 | dbgs() << '\n'); |
5905 | } |
5906 | // Copy the remainder (high) result, if it is needed. |
5907 | if (!SDValue(Node, 1).use_empty()) { |
5908 | SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, |
5909 | Reg: HiReg, VT: NVT, Glue: InGlue); |
5910 | InGlue = Result.getValue(R: 2); |
5911 | ReplaceUses(F: SDValue(Node, 1), T: Result); |
5912 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
5913 | dbgs() << '\n'); |
5914 | } |
5915 | CurDAG->RemoveDeadNode(N: Node); |
5916 | return; |
5917 | } |
5918 | |
5919 | case X86ISD::FCMP: |
5920 | case X86ISD::STRICT_FCMP: |
5921 | case X86ISD::STRICT_FCMPS: { |
5922 | bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP || |
5923 | Node->getOpcode() == X86ISD::STRICT_FCMPS; |
5924 | SDValue N0 = Node->getOperand(Num: IsStrictCmp ? 1 : 0); |
5925 | SDValue N1 = Node->getOperand(Num: IsStrictCmp ? 2 : 1); |
5926 | |
5927 | // Save the original VT of the compare. |
5928 | MVT CmpVT = N0.getSimpleValueType(); |
5929 | |
5930 | // Floating point needs special handling if we don't have FCOMI. |
5931 | if (Subtarget->canUseCMOV()) |
5932 | break; |
5933 | |
5934 | bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; |
5935 | |
5936 | unsigned Opc; |
5937 | switch (CmpVT.SimpleTy) { |
5938 | default: llvm_unreachable("Unexpected type!" ); |
5939 | case MVT::f32: |
5940 | Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32; |
5941 | break; |
5942 | case MVT::f64: |
5943 | Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64; |
5944 | break; |
5945 | case MVT::f80: |
5946 | Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80; |
5947 | break; |
5948 | } |
5949 | |
5950 | SDValue Chain = |
5951 | IsStrictCmp ? Node->getOperand(Num: 0) : CurDAG->getEntryNode(); |
5952 | SDValue Glue; |
5953 | if (IsStrictCmp) { |
5954 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
5955 | Chain = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), 0); |
5956 | Glue = Chain.getValue(R: 1); |
5957 | } else { |
5958 | Glue = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), 0); |
5959 | } |
5960 | |
5961 | // Move FPSW to AX. |
5962 | SDValue FNSTSW = |
5963 | SDValue(CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), 0); |
5964 | |
5965 | // Extract upper 8-bits of AX. |
5966 | SDValue = |
5967 | CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW); |
5968 | |
5969 | // Move AH into flags. |
5970 | // Some 64-bit targets lack SAHF support, but they do support FCOMI. |
5971 | assert(Subtarget->canUseLAHFSAHF() && |
5972 | "Target doesn't support SAHF or FCOMI?" ); |
5973 | SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue()); |
5974 | Chain = AH; |
5975 | SDValue SAHF = SDValue( |
5976 | CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: 1)), 0); |
5977 | |
5978 | if (IsStrictCmp) |
5979 | ReplaceUses(F: SDValue(Node, 1), T: Chain); |
5980 | |
5981 | ReplaceUses(F: SDValue(Node, 0), T: SAHF); |
5982 | CurDAG->RemoveDeadNode(N: Node); |
5983 | return; |
5984 | } |
5985 | |
5986 | case X86ISD::CMP: { |
5987 | SDValue N0 = Node->getOperand(Num: 0); |
5988 | SDValue N1 = Node->getOperand(Num: 1); |
5989 | |
5990 | // Optimizations for TEST compares. |
5991 | if (!isNullConstant(V: N1)) |
5992 | break; |
5993 | |
5994 | // Save the original VT of the compare. |
5995 | MVT CmpVT = N0.getSimpleValueType(); |
5996 | |
5997 | // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed |
5998 | // by a test instruction. The test should be removed later by |
5999 | // analyzeCompare if we are using only the zero flag. |
6000 | // TODO: Should we check the users and use the BEXTR flags directly? |
6001 | if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { |
6002 | if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) { |
6003 | unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr |
6004 | : X86::TEST32rr; |
6005 | SDValue BEXTR = SDValue(NewNode, 0); |
6006 | NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR); |
6007 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
6008 | CurDAG->RemoveDeadNode(N: Node); |
6009 | return; |
6010 | } |
6011 | } |
6012 | |
6013 | // We can peek through truncates, but we need to be careful below. |
6014 | if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) |
6015 | N0 = N0.getOperand(i: 0); |
6016 | |
6017 | // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to |
6018 | // use a smaller encoding. |
6019 | // Look past the truncate if CMP is the only use of it. |
6020 | if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && |
6021 | N0.getValueType() != MVT::i8) { |
6022 | auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1)); |
6023 | if (!MaskC) |
6024 | break; |
6025 | |
6026 | // We may have looked through a truncate so mask off any bits that |
6027 | // shouldn't be part of the compare. |
6028 | uint64_t Mask = MaskC->getZExtValue(); |
6029 | Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits()); |
6030 | |
6031 | // Check if we can replace AND+IMM{32,64} with a shift. This is possible |
6032 | // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the |
6033 | // zero flag. |
6034 | if (CmpVT == MVT::i64 && !isInt<8>(x: Mask) && isShiftedMask_64(Value: Mask) && |
6035 | onlyUsesZeroFlag(Flags: SDValue(Node, 0))) { |
6036 | unsigned ShiftOpcode = ISD::DELETED_NODE; |
6037 | unsigned ShiftAmt; |
6038 | unsigned SubRegIdx; |
6039 | MVT SubRegVT; |
6040 | unsigned TestOpcode; |
6041 | unsigned LeadingZeros = llvm::countl_zero(Val: Mask); |
6042 | unsigned TrailingZeros = llvm::countr_zero(Val: Mask); |
6043 | |
6044 | // With leading/trailing zeros, the transform is profitable if we can |
6045 | // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without |
6046 | // incurring any extra register moves. |
6047 | bool SavesBytes = !isInt<32>(x: Mask) || N0.getOperand(i: 0).hasOneUse(); |
6048 | if (LeadingZeros == 0 && SavesBytes) { |
6049 | // If the mask covers the most significant bit, then we can replace |
6050 | // TEST+AND with a SHR and check eflags. |
6051 | // This emits a redundant TEST which is subsequently eliminated. |
6052 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6053 | ShiftAmt = TrailingZeros; |
6054 | SubRegIdx = 0; |
6055 | TestOpcode = X86::TEST64rr; |
6056 | } else if (TrailingZeros == 0 && SavesBytes) { |
6057 | // If the mask covers the least significant bit, then we can replace |
6058 | // TEST+AND with a SHL and check eflags. |
6059 | // This emits a redundant TEST which is subsequently eliminated. |
6060 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri); |
6061 | ShiftAmt = LeadingZeros; |
6062 | SubRegIdx = 0; |
6063 | TestOpcode = X86::TEST64rr; |
6064 | } else if (MaskC->hasOneUse() && !isInt<32>(x: Mask)) { |
6065 | // If the shifted mask extends into the high half and is 8/16/32 bits |
6066 | // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr. |
6067 | unsigned PopCount = 64 - LeadingZeros - TrailingZeros; |
6068 | if (PopCount == 8) { |
6069 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6070 | ShiftAmt = TrailingZeros; |
6071 | SubRegIdx = X86::sub_8bit; |
6072 | SubRegVT = MVT::i8; |
6073 | TestOpcode = X86::TEST8rr; |
6074 | } else if (PopCount == 16) { |
6075 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6076 | ShiftAmt = TrailingZeros; |
6077 | SubRegIdx = X86::sub_16bit; |
6078 | SubRegVT = MVT::i16; |
6079 | TestOpcode = X86::TEST16rr; |
6080 | } else if (PopCount == 32) { |
6081 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6082 | ShiftAmt = TrailingZeros; |
6083 | SubRegIdx = X86::sub_32bit; |
6084 | SubRegVT = MVT::i32; |
6085 | TestOpcode = X86::TEST32rr; |
6086 | } |
6087 | } |
6088 | if (ShiftOpcode != ISD::DELETED_NODE) { |
6089 | SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64); |
6090 | SDValue Shift = SDValue( |
6091 | CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32, |
6092 | Op1: N0.getOperand(i: 0), Op2: ShiftC), |
6093 | 0); |
6094 | if (SubRegIdx != 0) { |
6095 | Shift = |
6096 | CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift); |
6097 | } |
6098 | MachineSDNode *Test = |
6099 | CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift); |
6100 | ReplaceNode(F: Node, T: Test); |
6101 | return; |
6102 | } |
6103 | } |
6104 | |
6105 | MVT VT; |
6106 | int SubRegOp; |
6107 | unsigned ROpc, MOpc; |
6108 | |
6109 | // For each of these checks we need to be careful if the sign flag is |
6110 | // being used. It is only safe to use the sign flag in two conditions, |
6111 | // either the sign bit in the shrunken mask is zero or the final test |
6112 | // size is equal to the original compare size. |
6113 | |
6114 | if (isUInt<8>(x: Mask) && |
6115 | (!(Mask & 0x80) || CmpVT == MVT::i8 || |
6116 | hasNoSignFlagUses(Flags: SDValue(Node, 0)))) { |
6117 | // For example, convert "testl %eax, $8" to "testb %al, $8" |
6118 | VT = MVT::i8; |
6119 | SubRegOp = X86::sub_8bit; |
6120 | ROpc = X86::TEST8ri; |
6121 | MOpc = X86::TEST8mi; |
6122 | } else if (OptForMinSize && isUInt<16>(x: Mask) && |
6123 | (!(Mask & 0x8000) || CmpVT == MVT::i16 || |
6124 | hasNoSignFlagUses(Flags: SDValue(Node, 0)))) { |
6125 | // For example, "testl %eax, $32776" to "testw %ax, $32776". |
6126 | // NOTE: We only want to form TESTW instructions if optimizing for |
6127 | // min size. Otherwise we only save one byte and possibly get a length |
6128 | // changing prefix penalty in the decoders. |
6129 | VT = MVT::i16; |
6130 | SubRegOp = X86::sub_16bit; |
6131 | ROpc = X86::TEST16ri; |
6132 | MOpc = X86::TEST16mi; |
6133 | } else if (isUInt<32>(x: Mask) && N0.getValueType() != MVT::i16 && |
6134 | ((!(Mask & 0x80000000) && |
6135 | // Without minsize 16-bit Cmps can get here so we need to |
6136 | // be sure we calculate the correct sign flag if needed. |
6137 | (CmpVT != MVT::i16 || !(Mask & 0x8000))) || |
6138 | CmpVT == MVT::i32 || |
6139 | hasNoSignFlagUses(Flags: SDValue(Node, 0)))) { |
6140 | // For example, "testq %rax, $268468232" to "testl %eax, $268468232". |
6141 | // NOTE: We only want to run that transform if N0 is 32 or 64 bits. |
6142 | // Otherwize, we find ourselves in a position where we have to do |
6143 | // promotion. If previous passes did not promote the and, we assume |
6144 | // they had a good reason not to and do not promote here. |
6145 | VT = MVT::i32; |
6146 | SubRegOp = X86::sub_32bit; |
6147 | ROpc = X86::TEST32ri; |
6148 | MOpc = X86::TEST32mi; |
6149 | } else { |
6150 | // No eligible transformation was found. |
6151 | break; |
6152 | } |
6153 | |
6154 | SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT); |
6155 | SDValue Reg = N0.getOperand(i: 0); |
6156 | |
6157 | // Emit a testl or testw. |
6158 | MachineSDNode *NewNode; |
6159 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
6160 | if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
6161 | if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: 0).getNode())) { |
6162 | if (!LoadN->isSimple()) { |
6163 | unsigned NumVolBits = LoadN->getValueType(ResNo: 0).getSizeInBits(); |
6164 | if ((MOpc == X86::TEST8mi && NumVolBits != 8) || |
6165 | (MOpc == X86::TEST16mi && NumVolBits != 16) || |
6166 | (MOpc == X86::TEST32mi && NumVolBits != 32)) |
6167 | break; |
6168 | } |
6169 | } |
6170 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
6171 | Reg.getOperand(i: 0) }; |
6172 | NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops); |
6173 | // Update the chain. |
6174 | ReplaceUses(F: Reg.getValue(R: 1), T: SDValue(NewNode, 1)); |
6175 | // Record the mem-refs |
6176 | CurDAG->setNodeMemRefs(N: NewNode, |
6177 | NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()}); |
6178 | } else { |
6179 | // Extract the subregister if necessary. |
6180 | if (N0.getValueType() != VT) |
6181 | Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg); |
6182 | |
6183 | NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm); |
6184 | } |
6185 | // Replace CMP with TEST. |
6186 | ReplaceNode(F: Node, T: NewNode); |
6187 | return; |
6188 | } |
6189 | break; |
6190 | } |
6191 | case X86ISD::PCMPISTR: { |
6192 | if (!Subtarget->hasSSE42()) |
6193 | break; |
6194 | |
6195 | bool NeedIndex = !SDValue(Node, 0).use_empty(); |
6196 | bool NeedMask = !SDValue(Node, 1).use_empty(); |
6197 | // We can't fold a load if we are going to make two instructions. |
6198 | bool MayFoldLoad = !NeedIndex || !NeedMask; |
6199 | |
6200 | MachineSDNode *CNode; |
6201 | if (NeedMask) { |
6202 | unsigned ROpc = |
6203 | Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri; |
6204 | unsigned MOpc = |
6205 | Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi; |
6206 | CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node); |
6207 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0)); |
6208 | } |
6209 | if (NeedIndex || !NeedMask) { |
6210 | unsigned ROpc = |
6211 | Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri; |
6212 | unsigned MOpc = |
6213 | Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi; |
6214 | CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node); |
6215 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
6216 | } |
6217 | |
6218 | // Connect the flag usage to the last instruction created. |
6219 | ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1)); |
6220 | CurDAG->RemoveDeadNode(N: Node); |
6221 | return; |
6222 | } |
6223 | case X86ISD::PCMPESTR: { |
6224 | if (!Subtarget->hasSSE42()) |
6225 | break; |
6226 | |
6227 | // Copy the two implicit register inputs. |
6228 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX, |
6229 | N: Node->getOperand(Num: 1), |
6230 | Glue: SDValue()).getValue(R: 1); |
6231 | InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX, |
6232 | N: Node->getOperand(Num: 3), Glue: InGlue).getValue(R: 1); |
6233 | |
6234 | bool NeedIndex = !SDValue(Node, 0).use_empty(); |
6235 | bool NeedMask = !SDValue(Node, 1).use_empty(); |
6236 | // We can't fold a load if we are going to make two instructions. |
6237 | bool MayFoldLoad = !NeedIndex || !NeedMask; |
6238 | |
6239 | MachineSDNode *CNode; |
6240 | if (NeedMask) { |
6241 | unsigned ROpc = |
6242 | Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri; |
6243 | unsigned MOpc = |
6244 | Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi; |
6245 | CNode = |
6246 | emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue); |
6247 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0)); |
6248 | } |
6249 | if (NeedIndex || !NeedMask) { |
6250 | unsigned ROpc = |
6251 | Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri; |
6252 | unsigned MOpc = |
6253 | Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi; |
6254 | CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue); |
6255 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
6256 | } |
6257 | // Connect the flag usage to the last instruction created. |
6258 | ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1)); |
6259 | CurDAG->RemoveDeadNode(N: Node); |
6260 | return; |
6261 | } |
6262 | |
6263 | case ISD::SETCC: { |
6264 | if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue(Node, 0), InMask: SDValue())) |
6265 | return; |
6266 | |
6267 | break; |
6268 | } |
6269 | |
6270 | case ISD::STORE: |
6271 | if (foldLoadStoreIntoMemOperand(Node)) |
6272 | return; |
6273 | break; |
6274 | |
6275 | case X86ISD::SETCC_CARRY: { |
6276 | MVT VT = Node->getSimpleValueType(ResNo: 0); |
6277 | SDValue Result; |
6278 | if (Subtarget->hasSBBDepBreaking()) { |
6279 | // We have to do this manually because tblgen will put the eflags copy in |
6280 | // the wrong place if we use an extract_subreg in the pattern. |
6281 | // Copy flags to the EFLAGS register and glue it to next node. |
6282 | SDValue EFLAGS = |
6283 | CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS, |
6284 | N: Node->getOperand(Num: 1), Glue: SDValue()); |
6285 | |
6286 | // Create a 64-bit instruction if the result is 64-bits otherwise use the |
6287 | // 32-bit version. |
6288 | unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; |
6289 | MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; |
6290 | Result = SDValue( |
6291 | CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: 1)), |
6292 | 0); |
6293 | } else { |
6294 | // The target does not recognize sbb with the same reg operand as a |
6295 | // no-source idiom, so we explicitly zero the input values. |
6296 | Result = getSBBZero(N: Node); |
6297 | } |
6298 | |
6299 | // For less than 32-bits we need to extract from the 32-bit node. |
6300 | if (VT == MVT::i8 || VT == MVT::i16) { |
6301 | int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; |
6302 | Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result); |
6303 | } |
6304 | |
6305 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
6306 | CurDAG->RemoveDeadNode(N: Node); |
6307 | return; |
6308 | } |
6309 | case X86ISD::SBB: { |
6310 | if (isNullConstant(V: Node->getOperand(Num: 0)) && |
6311 | isNullConstant(V: Node->getOperand(Num: 1))) { |
6312 | SDValue Result = getSBBZero(N: Node); |
6313 | |
6314 | // Replace the flag use. |
6315 | ReplaceUses(F: SDValue(Node, 1), T: Result.getValue(R: 1)); |
6316 | |
6317 | // Replace the result use. |
6318 | if (!SDValue(Node, 0).use_empty()) { |
6319 | // For less than 32-bits we need to extract from the 32-bit node. |
6320 | MVT VT = Node->getSimpleValueType(ResNo: 0); |
6321 | if (VT == MVT::i8 || VT == MVT::i16) { |
6322 | int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; |
6323 | Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result); |
6324 | } |
6325 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
6326 | } |
6327 | |
6328 | CurDAG->RemoveDeadNode(N: Node); |
6329 | return; |
6330 | } |
6331 | break; |
6332 | } |
6333 | case X86ISD::MGATHER: { |
6334 | auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node); |
6335 | SDValue IndexOp = Mgt->getIndex(); |
6336 | SDValue Mask = Mgt->getMask(); |
6337 | MVT IndexVT = IndexOp.getSimpleValueType(); |
6338 | MVT ValueVT = Node->getSimpleValueType(ResNo: 0); |
6339 | MVT MaskVT = Mask.getSimpleValueType(); |
6340 | |
6341 | // This is just to prevent crashes if the nodes are malformed somehow. We're |
6342 | // otherwise only doing loose type checking in here based on type what |
6343 | // a type constraint would say just like table based isel. |
6344 | if (!ValueVT.isVector() || !MaskVT.isVector()) |
6345 | break; |
6346 | |
6347 | unsigned NumElts = ValueVT.getVectorNumElements(); |
6348 | MVT ValueSVT = ValueVT.getVectorElementType(); |
6349 | |
6350 | bool IsFP = ValueSVT.isFloatingPoint(); |
6351 | unsigned EltSize = ValueSVT.getSizeInBits(); |
6352 | |
6353 | unsigned Opc = 0; |
6354 | bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1; |
6355 | if (AVX512Gather) { |
6356 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
6357 | Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm; |
6358 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
6359 | Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm; |
6360 | else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) |
6361 | Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm; |
6362 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
6363 | Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm; |
6364 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
6365 | Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm; |
6366 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) |
6367 | Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm; |
6368 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
6369 | Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm; |
6370 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
6371 | Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm; |
6372 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) |
6373 | Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm; |
6374 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
6375 | Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm; |
6376 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
6377 | Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm; |
6378 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) |
6379 | Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm; |
6380 | } else { |
6381 | assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() && |
6382 | "Unexpected mask VT!" ); |
6383 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
6384 | Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm; |
6385 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
6386 | Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm; |
6387 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
6388 | Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm; |
6389 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
6390 | Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm; |
6391 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
6392 | Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm; |
6393 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
6394 | Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm; |
6395 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
6396 | Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm; |
6397 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
6398 | Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm; |
6399 | } |
6400 | |
6401 | if (!Opc) |
6402 | break; |
6403 | |
6404 | SDValue Base, Scale, Index, Disp, Segment; |
6405 | if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(), |
6406 | Base, Scale, Index, Disp, Segment)) |
6407 | break; |
6408 | |
6409 | SDValue PassThru = Mgt->getPassThru(); |
6410 | SDValue Chain = Mgt->getChain(); |
6411 | // Gather instructions have a mask output not in the ISD node. |
6412 | SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other); |
6413 | |
6414 | MachineSDNode *NewNode; |
6415 | if (AVX512Gather) { |
6416 | SDValue Ops[] = {PassThru, Mask, Base, Scale, |
6417 | Index, Disp, Segment, Chain}; |
6418 | NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
6419 | } else { |
6420 | SDValue Ops[] = {PassThru, Base, Scale, Index, |
6421 | Disp, Segment, Mask, Chain}; |
6422 | NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
6423 | } |
6424 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()}); |
6425 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
6426 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(NewNode, 2)); |
6427 | CurDAG->RemoveDeadNode(N: Node); |
6428 | return; |
6429 | } |
6430 | case X86ISD::MSCATTER: { |
6431 | auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node); |
6432 | SDValue Value = Sc->getValue(); |
6433 | SDValue IndexOp = Sc->getIndex(); |
6434 | MVT IndexVT = IndexOp.getSimpleValueType(); |
6435 | MVT ValueVT = Value.getSimpleValueType(); |
6436 | |
6437 | // This is just to prevent crashes if the nodes are malformed somehow. We're |
6438 | // otherwise only doing loose type checking in here based on type what |
6439 | // a type constraint would say just like table based isel. |
6440 | if (!ValueVT.isVector()) |
6441 | break; |
6442 | |
6443 | unsigned NumElts = ValueVT.getVectorNumElements(); |
6444 | MVT ValueSVT = ValueVT.getVectorElementType(); |
6445 | |
6446 | bool IsFP = ValueSVT.isFloatingPoint(); |
6447 | unsigned EltSize = ValueSVT.getSizeInBits(); |
6448 | |
6449 | unsigned Opc; |
6450 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
6451 | Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr; |
6452 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
6453 | Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr; |
6454 | else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) |
6455 | Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr; |
6456 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
6457 | Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr; |
6458 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
6459 | Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr; |
6460 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) |
6461 | Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr; |
6462 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
6463 | Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr; |
6464 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
6465 | Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr; |
6466 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) |
6467 | Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr; |
6468 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
6469 | Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr; |
6470 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
6471 | Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr; |
6472 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) |
6473 | Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr; |
6474 | else |
6475 | break; |
6476 | |
6477 | SDValue Base, Scale, Index, Disp, Segment; |
6478 | if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(), |
6479 | Base, Scale, Index, Disp, Segment)) |
6480 | break; |
6481 | |
6482 | SDValue Mask = Sc->getMask(); |
6483 | SDValue Chain = Sc->getChain(); |
6484 | // Scatter instructions have a mask output not in the ISD node. |
6485 | SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other); |
6486 | SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain}; |
6487 | |
6488 | MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
6489 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()}); |
6490 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 1)); |
6491 | CurDAG->RemoveDeadNode(N: Node); |
6492 | return; |
6493 | } |
6494 | case ISD::PREALLOCATED_SETUP: { |
6495 | auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
6496 | auto CallId = MFI->getPreallocatedIdForCallSite( |
6497 | CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue()); |
6498 | SDValue Chain = Node->getOperand(Num: 0); |
6499 | SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32); |
6500 | MachineSDNode *New = CurDAG->getMachineNode( |
6501 | Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain); |
6502 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Chain |
6503 | CurDAG->RemoveDeadNode(N: Node); |
6504 | return; |
6505 | } |
6506 | case ISD::PREALLOCATED_ARG: { |
6507 | auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
6508 | auto CallId = MFI->getPreallocatedIdForCallSite( |
6509 | CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue()); |
6510 | SDValue Chain = Node->getOperand(Num: 0); |
6511 | SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32); |
6512 | SDValue ArgIndex = Node->getOperand(Num: 2); |
6513 | SDValue Ops[3]; |
6514 | Ops[0] = CallIdValue; |
6515 | Ops[1] = ArgIndex; |
6516 | Ops[2] = Chain; |
6517 | MachineSDNode *New = CurDAG->getMachineNode( |
6518 | Opcode: TargetOpcode::PREALLOCATED_ARG, dl, |
6519 | VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()), |
6520 | VT2: MVT::Other), |
6521 | Ops); |
6522 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Arg pointer |
6523 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(New, 1)); // Chain |
6524 | CurDAG->RemoveDeadNode(N: Node); |
6525 | return; |
6526 | } |
6527 | case X86ISD::AESENCWIDE128KL: |
6528 | case X86ISD::AESDECWIDE128KL: |
6529 | case X86ISD::AESENCWIDE256KL: |
6530 | case X86ISD::AESDECWIDE256KL: { |
6531 | if (!Subtarget->hasWIDEKL()) |
6532 | break; |
6533 | |
6534 | unsigned Opcode; |
6535 | switch (Node->getOpcode()) { |
6536 | default: |
6537 | llvm_unreachable("Unexpected opcode!" ); |
6538 | case X86ISD::AESENCWIDE128KL: |
6539 | Opcode = X86::AESENCWIDE128KL; |
6540 | break; |
6541 | case X86ISD::AESDECWIDE128KL: |
6542 | Opcode = X86::AESDECWIDE128KL; |
6543 | break; |
6544 | case X86ISD::AESENCWIDE256KL: |
6545 | Opcode = X86::AESENCWIDE256KL; |
6546 | break; |
6547 | case X86ISD::AESDECWIDE256KL: |
6548 | Opcode = X86::AESDECWIDE256KL; |
6549 | break; |
6550 | } |
6551 | |
6552 | SDValue Chain = Node->getOperand(Num: 0); |
6553 | SDValue Addr = Node->getOperand(Num: 1); |
6554 | |
6555 | SDValue Base, Scale, Index, Disp, Segment; |
6556 | if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment)) |
6557 | break; |
6558 | |
6559 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 2), |
6560 | Glue: SDValue()); |
6561 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 3), |
6562 | Glue: Chain.getValue(R: 1)); |
6563 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: 4), |
6564 | Glue: Chain.getValue(R: 1)); |
6565 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: 5), |
6566 | Glue: Chain.getValue(R: 1)); |
6567 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: 6), |
6568 | Glue: Chain.getValue(R: 1)); |
6569 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: 7), |
6570 | Glue: Chain.getValue(R: 1)); |
6571 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: 8), |
6572 | Glue: Chain.getValue(R: 1)); |
6573 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: 9), |
6574 | Glue: Chain.getValue(R: 1)); |
6575 | |
6576 | MachineSDNode *Res = CurDAG->getMachineNode( |
6577 | Opcode, dl, VTs: Node->getVTList(), |
6578 | Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: 1)}); |
6579 | CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand()); |
6580 | ReplaceNode(F: Node, T: Res); |
6581 | return; |
6582 | } |
6583 | } |
6584 | |
6585 | SelectCode(N: Node); |
6586 | } |
6587 | |
6588 | bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand( |
6589 | const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, |
6590 | std::vector<SDValue> &OutOps) { |
6591 | SDValue Op0, Op1, Op2, Op3, Op4; |
6592 | switch (ConstraintID) { |
6593 | default: |
6594 | llvm_unreachable("Unexpected asm memory constraint" ); |
6595 | case InlineAsm::ConstraintCode::o: // offsetable ?? |
6596 | case InlineAsm::ConstraintCode::v: // not offsetable ?? |
6597 | case InlineAsm::ConstraintCode::m: // memory |
6598 | case InlineAsm::ConstraintCode::X: |
6599 | case InlineAsm::ConstraintCode::p: // address |
6600 | if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4)) |
6601 | return true; |
6602 | break; |
6603 | } |
6604 | |
6605 | OutOps.push_back(x: Op0); |
6606 | OutOps.push_back(x: Op1); |
6607 | OutOps.push_back(x: Op2); |
6608 | OutOps.push_back(x: Op3); |
6609 | OutOps.push_back(x: Op4); |
6610 | return false; |
6611 | } |
6612 | |
6613 | X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM) |
6614 | : SelectionDAGISelPass( |
6615 | std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {} |
6616 | |
6617 | /// This pass converts a legalized DAG into a X86-specific DAG, |
6618 | /// ready for instruction scheduling. |
6619 | FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, |
6620 | CodeGenOptLevel OptLevel) { |
6621 | return new X86DAGToDAGISelLegacy(TM, OptLevel); |
6622 | } |
6623 | |