1 | //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines a DAG pattern matching instruction selector for X86, |
10 | // converting from a legalized dag to a X86 dag. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "X86ISelDAGToDAG.h" |
15 | #include "X86.h" |
16 | #include "X86MachineFunctionInfo.h" |
17 | #include "X86Subtarget.h" |
18 | #include "X86TargetMachine.h" |
19 | #include "llvm/ADT/Statistic.h" |
20 | #include "llvm/CodeGen/MachineModuleInfo.h" |
21 | #include "llvm/CodeGen/SelectionDAGISel.h" |
22 | #include "llvm/Config/llvm-config.h" |
23 | #include "llvm/IR/ConstantRange.h" |
24 | #include "llvm/IR/Function.h" |
25 | #include "llvm/IR/Instructions.h" |
26 | #include "llvm/IR/Intrinsics.h" |
27 | #include "llvm/IR/IntrinsicsX86.h" |
28 | #include "llvm/IR/Module.h" |
29 | #include "llvm/IR/Type.h" |
30 | #include "llvm/Support/Debug.h" |
31 | #include "llvm/Support/ErrorHandling.h" |
32 | #include "llvm/Support/KnownBits.h" |
33 | #include "llvm/Support/MathExtras.h" |
34 | #include <cstdint> |
35 | |
36 | using namespace llvm; |
37 | |
38 | #define DEBUG_TYPE "x86-isel" |
39 | #define PASS_NAME "X86 DAG->DAG Instruction Selection" |
40 | |
41 | STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor" ); |
42 | |
43 | static cl::opt<bool> AndImmShrink("x86-and-imm-shrink" , cl::init(Val: true), |
44 | cl::desc("Enable setting constant bits to reduce size of mask immediates" ), |
45 | cl::Hidden); |
46 | |
47 | static cl::opt<bool> EnablePromoteAnyextLoad( |
48 | "x86-promote-anyext-load" , cl::init(Val: true), |
49 | cl::desc("Enable promoting aligned anyext load to wider load" ), cl::Hidden); |
50 | |
51 | extern cl::opt<bool> IndirectBranchTracking; |
52 | |
53 | //===----------------------------------------------------------------------===// |
54 | // Pattern Matcher Implementation |
55 | //===----------------------------------------------------------------------===// |
56 | |
57 | namespace { |
58 | /// This corresponds to X86AddressMode, but uses SDValue's instead of register |
59 | /// numbers for the leaves of the matched tree. |
60 | struct X86ISelAddressMode { |
61 | enum { |
62 | RegBase, |
63 | FrameIndexBase |
64 | } BaseType = RegBase; |
65 | |
66 | // This is really a union, discriminated by BaseType! |
67 | SDValue Base_Reg; |
68 | int Base_FrameIndex = 0; |
69 | |
70 | unsigned Scale = 1; |
71 | SDValue IndexReg; |
72 | int32_t Disp = 0; |
73 | SDValue Segment; |
74 | const GlobalValue *GV = nullptr; |
75 | const Constant *CP = nullptr; |
76 | const BlockAddress *BlockAddr = nullptr; |
77 | const char *ES = nullptr; |
78 | MCSymbol *MCSym = nullptr; |
79 | int JT = -1; |
80 | Align Alignment; // CP alignment. |
81 | unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_* |
82 | bool NegateIndex = false; |
83 | |
84 | X86ISelAddressMode() = default; |
85 | |
86 | bool hasSymbolicDisplacement() const { |
87 | return GV != nullptr || CP != nullptr || ES != nullptr || |
88 | MCSym != nullptr || JT != -1 || BlockAddr != nullptr; |
89 | } |
90 | |
91 | bool hasBaseOrIndexReg() const { |
92 | return BaseType == FrameIndexBase || |
93 | IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; |
94 | } |
95 | |
96 | /// Return true if this addressing mode is already RIP-relative. |
97 | bool isRIPRelative() const { |
98 | if (BaseType != RegBase) return false; |
99 | if (RegisterSDNode *RegNode = |
100 | dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode())) |
101 | return RegNode->getReg() == X86::RIP; |
102 | return false; |
103 | } |
104 | |
105 | void setBaseReg(SDValue Reg) { |
106 | BaseType = RegBase; |
107 | Base_Reg = Reg; |
108 | } |
109 | |
110 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
111 | void dump(SelectionDAG *DAG = nullptr) { |
112 | dbgs() << "X86ISelAddressMode " << this << '\n'; |
113 | dbgs() << "Base_Reg " ; |
114 | if (Base_Reg.getNode()) |
115 | Base_Reg.getNode()->dump(DAG); |
116 | else |
117 | dbgs() << "nul\n" ; |
118 | if (BaseType == FrameIndexBase) |
119 | dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; |
120 | dbgs() << " Scale " << Scale << '\n' |
121 | << "IndexReg " ; |
122 | if (NegateIndex) |
123 | dbgs() << "negate " ; |
124 | if (IndexReg.getNode()) |
125 | IndexReg.getNode()->dump(DAG); |
126 | else |
127 | dbgs() << "nul\n" ; |
128 | dbgs() << " Disp " << Disp << '\n' |
129 | << "GV " ; |
130 | if (GV) |
131 | GV->dump(); |
132 | else |
133 | dbgs() << "nul" ; |
134 | dbgs() << " CP " ; |
135 | if (CP) |
136 | CP->dump(); |
137 | else |
138 | dbgs() << "nul" ; |
139 | dbgs() << '\n' |
140 | << "ES " ; |
141 | if (ES) |
142 | dbgs() << ES; |
143 | else |
144 | dbgs() << "nul" ; |
145 | dbgs() << " MCSym " ; |
146 | if (MCSym) |
147 | dbgs() << MCSym; |
148 | else |
149 | dbgs() << "nul" ; |
150 | dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n'; |
151 | } |
152 | #endif |
153 | }; |
154 | } |
155 | |
156 | namespace { |
157 | //===--------------------------------------------------------------------===// |
158 | /// ISel - X86-specific code to select X86 machine instructions for |
159 | /// SelectionDAG operations. |
160 | /// |
161 | class X86DAGToDAGISel final : public SelectionDAGISel { |
162 | /// Keep a pointer to the X86Subtarget around so that we can |
163 | /// make the right decision when generating code for different targets. |
164 | const X86Subtarget *Subtarget; |
165 | |
166 | /// If true, selector should try to optimize for minimum code size. |
167 | bool OptForMinSize; |
168 | |
169 | /// Disable direct TLS access through segment registers. |
170 | bool IndirectTlsSegRefs; |
171 | |
172 | public: |
173 | X86DAGToDAGISel() = delete; |
174 | |
175 | explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel) |
176 | : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), |
177 | OptForMinSize(false), IndirectTlsSegRefs(false) {} |
178 | |
179 | bool runOnMachineFunction(MachineFunction &MF) override { |
180 | // Reset the subtarget each time through. |
181 | Subtarget = &MF.getSubtarget<X86Subtarget>(); |
182 | IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( |
183 | Kind: "indirect-tls-seg-refs" ); |
184 | |
185 | // OptFor[Min]Size are used in pattern predicates that isel is matching. |
186 | OptForMinSize = MF.getFunction().hasMinSize(); |
187 | return SelectionDAGISel::runOnMachineFunction(mf&: MF); |
188 | } |
189 | |
190 | void emitFunctionEntryCode() override; |
191 | |
192 | bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; |
193 | |
194 | void PreprocessISelDAG() override; |
195 | void PostprocessISelDAG() override; |
196 | |
197 | // Include the pieces autogenerated from the target description. |
198 | #include "X86GenDAGISel.inc" |
199 | |
200 | private: |
201 | void Select(SDNode *N) override; |
202 | |
203 | bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); |
204 | bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, |
205 | bool AllowSegmentRegForX32 = false); |
206 | bool matchWrapper(SDValue N, X86ISelAddressMode &AM); |
207 | bool matchAddress(SDValue N, X86ISelAddressMode &AM); |
208 | bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); |
209 | bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); |
210 | SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM, |
211 | unsigned Depth); |
212 | bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
213 | unsigned Depth); |
214 | bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
215 | unsigned Depth); |
216 | bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); |
217 | bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, |
218 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
219 | SDValue &Segment); |
220 | bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, |
221 | SDValue ScaleOp, SDValue &Base, SDValue &Scale, |
222 | SDValue &Index, SDValue &Disp, SDValue &Segment); |
223 | bool selectMOV64Imm32(SDValue N, SDValue &Imm); |
224 | bool selectLEAAddr(SDValue N, SDValue &Base, |
225 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
226 | SDValue &Segment); |
227 | bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale, |
228 | SDValue &Index, SDValue &Disp, SDValue &Segment); |
229 | bool selectTLSADDRAddr(SDValue N, SDValue &Base, |
230 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
231 | SDValue &Segment); |
232 | bool selectRelocImm(SDValue N, SDValue &Op); |
233 | |
234 | bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, |
235 | SDValue &Base, SDValue &Scale, |
236 | SDValue &Index, SDValue &Disp, |
237 | SDValue &Segment); |
238 | |
239 | // Convenience method where P is also root. |
240 | bool tryFoldLoad(SDNode *P, SDValue N, |
241 | SDValue &Base, SDValue &Scale, |
242 | SDValue &Index, SDValue &Disp, |
243 | SDValue &Segment) { |
244 | return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment); |
245 | } |
246 | |
247 | bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, |
248 | SDValue &Base, SDValue &Scale, |
249 | SDValue &Index, SDValue &Disp, |
250 | SDValue &Segment); |
251 | |
252 | bool isProfitableToFormMaskedOp(SDNode *N) const; |
253 | |
254 | /// Implement addressing mode selection for inline asm expressions. |
255 | bool SelectInlineAsmMemoryOperand(const SDValue &Op, |
256 | InlineAsm::ConstraintCode ConstraintID, |
257 | std::vector<SDValue> &OutOps) override; |
258 | |
259 | void emitSpecialCodeForMain(); |
260 | |
261 | inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, |
262 | MVT VT, SDValue &Base, SDValue &Scale, |
263 | SDValue &Index, SDValue &Disp, |
264 | SDValue &Segment) { |
265 | if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
266 | Base = CurDAG->getTargetFrameIndex( |
267 | FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout())); |
268 | else if (AM.Base_Reg.getNode()) |
269 | Base = AM.Base_Reg; |
270 | else |
271 | Base = CurDAG->getRegister(Reg: 0, VT); |
272 | |
273 | Scale = getI8Imm(Imm: AM.Scale, DL); |
274 | |
275 | #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC) |
276 | // Negate the index if needed. |
277 | if (AM.NegateIndex) { |
278 | unsigned NegOpc; |
279 | switch (VT.SimpleTy) { |
280 | default: |
281 | llvm_unreachable("Unsupported VT!" ); |
282 | case MVT::i64: |
283 | NegOpc = GET_ND_IF_ENABLED(X86::NEG64r); |
284 | break; |
285 | case MVT::i32: |
286 | NegOpc = GET_ND_IF_ENABLED(X86::NEG32r); |
287 | break; |
288 | case MVT::i16: |
289 | NegOpc = GET_ND_IF_ENABLED(X86::NEG16r); |
290 | break; |
291 | case MVT::i8: |
292 | NegOpc = GET_ND_IF_ENABLED(X86::NEG8r); |
293 | break; |
294 | } |
295 | SDValue Neg = SDValue(CurDAG->getMachineNode(Opcode: NegOpc, dl: DL, VT1: VT, VT2: MVT::i32, |
296 | Ops: AM.IndexReg), 0); |
297 | AM.IndexReg = Neg; |
298 | } |
299 | |
300 | if (AM.IndexReg.getNode()) |
301 | Index = AM.IndexReg; |
302 | else |
303 | Index = CurDAG->getRegister(Reg: 0, VT); |
304 | |
305 | // These are 32-bit even in 64-bit mode since RIP-relative offset |
306 | // is 32-bit. |
307 | if (AM.GV) |
308 | Disp = CurDAG->getTargetGlobalAddress(GV: AM.GV, DL: SDLoc(), |
309 | VT: MVT::i32, offset: AM.Disp, |
310 | TargetFlags: AM.SymbolFlags); |
311 | else if (AM.CP) |
312 | Disp = CurDAG->getTargetConstantPool(C: AM.CP, VT: MVT::i32, Align: AM.Alignment, |
313 | Offset: AM.Disp, TargetFlags: AM.SymbolFlags); |
314 | else if (AM.ES) { |
315 | assert(!AM.Disp && "Non-zero displacement is ignored with ES." ); |
316 | Disp = CurDAG->getTargetExternalSymbol(Sym: AM.ES, VT: MVT::i32, TargetFlags: AM.SymbolFlags); |
317 | } else if (AM.MCSym) { |
318 | assert(!AM.Disp && "Non-zero displacement is ignored with MCSym." ); |
319 | assert(AM.SymbolFlags == 0 && "oo" ); |
320 | Disp = CurDAG->getMCSymbol(Sym: AM.MCSym, VT: MVT::i32); |
321 | } else if (AM.JT != -1) { |
322 | assert(!AM.Disp && "Non-zero displacement is ignored with JT." ); |
323 | Disp = CurDAG->getTargetJumpTable(JTI: AM.JT, VT: MVT::i32, TargetFlags: AM.SymbolFlags); |
324 | } else if (AM.BlockAddr) |
325 | Disp = CurDAG->getTargetBlockAddress(BA: AM.BlockAddr, VT: MVT::i32, Offset: AM.Disp, |
326 | TargetFlags: AM.SymbolFlags); |
327 | else |
328 | Disp = CurDAG->getSignedTargetConstant(Val: AM.Disp, DL, VT: MVT::i32); |
329 | |
330 | if (AM.Segment.getNode()) |
331 | Segment = AM.Segment; |
332 | else |
333 | Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
334 | } |
335 | |
336 | // Utility function to determine whether it is AMX SDNode right after |
337 | // lowering but before ISEL. |
338 | bool isAMXSDNode(SDNode *N) const { |
339 | // Check if N is AMX SDNode: |
340 | // 1. check specific opcode since these carry MVT::Untyped instead of |
341 | // x86amx_type; |
342 | // 2. check result type; |
343 | // 3. check operand type; |
344 | switch (N->getOpcode()) { |
345 | default: |
346 | break; |
347 | case X86::PT2RPNTLVWZ0V: |
348 | case X86::PT2RPNTLVWZ0T1V: |
349 | case X86::PT2RPNTLVWZ1V: |
350 | case X86::PT2RPNTLVWZ1T1V: |
351 | case X86::PT2RPNTLVWZ0RSV: |
352 | case X86::PT2RPNTLVWZ0RST1V: |
353 | case X86::PT2RPNTLVWZ1RSV: |
354 | case X86::PT2RPNTLVWZ1RST1V: |
355 | return true; |
356 | } |
357 | for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) { |
358 | if (N->getValueType(ResNo: Idx) == MVT::x86amx) |
359 | return true; |
360 | } |
361 | for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) { |
362 | SDValue Op = N->getOperand(Num: Idx); |
363 | if (Op.getValueType() == MVT::x86amx) |
364 | return true; |
365 | } |
366 | return false; |
367 | } |
368 | |
369 | // Utility function to determine whether we should avoid selecting |
370 | // immediate forms of instructions for better code size or not. |
371 | // At a high level, we'd like to avoid such instructions when |
372 | // we have similar constants used within the same basic block |
373 | // that can be kept in a register. |
374 | // |
375 | bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { |
376 | uint32_t UseCount = 0; |
377 | |
378 | // Do not want to hoist if we're not optimizing for size. |
379 | // TODO: We'd like to remove this restriction. |
380 | // See the comment in X86InstrInfo.td for more info. |
381 | if (!CurDAG->shouldOptForSize()) |
382 | return false; |
383 | |
384 | // Walk all the users of the immediate. |
385 | for (const SDNode *User : N->users()) { |
386 | if (UseCount >= 2) |
387 | break; |
388 | |
389 | // This user is already selected. Count it as a legitimate use and |
390 | // move on. |
391 | if (User->isMachineOpcode()) { |
392 | UseCount++; |
393 | continue; |
394 | } |
395 | |
396 | // We want to count stores of immediates as real uses. |
397 | if (User->getOpcode() == ISD::STORE && |
398 | User->getOperand(Num: 1).getNode() == N) { |
399 | UseCount++; |
400 | continue; |
401 | } |
402 | |
403 | // We don't currently match users that have > 2 operands (except |
404 | // for stores, which are handled above) |
405 | // Those instruction won't match in ISEL, for now, and would |
406 | // be counted incorrectly. |
407 | // This may change in the future as we add additional instruction |
408 | // types. |
409 | if (User->getNumOperands() != 2) |
410 | continue; |
411 | |
412 | // If this is a sign-extended 8-bit integer immediate used in an ALU |
413 | // instruction, there is probably an opcode encoding to save space. |
414 | auto *C = dyn_cast<ConstantSDNode>(Val: N); |
415 | if (C && isInt<8>(x: C->getSExtValue())) |
416 | continue; |
417 | |
418 | // Immediates that are used for offsets as part of stack |
419 | // manipulation should be left alone. These are typically |
420 | // used to indicate SP offsets for argument passing and |
421 | // will get pulled into stores/pushes (implicitly). |
422 | if (User->getOpcode() == X86ISD::ADD || |
423 | User->getOpcode() == ISD::ADD || |
424 | User->getOpcode() == X86ISD::SUB || |
425 | User->getOpcode() == ISD::SUB) { |
426 | |
427 | // Find the other operand of the add/sub. |
428 | SDValue OtherOp = User->getOperand(Num: 0); |
429 | if (OtherOp.getNode() == N) |
430 | OtherOp = User->getOperand(Num: 1); |
431 | |
432 | // Don't count if the other operand is SP. |
433 | RegisterSDNode *RegNode; |
434 | if (OtherOp->getOpcode() == ISD::CopyFromReg && |
435 | (RegNode = dyn_cast_or_null<RegisterSDNode>( |
436 | Val: OtherOp->getOperand(Num: 1).getNode()))) |
437 | if ((RegNode->getReg() == X86::ESP) || |
438 | (RegNode->getReg() == X86::RSP)) |
439 | continue; |
440 | } |
441 | |
442 | // ... otherwise, count this and move on. |
443 | UseCount++; |
444 | } |
445 | |
446 | // If we have more than 1 use, then recommend for hoisting. |
447 | return (UseCount > 1); |
448 | } |
449 | |
450 | /// Return a target constant with the specified value of type i8. |
451 | inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { |
452 | return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8); |
453 | } |
454 | |
455 | /// Return a target constant with the specified value, of type i32. |
456 | inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { |
457 | return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i32); |
458 | } |
459 | |
460 | /// Return a target constant with the specified value, of type i64. |
461 | inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { |
462 | return CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i64); |
463 | } |
464 | |
465 | SDValue (SDNode *N, unsigned VecWidth, |
466 | const SDLoc &DL) { |
467 | assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width" ); |
468 | uint64_t Index = N->getConstantOperandVal(Num: 1); |
469 | MVT VecVT = N->getOperand(Num: 0).getSimpleValueType(); |
470 | return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); |
471 | } |
472 | |
473 | SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, |
474 | const SDLoc &DL) { |
475 | assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width" ); |
476 | uint64_t Index = N->getConstantOperandVal(Num: 2); |
477 | MVT VecVT = N->getSimpleValueType(ResNo: 0); |
478 | return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); |
479 | } |
480 | |
481 | SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth, |
482 | const SDLoc &DL) { |
483 | assert(VecWidth == 128 && "Unexpected vector width" ); |
484 | uint64_t Index = N->getConstantOperandVal(Num: 2); |
485 | MVT VecVT = N->getSimpleValueType(ResNo: 0); |
486 | uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth; |
487 | assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index" ); |
488 | // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub) |
489 | // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub) |
490 | return getI8Imm(Imm: InsertIdx ? 0x02 : 0x30, DL); |
491 | } |
492 | |
493 | SDValue getSBBZero(SDNode *N) { |
494 | SDLoc dl(N); |
495 | MVT VT = N->getSimpleValueType(ResNo: 0); |
496 | |
497 | // Create zero. |
498 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32); |
499 | SDValue Zero = |
500 | SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0); |
501 | if (VT == MVT::i64) { |
502 | Zero = SDValue( |
503 | CurDAG->getMachineNode( |
504 | Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, |
505 | Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: Zero, |
506 | Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, VT: MVT::i32)), |
507 | 0); |
508 | } |
509 | |
510 | // Copy flags to the EFLAGS register and glue it to next node. |
511 | unsigned Opcode = N->getOpcode(); |
512 | assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) && |
513 | "Unexpected opcode for SBB materialization" ); |
514 | unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1; |
515 | SDValue EFLAGS = |
516 | CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS, |
517 | N: N->getOperand(Num: FlagOpIndex), Glue: SDValue()); |
518 | |
519 | // Create a 64-bit instruction if the result is 64-bits otherwise use the |
520 | // 32-bit version. |
521 | unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; |
522 | MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; |
523 | VTs = CurDAG->getVTList(VT1: SBBVT, VT2: MVT::i32); |
524 | return SDValue( |
525 | CurDAG->getMachineNode(Opcode: Opc, dl, VTs, |
526 | Ops: {Zero, Zero, EFLAGS, EFLAGS.getValue(R: 1)}), |
527 | 0); |
528 | } |
529 | |
530 | // Helper to detect unneeded and instructions on shift amounts. Called |
531 | // from PatFrags in tablegen. |
532 | bool isUnneededShiftMask(SDNode *N, unsigned Width) const { |
533 | assert(N->getOpcode() == ISD::AND && "Unexpected opcode" ); |
534 | const APInt &Val = N->getConstantOperandAPInt(Num: 1); |
535 | |
536 | if (Val.countr_one() >= Width) |
537 | return true; |
538 | |
539 | APInt Mask = Val | CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero; |
540 | return Mask.countr_one() >= Width; |
541 | } |
542 | |
543 | /// Return an SDNode that returns the value of the global base register. |
544 | /// Output instructions required to initialize the global base register, |
545 | /// if necessary. |
546 | SDNode *getGlobalBaseReg(); |
547 | |
548 | /// Return a reference to the TargetMachine, casted to the target-specific |
549 | /// type. |
550 | const X86TargetMachine &getTargetMachine() const { |
551 | return static_cast<const X86TargetMachine &>(TM); |
552 | } |
553 | |
554 | /// Return a reference to the TargetInstrInfo, casted to the target-specific |
555 | /// type. |
556 | const X86InstrInfo *getInstrInfo() const { |
557 | return Subtarget->getInstrInfo(); |
558 | } |
559 | |
560 | /// Return a condition code of the given SDNode |
561 | X86::CondCode getCondFromNode(SDNode *N) const; |
562 | |
563 | /// Address-mode matching performs shift-of-and to and-of-shift |
564 | /// reassociation in order to expose more scaled addressing |
565 | /// opportunities. |
566 | bool ComplexPatternFuncMutatesDAG() const override { |
567 | return true; |
568 | } |
569 | |
570 | bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; |
571 | |
572 | // Indicates we should prefer to use a non-temporal load for this load. |
573 | bool useNonTemporalLoad(LoadSDNode *N) const { |
574 | if (!N->isNonTemporal()) |
575 | return false; |
576 | |
577 | unsigned StoreSize = N->getMemoryVT().getStoreSize(); |
578 | |
579 | if (N->getAlign().value() < StoreSize) |
580 | return false; |
581 | |
582 | switch (StoreSize) { |
583 | default: llvm_unreachable("Unsupported store size" ); |
584 | case 4: |
585 | case 8: |
586 | return false; |
587 | case 16: |
588 | return Subtarget->hasSSE41(); |
589 | case 32: |
590 | return Subtarget->hasAVX2(); |
591 | case 64: |
592 | return Subtarget->hasAVX512(); |
593 | } |
594 | } |
595 | |
596 | bool foldLoadStoreIntoMemOperand(SDNode *Node); |
597 | MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); |
598 | bool matchBitExtract(SDNode *Node); |
599 | bool shrinkAndImmediate(SDNode *N); |
600 | bool isMaskZeroExtended(SDNode *N) const; |
601 | bool tryShiftAmountMod(SDNode *N); |
602 | bool tryShrinkShlLogicImm(SDNode *N); |
603 | bool tryVPTERNLOG(SDNode *N); |
604 | bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB, |
605 | SDNode *ParentC, SDValue A, SDValue B, SDValue C, |
606 | uint8_t Imm); |
607 | bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); |
608 | bool tryMatchBitSelect(SDNode *N); |
609 | |
610 | MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, |
611 | const SDLoc &dl, MVT VT, SDNode *Node); |
612 | MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, |
613 | const SDLoc &dl, MVT VT, SDNode *Node, |
614 | SDValue &InGlue); |
615 | |
616 | bool tryOptimizeRem8Extend(SDNode *N); |
617 | |
618 | bool onlyUsesZeroFlag(SDValue Flags) const; |
619 | bool hasNoSignFlagUses(SDValue Flags) const; |
620 | bool hasNoCarryFlagUses(SDValue Flags) const; |
621 | }; |
622 | |
623 | class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy { |
624 | public: |
625 | static char ID; |
626 | explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm, |
627 | CodeGenOptLevel OptLevel) |
628 | : SelectionDAGISelLegacy( |
629 | ID, std::make_unique<X86DAGToDAGISel>(args&: tm, args&: OptLevel)) {} |
630 | }; |
631 | } |
632 | |
633 | char X86DAGToDAGISelLegacy::ID = 0; |
634 | |
635 | INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false) |
636 | |
637 | // Returns true if this masked compare can be implemented legally with this |
638 | // type. |
639 | static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { |
640 | unsigned Opcode = N->getOpcode(); |
641 | if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM || |
642 | Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC || |
643 | Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) { |
644 | // We can get 256-bit 8 element types here without VLX being enabled. When |
645 | // this happens we will use 512-bit operations and the mask will not be |
646 | // zero extended. |
647 | EVT OpVT = N->getOperand(Num: 0).getValueType(); |
648 | // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the |
649 | // second operand. |
650 | if (Opcode == X86ISD::STRICT_CMPM) |
651 | OpVT = N->getOperand(Num: 1).getValueType(); |
652 | if (OpVT.is256BitVector() || OpVT.is128BitVector()) |
653 | return Subtarget->hasVLX(); |
654 | |
655 | return true; |
656 | } |
657 | // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. |
658 | if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || |
659 | Opcode == X86ISD::FSETCCM_SAE) |
660 | return true; |
661 | |
662 | return false; |
663 | } |
664 | |
665 | // Returns true if we can assume the writer of the mask has zero extended it |
666 | // for us. |
667 | bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { |
668 | // If this is an AND, check if we have a compare on either side. As long as |
669 | // one side guarantees the mask is zero extended, the AND will preserve those |
670 | // zeros. |
671 | if (N->getOpcode() == ISD::AND) |
672 | return isLegalMaskCompare(N: N->getOperand(Num: 0).getNode(), Subtarget) || |
673 | isLegalMaskCompare(N: N->getOperand(Num: 1).getNode(), Subtarget); |
674 | |
675 | return isLegalMaskCompare(N, Subtarget); |
676 | } |
677 | |
678 | bool |
679 | X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { |
680 | if (OptLevel == CodeGenOptLevel::None) |
681 | return false; |
682 | |
683 | if (!N.hasOneUse()) |
684 | return false; |
685 | |
686 | if (N.getOpcode() != ISD::LOAD) |
687 | return true; |
688 | |
689 | // Don't fold non-temporal loads if we have an instruction for them. |
690 | if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N))) |
691 | return false; |
692 | |
693 | // If N is a load, do additional profitability checks. |
694 | if (U == Root) { |
695 | switch (U->getOpcode()) { |
696 | default: break; |
697 | case X86ISD::ADD: |
698 | case X86ISD::ADC: |
699 | case X86ISD::SUB: |
700 | case X86ISD::SBB: |
701 | case X86ISD::AND: |
702 | case X86ISD::XOR: |
703 | case X86ISD::OR: |
704 | case ISD::ADD: |
705 | case ISD::UADDO_CARRY: |
706 | case ISD::AND: |
707 | case ISD::OR: |
708 | case ISD::XOR: { |
709 | SDValue Op1 = U->getOperand(Num: 1); |
710 | |
711 | // If the other operand is a 8-bit immediate we should fold the immediate |
712 | // instead. This reduces code size. |
713 | // e.g. |
714 | // movl 4(%esp), %eax |
715 | // addl $4, %eax |
716 | // vs. |
717 | // movl $4, %eax |
718 | // addl 4(%esp), %eax |
719 | // The former is 2 bytes shorter. In case where the increment is 1, then |
720 | // the saving can be 4 bytes (by using incl %eax). |
721 | if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) { |
722 | if (Imm->getAPIntValue().isSignedIntN(N: 8)) |
723 | return false; |
724 | |
725 | // If this is a 64-bit AND with an immediate that fits in 32-bits, |
726 | // prefer using the smaller and over folding the load. This is needed to |
727 | // make sure immediates created by shrinkAndImmediate are always folded. |
728 | // Ideally we would narrow the load during DAG combine and get the |
729 | // best of both worlds. |
730 | if (U->getOpcode() == ISD::AND && |
731 | Imm->getAPIntValue().getBitWidth() == 64 && |
732 | Imm->getAPIntValue().isIntN(N: 32)) |
733 | return false; |
734 | |
735 | // If this really a zext_inreg that can be represented with a movzx |
736 | // instruction, prefer that. |
737 | // TODO: We could shrink the load and fold if it is non-volatile. |
738 | if (U->getOpcode() == ISD::AND && |
739 | (Imm->getAPIntValue() == UINT8_MAX || |
740 | Imm->getAPIntValue() == UINT16_MAX || |
741 | Imm->getAPIntValue() == UINT32_MAX)) |
742 | return false; |
743 | |
744 | // ADD/SUB with can negate the immediate and use the opposite operation |
745 | // to fit 128 into a sign extended 8 bit immediate. |
746 | if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && |
747 | (-Imm->getAPIntValue()).isSignedIntN(N: 8)) |
748 | return false; |
749 | |
750 | if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) && |
751 | (-Imm->getAPIntValue()).isSignedIntN(N: 8) && |
752 | hasNoCarryFlagUses(Flags: SDValue(U, 1))) |
753 | return false; |
754 | } |
755 | |
756 | // If the other operand is a TLS address, we should fold it instead. |
757 | // This produces |
758 | // movl %gs:0, %eax |
759 | // leal i@NTPOFF(%eax), %eax |
760 | // instead of |
761 | // movl $i@NTPOFF, %eax |
762 | // addl %gs:0, %eax |
763 | // if the block also has an access to a second TLS address this will save |
764 | // a load. |
765 | // FIXME: This is probably also true for non-TLS addresses. |
766 | if (Op1.getOpcode() == X86ISD::Wrapper) { |
767 | SDValue Val = Op1.getOperand(i: 0); |
768 | if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) |
769 | return false; |
770 | } |
771 | |
772 | // Don't fold load if this matches the BTS/BTR/BTC patterns. |
773 | // BTS: (or X, (shl 1, n)) |
774 | // BTR: (and X, (rotl -2, n)) |
775 | // BTC: (xor X, (shl 1, n)) |
776 | if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { |
777 | if (U->getOperand(Num: 0).getOpcode() == ISD::SHL && |
778 | isOneConstant(V: U->getOperand(Num: 0).getOperand(i: 0))) |
779 | return false; |
780 | |
781 | if (U->getOperand(Num: 1).getOpcode() == ISD::SHL && |
782 | isOneConstant(V: U->getOperand(Num: 1).getOperand(i: 0))) |
783 | return false; |
784 | } |
785 | if (U->getOpcode() == ISD::AND) { |
786 | SDValue U0 = U->getOperand(Num: 0); |
787 | SDValue U1 = U->getOperand(Num: 1); |
788 | if (U0.getOpcode() == ISD::ROTL) { |
789 | auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: 0)); |
790 | if (C && C->getSExtValue() == -2) |
791 | return false; |
792 | } |
793 | |
794 | if (U1.getOpcode() == ISD::ROTL) { |
795 | auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: 0)); |
796 | if (C && C->getSExtValue() == -2) |
797 | return false; |
798 | } |
799 | } |
800 | |
801 | break; |
802 | } |
803 | case ISD::SHL: |
804 | case ISD::SRA: |
805 | case ISD::SRL: |
806 | // Don't fold a load into a shift by immediate. The BMI2 instructions |
807 | // support folding a load, but not an immediate. The legacy instructions |
808 | // support folding an immediate, but can't fold a load. Folding an |
809 | // immediate is preferable to folding a load. |
810 | if (isa<ConstantSDNode>(Val: U->getOperand(Num: 1))) |
811 | return false; |
812 | |
813 | break; |
814 | } |
815 | } |
816 | |
817 | // Prevent folding a load if this can implemented with an insert_subreg or |
818 | // a move that implicitly zeroes. |
819 | if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && |
820 | isNullConstant(V: Root->getOperand(Num: 2)) && |
821 | (Root->getOperand(Num: 0).isUndef() || |
822 | ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: 0).getNode()))) |
823 | return false; |
824 | |
825 | return true; |
826 | } |
827 | |
828 | // Indicates it is profitable to form an AVX512 masked operation. Returning |
829 | // false will favor a masked register-register masked move or vblendm and the |
830 | // operation will be selected separately. |
831 | bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const { |
832 | assert( |
833 | (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) && |
834 | "Unexpected opcode!" ); |
835 | |
836 | // If the operation has additional users, the operation will be duplicated. |
837 | // Check the use count to prevent that. |
838 | // FIXME: Are there cheap opcodes we might want to duplicate? |
839 | return N->getOperand(Num: 1).hasOneUse(); |
840 | } |
841 | |
842 | /// Replace the original chain operand of the call with |
843 | /// load's chain operand and move load below the call's chain operand. |
844 | static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, |
845 | SDValue Call, SDValue OrigChain) { |
846 | SmallVector<SDValue, 8> Ops; |
847 | SDValue Chain = OrigChain.getOperand(i: 0); |
848 | if (Chain.getNode() == Load.getNode()) |
849 | Ops.push_back(Elt: Load.getOperand(i: 0)); |
850 | else { |
851 | assert(Chain.getOpcode() == ISD::TokenFactor && |
852 | "Unexpected chain operand" ); |
853 | for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) |
854 | if (Chain.getOperand(i).getNode() == Load.getNode()) |
855 | Ops.push_back(Elt: Load.getOperand(i: 0)); |
856 | else |
857 | Ops.push_back(Elt: Chain.getOperand(i)); |
858 | SDValue NewChain = |
859 | CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Load), VT: MVT::Other, Ops); |
860 | Ops.clear(); |
861 | Ops.push_back(Elt: NewChain); |
862 | } |
863 | Ops.append(in_start: OrigChain->op_begin() + 1, in_end: OrigChain->op_end()); |
864 | CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops); |
865 | CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: 0), |
866 | Op2: Load.getOperand(i: 1), Op3: Load.getOperand(i: 2)); |
867 | |
868 | Ops.clear(); |
869 | Ops.push_back(Elt: SDValue(Load.getNode(), 1)); |
870 | Ops.append(in_start: Call->op_begin() + 1, in_end: Call->op_end()); |
871 | CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops); |
872 | } |
873 | |
874 | /// Return true if call address is a load and it can be |
875 | /// moved below CALLSEQ_START and the chains leading up to the call. |
876 | /// Return the CALLSEQ_START by reference as a second output. |
877 | /// In the case of a tail call, there isn't a callseq node between the call |
878 | /// chain and the load. |
879 | static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { |
880 | // The transformation is somewhat dangerous if the call's chain was glued to |
881 | // the call. After MoveBelowOrigChain the load is moved between the call and |
882 | // the chain, this can create a cycle if the load is not folded. So it is |
883 | // *really* important that we are sure the load will be folded. |
884 | if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) |
885 | return false; |
886 | auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode()); |
887 | if (!LD || |
888 | !LD->isSimple() || |
889 | LD->getAddressingMode() != ISD::UNINDEXED || |
890 | LD->getExtensionType() != ISD::NON_EXTLOAD) |
891 | return false; |
892 | |
893 | // Now let's find the callseq_start. |
894 | while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { |
895 | if (!Chain.hasOneUse()) |
896 | return false; |
897 | Chain = Chain.getOperand(i: 0); |
898 | } |
899 | |
900 | if (!Chain.getNumOperands()) |
901 | return false; |
902 | // Since we are not checking for AA here, conservatively abort if the chain |
903 | // writes to memory. It's not safe to move the callee (a load) across a store. |
904 | if (isa<MemSDNode>(Val: Chain.getNode()) && |
905 | cast<MemSDNode>(Val: Chain.getNode())->writeMem()) |
906 | return false; |
907 | if (Chain.getOperand(i: 0).getNode() == Callee.getNode()) |
908 | return true; |
909 | if (Chain.getOperand(i: 0).getOpcode() == ISD::TokenFactor && |
910 | Callee.getValue(R: 1).isOperandOf(N: Chain.getOperand(i: 0).getNode()) && |
911 | Callee.getValue(R: 1).hasOneUse()) |
912 | return true; |
913 | return false; |
914 | } |
915 | |
916 | static bool isEndbrImm64(uint64_t Imm) { |
917 | // There may be some other prefix bytes between 0xF3 and 0x0F1EFA. |
918 | // i.g: 0xF3660F1EFA, 0xF3670F1EFA |
919 | if ((Imm & 0x00FFFFFF) != 0x0F1EFA) |
920 | return false; |
921 | |
922 | uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64, |
923 | 0x65, 0x66, 0x67, 0xf0, 0xf2}; |
924 | int i = 24; // 24bit 0x0F1EFA has matched |
925 | while (i < 64) { |
926 | uint8_t Byte = (Imm >> i) & 0xFF; |
927 | if (Byte == 0xF3) |
928 | return true; |
929 | if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte)) |
930 | return false; |
931 | i += 8; |
932 | } |
933 | |
934 | return false; |
935 | } |
936 | |
937 | static bool needBWI(MVT VT) { |
938 | return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8); |
939 | } |
940 | |
941 | void X86DAGToDAGISel::PreprocessISelDAG() { |
942 | bool MadeChange = false; |
943 | for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), |
944 | E = CurDAG->allnodes_end(); I != E; ) { |
945 | SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. |
946 | |
947 | // This is for CET enhancement. |
948 | // |
949 | // ENDBR32 and ENDBR64 have specific opcodes: |
950 | // ENDBR32: F3 0F 1E FB |
951 | // ENDBR64: F3 0F 1E FA |
952 | // And we want that attackers won’t find unintended ENDBR32/64 |
953 | // opcode matches in the binary |
954 | // Here’s an example: |
955 | // If the compiler had to generate asm for the following code: |
956 | // a = 0xF30F1EFA |
957 | // it could, for example, generate: |
958 | // mov 0xF30F1EFA, dword ptr[a] |
959 | // In such a case, the binary would include a gadget that starts |
960 | // with a fake ENDBR64 opcode. Therefore, we split such generation |
961 | // into multiple operations, let it not shows in the binary |
962 | if (N->getOpcode() == ISD::Constant) { |
963 | MVT VT = N->getSimpleValueType(ResNo: 0); |
964 | int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue(); |
965 | int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB; |
966 | if (Imm == EndbrImm || isEndbrImm64(Imm)) { |
967 | // Check that the cf-protection-branch is enabled. |
968 | Metadata *CFProtectionBranch = |
969 | MF->getFunction().getParent()->getModuleFlag( |
970 | Key: "cf-protection-branch" ); |
971 | if (CFProtectionBranch || IndirectBranchTracking) { |
972 | SDLoc dl(N); |
973 | SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true); |
974 | Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT); |
975 | --I; |
976 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Complement); |
977 | ++I; |
978 | MadeChange = true; |
979 | continue; |
980 | } |
981 | } |
982 | } |
983 | |
984 | // If this is a target specific AND node with no flag usages, turn it back |
985 | // into ISD::AND to enable test instruction matching. |
986 | if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: 1)) { |
987 | SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
988 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1)); |
989 | --I; |
990 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
991 | ++I; |
992 | MadeChange = true; |
993 | continue; |
994 | } |
995 | |
996 | // Convert vector increment or decrement to sub/add with an all-ones |
997 | // constant: |
998 | // add X, <1, 1...> --> sub X, <-1, -1...> |
999 | // sub X, <1, 1...> --> add X, <-1, -1...> |
1000 | // The all-ones vector constant can be materialized using a pcmpeq |
1001 | // instruction that is commonly recognized as an idiom (has no register |
1002 | // dependency), so that's better/smaller than loading a splat 1 constant. |
1003 | // |
1004 | // But don't do this if it would inhibit a potentially profitable load |
1005 | // folding opportunity for the other operand. That only occurs with the |
1006 | // intersection of: |
1007 | // (1) The other operand (op0) is load foldable. |
1008 | // (2) The op is an add (otherwise, we are *creating* an add and can still |
1009 | // load fold the other op). |
1010 | // (3) The target has AVX (otherwise, we have a destructive add and can't |
1011 | // load fold the other op without killing the constant op). |
1012 | // (4) The constant 1 vector has multiple uses (so it is profitable to load |
1013 | // into a register anyway). |
1014 | auto mayPreventLoadFold = [&]() { |
1015 | return X86::mayFoldLoad(Op: N->getOperand(Num: 0), Subtarget: *Subtarget) && |
1016 | N->getOpcode() == ISD::ADD && Subtarget->hasAVX() && |
1017 | !N->getOperand(Num: 1).hasOneUse(); |
1018 | }; |
1019 | if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && |
1020 | N->getSimpleValueType(ResNo: 0).isVector() && !mayPreventLoadFold()) { |
1021 | APInt SplatVal; |
1022 | if (X86::isConstantSplat(Op: N->getOperand(Num: 1), SplatVal) && |
1023 | SplatVal.isOne()) { |
1024 | SDLoc DL(N); |
1025 | |
1026 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1027 | unsigned NumElts = VT.getSizeInBits() / 32; |
1028 | SDValue AllOnes = |
1029 | CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts)); |
1030 | AllOnes = CurDAG->getBitcast(VT, V: AllOnes); |
1031 | |
1032 | unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; |
1033 | SDValue Res = |
1034 | CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: 0), N2: AllOnes); |
1035 | --I; |
1036 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
1037 | ++I; |
1038 | MadeChange = true; |
1039 | continue; |
1040 | } |
1041 | } |
1042 | |
1043 | switch (N->getOpcode()) { |
1044 | case X86ISD::VBROADCAST: { |
1045 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1046 | // Emulate v32i16/v64i8 broadcast without BWI. |
1047 | if (!Subtarget->hasBWI() && needBWI(VT)) { |
1048 | MVT NarrowVT = VT.getHalfNumVectorElementsVT(); |
1049 | SDLoc dl(N); |
1050 | SDValue NarrowBCast = |
1051 | CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: 0)); |
1052 | SDValue Res = |
1053 | CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT), |
1054 | N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1055 | unsigned Index = NarrowVT.getVectorMinNumElements(); |
1056 | Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast, |
1057 | N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl)); |
1058 | |
1059 | --I; |
1060 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
1061 | ++I; |
1062 | MadeChange = true; |
1063 | continue; |
1064 | } |
1065 | |
1066 | break; |
1067 | } |
1068 | case X86ISD::VBROADCAST_LOAD: { |
1069 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1070 | // Emulate v32i16/v64i8 broadcast without BWI. |
1071 | if (!Subtarget->hasBWI() && needBWI(VT)) { |
1072 | MVT NarrowVT = VT.getHalfNumVectorElementsVT(); |
1073 | auto *MemNode = cast<MemSDNode>(Val: N); |
1074 | SDLoc dl(N); |
1075 | SDVTList VTs = CurDAG->getVTList(VT1: NarrowVT, VT2: MVT::Other); |
1076 | SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; |
1077 | SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( |
1078 | Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(), |
1079 | MMO: MemNode->getMemOperand()); |
1080 | SDValue Res = |
1081 | CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT), |
1082 | N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1083 | unsigned Index = NarrowVT.getVectorMinNumElements(); |
1084 | Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast, |
1085 | N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl)); |
1086 | |
1087 | --I; |
1088 | SDValue To[] = {Res, NarrowBCast.getValue(R: 1)}; |
1089 | CurDAG->ReplaceAllUsesWith(From: N, To); |
1090 | ++I; |
1091 | MadeChange = true; |
1092 | continue; |
1093 | } |
1094 | |
1095 | break; |
1096 | } |
1097 | case ISD::LOAD: { |
1098 | // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM |
1099 | // load, then just extract the lower subvector and avoid the second load. |
1100 | auto *Ld = cast<LoadSDNode>(Val: N); |
1101 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1102 | if (!ISD::isNormalLoad(N: Ld) || !Ld->isSimple() || |
1103 | !(VT.is128BitVector() || VT.is256BitVector())) |
1104 | break; |
1105 | |
1106 | MVT MaxVT = VT; |
1107 | SDNode *MaxLd = nullptr; |
1108 | SDValue Ptr = Ld->getBasePtr(); |
1109 | SDValue Chain = Ld->getChain(); |
1110 | for (SDNode *User : Ptr->users()) { |
1111 | auto *UserLd = dyn_cast<LoadSDNode>(Val: User); |
1112 | MVT UserVT = User->getSimpleValueType(ResNo: 0); |
1113 | if (User != N && UserLd && ISD::isNormalLoad(N: User) && |
1114 | UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain && |
1115 | !User->hasAnyUseOfValue(Value: 1) && |
1116 | (UserVT.is256BitVector() || UserVT.is512BitVector()) && |
1117 | UserVT.getSizeInBits() > VT.getSizeInBits() && |
1118 | (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) { |
1119 | MaxLd = User; |
1120 | MaxVT = UserVT; |
1121 | } |
1122 | } |
1123 | if (MaxLd) { |
1124 | SDLoc dl(N); |
1125 | unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits(); |
1126 | MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts); |
1127 | SDValue = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT, |
1128 | N1: SDValue(MaxLd, 0), |
1129 | N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1130 | SDValue Res = CurDAG->getBitcast(VT, V: Extract); |
1131 | |
1132 | --I; |
1133 | SDValue To[] = {Res, SDValue(MaxLd, 1)}; |
1134 | CurDAG->ReplaceAllUsesWith(From: N, To); |
1135 | ++I; |
1136 | MadeChange = true; |
1137 | continue; |
1138 | } |
1139 | break; |
1140 | } |
1141 | case ISD::VSELECT: { |
1142 | // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG. |
1143 | EVT EleVT = N->getOperand(Num: 0).getValueType().getVectorElementType(); |
1144 | if (EleVT == MVT::i1) |
1145 | break; |
1146 | |
1147 | assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!" ); |
1148 | assert(N->getValueType(0).getVectorElementType() != MVT::i16 && |
1149 | "We can't replace VSELECT with BLENDV in vXi16!" ); |
1150 | SDValue R; |
1151 | if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: 0)) == |
1152 | EleVT.getSizeInBits()) { |
1153 | R = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1154 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), N3: N->getOperand(Num: 2), |
1155 | N4: CurDAG->getTargetConstant(Val: 0xCA, DL: SDLoc(N), VT: MVT::i8)); |
1156 | } else { |
1157 | R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1158 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), |
1159 | N3: N->getOperand(Num: 2)); |
1160 | } |
1161 | --I; |
1162 | CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode()); |
1163 | ++I; |
1164 | MadeChange = true; |
1165 | continue; |
1166 | } |
1167 | case ISD::FP_ROUND: |
1168 | case ISD::STRICT_FP_ROUND: |
1169 | case ISD::FP_TO_SINT: |
1170 | case ISD::FP_TO_UINT: |
1171 | case ISD::STRICT_FP_TO_SINT: |
1172 | case ISD::STRICT_FP_TO_UINT: { |
1173 | // Replace vector fp_to_s/uint with their X86 specific equivalent so we |
1174 | // don't need 2 sets of patterns. |
1175 | if (!N->getSimpleValueType(ResNo: 0).isVector()) |
1176 | break; |
1177 | |
1178 | unsigned NewOpc; |
1179 | switch (N->getOpcode()) { |
1180 | default: llvm_unreachable("Unexpected opcode!" ); |
1181 | case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break; |
1182 | case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break; |
1183 | case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break; |
1184 | case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; |
1185 | case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break; |
1186 | case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; |
1187 | } |
1188 | SDValue Res; |
1189 | if (N->isStrictFPOpcode()) |
1190 | Res = |
1191 | CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), ResultTys: {N->getValueType(ResNo: 0), MVT::Other}, |
1192 | Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)}); |
1193 | else |
1194 | Res = |
1195 | CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1196 | Operand: N->getOperand(Num: 0)); |
1197 | --I; |
1198 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
1199 | ++I; |
1200 | MadeChange = true; |
1201 | continue; |
1202 | } |
1203 | case ISD::SHL: |
1204 | case ISD::SRA: |
1205 | case ISD::SRL: { |
1206 | // Replace vector shifts with their X86 specific equivalent so we don't |
1207 | // need 2 sets of patterns. |
1208 | if (!N->getValueType(ResNo: 0).isVector()) |
1209 | break; |
1210 | |
1211 | unsigned NewOpc; |
1212 | switch (N->getOpcode()) { |
1213 | default: llvm_unreachable("Unexpected opcode!" ); |
1214 | case ISD::SHL: NewOpc = X86ISD::VSHLV; break; |
1215 | case ISD::SRA: NewOpc = X86ISD::VSRAV; break; |
1216 | case ISD::SRL: NewOpc = X86ISD::VSRLV; break; |
1217 | } |
1218 | SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1219 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1)); |
1220 | --I; |
1221 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
1222 | ++I; |
1223 | MadeChange = true; |
1224 | continue; |
1225 | } |
1226 | case ISD::ANY_EXTEND: |
1227 | case ISD::ANY_EXTEND_VECTOR_INREG: { |
1228 | // Replace vector any extend with the zero extend equivalents so we don't |
1229 | // need 2 sets of patterns. Ignore vXi1 extensions. |
1230 | if (!N->getValueType(ResNo: 0).isVector()) |
1231 | break; |
1232 | |
1233 | unsigned NewOpc; |
1234 | if (N->getOperand(Num: 0).getScalarValueSizeInBits() == 1) { |
1235 | assert(N->getOpcode() == ISD::ANY_EXTEND && |
1236 | "Unexpected opcode for mask vector!" ); |
1237 | NewOpc = ISD::SIGN_EXTEND; |
1238 | } else { |
1239 | NewOpc = N->getOpcode() == ISD::ANY_EXTEND |
1240 | ? ISD::ZERO_EXTEND |
1241 | : ISD::ZERO_EXTEND_VECTOR_INREG; |
1242 | } |
1243 | |
1244 | SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1245 | Operand: N->getOperand(Num: 0)); |
1246 | --I; |
1247 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
1248 | ++I; |
1249 | MadeChange = true; |
1250 | continue; |
1251 | } |
1252 | case ISD::FCEIL: |
1253 | case ISD::STRICT_FCEIL: |
1254 | case ISD::FFLOOR: |
1255 | case ISD::STRICT_FFLOOR: |
1256 | case ISD::FTRUNC: |
1257 | case ISD::STRICT_FTRUNC: |
1258 | case ISD::FROUNDEVEN: |
1259 | case ISD::STRICT_FROUNDEVEN: |
1260 | case ISD::FNEARBYINT: |
1261 | case ISD::STRICT_FNEARBYINT: |
1262 | case ISD::FRINT: |
1263 | case ISD::STRICT_FRINT: { |
1264 | // Replace fp rounding with their X86 specific equivalent so we don't |
1265 | // need 2 sets of patterns. |
1266 | unsigned Imm; |
1267 | switch (N->getOpcode()) { |
1268 | default: llvm_unreachable("Unexpected opcode!" ); |
1269 | case ISD::STRICT_FCEIL: |
1270 | case ISD::FCEIL: Imm = 0xA; break; |
1271 | case ISD::STRICT_FFLOOR: |
1272 | case ISD::FFLOOR: Imm = 0x9; break; |
1273 | case ISD::STRICT_FTRUNC: |
1274 | case ISD::FTRUNC: Imm = 0xB; break; |
1275 | case ISD::STRICT_FROUNDEVEN: |
1276 | case ISD::FROUNDEVEN: Imm = 0x8; break; |
1277 | case ISD::STRICT_FNEARBYINT: |
1278 | case ISD::FNEARBYINT: Imm = 0xC; break; |
1279 | case ISD::STRICT_FRINT: |
1280 | case ISD::FRINT: Imm = 0x4; break; |
1281 | } |
1282 | SDLoc dl(N); |
1283 | bool IsStrict = N->isStrictFPOpcode(); |
1284 | SDValue Res; |
1285 | if (IsStrict) |
1286 | Res = CurDAG->getNode(Opcode: X86ISD::STRICT_VRNDSCALE, DL: dl, |
1287 | ResultTys: {N->getValueType(ResNo: 0), MVT::Other}, |
1288 | Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1), |
1289 | CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)}); |
1290 | else |
1291 | Res = CurDAG->getNode(Opcode: X86ISD::VRNDSCALE, DL: dl, VT: N->getValueType(ResNo: 0), |
1292 | N1: N->getOperand(Num: 0), |
1293 | N2: CurDAG->getTargetConstant(Val: Imm, DL: dl, VT: MVT::i32)); |
1294 | --I; |
1295 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
1296 | ++I; |
1297 | MadeChange = true; |
1298 | continue; |
1299 | } |
1300 | case X86ISD::FANDN: |
1301 | case X86ISD::FAND: |
1302 | case X86ISD::FOR: |
1303 | case X86ISD::FXOR: { |
1304 | // Widen scalar fp logic ops to vector to reduce isel patterns. |
1305 | // FIXME: Can we do this during lowering/combine. |
1306 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1307 | if (VT.isVector() || VT == MVT::f128) |
1308 | break; |
1309 | |
1310 | MVT VecVT = VT == MVT::f64 ? MVT::v2f64 |
1311 | : VT == MVT::f32 ? MVT::v4f32 |
1312 | : MVT::v8f16; |
1313 | |
1314 | SDLoc dl(N); |
1315 | SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT, |
1316 | Operand: N->getOperand(Num: 0)); |
1317 | SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT, |
1318 | Operand: N->getOperand(Num: 1)); |
1319 | |
1320 | SDValue Res; |
1321 | if (Subtarget->hasSSE2()) { |
1322 | EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); |
1323 | Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0); |
1324 | Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1); |
1325 | unsigned Opc; |
1326 | switch (N->getOpcode()) { |
1327 | default: llvm_unreachable("Unexpected opcode!" ); |
1328 | case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; |
1329 | case X86ISD::FAND: Opc = ISD::AND; break; |
1330 | case X86ISD::FOR: Opc = ISD::OR; break; |
1331 | case X86ISD::FXOR: Opc = ISD::XOR; break; |
1332 | } |
1333 | Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1); |
1334 | Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res); |
1335 | } else { |
1336 | Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1); |
1337 | } |
1338 | Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res, |
1339 | N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1340 | --I; |
1341 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
1342 | ++I; |
1343 | MadeChange = true; |
1344 | continue; |
1345 | } |
1346 | } |
1347 | |
1348 | if (OptLevel != CodeGenOptLevel::None && |
1349 | // Only do this when the target can fold the load into the call or |
1350 | // jmp. |
1351 | !Subtarget->useIndirectThunkCalls() && |
1352 | ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || |
1353 | (N->getOpcode() == X86ISD::TC_RETURN && |
1354 | (Subtarget->is64Bit() || |
1355 | !getTargetMachine().isPositionIndependent())))) { |
1356 | /// Also try moving call address load from outside callseq_start to just |
1357 | /// before the call to allow it to be folded. |
1358 | /// |
1359 | /// [Load chain] |
1360 | /// ^ |
1361 | /// | |
1362 | /// [Load] |
1363 | /// ^ ^ |
1364 | /// | | |
1365 | /// / \-- |
1366 | /// / | |
1367 | ///[CALLSEQ_START] | |
1368 | /// ^ | |
1369 | /// | | |
1370 | /// [LOAD/C2Reg] | |
1371 | /// | | |
1372 | /// \ / |
1373 | /// \ / |
1374 | /// [CALL] |
1375 | bool HasCallSeq = N->getOpcode() == X86ISD::CALL; |
1376 | SDValue Chain = N->getOperand(Num: 0); |
1377 | SDValue Load = N->getOperand(Num: 1); |
1378 | if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq)) |
1379 | continue; |
1380 | moveBelowOrigChain(CurDAG, Load, Call: SDValue(N, 0), OrigChain: Chain); |
1381 | ++NumLoadMoved; |
1382 | MadeChange = true; |
1383 | continue; |
1384 | } |
1385 | |
1386 | // Lower fpround and fpextend nodes that target the FP stack to be store and |
1387 | // load to the stack. This is a gross hack. We would like to simply mark |
1388 | // these as being illegal, but when we do that, legalize produces these when |
1389 | // it expands calls, then expands these in the same legalize pass. We would |
1390 | // like dag combine to be able to hack on these between the call expansion |
1391 | // and the node legalization. As such this pass basically does "really |
1392 | // late" legalization of these inline with the X86 isel pass. |
1393 | // FIXME: This should only happen when not compiled with -O0. |
1394 | switch (N->getOpcode()) { |
1395 | default: continue; |
1396 | case ISD::FP_ROUND: |
1397 | case ISD::FP_EXTEND: |
1398 | { |
1399 | MVT SrcVT = N->getOperand(Num: 0).getSimpleValueType(); |
1400 | MVT DstVT = N->getSimpleValueType(ResNo: 0); |
1401 | |
1402 | // If any of the sources are vectors, no fp stack involved. |
1403 | if (SrcVT.isVector() || DstVT.isVector()) |
1404 | continue; |
1405 | |
1406 | // If the source and destination are SSE registers, then this is a legal |
1407 | // conversion that should not be lowered. |
1408 | const X86TargetLowering *X86Lowering = |
1409 | static_cast<const X86TargetLowering *>(TLI); |
1410 | bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT); |
1411 | bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT); |
1412 | if (SrcIsSSE && DstIsSSE) |
1413 | continue; |
1414 | |
1415 | if (!SrcIsSSE && !DstIsSSE) { |
1416 | // If this is an FPStack extension, it is a noop. |
1417 | if (N->getOpcode() == ISD::FP_EXTEND) |
1418 | continue; |
1419 | // If this is a value-preserving FPStack truncation, it is a noop. |
1420 | if (N->getConstantOperandVal(Num: 1)) |
1421 | continue; |
1422 | } |
1423 | |
1424 | // Here we could have an FP stack truncation or an FPStack <-> SSE convert. |
1425 | // FPStack has extload and truncstore. SSE can fold direct loads into other |
1426 | // operations. Based on this, decide what we want to do. |
1427 | MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; |
1428 | SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT); |
1429 | int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex(); |
1430 | MachinePointerInfo MPI = |
1431 | MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI); |
1432 | SDLoc dl(N); |
1433 | |
1434 | // FIXME: optimize the case where the src/dest is a load or store? |
1435 | |
1436 | SDValue Store = CurDAG->getTruncStore( |
1437 | Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: 0), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT); |
1438 | SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store, |
1439 | Ptr: MemTmp, PtrInfo: MPI, MemVT); |
1440 | |
1441 | // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the |
1442 | // extload we created. This will cause general havok on the dag because |
1443 | // anything below the conversion could be folded into other existing nodes. |
1444 | // To avoid invalidating 'I', back it up to the convert node. |
1445 | --I; |
1446 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Result); |
1447 | break; |
1448 | } |
1449 | |
1450 | //The sequence of events for lowering STRICT_FP versions of these nodes requires |
1451 | //dealing with the chain differently, as there is already a preexisting chain. |
1452 | case ISD::STRICT_FP_ROUND: |
1453 | case ISD::STRICT_FP_EXTEND: |
1454 | { |
1455 | MVT SrcVT = N->getOperand(Num: 1).getSimpleValueType(); |
1456 | MVT DstVT = N->getSimpleValueType(ResNo: 0); |
1457 | |
1458 | // If any of the sources are vectors, no fp stack involved. |
1459 | if (SrcVT.isVector() || DstVT.isVector()) |
1460 | continue; |
1461 | |
1462 | // If the source and destination are SSE registers, then this is a legal |
1463 | // conversion that should not be lowered. |
1464 | const X86TargetLowering *X86Lowering = |
1465 | static_cast<const X86TargetLowering *>(TLI); |
1466 | bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT); |
1467 | bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT); |
1468 | if (SrcIsSSE && DstIsSSE) |
1469 | continue; |
1470 | |
1471 | if (!SrcIsSSE && !DstIsSSE) { |
1472 | // If this is an FPStack extension, it is a noop. |
1473 | if (N->getOpcode() == ISD::STRICT_FP_EXTEND) |
1474 | continue; |
1475 | // If this is a value-preserving FPStack truncation, it is a noop. |
1476 | if (N->getConstantOperandVal(Num: 2)) |
1477 | continue; |
1478 | } |
1479 | |
1480 | // Here we could have an FP stack truncation or an FPStack <-> SSE convert. |
1481 | // FPStack has extload and truncstore. SSE can fold direct loads into other |
1482 | // operations. Based on this, decide what we want to do. |
1483 | MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; |
1484 | SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT); |
1485 | int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex(); |
1486 | MachinePointerInfo MPI = |
1487 | MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI); |
1488 | SDLoc dl(N); |
1489 | |
1490 | // FIXME: optimize the case where the src/dest is a load or store? |
1491 | |
1492 | //Since the operation is StrictFP, use the preexisting chain. |
1493 | SDValue Store, Result; |
1494 | if (!SrcIsSSE) { |
1495 | SDVTList VTs = CurDAG->getVTList(VT: MVT::Other); |
1496 | SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), MemTmp}; |
1497 | Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT, |
1498 | PtrInfo: MPI, /*Align*/ Alignment: std::nullopt, |
1499 | Flags: MachineMemOperand::MOStore); |
1500 | if (N->getFlags().hasNoFPExcept()) { |
1501 | SDNodeFlags Flags = Store->getFlags(); |
1502 | Flags.setNoFPExcept(true); |
1503 | Store->setFlags(Flags); |
1504 | } |
1505 | } else { |
1506 | assert(SrcVT == MemVT && "Unexpected VT!" ); |
1507 | Store = CurDAG->getStore(Chain: N->getOperand(Num: 0), dl, Val: N->getOperand(Num: 1), Ptr: MemTmp, |
1508 | PtrInfo: MPI); |
1509 | } |
1510 | |
1511 | if (!DstIsSSE) { |
1512 | SDVTList VTs = CurDAG->getVTList(VT1: DstVT, VT2: MVT::Other); |
1513 | SDValue Ops[] = {Store, MemTmp}; |
1514 | Result = CurDAG->getMemIntrinsicNode( |
1515 | Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI, |
1516 | /*Align*/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad); |
1517 | if (N->getFlags().hasNoFPExcept()) { |
1518 | SDNodeFlags Flags = Result->getFlags(); |
1519 | Flags.setNoFPExcept(true); |
1520 | Result->setFlags(Flags); |
1521 | } |
1522 | } else { |
1523 | assert(DstVT == MemVT && "Unexpected VT!" ); |
1524 | Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI); |
1525 | } |
1526 | |
1527 | // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the |
1528 | // extload we created. This will cause general havok on the dag because |
1529 | // anything below the conversion could be folded into other existing nodes. |
1530 | // To avoid invalidating 'I', back it up to the convert node. |
1531 | --I; |
1532 | CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode()); |
1533 | break; |
1534 | } |
1535 | } |
1536 | |
1537 | |
1538 | // Now that we did that, the node is dead. Increment the iterator to the |
1539 | // next node to process, then delete N. |
1540 | ++I; |
1541 | MadeChange = true; |
1542 | } |
1543 | |
1544 | // Remove any dead nodes that may have been left behind. |
1545 | if (MadeChange) |
1546 | CurDAG->RemoveDeadNodes(); |
1547 | } |
1548 | |
1549 | // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. |
1550 | bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { |
1551 | unsigned Opc = N->getMachineOpcode(); |
1552 | if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && |
1553 | Opc != X86::MOVSX64rr8) |
1554 | return false; |
1555 | |
1556 | SDValue N0 = N->getOperand(Num: 0); |
1557 | |
1558 | // We need to be extracting the lower bit of an extend. |
1559 | if (!N0.isMachineOpcode() || |
1560 | N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || |
1561 | N0.getConstantOperandVal(i: 1) != X86::sub_8bit) |
1562 | return false; |
1563 | |
1564 | // We're looking for either a movsx or movzx to match the original opcode. |
1565 | unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX |
1566 | : X86::MOVSX32rr8_NOREX; |
1567 | SDValue N00 = N0.getOperand(i: 0); |
1568 | if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) |
1569 | return false; |
1570 | |
1571 | if (Opc == X86::MOVSX64rr8) { |
1572 | // If we had a sign extend from 8 to 64 bits. We still need to go from 32 |
1573 | // to 64. |
1574 | MachineSDNode *Extend = CurDAG->getMachineNode(Opcode: X86::MOVSX64rr32, dl: SDLoc(N), |
1575 | VT: MVT::i64, Op1: N00); |
1576 | ReplaceUses(F: N, T: Extend); |
1577 | } else { |
1578 | // Ok we can drop this extend and just use the original extend. |
1579 | ReplaceUses(F: N, T: N00.getNode()); |
1580 | } |
1581 | |
1582 | return true; |
1583 | } |
1584 | |
1585 | void X86DAGToDAGISel::PostprocessISelDAG() { |
1586 | // Skip peepholes at -O0. |
1587 | if (TM.getOptLevel() == CodeGenOptLevel::None) |
1588 | return; |
1589 | |
1590 | SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); |
1591 | |
1592 | bool MadeChange = false; |
1593 | while (Position != CurDAG->allnodes_begin()) { |
1594 | SDNode *N = &*--Position; |
1595 | // Skip dead nodes and any non-machine opcodes. |
1596 | if (N->use_empty() || !N->isMachineOpcode()) |
1597 | continue; |
1598 | |
1599 | if (tryOptimizeRem8Extend(N)) { |
1600 | MadeChange = true; |
1601 | continue; |
1602 | } |
1603 | |
1604 | unsigned Opc = N->getMachineOpcode(); |
1605 | switch (Opc) { |
1606 | default: |
1607 | continue; |
1608 | // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr |
1609 | case X86::TEST8rr: |
1610 | case X86::TEST16rr: |
1611 | case X86::TEST32rr: |
1612 | case X86::TEST64rr: |
1613 | // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr |
1614 | case X86::CTEST8rr: |
1615 | case X86::CTEST16rr: |
1616 | case X86::CTEST32rr: |
1617 | case X86::CTEST64rr: { |
1618 | auto &Op0 = N->getOperand(Num: 0); |
1619 | if (Op0 != N->getOperand(Num: 1) || !Op0->hasNUsesOfValue(NUses: 2, Value: Op0.getResNo()) || |
1620 | !Op0.isMachineOpcode()) |
1621 | continue; |
1622 | SDValue And = N->getOperand(Num: 0); |
1623 | #define CASE_ND(OP) \ |
1624 | case X86::OP: \ |
1625 | case X86::OP##_ND: |
1626 | switch (And.getMachineOpcode()) { |
1627 | default: |
1628 | continue; |
1629 | CASE_ND(AND8rr) |
1630 | CASE_ND(AND16rr) |
1631 | CASE_ND(AND32rr) |
1632 | CASE_ND(AND64rr) { |
1633 | if (And->hasAnyUseOfValue(Value: 1)) |
1634 | continue; |
1635 | SmallVector<SDValue> Ops(N->op_values()); |
1636 | Ops[0] = And.getOperand(i: 0); |
1637 | Ops[1] = And.getOperand(i: 1); |
1638 | MachineSDNode *Test = |
1639 | CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: MVT::i32, Ops); |
1640 | ReplaceUses(F: N, T: Test); |
1641 | MadeChange = true; |
1642 | continue; |
1643 | } |
1644 | CASE_ND(AND8rm) |
1645 | CASE_ND(AND16rm) |
1646 | CASE_ND(AND32rm) |
1647 | CASE_ND(AND64rm) { |
1648 | if (And->hasAnyUseOfValue(Value: 1)) |
1649 | continue; |
1650 | unsigned NewOpc; |
1651 | bool IsCTESTCC = X86::isCTESTCC(Opcode: Opc); |
1652 | #define FROM_TO(A, B) \ |
1653 | CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \ |
1654 | break; |
1655 | switch (And.getMachineOpcode()) { |
1656 | FROM_TO(AND8rm, TEST8mr); |
1657 | FROM_TO(AND16rm, TEST16mr); |
1658 | FROM_TO(AND32rm, TEST32mr); |
1659 | FROM_TO(AND64rm, TEST64mr); |
1660 | } |
1661 | #undef FROM_TO |
1662 | #undef CASE_ND |
1663 | // Need to swap the memory and register operand. |
1664 | SmallVector<SDValue> Ops = {And.getOperand(i: 1), And.getOperand(i: 2), |
1665 | And.getOperand(i: 3), And.getOperand(i: 4), |
1666 | And.getOperand(i: 5), And.getOperand(i: 0)}; |
1667 | // CC, Cflags. |
1668 | if (IsCTESTCC) { |
1669 | Ops.push_back(Elt: N->getOperand(Num: 2)); |
1670 | Ops.push_back(Elt: N->getOperand(Num: 3)); |
1671 | } |
1672 | // Chain of memory load |
1673 | Ops.push_back(Elt: And.getOperand(i: 6)); |
1674 | // Glue |
1675 | if (IsCTESTCC) |
1676 | Ops.push_back(Elt: N->getOperand(Num: 4)); |
1677 | |
1678 | MachineSDNode *Test = CurDAG->getMachineNode( |
1679 | Opcode: NewOpc, dl: SDLoc(N), VT1: MVT::i32, VT2: MVT::Other, Ops); |
1680 | CurDAG->setNodeMemRefs( |
1681 | N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands()); |
1682 | ReplaceUses(F: And.getValue(R: 2), T: SDValue(Test, 1)); |
1683 | ReplaceUses(F: SDValue(N, 0), T: SDValue(Test, 0)); |
1684 | MadeChange = true; |
1685 | continue; |
1686 | } |
1687 | } |
1688 | } |
1689 | // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is |
1690 | // used. We're doing this late so we can prefer to fold the AND into masked |
1691 | // comparisons. Doing that can be better for the live range of the mask |
1692 | // register. |
1693 | case X86::KORTESTBkk: |
1694 | case X86::KORTESTWkk: |
1695 | case X86::KORTESTDkk: |
1696 | case X86::KORTESTQkk: { |
1697 | SDValue Op0 = N->getOperand(Num: 0); |
1698 | if (Op0 != N->getOperand(Num: 1) || !N->isOnlyUserOf(N: Op0.getNode()) || |
1699 | !Op0.isMachineOpcode() || !onlyUsesZeroFlag(Flags: SDValue(N, 0))) |
1700 | continue; |
1701 | #define CASE(A) \ |
1702 | case X86::A: \ |
1703 | break; |
1704 | switch (Op0.getMachineOpcode()) { |
1705 | default: |
1706 | continue; |
1707 | CASE(KANDBkk) |
1708 | CASE(KANDWkk) |
1709 | CASE(KANDDkk) |
1710 | CASE(KANDQkk) |
1711 | } |
1712 | unsigned NewOpc; |
1713 | #define FROM_TO(A, B) \ |
1714 | case X86::A: \ |
1715 | NewOpc = X86::B; \ |
1716 | break; |
1717 | switch (Opc) { |
1718 | FROM_TO(KORTESTBkk, KTESTBkk) |
1719 | FROM_TO(KORTESTWkk, KTESTWkk) |
1720 | FROM_TO(KORTESTDkk, KTESTDkk) |
1721 | FROM_TO(KORTESTQkk, KTESTQkk) |
1722 | } |
1723 | // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other |
1724 | // KAND instructions and KTEST use the same ISA feature. |
1725 | if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI()) |
1726 | continue; |
1727 | #undef FROM_TO |
1728 | MachineSDNode *KTest = CurDAG->getMachineNode( |
1729 | Opcode: NewOpc, dl: SDLoc(N), VT: MVT::i32, Op1: Op0.getOperand(i: 0), Op2: Op0.getOperand(i: 1)); |
1730 | ReplaceUses(F: N, T: KTest); |
1731 | MadeChange = true; |
1732 | continue; |
1733 | } |
1734 | // Attempt to remove vectors moves that were inserted to zero upper bits. |
1735 | case TargetOpcode::SUBREG_TO_REG: { |
1736 | unsigned SubRegIdx = N->getConstantOperandVal(Num: 2); |
1737 | if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) |
1738 | continue; |
1739 | |
1740 | SDValue Move = N->getOperand(Num: 1); |
1741 | if (!Move.isMachineOpcode()) |
1742 | continue; |
1743 | |
1744 | // Make sure its one of the move opcodes we recognize. |
1745 | switch (Move.getMachineOpcode()) { |
1746 | default: |
1747 | continue; |
1748 | CASE(VMOVAPDrr) CASE(VMOVUPDrr) |
1749 | CASE(VMOVAPSrr) CASE(VMOVUPSrr) |
1750 | CASE(VMOVDQArr) CASE(VMOVDQUrr) |
1751 | CASE(VMOVAPDYrr) CASE(VMOVUPDYrr) |
1752 | CASE(VMOVAPSYrr) CASE(VMOVUPSYrr) |
1753 | CASE(VMOVDQAYrr) CASE(VMOVDQUYrr) |
1754 | CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr) |
1755 | CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr) |
1756 | CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr) |
1757 | CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr) |
1758 | CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr) |
1759 | CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr) |
1760 | CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr) |
1761 | CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr) |
1762 | } |
1763 | #undef CASE |
1764 | |
1765 | SDValue In = Move.getOperand(i: 0); |
1766 | if (!In.isMachineOpcode() || |
1767 | In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) |
1768 | continue; |
1769 | |
1770 | // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers |
1771 | // the SHA instructions which use a legacy encoding. |
1772 | uint64_t TSFlags = getInstrInfo()->get(Opcode: In.getMachineOpcode()).TSFlags; |
1773 | if ((TSFlags & X86II::EncodingMask) != X86II::VEX && |
1774 | (TSFlags & X86II::EncodingMask) != X86II::EVEX && |
1775 | (TSFlags & X86II::EncodingMask) != X86II::XOP) |
1776 | continue; |
1777 | |
1778 | // Producing instruction is another vector instruction. We can drop the |
1779 | // move. |
1780 | CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: In, Op3: N->getOperand(Num: 2)); |
1781 | MadeChange = true; |
1782 | } |
1783 | } |
1784 | } |
1785 | |
1786 | if (MadeChange) |
1787 | CurDAG->RemoveDeadNodes(); |
1788 | } |
1789 | |
1790 | |
1791 | /// Emit any code that needs to be executed only in the main function. |
1792 | void X86DAGToDAGISel::emitSpecialCodeForMain() { |
1793 | if (Subtarget->isTargetCygMing()) { |
1794 | TargetLowering::ArgListTy Args; |
1795 | auto &DL = CurDAG->getDataLayout(); |
1796 | |
1797 | TargetLowering::CallLoweringInfo CLI(*CurDAG); |
1798 | CLI.setChain(CurDAG->getRoot()) |
1799 | .setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()), |
1800 | Target: CurDAG->getExternalSymbol(Sym: "__main" , VT: TLI->getPointerTy(DL)), |
1801 | ArgsList: std::move(Args)); |
1802 | const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); |
1803 | std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); |
1804 | CurDAG->setRoot(Result.second); |
1805 | } |
1806 | } |
1807 | |
1808 | void X86DAGToDAGISel::emitFunctionEntryCode() { |
1809 | // If this is main, emit special code for main. |
1810 | const Function &F = MF->getFunction(); |
1811 | if (F.hasExternalLinkage() && F.getName() == "main" ) |
1812 | emitSpecialCodeForMain(); |
1813 | } |
1814 | |
1815 | static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) { |
1816 | // We can run into an issue where a frame index or a register base |
1817 | // includes a displacement that, when added to the explicit displacement, |
1818 | // will overflow the displacement field. Assuming that the |
1819 | // displacement fits into a 31-bit integer (which is only slightly more |
1820 | // aggressive than the current fundamental assumption that it fits into |
1821 | // a 32-bit integer), a 31-bit disp should always be safe. |
1822 | return isInt<31>(x: Val); |
1823 | } |
1824 | |
1825 | bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, |
1826 | X86ISelAddressMode &AM) { |
1827 | // We may have already matched a displacement and the caller just added the |
1828 | // symbolic displacement. So we still need to do the checks even if Offset |
1829 | // is zero. |
1830 | |
1831 | int64_t Val = AM.Disp + Offset; |
1832 | |
1833 | // Cannot combine ExternalSymbol displacements with integer offsets. |
1834 | if (Val != 0 && (AM.ES || AM.MCSym)) |
1835 | return true; |
1836 | |
1837 | CodeModel::Model M = TM.getCodeModel(); |
1838 | if (Subtarget->is64Bit()) { |
1839 | if (Val != 0 && |
1840 | !X86::isOffsetSuitableForCodeModel(Offset: Val, M, |
1841 | hasSymbolicDisplacement: AM.hasSymbolicDisplacement())) |
1842 | return true; |
1843 | // In addition to the checks required for a register base, check that |
1844 | // we do not try to use an unsafe Disp with a frame index. |
1845 | if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && |
1846 | !isDispSafeForFrameIndexOrRegBase(Val)) |
1847 | return true; |
1848 | // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to |
1849 | // 64 bits. Instructions with 32-bit register addresses perform this zero |
1850 | // extension for us and we can safely ignore the high bits of Offset. |
1851 | // Instructions with only a 32-bit immediate address do not, though: they |
1852 | // sign extend instead. This means only address the low 2GB of address space |
1853 | // is directly addressable, we need indirect addressing for the high 2GB of |
1854 | // address space. |
1855 | // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the |
1856 | // implicit zero extension of instructions would cover up any problem. |
1857 | // However, we have asserts elsewhere that get triggered if we do, so keep |
1858 | // the checks for now. |
1859 | // TODO: We would actually be able to accept these, as well as the same |
1860 | // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand |
1861 | // to get an address size override to be emitted. However, this |
1862 | // pseudo-register is not part of any register class and therefore causes |
1863 | // MIR verification to fail. |
1864 | if (Subtarget->isTarget64BitILP32() && |
1865 | !isDispSafeForFrameIndexOrRegBase(Val: (uint32_t)Val) && |
1866 | !AM.hasBaseOrIndexReg()) |
1867 | return true; |
1868 | } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val)) |
1869 | // For 32-bit X86, make sure the displacement still isn't close to the |
1870 | // expressible limit. |
1871 | return true; |
1872 | AM.Disp = Val; |
1873 | return false; |
1874 | } |
1875 | |
1876 | bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, |
1877 | bool AllowSegmentRegForX32) { |
1878 | SDValue Address = N->getOperand(Num: 1); |
1879 | |
1880 | // load gs:0 -> GS segment register. |
1881 | // load fs:0 -> FS segment register. |
1882 | // |
1883 | // This optimization is generally valid because the GNU TLS model defines that |
1884 | // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode |
1885 | // with 32-bit registers, as we get in ILP32 mode, those registers are first |
1886 | // zero-extended to 64 bits and then added it to the base address, which gives |
1887 | // unwanted results when the register holds a negative value. |
1888 | // For more information see http://people.redhat.com/drepper/tls.pdf |
1889 | if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr && |
1890 | !IndirectTlsSegRefs && |
1891 | (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || |
1892 | Subtarget->isTargetFuchsia())) { |
1893 | if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32) |
1894 | return true; |
1895 | switch (N->getPointerInfo().getAddrSpace()) { |
1896 | case X86AS::GS: |
1897 | AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16); |
1898 | return false; |
1899 | case X86AS::FS: |
1900 | AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16); |
1901 | return false; |
1902 | // Address space X86AS::SS is not handled here, because it is not used to |
1903 | // address TLS areas. |
1904 | } |
1905 | } |
1906 | |
1907 | return true; |
1908 | } |
1909 | |
1910 | /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing |
1911 | /// mode. These wrap things that will resolve down into a symbol reference. |
1912 | /// If no match is possible, this returns true, otherwise it returns false. |
1913 | bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { |
1914 | // If the addressing mode already has a symbol as the displacement, we can |
1915 | // never match another symbol. |
1916 | if (AM.hasSymbolicDisplacement()) |
1917 | return true; |
1918 | |
1919 | bool IsRIPRelTLS = false; |
1920 | bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; |
1921 | if (IsRIPRel) { |
1922 | SDValue Val = N.getOperand(i: 0); |
1923 | if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) |
1924 | IsRIPRelTLS = true; |
1925 | } |
1926 | |
1927 | // We can't use an addressing mode in the 64-bit large code model. |
1928 | // Global TLS addressing is an exception. In the medium code model, |
1929 | // we use can use a mode when RIP wrappers are present. |
1930 | // That signifies access to globals that are known to be "near", |
1931 | // such as the GOT itself. |
1932 | CodeModel::Model M = TM.getCodeModel(); |
1933 | if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS) |
1934 | return true; |
1935 | |
1936 | // Base and index reg must be 0 in order to use %rip as base. |
1937 | if (IsRIPRel && AM.hasBaseOrIndexReg()) |
1938 | return true; |
1939 | |
1940 | // Make a local copy in case we can't do this fold. |
1941 | X86ISelAddressMode Backup = AM; |
1942 | |
1943 | int64_t Offset = 0; |
1944 | SDValue N0 = N.getOperand(i: 0); |
1945 | if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) { |
1946 | AM.GV = G->getGlobal(); |
1947 | AM.SymbolFlags = G->getTargetFlags(); |
1948 | Offset = G->getOffset(); |
1949 | } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) { |
1950 | AM.CP = CP->getConstVal(); |
1951 | AM.Alignment = CP->getAlign(); |
1952 | AM.SymbolFlags = CP->getTargetFlags(); |
1953 | Offset = CP->getOffset(); |
1954 | } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) { |
1955 | AM.ES = S->getSymbol(); |
1956 | AM.SymbolFlags = S->getTargetFlags(); |
1957 | } else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) { |
1958 | AM.MCSym = S->getMCSymbol(); |
1959 | } else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) { |
1960 | AM.JT = J->getIndex(); |
1961 | AM.SymbolFlags = J->getTargetFlags(); |
1962 | } else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) { |
1963 | AM.BlockAddr = BA->getBlockAddress(); |
1964 | AM.SymbolFlags = BA->getTargetFlags(); |
1965 | Offset = BA->getOffset(); |
1966 | } else |
1967 | llvm_unreachable("Unhandled symbol reference node." ); |
1968 | |
1969 | // Can't use an addressing mode with large globals. |
1970 | if (Subtarget->is64Bit() && !IsRIPRel && AM.GV && |
1971 | TM.isLargeGlobalValue(GV: AM.GV)) { |
1972 | AM = Backup; |
1973 | return true; |
1974 | } |
1975 | |
1976 | if (foldOffsetIntoAddress(Offset, AM)) { |
1977 | AM = Backup; |
1978 | return true; |
1979 | } |
1980 | |
1981 | if (IsRIPRel) |
1982 | AM.setBaseReg(CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64)); |
1983 | |
1984 | // Commit the changes now that we know this fold is safe. |
1985 | return false; |
1986 | } |
1987 | |
1988 | /// Add the specified node to the specified addressing mode, returning true if |
1989 | /// it cannot be done. This just pattern matches for the addressing mode. |
1990 | bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { |
1991 | if (matchAddressRecursively(N, AM, Depth: 0)) |
1992 | return true; |
1993 | |
1994 | // Post-processing: Make a second attempt to fold a load, if we now know |
1995 | // that there will not be any other register. This is only performed for |
1996 | // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded |
1997 | // any foldable load the first time. |
1998 | if (Subtarget->isTarget64BitILP32() && |
1999 | AM.BaseType == X86ISelAddressMode::RegBase && |
2000 | AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) { |
2001 | SDValue Save_Base_Reg = AM.Base_Reg; |
2002 | if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) { |
2003 | AM.Base_Reg = SDValue(); |
2004 | if (matchLoadInAddress(N: LoadN, AM, /*AllowSegmentRegForX32=*/true)) |
2005 | AM.Base_Reg = Save_Base_Reg; |
2006 | } |
2007 | } |
2008 | |
2009 | // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has |
2010 | // a smaller encoding and avoids a scaled-index. |
2011 | if (AM.Scale == 2 && |
2012 | AM.BaseType == X86ISelAddressMode::RegBase && |
2013 | AM.Base_Reg.getNode() == nullptr) { |
2014 | AM.Base_Reg = AM.IndexReg; |
2015 | AM.Scale = 1; |
2016 | } |
2017 | |
2018 | // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, |
2019 | // because it has a smaller encoding. |
2020 | if (TM.getCodeModel() != CodeModel::Large && |
2021 | (!AM.GV || !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() && |
2022 | AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase && |
2023 | AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr && |
2024 | AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) { |
2025 | // However, when GV is a local function symbol and in the same section as |
2026 | // the current instruction, and AM.Disp is negative and near INT32_MIN, |
2027 | // referencing GV+Disp generates a relocation referencing the section symbol |
2028 | // with an even smaller offset, which might underflow. We should bail out if |
2029 | // the negative offset is too close to INT32_MIN. Actually, we are more |
2030 | // conservative here, using a smaller magic number also used by |
2031 | // isOffsetSuitableForCodeModel. |
2032 | if (isa_and_nonnull<Function>(Val: AM.GV) && AM.Disp < -16 * 1024 * 1024) |
2033 | return true; |
2034 | |
2035 | AM.Base_Reg = CurDAG->getRegister(Reg: X86::RIP, VT: MVT::i64); |
2036 | } |
2037 | |
2038 | return false; |
2039 | } |
2040 | |
2041 | bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, |
2042 | unsigned Depth) { |
2043 | // Add an artificial use to this node so that we can keep track of |
2044 | // it if it gets CSE'd with a different node. |
2045 | HandleSDNode Handle(N); |
2046 | |
2047 | X86ISelAddressMode Backup = AM; |
2048 | if (!matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1) && |
2049 | !matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, Depth: Depth+1)) |
2050 | return false; |
2051 | AM = Backup; |
2052 | |
2053 | // Try again after commutating the operands. |
2054 | if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
2055 | Depth: Depth + 1) && |
2056 | !matchAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, Depth: Depth + 1)) |
2057 | return false; |
2058 | AM = Backup; |
2059 | |
2060 | // If we couldn't fold both operands into the address at the same time, |
2061 | // see if we can just put each operand into a register and fold at least |
2062 | // the add. |
2063 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
2064 | !AM.Base_Reg.getNode() && |
2065 | !AM.IndexReg.getNode()) { |
2066 | N = Handle.getValue(); |
2067 | AM.Base_Reg = N.getOperand(i: 0); |
2068 | AM.IndexReg = N.getOperand(i: 1); |
2069 | AM.Scale = 1; |
2070 | return false; |
2071 | } |
2072 | N = Handle.getValue(); |
2073 | return true; |
2074 | } |
2075 | |
2076 | // Insert a node into the DAG at least before the Pos node's position. This |
2077 | // will reposition the node as needed, and will assign it a node ID that is <= |
2078 | // the Pos node's ID. Note that this does *not* preserve the uniqueness of node |
2079 | // IDs! The selection DAG must no longer depend on their uniqueness when this |
2080 | // is used. |
2081 | static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { |
2082 | if (N->getNodeId() == -1 || |
2083 | (SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) > |
2084 | SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) { |
2085 | DAG.RepositionNode(Position: Pos->getIterator(), N: N.getNode()); |
2086 | // Mark Node as invalid for pruning as after this it may be a successor to a |
2087 | // selected node but otherwise be in the same position of Pos. |
2088 | // Conservatively mark it with the same -abs(Id) to assure node id |
2089 | // invariant is preserved. |
2090 | N->setNodeId(Pos->getNodeId()); |
2091 | SelectionDAGISel::InvalidateNodeId(N: N.getNode()); |
2092 | } |
2093 | } |
2094 | |
2095 | // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if |
2096 | // safe. This allows us to convert the shift and and into an h-register |
2097 | // extract and a scaled index. Returns false if the simplification is |
2098 | // performed. |
2099 | static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, |
2100 | uint64_t Mask, |
2101 | SDValue Shift, SDValue X, |
2102 | X86ISelAddressMode &AM) { |
2103 | if (Shift.getOpcode() != ISD::SRL || |
2104 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) || |
2105 | !Shift.hasOneUse()) |
2106 | return true; |
2107 | |
2108 | int ScaleLog = 8 - Shift.getConstantOperandVal(i: 1); |
2109 | if (ScaleLog <= 0 || ScaleLog >= 4 || |
2110 | Mask != (0xffu << ScaleLog)) |
2111 | return true; |
2112 | |
2113 | MVT XVT = X.getSimpleValueType(); |
2114 | MVT VT = N.getSimpleValueType(); |
2115 | SDLoc DL(N); |
2116 | SDValue Eight = DAG.getConstant(Val: 8, DL, VT: MVT::i8); |
2117 | SDValue NewMask = DAG.getConstant(Val: 0xff, DL, VT: XVT); |
2118 | SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight); |
2119 | SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask); |
2120 | SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT); |
2121 | SDValue ShlCount = DAG.getConstant(Val: ScaleLog, DL, VT: MVT::i8); |
2122 | SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount); |
2123 | |
2124 | // Insert the new nodes into the topological ordering. We must do this in |
2125 | // a valid topological ordering as nothing is going to go back and re-sort |
2126 | // these nodes. We continually insert before 'N' in sequence as this is |
2127 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2128 | // hierarchy left to express. |
2129 | insertDAGNode(DAG, Pos: N, N: Eight); |
2130 | insertDAGNode(DAG, Pos: N, N: NewMask); |
2131 | insertDAGNode(DAG, Pos: N, N: Srl); |
2132 | insertDAGNode(DAG, Pos: N, N: And); |
2133 | insertDAGNode(DAG, Pos: N, N: Ext); |
2134 | insertDAGNode(DAG, Pos: N, N: ShlCount); |
2135 | insertDAGNode(DAG, Pos: N, N: Shl); |
2136 | DAG.ReplaceAllUsesWith(From: N, To: Shl); |
2137 | DAG.RemoveDeadNode(N: N.getNode()); |
2138 | AM.IndexReg = Ext; |
2139 | AM.Scale = (1 << ScaleLog); |
2140 | return false; |
2141 | } |
2142 | |
2143 | // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this |
2144 | // allows us to fold the shift into this addressing mode. Returns false if the |
2145 | // transform succeeded. |
2146 | static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, |
2147 | X86ISelAddressMode &AM) { |
2148 | SDValue Shift = N.getOperand(i: 0); |
2149 | |
2150 | // Use a signed mask so that shifting right will insert sign bits. These |
2151 | // bits will be removed when we shift the result left so it doesn't matter |
2152 | // what we use. This might allow a smaller immediate encoding. |
2153 | int64_t Mask = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getSExtValue(); |
2154 | |
2155 | // If we have an any_extend feeding the AND, look through it to see if there |
2156 | // is a shift behind it. But only if the AND doesn't use the extended bits. |
2157 | // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? |
2158 | bool FoundAnyExtend = false; |
2159 | if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && |
2160 | Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 && |
2161 | isUInt<32>(x: Mask)) { |
2162 | FoundAnyExtend = true; |
2163 | Shift = Shift.getOperand(i: 0); |
2164 | } |
2165 | |
2166 | if (Shift.getOpcode() != ISD::SHL || |
2167 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1))) |
2168 | return true; |
2169 | |
2170 | SDValue X = Shift.getOperand(i: 0); |
2171 | |
2172 | // Not likely to be profitable if either the AND or SHIFT node has more |
2173 | // than one use (unless all uses are for address computation). Besides, |
2174 | // isel mechanism requires their node ids to be reused. |
2175 | if (!N.hasOneUse() || !Shift.hasOneUse()) |
2176 | return true; |
2177 | |
2178 | // Verify that the shift amount is something we can fold. |
2179 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
2180 | if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) |
2181 | return true; |
2182 | |
2183 | MVT VT = N.getSimpleValueType(); |
2184 | SDLoc DL(N); |
2185 | if (FoundAnyExtend) { |
2186 | SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X); |
2187 | insertDAGNode(DAG, Pos: N, N: NewX); |
2188 | X = NewX; |
2189 | } |
2190 | |
2191 | SDValue NewMask = DAG.getSignedConstant(Val: Mask >> ShiftAmt, DL, VT); |
2192 | SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask); |
2193 | SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: 1)); |
2194 | |
2195 | // Insert the new nodes into the topological ordering. We must do this in |
2196 | // a valid topological ordering as nothing is going to go back and re-sort |
2197 | // these nodes. We continually insert before 'N' in sequence as this is |
2198 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2199 | // hierarchy left to express. |
2200 | insertDAGNode(DAG, Pos: N, N: NewMask); |
2201 | insertDAGNode(DAG, Pos: N, N: NewAnd); |
2202 | insertDAGNode(DAG, Pos: N, N: NewShift); |
2203 | DAG.ReplaceAllUsesWith(From: N, To: NewShift); |
2204 | DAG.RemoveDeadNode(N: N.getNode()); |
2205 | |
2206 | AM.Scale = 1 << ShiftAmt; |
2207 | AM.IndexReg = NewAnd; |
2208 | return false; |
2209 | } |
2210 | |
2211 | // Implement some heroics to detect shifts of masked values where the mask can |
2212 | // be replaced by extending the shift and undoing that in the addressing mode |
2213 | // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and |
2214 | // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in |
2215 | // the addressing mode. This results in code such as: |
2216 | // |
2217 | // int f(short *y, int *lookup_table) { |
2218 | // ... |
2219 | // return *y + lookup_table[*y >> 11]; |
2220 | // } |
2221 | // |
2222 | // Turning into: |
2223 | // movzwl (%rdi), %eax |
2224 | // movl %eax, %ecx |
2225 | // shrl $11, %ecx |
2226 | // addl (%rsi,%rcx,4), %eax |
2227 | // |
2228 | // Instead of: |
2229 | // movzwl (%rdi), %eax |
2230 | // movl %eax, %ecx |
2231 | // shrl $9, %ecx |
2232 | // andl $124, %rcx |
2233 | // addl (%rsi,%rcx), %eax |
2234 | // |
2235 | // Note that this function assumes the mask is provided as a mask *after* the |
2236 | // value is shifted. The input chain may or may not match that, but computing |
2237 | // such a mask is trivial. |
2238 | static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, |
2239 | uint64_t Mask, |
2240 | SDValue Shift, SDValue X, |
2241 | X86ISelAddressMode &AM) { |
2242 | if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || |
2243 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1))) |
2244 | return true; |
2245 | |
2246 | // We need to ensure that mask is a continuous run of bits. |
2247 | unsigned MaskIdx, MaskLen; |
2248 | if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen)) |
2249 | return true; |
2250 | unsigned MaskLZ = 64 - (MaskIdx + MaskLen); |
2251 | |
2252 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
2253 | |
2254 | // The amount of shift we're trying to fit into the addressing mode is taken |
2255 | // from the shifted mask index (number of trailing zeros of the mask). |
2256 | unsigned AMShiftAmt = MaskIdx; |
2257 | |
2258 | // There is nothing we can do here unless the mask is removing some bits. |
2259 | // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. |
2260 | if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; |
2261 | |
2262 | // Scale the leading zero count down based on the actual size of the value. |
2263 | // Also scale it down based on the size of the shift. |
2264 | unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; |
2265 | if (MaskLZ < ScaleDown) |
2266 | return true; |
2267 | MaskLZ -= ScaleDown; |
2268 | |
2269 | // The final check is to ensure that any masked out high bits of X are |
2270 | // already known to be zero. Otherwise, the mask has a semantic impact |
2271 | // other than masking out a couple of low bits. Unfortunately, because of |
2272 | // the mask, zero extensions will be removed from operands in some cases. |
2273 | // This code works extra hard to look through extensions because we can |
2274 | // replace them with zero extensions cheaply if necessary. |
2275 | bool ReplacingAnyExtend = false; |
2276 | if (X.getOpcode() == ISD::ANY_EXTEND) { |
2277 | unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - |
2278 | X.getOperand(i: 0).getSimpleValueType().getSizeInBits(); |
2279 | // Assume that we'll replace the any-extend with a zero-extend, and |
2280 | // narrow the search to the extended value. |
2281 | X = X.getOperand(i: 0); |
2282 | MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; |
2283 | ReplacingAnyExtend = true; |
2284 | } |
2285 | APInt MaskedHighBits = |
2286 | APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ); |
2287 | if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits)) |
2288 | return true; |
2289 | |
2290 | // We've identified a pattern that can be transformed into a single shift |
2291 | // and an addressing mode. Make it so. |
2292 | MVT VT = N.getSimpleValueType(); |
2293 | if (ReplacingAnyExtend) { |
2294 | assert(X.getValueType() != VT); |
2295 | // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. |
2296 | SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(X), VT, Operand: X); |
2297 | insertDAGNode(DAG, Pos: N, N: NewX); |
2298 | X = NewX; |
2299 | } |
2300 | |
2301 | MVT XVT = X.getSimpleValueType(); |
2302 | SDLoc DL(N); |
2303 | SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8); |
2304 | SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt); |
2305 | SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT); |
2306 | SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8); |
2307 | SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt); |
2308 | |
2309 | // Insert the new nodes into the topological ordering. We must do this in |
2310 | // a valid topological ordering as nothing is going to go back and re-sort |
2311 | // these nodes. We continually insert before 'N' in sequence as this is |
2312 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2313 | // hierarchy left to express. |
2314 | insertDAGNode(DAG, Pos: N, N: NewSRLAmt); |
2315 | insertDAGNode(DAG, Pos: N, N: NewSRL); |
2316 | insertDAGNode(DAG, Pos: N, N: NewExt); |
2317 | insertDAGNode(DAG, Pos: N, N: NewSHLAmt); |
2318 | insertDAGNode(DAG, Pos: N, N: NewSHL); |
2319 | DAG.ReplaceAllUsesWith(From: N, To: NewSHL); |
2320 | DAG.RemoveDeadNode(N: N.getNode()); |
2321 | |
2322 | AM.Scale = 1 << AMShiftAmt; |
2323 | AM.IndexReg = NewExt; |
2324 | return false; |
2325 | } |
2326 | |
2327 | // Transform "(X >> SHIFT) & (MASK << C1)" to |
2328 | // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be |
2329 | // matched to a BEXTR later. Returns false if the simplification is performed. |
2330 | static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, |
2331 | uint64_t Mask, |
2332 | SDValue Shift, SDValue X, |
2333 | X86ISelAddressMode &AM, |
2334 | const X86Subtarget &Subtarget) { |
2335 | if (Shift.getOpcode() != ISD::SRL || |
2336 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) || |
2337 | !Shift.hasOneUse() || !N.hasOneUse()) |
2338 | return true; |
2339 | |
2340 | // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. |
2341 | if (!Subtarget.hasTBM() && |
2342 | !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) |
2343 | return true; |
2344 | |
2345 | // We need to ensure that mask is a continuous run of bits. |
2346 | unsigned MaskIdx, MaskLen; |
2347 | if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen)) |
2348 | return true; |
2349 | |
2350 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
2351 | |
2352 | // The amount of shift we're trying to fit into the addressing mode is taken |
2353 | // from the shifted mask index (number of trailing zeros of the mask). |
2354 | unsigned AMShiftAmt = MaskIdx; |
2355 | |
2356 | // There is nothing we can do here unless the mask is removing some bits. |
2357 | // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. |
2358 | if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; |
2359 | |
2360 | MVT XVT = X.getSimpleValueType(); |
2361 | MVT VT = N.getSimpleValueType(); |
2362 | SDLoc DL(N); |
2363 | SDValue NewSRLAmt = DAG.getConstant(Val: ShiftAmt + AMShiftAmt, DL, VT: MVT::i8); |
2364 | SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt); |
2365 | SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT); |
2366 | SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask); |
2367 | SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT); |
2368 | SDValue NewSHLAmt = DAG.getConstant(Val: AMShiftAmt, DL, VT: MVT::i8); |
2369 | SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt); |
2370 | |
2371 | // Insert the new nodes into the topological ordering. We must do this in |
2372 | // a valid topological ordering as nothing is going to go back and re-sort |
2373 | // these nodes. We continually insert before 'N' in sequence as this is |
2374 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2375 | // hierarchy left to express. |
2376 | insertDAGNode(DAG, Pos: N, N: NewSRLAmt); |
2377 | insertDAGNode(DAG, Pos: N, N: NewSRL); |
2378 | insertDAGNode(DAG, Pos: N, N: NewMask); |
2379 | insertDAGNode(DAG, Pos: N, N: NewAnd); |
2380 | insertDAGNode(DAG, Pos: N, N: NewExt); |
2381 | insertDAGNode(DAG, Pos: N, N: NewSHLAmt); |
2382 | insertDAGNode(DAG, Pos: N, N: NewSHL); |
2383 | DAG.ReplaceAllUsesWith(From: N, To: NewSHL); |
2384 | DAG.RemoveDeadNode(N: N.getNode()); |
2385 | |
2386 | AM.Scale = 1 << AMShiftAmt; |
2387 | AM.IndexReg = NewExt; |
2388 | return false; |
2389 | } |
2390 | |
2391 | // Attempt to peek further into a scaled index register, collecting additional |
2392 | // extensions / offsets / etc. Returns /p N if we can't peek any further. |
2393 | SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N, |
2394 | X86ISelAddressMode &AM, |
2395 | unsigned Depth) { |
2396 | assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched" ); |
2397 | assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) && |
2398 | "Illegal index scale" ); |
2399 | |
2400 | // Limit recursion. |
2401 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
2402 | return N; |
2403 | |
2404 | EVT VT = N.getValueType(); |
2405 | unsigned Opc = N.getOpcode(); |
2406 | |
2407 | // index: add(x,c) -> index: x, disp + c |
2408 | if (CurDAG->isBaseWithConstantOffset(Op: N)) { |
2409 | auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: 1)); |
2410 | uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale; |
2411 | if (!foldOffsetIntoAddress(Offset, AM)) |
2412 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
2413 | } |
2414 | |
2415 | // index: add(x,x) -> index: x, scale * 2 |
2416 | if (Opc == ISD::ADD && N.getOperand(i: 0) == N.getOperand(i: 1)) { |
2417 | if (AM.Scale <= 4) { |
2418 | AM.Scale *= 2; |
2419 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
2420 | } |
2421 | } |
2422 | |
2423 | // index: shl(x,i) -> index: x, scale * (1 << i) |
2424 | if (Opc == X86ISD::VSHLI) { |
2425 | uint64_t ShiftAmt = N.getConstantOperandVal(i: 1); |
2426 | uint64_t ScaleAmt = 1ULL << ShiftAmt; |
2427 | if ((AM.Scale * ScaleAmt) <= 8) { |
2428 | AM.Scale *= ScaleAmt; |
2429 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
2430 | } |
2431 | } |
2432 | |
2433 | // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c) |
2434 | // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext? |
2435 | if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) { |
2436 | SDValue Src = N.getOperand(i: 0); |
2437 | if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() && |
2438 | Src.hasOneUse()) { |
2439 | if (CurDAG->isBaseWithConstantOffset(Op: Src)) { |
2440 | SDValue AddSrc = Src.getOperand(i: 0); |
2441 | auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: 1)); |
2442 | int64_t Offset = AddVal->getSExtValue(); |
2443 | if (!foldOffsetIntoAddress(Offset: (uint64_t)Offset * AM.Scale, AM)) { |
2444 | SDLoc DL(N); |
2445 | SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc); |
2446 | SDValue ExtVal = CurDAG->getSignedConstant(Val: Offset, DL, VT); |
2447 | SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal); |
2448 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc); |
2449 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal); |
2450 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd); |
2451 | CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd); |
2452 | CurDAG->RemoveDeadNode(N: N.getNode()); |
2453 | return ExtSrc; |
2454 | } |
2455 | } |
2456 | } |
2457 | } |
2458 | |
2459 | // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c) |
2460 | // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c) |
2461 | // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext? |
2462 | if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) { |
2463 | SDValue Src = N.getOperand(i: 0); |
2464 | unsigned SrcOpc = Src.getOpcode(); |
2465 | if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) || |
2466 | CurDAG->isADDLike(Op: Src, /*NoWrap=*/true)) && |
2467 | Src.hasOneUse()) { |
2468 | if (CurDAG->isBaseWithConstantOffset(Op: Src)) { |
2469 | SDValue AddSrc = Src.getOperand(i: 0); |
2470 | uint64_t Offset = Src.getConstantOperandVal(i: 1); |
2471 | if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) { |
2472 | SDLoc DL(N); |
2473 | SDValue Res; |
2474 | // If we're also scaling, see if we can use that as well. |
2475 | if (AddSrc.getOpcode() == ISD::SHL && |
2476 | isa<ConstantSDNode>(Val: AddSrc.getOperand(i: 1))) { |
2477 | SDValue ShVal = AddSrc.getOperand(i: 0); |
2478 | uint64_t ShAmt = AddSrc.getConstantOperandVal(i: 1); |
2479 | APInt HiBits = |
2480 | APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt); |
2481 | uint64_t ScaleAmt = 1ULL << ShAmt; |
2482 | if ((AM.Scale * ScaleAmt) <= 8 && |
2483 | (AddSrc->getFlags().hasNoUnsignedWrap() || |
2484 | CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) { |
2485 | AM.Scale *= ScaleAmt; |
2486 | SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal); |
2487 | SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal, |
2488 | N2: AddSrc.getOperand(i: 1)); |
2489 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal); |
2490 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift); |
2491 | AddSrc = ExtShift; |
2492 | Res = ExtShVal; |
2493 | } |
2494 | } |
2495 | SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc); |
2496 | SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT); |
2497 | SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal); |
2498 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc); |
2499 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal); |
2500 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd); |
2501 | CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd); |
2502 | CurDAG->RemoveDeadNode(N: N.getNode()); |
2503 | return Res ? Res : ExtSrc; |
2504 | } |
2505 | } |
2506 | } |
2507 | } |
2508 | |
2509 | // TODO: Handle extensions, shifted masks etc. |
2510 | return N; |
2511 | } |
2512 | |
2513 | bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
2514 | unsigned Depth) { |
2515 | LLVM_DEBUG({ |
2516 | dbgs() << "MatchAddress: " ; |
2517 | AM.dump(CurDAG); |
2518 | }); |
2519 | // Limit recursion. |
2520 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
2521 | return matchAddressBase(N, AM); |
2522 | |
2523 | // If this is already a %rip relative address, we can only merge immediates |
2524 | // into it. Instead of handling this in every case, we handle it here. |
2525 | // RIP relative addressing: %rip + 32-bit displacement! |
2526 | if (AM.isRIPRelative()) { |
2527 | // FIXME: JumpTable and ExternalSymbol address currently don't like |
2528 | // displacements. It isn't very important, but this should be fixed for |
2529 | // consistency. |
2530 | if (!(AM.ES || AM.MCSym) && AM.JT != -1) |
2531 | return true; |
2532 | |
2533 | if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N)) |
2534 | if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM)) |
2535 | return false; |
2536 | return true; |
2537 | } |
2538 | |
2539 | switch (N.getOpcode()) { |
2540 | default: break; |
2541 | case ISD::LOCAL_RECOVER: { |
2542 | if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) |
2543 | if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: 0))) { |
2544 | // Use the symbol and don't prefix it. |
2545 | AM.MCSym = ESNode->getMCSymbol(); |
2546 | return false; |
2547 | } |
2548 | break; |
2549 | } |
2550 | case ISD::Constant: { |
2551 | uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue(); |
2552 | if (!foldOffsetIntoAddress(Offset: Val, AM)) |
2553 | return false; |
2554 | break; |
2555 | } |
2556 | |
2557 | case X86ISD::Wrapper: |
2558 | case X86ISD::WrapperRIP: |
2559 | if (!matchWrapper(N, AM)) |
2560 | return false; |
2561 | break; |
2562 | |
2563 | case ISD::LOAD: |
2564 | if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM)) |
2565 | return false; |
2566 | break; |
2567 | |
2568 | case ISD::FrameIndex: |
2569 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
2570 | AM.Base_Reg.getNode() == nullptr && |
2571 | (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(Val: AM.Disp))) { |
2572 | AM.BaseType = X86ISelAddressMode::FrameIndexBase; |
2573 | AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex(); |
2574 | return false; |
2575 | } |
2576 | break; |
2577 | |
2578 | case ISD::SHL: |
2579 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) |
2580 | break; |
2581 | |
2582 | if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) { |
2583 | unsigned Val = CN->getZExtValue(); |
2584 | // Note that we handle x<<1 as (,x,2) rather than (x,x) here so |
2585 | // that the base operand remains free for further matching. If |
2586 | // the base doesn't end up getting used, a post-processing step |
2587 | // in MatchAddress turns (,x,2) into (x,x), which is cheaper. |
2588 | if (Val == 1 || Val == 2 || Val == 3) { |
2589 | SDValue ShVal = N.getOperand(i: 0); |
2590 | AM.Scale = 1 << Val; |
2591 | AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + 1); |
2592 | return false; |
2593 | } |
2594 | } |
2595 | break; |
2596 | |
2597 | case ISD::SRL: { |
2598 | // Scale must not be used already. |
2599 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; |
2600 | |
2601 | // We only handle up to 64-bit values here as those are what matter for |
2602 | // addressing mode optimizations. |
2603 | assert(N.getSimpleValueType().getSizeInBits() <= 64 && |
2604 | "Unexpected value size!" ); |
2605 | |
2606 | SDValue And = N.getOperand(i: 0); |
2607 | if (And.getOpcode() != ISD::AND) break; |
2608 | SDValue X = And.getOperand(i: 0); |
2609 | |
2610 | // The mask used for the transform is expected to be post-shift, but we |
2611 | // found the shift first so just apply the shift to the mask before passing |
2612 | // it down. |
2613 | if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)) || |
2614 | !isa<ConstantSDNode>(Val: And.getOperand(i: 1))) |
2615 | break; |
2616 | uint64_t Mask = And.getConstantOperandVal(i: 1) >> N.getConstantOperandVal(i: 1); |
2617 | |
2618 | // Try to fold the mask and shift into the scale, and return false if we |
2619 | // succeed. |
2620 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM)) |
2621 | return false; |
2622 | break; |
2623 | } |
2624 | |
2625 | case ISD::SMUL_LOHI: |
2626 | case ISD::UMUL_LOHI: |
2627 | // A mul_lohi where we need the low part can be folded as a plain multiply. |
2628 | if (N.getResNo() != 0) break; |
2629 | [[fallthrough]]; |
2630 | case ISD::MUL: |
2631 | case X86ISD::MUL_IMM: |
2632 | // X*[3,5,9] -> X+X*[2,4,8] |
2633 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
2634 | AM.Base_Reg.getNode() == nullptr && |
2635 | AM.IndexReg.getNode() == nullptr) { |
2636 | if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) |
2637 | if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || |
2638 | CN->getZExtValue() == 9) { |
2639 | AM.Scale = unsigned(CN->getZExtValue())-1; |
2640 | |
2641 | SDValue MulVal = N.getOperand(i: 0); |
2642 | SDValue Reg; |
2643 | |
2644 | // Okay, we know that we have a scale by now. However, if the scaled |
2645 | // value is an add of something and a constant, we can fold the |
2646 | // constant into the disp field here. |
2647 | if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && |
2648 | isa<ConstantSDNode>(Val: MulVal.getOperand(i: 1))) { |
2649 | Reg = MulVal.getOperand(i: 0); |
2650 | auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: 1)); |
2651 | uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); |
2652 | if (foldOffsetIntoAddress(Offset: Disp, AM)) |
2653 | Reg = N.getOperand(i: 0); |
2654 | } else { |
2655 | Reg = N.getOperand(i: 0); |
2656 | } |
2657 | |
2658 | AM.IndexReg = AM.Base_Reg = Reg; |
2659 | return false; |
2660 | } |
2661 | } |
2662 | break; |
2663 | |
2664 | case ISD::SUB: { |
2665 | // Given A-B, if A can be completely folded into the address and |
2666 | // the index field with the index field unused, use -B as the index. |
2667 | // This is a win if a has multiple parts that can be folded into |
2668 | // the address. Also, this saves a mov if the base register has |
2669 | // other uses, since it avoids a two-address sub instruction, however |
2670 | // it costs an additional mov if the index register has other uses. |
2671 | |
2672 | // Add an artificial use to this node so that we can keep track of |
2673 | // it if it gets CSE'd with a different node. |
2674 | HandleSDNode Handle(N); |
2675 | |
2676 | // Test if the LHS of the sub can be folded. |
2677 | X86ISelAddressMode Backup = AM; |
2678 | if (matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1)) { |
2679 | N = Handle.getValue(); |
2680 | AM = Backup; |
2681 | break; |
2682 | } |
2683 | N = Handle.getValue(); |
2684 | // Test if the index field is free for use. |
2685 | if (AM.IndexReg.getNode() || AM.isRIPRelative()) { |
2686 | AM = Backup; |
2687 | break; |
2688 | } |
2689 | |
2690 | int Cost = 0; |
2691 | SDValue RHS = N.getOperand(i: 1); |
2692 | // If the RHS involves a register with multiple uses, this |
2693 | // transformation incurs an extra mov, due to the neg instruction |
2694 | // clobbering its operand. |
2695 | if (!RHS.getNode()->hasOneUse() || |
2696 | RHS.getNode()->getOpcode() == ISD::CopyFromReg || |
2697 | RHS.getNode()->getOpcode() == ISD::TRUNCATE || |
2698 | RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || |
2699 | (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && |
2700 | RHS.getOperand(i: 0).getValueType() == MVT::i32)) |
2701 | ++Cost; |
2702 | // If the base is a register with multiple uses, this |
2703 | // transformation may save a mov. |
2704 | if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && |
2705 | !AM.Base_Reg.getNode()->hasOneUse()) || |
2706 | AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
2707 | --Cost; |
2708 | // If the folded LHS was interesting, this transformation saves |
2709 | // address arithmetic. |
2710 | if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + |
2711 | ((AM.Disp != 0) && (Backup.Disp == 0)) + |
2712 | (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) |
2713 | --Cost; |
2714 | // If it doesn't look like it may be an overall win, don't do it. |
2715 | if (Cost >= 0) { |
2716 | AM = Backup; |
2717 | break; |
2718 | } |
2719 | |
2720 | // Ok, the transformation is legal and appears profitable. Go for it. |
2721 | // Negation will be emitted later to avoid creating dangling nodes if this |
2722 | // was an unprofitable LEA. |
2723 | AM.IndexReg = RHS; |
2724 | AM.NegateIndex = true; |
2725 | AM.Scale = 1; |
2726 | return false; |
2727 | } |
2728 | |
2729 | case ISD::OR: |
2730 | case ISD::XOR: |
2731 | // See if we can treat the OR/XOR node as an ADD node. |
2732 | if (!CurDAG->isADDLike(Op: N)) |
2733 | break; |
2734 | [[fallthrough]]; |
2735 | case ISD::ADD: |
2736 | if (!matchAdd(N, AM, Depth)) |
2737 | return false; |
2738 | break; |
2739 | |
2740 | case ISD::AND: { |
2741 | // Perform some heroic transforms on an and of a constant-count shift |
2742 | // with a constant to enable use of the scaled offset field. |
2743 | |
2744 | // Scale must not be used already. |
2745 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; |
2746 | |
2747 | // We only handle up to 64-bit values here as those are what matter for |
2748 | // addressing mode optimizations. |
2749 | assert(N.getSimpleValueType().getSizeInBits() <= 64 && |
2750 | "Unexpected value size!" ); |
2751 | |
2752 | if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1))) |
2753 | break; |
2754 | |
2755 | if (N.getOperand(i: 0).getOpcode() == ISD::SRL) { |
2756 | SDValue Shift = N.getOperand(i: 0); |
2757 | SDValue X = Shift.getOperand(i: 0); |
2758 | |
2759 | uint64_t Mask = N.getConstantOperandVal(i: 1); |
2760 | |
2761 | // Try to fold the mask and shift into an extract and scale. |
2762 | if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM)) |
2763 | return false; |
2764 | |
2765 | // Try to fold the mask and shift directly into the scale. |
2766 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM)) |
2767 | return false; |
2768 | |
2769 | // Try to fold the mask and shift into BEXTR and scale. |
2770 | if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask, Shift, X, AM, Subtarget: *Subtarget)) |
2771 | return false; |
2772 | } |
2773 | |
2774 | // Try to swap the mask and shift to place shifts which can be done as |
2775 | // a scale on the outside of the mask. |
2776 | if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM)) |
2777 | return false; |
2778 | |
2779 | break; |
2780 | } |
2781 | case ISD::ZERO_EXTEND: { |
2782 | // Try to widen a zexted shift left to the same size as its use, so we can |
2783 | // match the shift as a scale factor. |
2784 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) |
2785 | break; |
2786 | |
2787 | SDValue Src = N.getOperand(i: 0); |
2788 | |
2789 | // See if we can match a zext(addlike(x,c)). |
2790 | // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively. |
2791 | if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR) |
2792 | if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + 1)) |
2793 | if (Index != N) { |
2794 | AM.IndexReg = Index; |
2795 | return false; |
2796 | } |
2797 | |
2798 | // Peek through mask: zext(and(shl(x,c1),c2)) |
2799 | APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits()); |
2800 | if (Src.getOpcode() == ISD::AND && Src.hasOneUse()) |
2801 | if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1))) { |
2802 | Mask = MaskC->getAPIntValue(); |
2803 | Src = Src.getOperand(i: 0); |
2804 | } |
2805 | |
2806 | if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) { |
2807 | // Give up if the shift is not a valid scale factor [1,2,3]. |
2808 | SDValue ShlSrc = Src.getOperand(i: 0); |
2809 | SDValue ShlAmt = Src.getOperand(i: 1); |
2810 | auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt); |
2811 | if (!ShAmtC) |
2812 | break; |
2813 | unsigned ShAmtV = ShAmtC->getZExtValue(); |
2814 | if (ShAmtV > 3) |
2815 | break; |
2816 | |
2817 | // The narrow shift must only shift out zero bits (it must be 'nuw'). |
2818 | // That makes it safe to widen to the destination type. |
2819 | APInt HighZeros = |
2820 | APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV); |
2821 | if (!Src->getFlags().hasNoUnsignedWrap() && |
2822 | !CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask)) |
2823 | break; |
2824 | |
2825 | // zext (shl nuw i8 %x, C1) to i32 |
2826 | // --> shl (zext i8 %x to i32), (zext C1) |
2827 | // zext (and (shl nuw i8 %x, C1), C2) to i32 |
2828 | // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1) |
2829 | MVT SrcVT = ShlSrc.getSimpleValueType(); |
2830 | MVT VT = N.getSimpleValueType(); |
2831 | SDLoc DL(N); |
2832 | |
2833 | SDValue Res = ShlSrc; |
2834 | if (!Mask.isAllOnes()) { |
2835 | Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT); |
2836 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res); |
2837 | Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res); |
2838 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res); |
2839 | } |
2840 | SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res); |
2841 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext); |
2842 | SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt); |
2843 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl); |
2844 | CurDAG->ReplaceAllUsesWith(From: N, To: NewShl); |
2845 | CurDAG->RemoveDeadNode(N: N.getNode()); |
2846 | |
2847 | // Convert the shift to scale factor. |
2848 | AM.Scale = 1 << ShAmtV; |
2849 | // If matchIndexRecursively is not called here, |
2850 | // Zext may be replaced by other nodes but later used to call a builder |
2851 | // method |
2852 | AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + 1); |
2853 | return false; |
2854 | } |
2855 | |
2856 | if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) { |
2857 | // Try to fold the mask and shift into an extract and scale. |
2858 | if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
2859 | X: Src.getOperand(i: 0), AM)) |
2860 | return false; |
2861 | |
2862 | // Try to fold the mask and shift directly into the scale. |
2863 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
2864 | X: Src.getOperand(i: 0), AM)) |
2865 | return false; |
2866 | |
2867 | // Try to fold the mask and shift into BEXTR and scale. |
2868 | if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
2869 | X: Src.getOperand(i: 0), AM, Subtarget: *Subtarget)) |
2870 | return false; |
2871 | } |
2872 | |
2873 | break; |
2874 | } |
2875 | } |
2876 | |
2877 | return matchAddressBase(N, AM); |
2878 | } |
2879 | |
2880 | /// Helper for MatchAddress. Add the specified node to the |
2881 | /// specified addressing mode without any further recursion. |
2882 | bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { |
2883 | // Is the base register already occupied? |
2884 | if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { |
2885 | // If so, check to see if the scale index register is set. |
2886 | if (!AM.IndexReg.getNode()) { |
2887 | AM.IndexReg = N; |
2888 | AM.Scale = 1; |
2889 | return false; |
2890 | } |
2891 | |
2892 | // Otherwise, we cannot select it. |
2893 | return true; |
2894 | } |
2895 | |
2896 | // Default, generate it as a register. |
2897 | AM.BaseType = X86ISelAddressMode::RegBase; |
2898 | AM.Base_Reg = N; |
2899 | return false; |
2900 | } |
2901 | |
2902 | bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N, |
2903 | X86ISelAddressMode &AM, |
2904 | unsigned Depth) { |
2905 | LLVM_DEBUG({ |
2906 | dbgs() << "MatchVectorAddress: " ; |
2907 | AM.dump(CurDAG); |
2908 | }); |
2909 | // Limit recursion. |
2910 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
2911 | return matchAddressBase(N, AM); |
2912 | |
2913 | // TODO: Support other operations. |
2914 | switch (N.getOpcode()) { |
2915 | case ISD::Constant: { |
2916 | uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue(); |
2917 | if (!foldOffsetIntoAddress(Offset: Val, AM)) |
2918 | return false; |
2919 | break; |
2920 | } |
2921 | case X86ISD::Wrapper: |
2922 | if (!matchWrapper(N, AM)) |
2923 | return false; |
2924 | break; |
2925 | case ISD::ADD: { |
2926 | // Add an artificial use to this node so that we can keep track of |
2927 | // it if it gets CSE'd with a different node. |
2928 | HandleSDNode Handle(N); |
2929 | |
2930 | X86ISelAddressMode Backup = AM; |
2931 | if (!matchVectorAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1) && |
2932 | !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
2933 | Depth: Depth + 1)) |
2934 | return false; |
2935 | AM = Backup; |
2936 | |
2937 | // Try again after commuting the operands. |
2938 | if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
2939 | Depth: Depth + 1) && |
2940 | !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, |
2941 | Depth: Depth + 1)) |
2942 | return false; |
2943 | AM = Backup; |
2944 | |
2945 | N = Handle.getValue(); |
2946 | break; |
2947 | } |
2948 | } |
2949 | |
2950 | return matchAddressBase(N, AM); |
2951 | } |
2952 | |
2953 | /// Helper for selectVectorAddr. Handles things that can be folded into a |
2954 | /// gather/scatter address. The index register and scale should have already |
2955 | /// been handled. |
2956 | bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { |
2957 | return matchVectorAddressRecursively(N, AM, Depth: 0); |
2958 | } |
2959 | |
2960 | bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, |
2961 | SDValue IndexOp, SDValue ScaleOp, |
2962 | SDValue &Base, SDValue &Scale, |
2963 | SDValue &Index, SDValue &Disp, |
2964 | SDValue &Segment) { |
2965 | X86ISelAddressMode AM; |
2966 | AM.Scale = ScaleOp->getAsZExtVal(); |
2967 | |
2968 | // Attempt to match index patterns, as long as we're not relying on implicit |
2969 | // sign-extension, which is performed BEFORE scale. |
2970 | if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits()) |
2971 | AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: 0); |
2972 | else |
2973 | AM.IndexReg = IndexOp; |
2974 | |
2975 | unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace(); |
2976 | if (AddrSpace == X86AS::GS) |
2977 | AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16); |
2978 | if (AddrSpace == X86AS::FS) |
2979 | AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16); |
2980 | if (AddrSpace == X86AS::SS) |
2981 | AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16); |
2982 | |
2983 | SDLoc DL(BasePtr); |
2984 | MVT VT = BasePtr.getSimpleValueType(); |
2985 | |
2986 | // Try to match into the base and displacement fields. |
2987 | if (matchVectorAddress(N: BasePtr, AM)) |
2988 | return false; |
2989 | |
2990 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
2991 | return true; |
2992 | } |
2993 | |
2994 | /// Returns true if it is able to pattern match an addressing mode. |
2995 | /// It returns the operands which make up the maximal addressing mode it can |
2996 | /// match by reference. |
2997 | /// |
2998 | /// Parent is the parent node of the addr operand that is being matched. It |
2999 | /// is always a load, store, atomic node, or null. It is only null when |
3000 | /// checking memory operands for inline asm nodes. |
3001 | bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, |
3002 | SDValue &Scale, SDValue &Index, |
3003 | SDValue &Disp, SDValue &Segment) { |
3004 | X86ISelAddressMode AM; |
3005 | |
3006 | if (Parent && |
3007 | // This list of opcodes are all the nodes that have an "addr:$ptr" operand |
3008 | // that are not a MemSDNode, and thus don't have proper addrspace info. |
3009 | Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme |
3010 | Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores |
3011 | Parent->getOpcode() != X86ISD::TLSCALL && // Fixme |
3012 | Parent->getOpcode() != X86ISD::ENQCMD && // Fixme |
3013 | Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme |
3014 | Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp |
3015 | Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp |
3016 | unsigned AddrSpace = |
3017 | cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace(); |
3018 | if (AddrSpace == X86AS::GS) |
3019 | AM.Segment = CurDAG->getRegister(Reg: X86::GS, VT: MVT::i16); |
3020 | if (AddrSpace == X86AS::FS) |
3021 | AM.Segment = CurDAG->getRegister(Reg: X86::FS, VT: MVT::i16); |
3022 | if (AddrSpace == X86AS::SS) |
3023 | AM.Segment = CurDAG->getRegister(Reg: X86::SS, VT: MVT::i16); |
3024 | } |
3025 | |
3026 | // Save the DL and VT before calling matchAddress, it can invalidate N. |
3027 | SDLoc DL(N); |
3028 | MVT VT = N.getSimpleValueType(); |
3029 | |
3030 | if (matchAddress(N, AM)) |
3031 | return false; |
3032 | |
3033 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
3034 | return true; |
3035 | } |
3036 | |
3037 | bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { |
3038 | // Cannot use 32 bit constants to reference objects in kernel/large code |
3039 | // model. |
3040 | if (TM.getCodeModel() == CodeModel::Kernel || |
3041 | TM.getCodeModel() == CodeModel::Large) |
3042 | return false; |
3043 | |
3044 | // In static codegen with small code model, we can get the address of a label |
3045 | // into a register with 'movl' |
3046 | if (N->getOpcode() != X86ISD::Wrapper) |
3047 | return false; |
3048 | |
3049 | N = N.getOperand(i: 0); |
3050 | |
3051 | // At least GNU as does not accept 'movl' for TPOFF relocations. |
3052 | // FIXME: We could use 'movl' when we know we are targeting MC. |
3053 | if (N->getOpcode() == ISD::TargetGlobalTLSAddress) |
3054 | return false; |
3055 | |
3056 | Imm = N; |
3057 | // Small/medium code model can reference non-TargetGlobalAddress objects with |
3058 | // 32 bit constants. |
3059 | if (N->getOpcode() != ISD::TargetGlobalAddress) { |
3060 | return TM.getCodeModel() == CodeModel::Small || |
3061 | TM.getCodeModel() == CodeModel::Medium; |
3062 | } |
3063 | |
3064 | const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal(); |
3065 | if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) |
3066 | return CR->getUnsignedMax().ult(RHS: 1ull << 32); |
3067 | |
3068 | return !TM.isLargeGlobalValue(GV); |
3069 | } |
3070 | |
3071 | bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale, |
3072 | SDValue &Index, SDValue &Disp, |
3073 | SDValue &Segment) { |
3074 | // Save the debug loc before calling selectLEAAddr, in case it invalidates N. |
3075 | SDLoc DL(N); |
3076 | |
3077 | if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) |
3078 | return false; |
3079 | |
3080 | EVT BaseType = Base.getValueType(); |
3081 | unsigned SubReg; |
3082 | if (BaseType == MVT::i8) |
3083 | SubReg = X86::sub_8bit; |
3084 | else if (BaseType == MVT::i16) |
3085 | SubReg = X86::sub_16bit; |
3086 | else |
3087 | SubReg = X86::sub_32bit; |
3088 | |
3089 | auto *RN = dyn_cast<RegisterSDNode>(Val&: Base); |
3090 | if (RN && RN->getReg() == 0) |
3091 | Base = CurDAG->getRegister(Reg: 0, VT: MVT::i64); |
3092 | else if ((BaseType == MVT::i8 || BaseType == MVT::i16 || |
3093 | BaseType == MVT::i32) && |
3094 | !isa<FrameIndexSDNode>(Val: Base)) { |
3095 | // Base could already be %rip, particularly in the x32 ABI. |
3096 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL, |
3097 | VT: MVT::i64), 0); |
3098 | Base = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Base); |
3099 | } |
3100 | |
3101 | [[maybe_unused]] EVT IndexType = Index.getValueType(); |
3102 | RN = dyn_cast<RegisterSDNode>(Val&: Index); |
3103 | if (RN && RN->getReg() == 0) |
3104 | Index = CurDAG->getRegister(Reg: 0, VT: MVT::i64); |
3105 | else { |
3106 | assert((IndexType == BaseType) && |
3107 | "Expect to be extending 8/16/32-bit registers for use in LEA" ); |
3108 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl: DL, |
3109 | VT: MVT::i64), 0); |
3110 | Index = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL, VT: MVT::i64, Operand: ImplDef, Subreg: Index); |
3111 | } |
3112 | |
3113 | return true; |
3114 | } |
3115 | |
3116 | /// Calls SelectAddr and determines if the maximal addressing |
3117 | /// mode it matches can be cost effectively emitted as an LEA instruction. |
3118 | bool X86DAGToDAGISel::selectLEAAddr(SDValue N, |
3119 | SDValue &Base, SDValue &Scale, |
3120 | SDValue &Index, SDValue &Disp, |
3121 | SDValue &Segment) { |
3122 | X86ISelAddressMode AM; |
3123 | |
3124 | // Save the DL and VT before calling matchAddress, it can invalidate N. |
3125 | SDLoc DL(N); |
3126 | MVT VT = N.getSimpleValueType(); |
3127 | |
3128 | // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support |
3129 | // segments. |
3130 | SDValue Copy = AM.Segment; |
3131 | SDValue T = CurDAG->getRegister(Reg: 0, VT: MVT::i32); |
3132 | AM.Segment = T; |
3133 | if (matchAddress(N, AM)) |
3134 | return false; |
3135 | assert (T == AM.Segment); |
3136 | AM.Segment = Copy; |
3137 | |
3138 | unsigned Complexity = 0; |
3139 | if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) |
3140 | Complexity = 1; |
3141 | else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
3142 | Complexity = 4; |
3143 | |
3144 | if (AM.IndexReg.getNode()) |
3145 | Complexity++; |
3146 | |
3147 | // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with |
3148 | // a simple shift. |
3149 | if (AM.Scale > 1) |
3150 | Complexity++; |
3151 | |
3152 | // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA |
3153 | // to a LEA. This is determined with some experimentation but is by no means |
3154 | // optimal (especially for code size consideration). LEA is nice because of |
3155 | // its three-address nature. Tweak the cost function again when we can run |
3156 | // convertToThreeAddress() at register allocation time. |
3157 | if (AM.hasSymbolicDisplacement()) { |
3158 | // For X86-64, always use LEA to materialize RIP-relative addresses. |
3159 | if (Subtarget->is64Bit()) |
3160 | Complexity = 4; |
3161 | else |
3162 | Complexity += 2; |
3163 | } |
3164 | |
3165 | // Heuristic: try harder to form an LEA from ADD if the operands set flags. |
3166 | // Unlike ADD, LEA does not affect flags, so we will be less likely to require |
3167 | // duplicating flag-producing instructions later in the pipeline. |
3168 | if (N.getOpcode() == ISD::ADD) { |
3169 | auto isMathWithFlags = [](SDValue V) { |
3170 | switch (V.getOpcode()) { |
3171 | case X86ISD::ADD: |
3172 | case X86ISD::SUB: |
3173 | case X86ISD::ADC: |
3174 | case X86ISD::SBB: |
3175 | case X86ISD::SMUL: |
3176 | case X86ISD::UMUL: |
3177 | /* TODO: These opcodes can be added safely, but we may want to justify |
3178 | their inclusion for different reasons (better for reg-alloc). |
3179 | case X86ISD::OR: |
3180 | case X86ISD::XOR: |
3181 | case X86ISD::AND: |
3182 | */ |
3183 | // Value 1 is the flag output of the node - verify it's not dead. |
3184 | return !SDValue(V.getNode(), 1).use_empty(); |
3185 | default: |
3186 | return false; |
3187 | } |
3188 | }; |
3189 | // TODO: We might want to factor in whether there's a load folding |
3190 | // opportunity for the math op that disappears with LEA. |
3191 | if (isMathWithFlags(N.getOperand(i: 0)) || isMathWithFlags(N.getOperand(i: 1))) |
3192 | Complexity++; |
3193 | } |
3194 | |
3195 | if (AM.Disp) |
3196 | Complexity++; |
3197 | |
3198 | // If it isn't worth using an LEA, reject it. |
3199 | if (Complexity <= 2) |
3200 | return false; |
3201 | |
3202 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
3203 | return true; |
3204 | } |
3205 | |
3206 | /// This is only run on TargetGlobalTLSAddress nodes. |
3207 | bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, |
3208 | SDValue &Scale, SDValue &Index, |
3209 | SDValue &Disp, SDValue &Segment) { |
3210 | assert(N.getOpcode() == ISD::TargetGlobalTLSAddress || |
3211 | N.getOpcode() == ISD::TargetExternalSymbol); |
3212 | |
3213 | X86ISelAddressMode AM; |
3214 | if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) { |
3215 | AM.GV = GA->getGlobal(); |
3216 | AM.Disp += GA->getOffset(); |
3217 | AM.SymbolFlags = GA->getTargetFlags(); |
3218 | } else { |
3219 | auto *SA = cast<ExternalSymbolSDNode>(Val&: N); |
3220 | AM.ES = SA->getSymbol(); |
3221 | AM.SymbolFlags = SA->getTargetFlags(); |
3222 | } |
3223 | |
3224 | if (Subtarget->is32Bit()) { |
3225 | AM.Scale = 1; |
3226 | AM.IndexReg = CurDAG->getRegister(Reg: X86::EBX, VT: MVT::i32); |
3227 | } |
3228 | |
3229 | MVT VT = N.getSimpleValueType(); |
3230 | getAddressOperands(AM, DL: SDLoc(N), VT, Base, Scale, Index, Disp, Segment); |
3231 | return true; |
3232 | } |
3233 | |
3234 | bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { |
3235 | // Keep track of the original value type and whether this value was |
3236 | // truncated. If we see a truncation from pointer type to VT that truncates |
3237 | // bits that are known to be zero, we can use a narrow reference. |
3238 | EVT VT = N.getValueType(); |
3239 | bool WasTruncated = false; |
3240 | if (N.getOpcode() == ISD::TRUNCATE) { |
3241 | WasTruncated = true; |
3242 | N = N.getOperand(i: 0); |
3243 | } |
3244 | |
3245 | if (N.getOpcode() != X86ISD::Wrapper) |
3246 | return false; |
3247 | |
3248 | // We can only use non-GlobalValues as immediates if they were not truncated, |
3249 | // as we do not have any range information. If we have a GlobalValue and the |
3250 | // address was not truncated, we can select it as an operand directly. |
3251 | unsigned Opc = N.getOperand(i: 0)->getOpcode(); |
3252 | if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { |
3253 | Op = N.getOperand(i: 0); |
3254 | // We can only select the operand directly if we didn't have to look past a |
3255 | // truncate. |
3256 | return !WasTruncated; |
3257 | } |
3258 | |
3259 | // Check that the global's range fits into VT. |
3260 | auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: 0)); |
3261 | std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); |
3262 | if (!CR || CR->getUnsignedMax().uge(RHS: 1ull << VT.getSizeInBits())) |
3263 | return false; |
3264 | |
3265 | // Okay, we can use a narrow reference. |
3266 | Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N), VT, |
3267 | offset: GA->getOffset(), TargetFlags: GA->getTargetFlags()); |
3268 | return true; |
3269 | } |
3270 | |
3271 | bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, |
3272 | SDValue &Base, SDValue &Scale, |
3273 | SDValue &Index, SDValue &Disp, |
3274 | SDValue &Segment) { |
3275 | assert(Root && P && "Unknown root/parent nodes" ); |
3276 | if (!ISD::isNON_EXTLoad(N: N.getNode()) || |
3277 | !IsProfitableToFold(N, U: P, Root) || |
3278 | !IsLegalToFold(N, U: P, Root, OptLevel)) |
3279 | return false; |
3280 | |
3281 | return selectAddr(Parent: N.getNode(), |
3282 | N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment); |
3283 | } |
3284 | |
3285 | bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, |
3286 | SDValue &Base, SDValue &Scale, |
3287 | SDValue &Index, SDValue &Disp, |
3288 | SDValue &Segment) { |
3289 | assert(Root && P && "Unknown root/parent nodes" ); |
3290 | if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || |
3291 | !IsProfitableToFold(N, U: P, Root) || |
3292 | !IsLegalToFold(N, U: P, Root, OptLevel)) |
3293 | return false; |
3294 | |
3295 | return selectAddr(Parent: N.getNode(), |
3296 | N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment); |
3297 | } |
3298 | |
3299 | /// Return an SDNode that returns the value of the global base register. |
3300 | /// Output instructions required to initialize the global base register, |
3301 | /// if necessary. |
3302 | SDNode *X86DAGToDAGISel::getGlobalBaseReg() { |
3303 | Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); |
3304 | auto &DL = MF->getDataLayout(); |
3305 | return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode(); |
3306 | } |
3307 | |
3308 | bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { |
3309 | if (N->getOpcode() == ISD::TRUNCATE) |
3310 | N = N->getOperand(Num: 0).getNode(); |
3311 | if (N->getOpcode() != X86ISD::Wrapper) |
3312 | return false; |
3313 | |
3314 | auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: 0)); |
3315 | if (!GA) |
3316 | return false; |
3317 | |
3318 | auto *GV = GA->getGlobal(); |
3319 | std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange(); |
3320 | if (CR) |
3321 | return CR->getSignedMin().sge(RHS: -1ull << Width) && |
3322 | CR->getSignedMax().slt(RHS: 1ull << Width); |
3323 | // In the kernel code model, globals are in the negative 2GB of the address |
3324 | // space, so globals can be a sign extended 32-bit immediate. |
3325 | // In other code models, small globals are in the low 2GB of the address |
3326 | // space, so sign extending them is equivalent to zero extending them. |
3327 | return Width == 32 && !TM.isLargeGlobalValue(GV); |
3328 | } |
3329 | |
3330 | X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const { |
3331 | assert(N->isMachineOpcode() && "Unexpected node" ); |
3332 | unsigned Opc = N->getMachineOpcode(); |
3333 | const MCInstrDesc &MCID = getInstrInfo()->get(Opcode: Opc); |
3334 | int CondNo = X86::getCondSrcNoFromDesc(MCID); |
3335 | if (CondNo < 0) |
3336 | return X86::COND_INVALID; |
3337 | |
3338 | return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo)); |
3339 | } |
3340 | |
3341 | /// Test whether the given X86ISD::CMP node has any users that use a flag |
3342 | /// other than ZF. |
3343 | bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { |
3344 | // Examine each user of the node. |
3345 | for (SDUse &Use : Flags->uses()) { |
3346 | // Only check things that use the flags. |
3347 | if (Use.getResNo() != Flags.getResNo()) |
3348 | continue; |
3349 | SDNode *User = Use.getUser(); |
3350 | // Only examine CopyToReg uses that copy to EFLAGS. |
3351 | if (User->getOpcode() != ISD::CopyToReg || |
3352 | cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS) |
3353 | return false; |
3354 | // Examine each user of the CopyToReg use. |
3355 | for (SDUse &FlagUse : User->uses()) { |
3356 | // Only examine the Flag result. |
3357 | if (FlagUse.getResNo() != 1) |
3358 | continue; |
3359 | // Anything unusual: assume conservatively. |
3360 | if (!FlagUse.getUser()->isMachineOpcode()) |
3361 | return false; |
3362 | // Examine the condition code of the user. |
3363 | X86::CondCode CC = getCondFromNode(N: FlagUse.getUser()); |
3364 | |
3365 | switch (CC) { |
3366 | // Comparisons which only use the zero flag. |
3367 | case X86::COND_E: case X86::COND_NE: |
3368 | continue; |
3369 | // Anything else: assume conservatively. |
3370 | default: |
3371 | return false; |
3372 | } |
3373 | } |
3374 | } |
3375 | return true; |
3376 | } |
3377 | |
3378 | /// Test whether the given X86ISD::CMP node has any uses which require the SF |
3379 | /// flag to be accurate. |
3380 | bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { |
3381 | // Examine each user of the node. |
3382 | for (SDUse &Use : Flags->uses()) { |
3383 | // Only check things that use the flags. |
3384 | if (Use.getResNo() != Flags.getResNo()) |
3385 | continue; |
3386 | SDNode *User = Use.getUser(); |
3387 | // Only examine CopyToReg uses that copy to EFLAGS. |
3388 | if (User->getOpcode() != ISD::CopyToReg || |
3389 | cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS) |
3390 | return false; |
3391 | // Examine each user of the CopyToReg use. |
3392 | for (SDUse &FlagUse : User->uses()) { |
3393 | // Only examine the Flag result. |
3394 | if (FlagUse.getResNo() != 1) |
3395 | continue; |
3396 | // Anything unusual: assume conservatively. |
3397 | if (!FlagUse.getUser()->isMachineOpcode()) |
3398 | return false; |
3399 | // Examine the condition code of the user. |
3400 | X86::CondCode CC = getCondFromNode(N: FlagUse.getUser()); |
3401 | |
3402 | switch (CC) { |
3403 | // Comparisons which don't examine the SF flag. |
3404 | case X86::COND_A: case X86::COND_AE: |
3405 | case X86::COND_B: case X86::COND_BE: |
3406 | case X86::COND_E: case X86::COND_NE: |
3407 | case X86::COND_O: case X86::COND_NO: |
3408 | case X86::COND_P: case X86::COND_NP: |
3409 | continue; |
3410 | // Anything else: assume conservatively. |
3411 | default: |
3412 | return false; |
3413 | } |
3414 | } |
3415 | } |
3416 | return true; |
3417 | } |
3418 | |
3419 | static bool mayUseCarryFlag(X86::CondCode CC) { |
3420 | switch (CC) { |
3421 | // Comparisons which don't examine the CF flag. |
3422 | case X86::COND_O: case X86::COND_NO: |
3423 | case X86::COND_E: case X86::COND_NE: |
3424 | case X86::COND_S: case X86::COND_NS: |
3425 | case X86::COND_P: case X86::COND_NP: |
3426 | case X86::COND_L: case X86::COND_GE: |
3427 | case X86::COND_G: case X86::COND_LE: |
3428 | return false; |
3429 | // Anything else: assume conservatively. |
3430 | default: |
3431 | return true; |
3432 | } |
3433 | } |
3434 | |
3435 | /// Test whether the given node which sets flags has any uses which require the |
3436 | /// CF flag to be accurate. |
3437 | bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { |
3438 | // Examine each user of the node. |
3439 | for (SDUse &Use : Flags->uses()) { |
3440 | // Only check things that use the flags. |
3441 | if (Use.getResNo() != Flags.getResNo()) |
3442 | continue; |
3443 | |
3444 | SDNode *User = Use.getUser(); |
3445 | unsigned UserOpc = User->getOpcode(); |
3446 | |
3447 | if (UserOpc == ISD::CopyToReg) { |
3448 | // Only examine CopyToReg uses that copy to EFLAGS. |
3449 | if (cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg() != X86::EFLAGS) |
3450 | return false; |
3451 | // Examine each user of the CopyToReg use. |
3452 | for (SDUse &FlagUse : User->uses()) { |
3453 | // Only examine the Flag result. |
3454 | if (FlagUse.getResNo() != 1) |
3455 | continue; |
3456 | // Anything unusual: assume conservatively. |
3457 | if (!FlagUse.getUser()->isMachineOpcode()) |
3458 | return false; |
3459 | // Examine the condition code of the user. |
3460 | X86::CondCode CC = getCondFromNode(N: FlagUse.getUser()); |
3461 | |
3462 | if (mayUseCarryFlag(CC)) |
3463 | return false; |
3464 | } |
3465 | |
3466 | // This CopyToReg is ok. Move on to the next user. |
3467 | continue; |
3468 | } |
3469 | |
3470 | // This might be an unselected node. So look for the pre-isel opcodes that |
3471 | // use flags. |
3472 | unsigned CCOpNo; |
3473 | switch (UserOpc) { |
3474 | default: |
3475 | // Something unusual. Be conservative. |
3476 | return false; |
3477 | case X86ISD::SETCC: CCOpNo = 0; break; |
3478 | case X86ISD::SETCC_CARRY: CCOpNo = 0; break; |
3479 | case X86ISD::CMOV: CCOpNo = 2; break; |
3480 | case X86ISD::BRCOND: CCOpNo = 2; break; |
3481 | } |
3482 | |
3483 | X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(Num: CCOpNo); |
3484 | if (mayUseCarryFlag(CC)) |
3485 | return false; |
3486 | } |
3487 | return true; |
3488 | } |
3489 | |
3490 | /// Check whether or not the chain ending in StoreNode is suitable for doing |
3491 | /// the {load; op; store} to modify transformation. |
3492 | static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, |
3493 | SDValue StoredVal, SelectionDAG *CurDAG, |
3494 | unsigned LoadOpNo, |
3495 | LoadSDNode *&LoadNode, |
3496 | SDValue &InputChain) { |
3497 | // Is the stored value result 0 of the operation? |
3498 | if (StoredVal.getResNo() != 0) return false; |
3499 | |
3500 | // Are there other uses of the operation other than the store? |
3501 | if (!StoredVal.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) return false; |
3502 | |
3503 | // Is the store non-extending and non-indexed? |
3504 | if (!ISD::isNormalStore(N: StoreNode) || StoreNode->isNonTemporal()) |
3505 | return false; |
3506 | |
3507 | SDValue Load = StoredVal->getOperand(Num: LoadOpNo); |
3508 | // Is the stored value a non-extending and non-indexed load? |
3509 | if (!ISD::isNormalLoad(N: Load.getNode())) return false; |
3510 | |
3511 | // Return LoadNode by reference. |
3512 | LoadNode = cast<LoadSDNode>(Val&: Load); |
3513 | |
3514 | // Is store the only read of the loaded value? |
3515 | if (!Load.hasOneUse()) |
3516 | return false; |
3517 | |
3518 | // Is the address of the store the same as the load? |
3519 | if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || |
3520 | LoadNode->getOffset() != StoreNode->getOffset()) |
3521 | return false; |
3522 | |
3523 | bool FoundLoad = false; |
3524 | SmallVector<SDValue, 4> ChainOps; |
3525 | SmallVector<const SDNode *, 4> LoopWorklist; |
3526 | SmallPtrSet<const SDNode *, 16> Visited; |
3527 | const unsigned int Max = 1024; |
3528 | |
3529 | // Visualization of Load-Op-Store fusion: |
3530 | // ------------------------- |
3531 | // Legend: |
3532 | // *-lines = Chain operand dependencies. |
3533 | // |-lines = Normal operand dependencies. |
3534 | // Dependencies flow down and right. n-suffix references multiple nodes. |
3535 | // |
3536 | // C Xn C |
3537 | // * * * |
3538 | // * * * |
3539 | // Xn A-LD Yn TF Yn |
3540 | // * * \ | * | |
3541 | // * * \ | * | |
3542 | // * * \ | => A--LD_OP_ST |
3543 | // * * \| \ |
3544 | // TF OP \ |
3545 | // * | \ Zn |
3546 | // * | \ |
3547 | // A-ST Zn |
3548 | // |
3549 | |
3550 | // This merge induced dependences from: #1: Xn -> LD, OP, Zn |
3551 | // #2: Yn -> LD |
3552 | // #3: ST -> Zn |
3553 | |
3554 | // Ensure the transform is safe by checking for the dual |
3555 | // dependencies to make sure we do not induce a loop. |
3556 | |
3557 | // As LD is a predecessor to both OP and ST we can do this by checking: |
3558 | // a). if LD is a predecessor to a member of Xn or Yn. |
3559 | // b). if a Zn is a predecessor to ST. |
3560 | |
3561 | // However, (b) can only occur through being a chain predecessor to |
3562 | // ST, which is the same as Zn being a member or predecessor of Xn, |
3563 | // which is a subset of LD being a predecessor of Xn. So it's |
3564 | // subsumed by check (a). |
3565 | |
3566 | SDValue Chain = StoreNode->getChain(); |
3567 | |
3568 | // Gather X elements in ChainOps. |
3569 | if (Chain == Load.getValue(R: 1)) { |
3570 | FoundLoad = true; |
3571 | ChainOps.push_back(Elt: Load.getOperand(i: 0)); |
3572 | } else if (Chain.getOpcode() == ISD::TokenFactor) { |
3573 | for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { |
3574 | SDValue Op = Chain.getOperand(i); |
3575 | if (Op == Load.getValue(R: 1)) { |
3576 | FoundLoad = true; |
3577 | // Drop Load, but keep its chain. No cycle check necessary. |
3578 | ChainOps.push_back(Elt: Load.getOperand(i: 0)); |
3579 | continue; |
3580 | } |
3581 | LoopWorklist.push_back(Elt: Op.getNode()); |
3582 | ChainOps.push_back(Elt: Op); |
3583 | } |
3584 | } |
3585 | |
3586 | if (!FoundLoad) |
3587 | return false; |
3588 | |
3589 | // Worklist is currently Xn. Add Yn to worklist. |
3590 | for (SDValue Op : StoredVal->ops()) |
3591 | if (Op.getNode() != LoadNode) |
3592 | LoopWorklist.push_back(Elt: Op.getNode()); |
3593 | |
3594 | // Check (a) if Load is a predecessor to Xn + Yn |
3595 | if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max, |
3596 | TopologicalPrune: true)) |
3597 | return false; |
3598 | |
3599 | InputChain = |
3600 | CurDAG->getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ChainOps); |
3601 | return true; |
3602 | } |
3603 | |
3604 | // Change a chain of {load; op; store} of the same value into a simple op |
3605 | // through memory of that value, if the uses of the modified value and its |
3606 | // address are suitable. |
3607 | // |
3608 | // The tablegen pattern memory operand pattern is currently not able to match |
3609 | // the case where the EFLAGS on the original operation are used. |
3610 | // |
3611 | // To move this to tablegen, we'll need to improve tablegen to allow flags to |
3612 | // be transferred from a node in the pattern to the result node, probably with |
3613 | // a new keyword. For example, we have this |
3614 | // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", |
3615 | // [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>; |
3616 | // but maybe need something like this |
3617 | // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", |
3618 | // [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst), |
3619 | // (transferrable EFLAGS)]>; |
3620 | // |
3621 | // Until then, we manually fold these and instruction select the operation |
3622 | // here. |
3623 | bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { |
3624 | auto *StoreNode = cast<StoreSDNode>(Val: Node); |
3625 | SDValue StoredVal = StoreNode->getOperand(Num: 1); |
3626 | unsigned Opc = StoredVal->getOpcode(); |
3627 | |
3628 | // Before we try to select anything, make sure this is memory operand size |
3629 | // and opcode we can handle. Note that this must match the code below that |
3630 | // actually lowers the opcodes. |
3631 | EVT MemVT = StoreNode->getMemoryVT(); |
3632 | if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && |
3633 | MemVT != MVT::i8) |
3634 | return false; |
3635 | |
3636 | bool IsCommutable = false; |
3637 | bool IsNegate = false; |
3638 | switch (Opc) { |
3639 | default: |
3640 | return false; |
3641 | case X86ISD::SUB: |
3642 | IsNegate = isNullConstant(V: StoredVal.getOperand(i: 0)); |
3643 | break; |
3644 | case X86ISD::SBB: |
3645 | break; |
3646 | case X86ISD::ADD: |
3647 | case X86ISD::ADC: |
3648 | case X86ISD::AND: |
3649 | case X86ISD::OR: |
3650 | case X86ISD::XOR: |
3651 | IsCommutable = true; |
3652 | break; |
3653 | } |
3654 | |
3655 | unsigned LoadOpNo = IsNegate ? 1 : 0; |
3656 | LoadSDNode *LoadNode = nullptr; |
3657 | SDValue InputChain; |
3658 | if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, |
3659 | LoadNode, InputChain)) { |
3660 | if (!IsCommutable) |
3661 | return false; |
3662 | |
3663 | // This operation is commutable, try the other operand. |
3664 | LoadOpNo = 1; |
3665 | if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, |
3666 | LoadNode, InputChain)) |
3667 | return false; |
3668 | } |
3669 | |
3670 | SDValue Base, Scale, Index, Disp, Segment; |
3671 | if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp, |
3672 | Segment)) |
3673 | return false; |
3674 | |
3675 | auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, |
3676 | unsigned Opc8) { |
3677 | switch (MemVT.getSimpleVT().SimpleTy) { |
3678 | case MVT::i64: |
3679 | return Opc64; |
3680 | case MVT::i32: |
3681 | return Opc32; |
3682 | case MVT::i16: |
3683 | return Opc16; |
3684 | case MVT::i8: |
3685 | return Opc8; |
3686 | default: |
3687 | llvm_unreachable("Invalid size!" ); |
3688 | } |
3689 | }; |
3690 | |
3691 | MachineSDNode *Result; |
3692 | switch (Opc) { |
3693 | case X86ISD::SUB: |
3694 | // Handle negate. |
3695 | if (IsNegate) { |
3696 | unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, |
3697 | X86::NEG8m); |
3698 | const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; |
3699 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, |
3700 | VT2: MVT::Other, Ops); |
3701 | break; |
3702 | } |
3703 | [[fallthrough]]; |
3704 | case X86ISD::ADD: |
3705 | // Try to match inc/dec. |
3706 | if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { |
3707 | bool IsOne = isOneConstant(V: StoredVal.getOperand(i: 1)); |
3708 | bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: 1)); |
3709 | // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. |
3710 | if ((IsOne || IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) { |
3711 | unsigned NewOpc = |
3712 | ((Opc == X86ISD::ADD) == IsOne) |
3713 | ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) |
3714 | : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); |
3715 | const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; |
3716 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, |
3717 | VT2: MVT::Other, Ops); |
3718 | break; |
3719 | } |
3720 | } |
3721 | [[fallthrough]]; |
3722 | case X86ISD::ADC: |
3723 | case X86ISD::SBB: |
3724 | case X86ISD::AND: |
3725 | case X86ISD::OR: |
3726 | case X86ISD::XOR: { |
3727 | auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { |
3728 | switch (Opc) { |
3729 | case X86ISD::ADD: |
3730 | return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, |
3731 | X86::ADD8mr); |
3732 | case X86ISD::ADC: |
3733 | return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, |
3734 | X86::ADC8mr); |
3735 | case X86ISD::SUB: |
3736 | return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, |
3737 | X86::SUB8mr); |
3738 | case X86ISD::SBB: |
3739 | return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, |
3740 | X86::SBB8mr); |
3741 | case X86ISD::AND: |
3742 | return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, |
3743 | X86::AND8mr); |
3744 | case X86ISD::OR: |
3745 | return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); |
3746 | case X86ISD::XOR: |
3747 | return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, |
3748 | X86::XOR8mr); |
3749 | default: |
3750 | llvm_unreachable("Invalid opcode!" ); |
3751 | } |
3752 | }; |
3753 | auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { |
3754 | switch (Opc) { |
3755 | case X86ISD::ADD: |
3756 | return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, |
3757 | X86::ADD8mi); |
3758 | case X86ISD::ADC: |
3759 | return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, |
3760 | X86::ADC8mi); |
3761 | case X86ISD::SUB: |
3762 | return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, |
3763 | X86::SUB8mi); |
3764 | case X86ISD::SBB: |
3765 | return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, |
3766 | X86::SBB8mi); |
3767 | case X86ISD::AND: |
3768 | return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, |
3769 | X86::AND8mi); |
3770 | case X86ISD::OR: |
3771 | return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, |
3772 | X86::OR8mi); |
3773 | case X86ISD::XOR: |
3774 | return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, |
3775 | X86::XOR8mi); |
3776 | default: |
3777 | llvm_unreachable("Invalid opcode!" ); |
3778 | } |
3779 | }; |
3780 | |
3781 | unsigned NewOpc = SelectRegOpcode(Opc); |
3782 | SDValue Operand = StoredVal->getOperand(Num: 1-LoadOpNo); |
3783 | |
3784 | // See if the operand is a constant that we can fold into an immediate |
3785 | // operand. |
3786 | if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) { |
3787 | int64_t OperandV = OperandC->getSExtValue(); |
3788 | |
3789 | // Check if we can shrink the operand enough to fit in an immediate (or |
3790 | // fit into a smaller immediate) by negating it and switching the |
3791 | // operation. |
3792 | if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && |
3793 | ((MemVT != MVT::i8 && !isInt<8>(x: OperandV) && isInt<8>(x: -OperandV)) || |
3794 | (MemVT == MVT::i64 && !isInt<32>(x: OperandV) && |
3795 | isInt<32>(x: -OperandV))) && |
3796 | hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) { |
3797 | OperandV = -OperandV; |
3798 | Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; |
3799 | } |
3800 | |
3801 | if (MemVT != MVT::i64 || isInt<32>(x: OperandV)) { |
3802 | Operand = CurDAG->getSignedTargetConstant(Val: OperandV, DL: SDLoc(Node), VT: MemVT); |
3803 | NewOpc = SelectImmOpcode(Opc); |
3804 | } |
3805 | } |
3806 | |
3807 | if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { |
3808 | SDValue CopyTo = |
3809 | CurDAG->getCopyToReg(Chain: InputChain, dl: SDLoc(Node), Reg: X86::EFLAGS, |
3810 | N: StoredVal.getOperand(i: 2), Glue: SDValue()); |
3811 | |
3812 | const SDValue Ops[] = {Base, Scale, Index, Disp, |
3813 | Segment, Operand, CopyTo, CopyTo.getValue(R: 1)}; |
3814 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other, |
3815 | Ops); |
3816 | } else { |
3817 | const SDValue Ops[] = {Base, Scale, Index, Disp, |
3818 | Segment, Operand, InputChain}; |
3819 | Result = CurDAG->getMachineNode(Opcode: NewOpc, dl: SDLoc(Node), VT1: MVT::i32, VT2: MVT::Other, |
3820 | Ops); |
3821 | } |
3822 | break; |
3823 | } |
3824 | default: |
3825 | llvm_unreachable("Invalid opcode!" ); |
3826 | } |
3827 | |
3828 | MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), |
3829 | LoadNode->getMemOperand()}; |
3830 | CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps); |
3831 | |
3832 | // Update Load Chain uses as well. |
3833 | ReplaceUses(F: SDValue(LoadNode, 1), T: SDValue(Result, 1)); |
3834 | ReplaceUses(F: SDValue(StoreNode, 0), T: SDValue(Result, 1)); |
3835 | ReplaceUses(F: SDValue(StoredVal.getNode(), 1), T: SDValue(Result, 0)); |
3836 | CurDAG->RemoveDeadNode(N: Node); |
3837 | return true; |
3838 | } |
3839 | |
3840 | // See if this is an X & Mask that we can match to BEXTR/BZHI. |
3841 | // Where Mask is one of the following patterns: |
3842 | // a) x & (1 << nbits) - 1 |
3843 | // b) x & ~(-1 << nbits) |
3844 | // c) x & (-1 >> (32 - y)) |
3845 | // d) x << (32 - y) >> (32 - y) |
3846 | // e) (1 << nbits) - 1 |
3847 | bool X86DAGToDAGISel::(SDNode *Node) { |
3848 | assert( |
3849 | (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND || |
3850 | Node->getOpcode() == ISD::SRL) && |
3851 | "Should be either an and-mask, or right-shift after clearing high bits." ); |
3852 | |
3853 | // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. |
3854 | if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) |
3855 | return false; |
3856 | |
3857 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
3858 | |
3859 | // Only supported for 32 and 64 bits. |
3860 | if (NVT != MVT::i32 && NVT != MVT::i64) |
3861 | return false; |
3862 | |
3863 | SDValue NBits; |
3864 | bool NegateNBits; |
3865 | |
3866 | // If we have BMI2's BZHI, we are ok with muti-use patterns. |
3867 | // Else, if we only have BMI1's BEXTR, we require one-use. |
3868 | const bool = Subtarget->hasBMI2(); |
3869 | auto checkUses = [AllowExtraUsesByDefault]( |
3870 | SDValue Op, unsigned NUses, |
3871 | std::optional<bool> ) { |
3872 | return AllowExtraUses.value_or(u: AllowExtraUsesByDefault) || |
3873 | Op.getNode()->hasNUsesOfValue(NUses, Value: Op.getResNo()); |
3874 | }; |
3875 | auto checkOneUse = [checkUses](SDValue Op, |
3876 | std::optional<bool> = |
3877 | std::nullopt) { |
3878 | return checkUses(Op, 1, AllowExtraUses); |
3879 | }; |
3880 | auto checkTwoUse = [checkUses](SDValue Op, |
3881 | std::optional<bool> = |
3882 | std::nullopt) { |
3883 | return checkUses(Op, 2, AllowExtraUses); |
3884 | }; |
3885 | |
3886 | auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { |
3887 | if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { |
3888 | assert(V.getSimpleValueType() == MVT::i32 && |
3889 | V.getOperand(0).getSimpleValueType() == MVT::i64 && |
3890 | "Expected i64 -> i32 truncation" ); |
3891 | V = V.getOperand(i: 0); |
3892 | } |
3893 | return V; |
3894 | }; |
3895 | |
3896 | // a) x & ((1 << nbits) + (-1)) |
3897 | auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits, |
3898 | &NegateNBits](SDValue Mask) -> bool { |
3899 | // Match `add`. Must only have one use! |
3900 | if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) |
3901 | return false; |
3902 | // We should be adding all-ones constant (i.e. subtracting one.) |
3903 | if (!isAllOnesConstant(V: Mask->getOperand(Num: 1))) |
3904 | return false; |
3905 | // Match `1 << nbits`. Might be truncated. Must only have one use! |
3906 | SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0)); |
3907 | if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) |
3908 | return false; |
3909 | if (!isOneConstant(V: M0->getOperand(Num: 0))) |
3910 | return false; |
3911 | NBits = M0->getOperand(Num: 1); |
3912 | NegateNBits = false; |
3913 | return true; |
3914 | }; |
3915 | |
3916 | auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { |
3917 | V = peekThroughOneUseTruncation(V); |
3918 | return CurDAG->MaskedValueIsAllOnes( |
3919 | Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(), |
3920 | loBitsSet: NVT.getSizeInBits())); |
3921 | }; |
3922 | |
3923 | // b) x & ~(-1 << nbits) |
3924 | auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, |
3925 | &NBits, &NegateNBits](SDValue Mask) -> bool { |
3926 | // Match `~()`. Must only have one use! |
3927 | if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) |
3928 | return false; |
3929 | // The -1 only has to be all-ones for the final Node's NVT. |
3930 | if (!isAllOnes(Mask->getOperand(Num: 1))) |
3931 | return false; |
3932 | // Match `-1 << nbits`. Might be truncated. Must only have one use! |
3933 | SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0)); |
3934 | if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) |
3935 | return false; |
3936 | // The -1 only has to be all-ones for the final Node's NVT. |
3937 | if (!isAllOnes(M0->getOperand(Num: 0))) |
3938 | return false; |
3939 | NBits = M0->getOperand(Num: 1); |
3940 | NegateNBits = false; |
3941 | return true; |
3942 | }; |
3943 | |
3944 | // Try to match potentially-truncated shift amount as `(bitwidth - y)`, |
3945 | // or leave the shift amount as-is, but then we'll have to negate it. |
3946 | auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt, |
3947 | unsigned Bitwidth) { |
3948 | NBits = ShiftAmt; |
3949 | NegateNBits = true; |
3950 | // Skip over a truncate of the shift amount, if any. |
3951 | if (NBits.getOpcode() == ISD::TRUNCATE) |
3952 | NBits = NBits.getOperand(i: 0); |
3953 | // Try to match the shift amount as (bitwidth - y). It should go away, too. |
3954 | // If it doesn't match, that's fine, we'll just negate it ourselves. |
3955 | if (NBits.getOpcode() != ISD::SUB) |
3956 | return; |
3957 | auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: 0)); |
3958 | if (!V0 || V0->getZExtValue() != Bitwidth) |
3959 | return; |
3960 | NBits = NBits.getOperand(i: 1); |
3961 | NegateNBits = false; |
3962 | }; |
3963 | |
3964 | // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth |
3965 | // or |
3966 | // c) x & (-1 >> (32 - y)) |
3967 | auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits, |
3968 | canonicalizeShiftAmt](SDValue Mask) -> bool { |
3969 | // The mask itself may be truncated. |
3970 | Mask = peekThroughOneUseTruncation(Mask); |
3971 | unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); |
3972 | // Match `l>>`. Must only have one use! |
3973 | if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) |
3974 | return false; |
3975 | // We should be shifting truly all-ones constant. |
3976 | if (!isAllOnesConstant(V: Mask.getOperand(i: 0))) |
3977 | return false; |
3978 | SDValue M1 = Mask.getOperand(i: 1); |
3979 | // The shift amount should not be used externally. |
3980 | if (!checkOneUse(M1)) |
3981 | return false; |
3982 | canonicalizeShiftAmt(M1, Bitwidth); |
3983 | // Pattern c. is non-canonical, and is expanded into pattern d. iff there |
3984 | // is no extra use of the mask. Clearly, there was one since we are here. |
3985 | // But at the same time, if we need to negate the shift amount, |
3986 | // then we don't want the mask to stick around, else it's unprofitable. |
3987 | return !NegateNBits; |
3988 | }; |
3989 | |
3990 | SDValue X; |
3991 | |
3992 | // d) x << z >> z but then we'll have to subtract z from bitwidth |
3993 | // or |
3994 | // d) x << (32 - y) >> (32 - y) |
3995 | auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt, |
3996 | AllowExtraUsesByDefault, &NegateNBits, |
3997 | &X](SDNode *Node) -> bool { |
3998 | if (Node->getOpcode() != ISD::SRL) |
3999 | return false; |
4000 | SDValue N0 = Node->getOperand(Num: 0); |
4001 | if (N0->getOpcode() != ISD::SHL) |
4002 | return false; |
4003 | unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); |
4004 | SDValue N1 = Node->getOperand(Num: 1); |
4005 | SDValue N01 = N0->getOperand(Num: 1); |
4006 | // Both of the shifts must be by the exact same value. |
4007 | if (N1 != N01) |
4008 | return false; |
4009 | canonicalizeShiftAmt(N1, Bitwidth); |
4010 | // There should not be any external uses of the inner shift / shift amount. |
4011 | // Note that while we are generally okay with external uses given BMI2, |
4012 | // iff we need to negate the shift amount, we are not okay with extra uses. |
4013 | const bool = AllowExtraUsesByDefault && !NegateNBits; |
4014 | if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses)) |
4015 | return false; |
4016 | X = N0->getOperand(Num: 0); |
4017 | return true; |
4018 | }; |
4019 | |
4020 | auto matchLowBitMask = [matchPatternA, matchPatternB, |
4021 | matchPatternC](SDValue Mask) -> bool { |
4022 | return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); |
4023 | }; |
4024 | |
4025 | if (Node->getOpcode() == ISD::AND) { |
4026 | X = Node->getOperand(Num: 0); |
4027 | SDValue Mask = Node->getOperand(Num: 1); |
4028 | |
4029 | if (matchLowBitMask(Mask)) { |
4030 | // Great. |
4031 | } else { |
4032 | std::swap(a&: X, b&: Mask); |
4033 | if (!matchLowBitMask(Mask)) |
4034 | return false; |
4035 | } |
4036 | } else if (matchLowBitMask(SDValue(Node, 0))) { |
4037 | X = CurDAG->getAllOnesConstant(DL: SDLoc(Node), VT: NVT); |
4038 | } else if (!matchPatternD(Node)) |
4039 | return false; |
4040 | |
4041 | // If we need to negate the shift amount, require BMI2 BZHI support. |
4042 | // It's just too unprofitable for BMI1 BEXTR. |
4043 | if (NegateNBits && !Subtarget->hasBMI2()) |
4044 | return false; |
4045 | |
4046 | SDLoc DL(Node); |
4047 | |
4048 | // Truncate the shift amount. |
4049 | NBits = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NBits); |
4050 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
4051 | |
4052 | // Insert 8-bit NBits into lowest 8 bits of 32-bit register. |
4053 | // All the other bits are undefined, we do not care about them. |
4054 | SDValue ImplDef = SDValue( |
4055 | CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i32), 0); |
4056 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: ImplDef); |
4057 | |
4058 | SDValue SRIdxVal = CurDAG->getTargetConstant(Val: X86::sub_8bit, DL, VT: MVT::i32); |
4059 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: SRIdxVal); |
4060 | NBits = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::INSERT_SUBREG, dl: DL, |
4061 | VT: MVT::i32, Op1: ImplDef, Op2: NBits, Op3: SRIdxVal), |
4062 | 0); |
4063 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
4064 | |
4065 | // We might have matched the amount of high bits to be cleared, |
4066 | // but we want the amount of low bits to be kept, so negate it then. |
4067 | if (NegateNBits) { |
4068 | SDValue BitWidthC = CurDAG->getConstant(Val: NVT.getSizeInBits(), DL, VT: MVT::i32); |
4069 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: BitWidthC); |
4070 | |
4071 | NBits = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: BitWidthC, N2: NBits); |
4072 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
4073 | } |
4074 | |
4075 | if (Subtarget->hasBMI2()) { |
4076 | // Great, just emit the BZHI.. |
4077 | if (NVT != MVT::i32) { |
4078 | // But have to place the bit count into the wide-enough register first. |
4079 | NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits); |
4080 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
4081 | } |
4082 | |
4083 | SDValue = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits); |
4084 | ReplaceNode(F: Node, T: Extract.getNode()); |
4085 | SelectCode(N: Extract.getNode()); |
4086 | return true; |
4087 | } |
4088 | |
4089 | // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is |
4090 | // *logically* shifted (potentially with one-use trunc inbetween), |
4091 | // and the truncation was the only use of the shift, |
4092 | // and if so look past one-use truncation. |
4093 | { |
4094 | SDValue RealX = peekThroughOneUseTruncation(X); |
4095 | // FIXME: only if the shift is one-use? |
4096 | if (RealX != X && RealX.getOpcode() == ISD::SRL) |
4097 | X = RealX; |
4098 | } |
4099 | |
4100 | MVT XVT = X.getSimpleValueType(); |
4101 | |
4102 | // Else, emitting BEXTR requires one more step. |
4103 | // The 'control' of BEXTR has the pattern of: |
4104 | // [15...8 bit][ 7...0 bit] location |
4105 | // [ bit count][ shift] name |
4106 | // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 |
4107 | |
4108 | // Shift NBits left by 8 bits, thus producing 'control'. |
4109 | // This makes the low 8 bits to be zero. |
4110 | SDValue C8 = CurDAG->getConstant(Val: 8, DL, VT: MVT::i8); |
4111 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: C8); |
4112 | SDValue Control = CurDAG->getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: NBits, N2: C8); |
4113 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
4114 | |
4115 | // If the 'X' is *logically* shifted, we can fold that shift into 'control'. |
4116 | // FIXME: only if the shift is one-use? |
4117 | if (X.getOpcode() == ISD::SRL) { |
4118 | SDValue ShiftAmt = X.getOperand(i: 1); |
4119 | X = X.getOperand(i: 0); |
4120 | |
4121 | assert(ShiftAmt.getValueType() == MVT::i8 && |
4122 | "Expected shift amount to be i8" ); |
4123 | |
4124 | // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! |
4125 | // We could zext to i16 in some form, but we intentionally don't do that. |
4126 | SDValue OrigShiftAmt = ShiftAmt; |
4127 | ShiftAmt = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: ShiftAmt); |
4128 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt); |
4129 | |
4130 | // And now 'or' these low 8 bits of shift amount into the 'control'. |
4131 | Control = CurDAG->getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Control, N2: ShiftAmt); |
4132 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
4133 | } |
4134 | |
4135 | // But have to place the 'control' into the wide-enough register first. |
4136 | if (XVT != MVT::i32) { |
4137 | Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control); |
4138 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
4139 | } |
4140 | |
4141 | // And finally, form the BEXTR itself. |
4142 | SDValue = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control); |
4143 | |
4144 | // The 'X' was originally truncated. Do that now. |
4145 | if (XVT != NVT) { |
4146 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Extract); |
4147 | Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract); |
4148 | } |
4149 | |
4150 | ReplaceNode(F: Node, T: Extract.getNode()); |
4151 | SelectCode(N: Extract.getNode()); |
4152 | |
4153 | return true; |
4154 | } |
4155 | |
4156 | // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. |
4157 | MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { |
4158 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
4159 | SDLoc dl(Node); |
4160 | |
4161 | SDValue N0 = Node->getOperand(Num: 0); |
4162 | SDValue N1 = Node->getOperand(Num: 1); |
4163 | |
4164 | // If we have TBM we can use an immediate for the control. If we have BMI |
4165 | // we should only do this if the BEXTR instruction is implemented well. |
4166 | // Otherwise moving the control into a register makes this more costly. |
4167 | // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM |
4168 | // hoisting the move immediate would make it worthwhile with a less optimal |
4169 | // BEXTR? |
4170 | bool PreferBEXTR = |
4171 | Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); |
4172 | if (!PreferBEXTR && !Subtarget->hasBMI2()) |
4173 | return nullptr; |
4174 | |
4175 | // Must have a shift right. |
4176 | if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) |
4177 | return nullptr; |
4178 | |
4179 | // Shift can't have additional users. |
4180 | if (!N0->hasOneUse()) |
4181 | return nullptr; |
4182 | |
4183 | // Only supported for 32 and 64 bits. |
4184 | if (NVT != MVT::i32 && NVT != MVT::i64) |
4185 | return nullptr; |
4186 | |
4187 | // Shift amount and RHS of and must be constant. |
4188 | auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1); |
4189 | auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1)); |
4190 | if (!MaskCst || !ShiftCst) |
4191 | return nullptr; |
4192 | |
4193 | // And RHS must be a mask. |
4194 | uint64_t Mask = MaskCst->getZExtValue(); |
4195 | if (!isMask_64(Value: Mask)) |
4196 | return nullptr; |
4197 | |
4198 | uint64_t Shift = ShiftCst->getZExtValue(); |
4199 | uint64_t MaskSize = llvm::popcount(Value: Mask); |
4200 | |
4201 | // Don't interfere with something that can be handled by extracting AH. |
4202 | // TODO: If we are able to fold a load, BEXTR might still be better than AH. |
4203 | if (Shift == 8 && MaskSize == 8) |
4204 | return nullptr; |
4205 | |
4206 | // Make sure we are only using bits that were in the original value, not |
4207 | // shifted in. |
4208 | if (Shift + MaskSize > NVT.getSizeInBits()) |
4209 | return nullptr; |
4210 | |
4211 | // BZHI, if available, is always fast, unlike BEXTR. But even if we decide |
4212 | // that we can't use BEXTR, it is only worthwhile using BZHI if the mask |
4213 | // does not fit into 32 bits. Load folding is not a sufficient reason. |
4214 | if (!PreferBEXTR && MaskSize <= 32) |
4215 | return nullptr; |
4216 | |
4217 | SDValue Control; |
4218 | unsigned ROpc, MOpc; |
4219 | |
4220 | #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC) |
4221 | if (!PreferBEXTR) { |
4222 | assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then." ); |
4223 | // If we can't make use of BEXTR then we can't fuse shift+mask stages. |
4224 | // Let's perform the mask first, and apply shift later. Note that we need to |
4225 | // widen the mask to account for the fact that we'll apply shift afterwards! |
4226 | Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT); |
4227 | ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr) |
4228 | : GET_EGPR_IF_ENABLED(X86::BZHI32rr); |
4229 | MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm) |
4230 | : GET_EGPR_IF_ENABLED(X86::BZHI32rm); |
4231 | unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; |
4232 | Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0); |
4233 | } else { |
4234 | // The 'control' of BEXTR has the pattern of: |
4235 | // [15...8 bit][ 7...0 bit] location |
4236 | // [ bit count][ shift] name |
4237 | // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 |
4238 | Control = CurDAG->getTargetConstant(Val: Shift | (MaskSize << 8), DL: dl, VT: NVT); |
4239 | if (Subtarget->hasTBM()) { |
4240 | ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; |
4241 | MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; |
4242 | } else { |
4243 | assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then." ); |
4244 | // BMI requires the immediate to placed in a register. |
4245 | ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr) |
4246 | : GET_EGPR_IF_ENABLED(X86::BEXTR32rr); |
4247 | MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm) |
4248 | : GET_EGPR_IF_ENABLED(X86::BEXTR32rm); |
4249 | unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; |
4250 | Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0); |
4251 | } |
4252 | } |
4253 | |
4254 | MachineSDNode *NewNode; |
4255 | SDValue Input = N0->getOperand(Num: 0); |
4256 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4257 | if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
4258 | SDValue Ops[] = { |
4259 | Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: 0)}; |
4260 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other); |
4261 | NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
4262 | // Update the chain. |
4263 | ReplaceUses(F: Input.getValue(R: 1), T: SDValue(NewNode, 2)); |
4264 | // Record the mem-refs |
4265 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()}); |
4266 | } else { |
4267 | NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT1: NVT, VT2: MVT::i32, Op1: Input, Op2: Control); |
4268 | } |
4269 | |
4270 | if (!PreferBEXTR) { |
4271 | // We still need to apply the shift. |
4272 | SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT); |
4273 | unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri) |
4274 | : GET_ND_IF_ENABLED(X86::SHR32ri); |
4275 | NewNode = |
4276 | CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue(NewNode, 0), Op2: ShAmt); |
4277 | } |
4278 | |
4279 | return NewNode; |
4280 | } |
4281 | |
4282 | // Emit a PCMISTR(I/M) instruction. |
4283 | MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, |
4284 | bool MayFoldLoad, const SDLoc &dl, |
4285 | MVT VT, SDNode *Node) { |
4286 | SDValue N0 = Node->getOperand(Num: 0); |
4287 | SDValue N1 = Node->getOperand(Num: 1); |
4288 | SDValue Imm = Node->getOperand(Num: 2); |
4289 | auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue(); |
4290 | Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType()); |
4291 | |
4292 | // Try to fold a load. No need to check alignment. |
4293 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4294 | if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
4295 | SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
4296 | N1.getOperand(i: 0) }; |
4297 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other); |
4298 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
4299 | // Update the chain. |
4300 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 2)); |
4301 | // Record the mem-refs |
4302 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
4303 | return CNode; |
4304 | } |
4305 | |
4306 | SDValue Ops[] = { N0, N1, Imm }; |
4307 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32); |
4308 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops); |
4309 | return CNode; |
4310 | } |
4311 | |
4312 | // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need |
4313 | // to emit a second instruction after this one. This is needed since we have two |
4314 | // copyToReg nodes glued before this and we need to continue that glue through. |
4315 | MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, |
4316 | bool MayFoldLoad, const SDLoc &dl, |
4317 | MVT VT, SDNode *Node, |
4318 | SDValue &InGlue) { |
4319 | SDValue N0 = Node->getOperand(Num: 0); |
4320 | SDValue N2 = Node->getOperand(Num: 2); |
4321 | SDValue Imm = Node->getOperand(Num: 4); |
4322 | auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue(); |
4323 | Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType()); |
4324 | |
4325 | // Try to fold a load. No need to check alignment. |
4326 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4327 | if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
4328 | SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
4329 | N2.getOperand(i: 0), InGlue }; |
4330 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue); |
4331 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
4332 | InGlue = SDValue(CNode, 3); |
4333 | // Update the chain. |
4334 | ReplaceUses(F: N2.getValue(R: 1), T: SDValue(CNode, 2)); |
4335 | // Record the mem-refs |
4336 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()}); |
4337 | return CNode; |
4338 | } |
4339 | |
4340 | SDValue Ops[] = { N0, N2, Imm, InGlue }; |
4341 | SDVTList VTs = CurDAG->getVTList(VT1: VT, VT2: MVT::i32, VT3: MVT::Glue); |
4342 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops); |
4343 | InGlue = SDValue(CNode, 2); |
4344 | return CNode; |
4345 | } |
4346 | |
4347 | bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { |
4348 | EVT VT = N->getValueType(ResNo: 0); |
4349 | |
4350 | // Only handle scalar shifts. |
4351 | if (VT.isVector()) |
4352 | return false; |
4353 | |
4354 | // Narrower shifts only mask to 5 bits in hardware. |
4355 | unsigned Size = VT == MVT::i64 ? 64 : 32; |
4356 | |
4357 | SDValue OrigShiftAmt = N->getOperand(Num: 1); |
4358 | SDValue ShiftAmt = OrigShiftAmt; |
4359 | SDLoc DL(N); |
4360 | |
4361 | // Skip over a truncate of the shift amount. |
4362 | if (ShiftAmt->getOpcode() == ISD::TRUNCATE) |
4363 | ShiftAmt = ShiftAmt->getOperand(Num: 0); |
4364 | |
4365 | // This function is called after X86DAGToDAGISel::matchBitExtract(), |
4366 | // so we are not afraid that we might mess up BZHI/BEXTR pattern. |
4367 | |
4368 | SDValue NewShiftAmt; |
4369 | if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB || |
4370 | ShiftAmt->getOpcode() == ISD::XOR) { |
4371 | SDValue Add0 = ShiftAmt->getOperand(Num: 0); |
4372 | SDValue Add1 = ShiftAmt->getOperand(Num: 1); |
4373 | auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0); |
4374 | auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1); |
4375 | // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X |
4376 | // to avoid the ADD/SUB/XOR. |
4377 | if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == 0) { |
4378 | NewShiftAmt = Add0; |
4379 | |
4380 | } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() && |
4381 | ((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - 1) || |
4382 | (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - 1))) { |
4383 | // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X |
4384 | // we can replace it with a NOT. In the XOR case it may save some code |
4385 | // size, in the SUB case it also may save a move. |
4386 | assert(Add0C == nullptr || Add1C == nullptr); |
4387 | |
4388 | // We can only do N-X, not X-N |
4389 | if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr) |
4390 | return false; |
4391 | |
4392 | EVT OpVT = ShiftAmt.getValueType(); |
4393 | |
4394 | SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT); |
4395 | NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT, |
4396 | N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes); |
4397 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes); |
4398 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
4399 | // If we are shifting by N-X where N == 0 mod Size, then just shift by |
4400 | // -X to generate a NEG instead of a SUB of a constant. |
4401 | } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C && |
4402 | Add0C->getZExtValue() != 0) { |
4403 | EVT SubVT = ShiftAmt.getValueType(); |
4404 | SDValue X; |
4405 | if (Add0C->getZExtValue() % Size == 0) |
4406 | X = Add1; |
4407 | else if (ShiftAmt.hasOneUse() && Size == 64 && |
4408 | Add0C->getZExtValue() % 32 == 0) { |
4409 | // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32). |
4410 | // This is mainly beneficial if we already compute (x+n*32). |
4411 | if (Add1.getOpcode() == ISD::TRUNCATE) { |
4412 | Add1 = Add1.getOperand(i: 0); |
4413 | SubVT = Add1.getValueType(); |
4414 | } |
4415 | if (Add0.getValueType() != SubVT) { |
4416 | Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT); |
4417 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0); |
4418 | } |
4419 | |
4420 | X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0); |
4421 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X); |
4422 | } else |
4423 | return false; |
4424 | // Insert a negate op. |
4425 | // TODO: This isn't guaranteed to replace the sub if there is a logic cone |
4426 | // that uses it that's not a shift. |
4427 | SDValue Zero = CurDAG->getConstant(Val: 0, DL, VT: SubVT); |
4428 | SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X); |
4429 | NewShiftAmt = Neg; |
4430 | |
4431 | // Insert these operands into a valid topological order so they can |
4432 | // get selected independently. |
4433 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero); |
4434 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg); |
4435 | } else |
4436 | return false; |
4437 | } else |
4438 | return false; |
4439 | |
4440 | if (NewShiftAmt.getValueType() != MVT::i8) { |
4441 | // Need to truncate the shift amount. |
4442 | NewShiftAmt = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: NewShiftAmt); |
4443 | // Add to a correct topological ordering. |
4444 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
4445 | } |
4446 | |
4447 | // Insert a new mask to keep the shift amount legal. This should be removed |
4448 | // by isel patterns. |
4449 | NewShiftAmt = CurDAG->getNode(Opcode: ISD::AND, DL, VT: MVT::i8, N1: NewShiftAmt, |
4450 | N2: CurDAG->getConstant(Val: Size - 1, DL, VT: MVT::i8)); |
4451 | // Place in a correct topological ordering. |
4452 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
4453 | |
4454 | SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), |
4455 | Op2: NewShiftAmt); |
4456 | if (UpdatedNode != N) { |
4457 | // If we found an existing node, we should replace ourselves with that node |
4458 | // and wait for it to be selected after its other users. |
4459 | ReplaceNode(F: N, T: UpdatedNode); |
4460 | return true; |
4461 | } |
4462 | |
4463 | // If the original shift amount is now dead, delete it so that we don't run |
4464 | // it through isel. |
4465 | if (OrigShiftAmt.getNode()->use_empty()) |
4466 | CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode()); |
4467 | |
4468 | // Now that we've optimized the shift amount, defer to normal isel to get |
4469 | // load folding and legacy vs BMI2 selection without repeating it here. |
4470 | SelectCode(N); |
4471 | return true; |
4472 | } |
4473 | |
4474 | bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { |
4475 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
4476 | unsigned Opcode = N->getOpcode(); |
4477 | SDLoc dl(N); |
4478 | |
4479 | // For operations of the form (x << C1) op C2, check if we can use a smaller |
4480 | // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. |
4481 | SDValue Shift = N->getOperand(Num: 0); |
4482 | SDValue N1 = N->getOperand(Num: 1); |
4483 | |
4484 | auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1); |
4485 | if (!Cst) |
4486 | return false; |
4487 | |
4488 | int64_t Val = Cst->getSExtValue(); |
4489 | |
4490 | // If we have an any_extend feeding the AND, look through it to see if there |
4491 | // is a shift behind it. But only if the AND doesn't use the extended bits. |
4492 | // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? |
4493 | bool FoundAnyExtend = false; |
4494 | if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && |
4495 | Shift.getOperand(i: 0).getSimpleValueType() == MVT::i32 && |
4496 | isUInt<32>(x: Val)) { |
4497 | FoundAnyExtend = true; |
4498 | Shift = Shift.getOperand(i: 0); |
4499 | } |
4500 | |
4501 | if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) |
4502 | return false; |
4503 | |
4504 | // i8 is unshrinkable, i16 should be promoted to i32. |
4505 | if (NVT != MVT::i32 && NVT != MVT::i64) |
4506 | return false; |
4507 | |
4508 | auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1)); |
4509 | if (!ShlCst) |
4510 | return false; |
4511 | |
4512 | uint64_t ShAmt = ShlCst->getZExtValue(); |
4513 | |
4514 | // Make sure that we don't change the operation by removing bits. |
4515 | // This only matters for OR and XOR, AND is unaffected. |
4516 | uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; |
4517 | if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) |
4518 | return false; |
4519 | |
4520 | // Check the minimum bitwidth for the new constant. |
4521 | // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. |
4522 | auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { |
4523 | if (Opcode == ISD::AND) { |
4524 | // AND32ri is the same as AND64ri32 with zext imm. |
4525 | // Try this before sign extended immediates below. |
4526 | ShiftedVal = (uint64_t)Val >> ShAmt; |
4527 | if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal)) |
4528 | return true; |
4529 | // Also swap order when the AND can become MOVZX. |
4530 | if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) |
4531 | return true; |
4532 | } |
4533 | ShiftedVal = Val >> ShAmt; |
4534 | if ((!isInt<8>(x: Val) && isInt<8>(x: ShiftedVal)) || |
4535 | (!isInt<32>(x: Val) && isInt<32>(x: ShiftedVal))) |
4536 | return true; |
4537 | if (Opcode != ISD::AND) { |
4538 | // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr |
4539 | ShiftedVal = (uint64_t)Val >> ShAmt; |
4540 | if (NVT == MVT::i64 && !isUInt<32>(x: Val) && isUInt<32>(x: ShiftedVal)) |
4541 | return true; |
4542 | } |
4543 | return false; |
4544 | }; |
4545 | |
4546 | int64_t ShiftedVal; |
4547 | if (!CanShrinkImmediate(ShiftedVal)) |
4548 | return false; |
4549 | |
4550 | // Ok, we can reorder to get a smaller immediate. |
4551 | |
4552 | // But, its possible the original immediate allowed an AND to become MOVZX. |
4553 | // Doing this late due to avoid the MakedValueIsZero call as late as |
4554 | // possible. |
4555 | if (Opcode == ISD::AND) { |
4556 | // Find the smallest zext this could possibly be. |
4557 | unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); |
4558 | ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: 8U)); |
4559 | |
4560 | // Figure out which bits need to be zero to achieve that mask. |
4561 | APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(), |
4562 | loBitsSet: ZExtWidth); |
4563 | NeededMask &= ~Cst->getAPIntValue(); |
4564 | |
4565 | if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: 0), Mask: NeededMask)) |
4566 | return false; |
4567 | } |
4568 | |
4569 | SDValue X = Shift.getOperand(i: 0); |
4570 | if (FoundAnyExtend) { |
4571 | SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X); |
4572 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewX); |
4573 | X = NewX; |
4574 | } |
4575 | |
4576 | SDValue NewCst = CurDAG->getSignedConstant(Val: ShiftedVal, DL: dl, VT: NVT); |
4577 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewCst); |
4578 | SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst); |
4579 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewBinOp); |
4580 | SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp, |
4581 | N2: Shift.getOperand(i: 1)); |
4582 | ReplaceNode(F: N, T: NewSHL.getNode()); |
4583 | SelectCode(N: NewSHL.getNode()); |
4584 | return true; |
4585 | } |
4586 | |
4587 | bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA, |
4588 | SDNode *ParentB, SDNode *ParentC, |
4589 | SDValue A, SDValue B, SDValue C, |
4590 | uint8_t Imm) { |
4591 | assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) && |
4592 | C.isOperandOf(ParentC) && "Incorrect parent node" ); |
4593 | |
4594 | auto tryFoldLoadOrBCast = |
4595 | [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, |
4596 | SDValue &Index, SDValue &Disp, SDValue &Segment) { |
4597 | if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment)) |
4598 | return true; |
4599 | |
4600 | // Not a load, check for broadcast which may be behind a bitcast. |
4601 | if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { |
4602 | P = L.getNode(); |
4603 | L = L.getOperand(i: 0); |
4604 | } |
4605 | |
4606 | if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) |
4607 | return false; |
4608 | |
4609 | // Only 32 and 64 bit broadcasts are supported. |
4610 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L); |
4611 | unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); |
4612 | if (Size != 32 && Size != 64) |
4613 | return false; |
4614 | |
4615 | return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment); |
4616 | }; |
4617 | |
4618 | bool FoldedLoad = false; |
4619 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4620 | if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { |
4621 | FoldedLoad = true; |
4622 | } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3, |
4623 | Tmp4)) { |
4624 | FoldedLoad = true; |
4625 | std::swap(a&: A, b&: C); |
4626 | // Swap bits 1/4 and 3/6. |
4627 | uint8_t OldImm = Imm; |
4628 | Imm = OldImm & 0xa5; |
4629 | if (OldImm & 0x02) Imm |= 0x10; |
4630 | if (OldImm & 0x10) Imm |= 0x02; |
4631 | if (OldImm & 0x08) Imm |= 0x40; |
4632 | if (OldImm & 0x40) Imm |= 0x08; |
4633 | } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3, |
4634 | Tmp4)) { |
4635 | FoldedLoad = true; |
4636 | std::swap(a&: B, b&: C); |
4637 | // Swap bits 1/2 and 5/6. |
4638 | uint8_t OldImm = Imm; |
4639 | Imm = OldImm & 0x99; |
4640 | if (OldImm & 0x02) Imm |= 0x04; |
4641 | if (OldImm & 0x04) Imm |= 0x02; |
4642 | if (OldImm & 0x20) Imm |= 0x40; |
4643 | if (OldImm & 0x40) Imm |= 0x20; |
4644 | } |
4645 | |
4646 | SDLoc DL(Root); |
4647 | |
4648 | SDValue TImm = CurDAG->getTargetConstant(Val: Imm, DL, VT: MVT::i8); |
4649 | |
4650 | MVT NVT = Root->getSimpleValueType(ResNo: 0); |
4651 | |
4652 | MachineSDNode *MNode; |
4653 | if (FoldedLoad) { |
4654 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other); |
4655 | |
4656 | unsigned Opc; |
4657 | if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { |
4658 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C); |
4659 | unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); |
4660 | assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!" ); |
4661 | |
4662 | bool UseD = EltSize == 32; |
4663 | if (NVT.is128BitVector()) |
4664 | Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; |
4665 | else if (NVT.is256BitVector()) |
4666 | Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; |
4667 | else if (NVT.is512BitVector()) |
4668 | Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; |
4669 | else |
4670 | llvm_unreachable("Unexpected vector size!" ); |
4671 | } else { |
4672 | bool UseD = NVT.getVectorElementType() == MVT::i32; |
4673 | if (NVT.is128BitVector()) |
4674 | Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; |
4675 | else if (NVT.is256BitVector()) |
4676 | Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; |
4677 | else if (NVT.is512BitVector()) |
4678 | Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; |
4679 | else |
4680 | llvm_unreachable("Unexpected vector size!" ); |
4681 | } |
4682 | |
4683 | SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: 0)}; |
4684 | MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops); |
4685 | |
4686 | // Update the chain. |
4687 | ReplaceUses(F: C.getValue(R: 1), T: SDValue(MNode, 1)); |
4688 | // Record the mem-refs |
4689 | CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()}); |
4690 | } else { |
4691 | bool UseD = NVT.getVectorElementType() == MVT::i32; |
4692 | unsigned Opc; |
4693 | if (NVT.is128BitVector()) |
4694 | Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; |
4695 | else if (NVT.is256BitVector()) |
4696 | Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; |
4697 | else if (NVT.is512BitVector()) |
4698 | Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; |
4699 | else |
4700 | llvm_unreachable("Unexpected vector size!" ); |
4701 | |
4702 | MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm}); |
4703 | } |
4704 | |
4705 | ReplaceUses(F: SDValue(Root, 0), T: SDValue(MNode, 0)); |
4706 | CurDAG->RemoveDeadNode(N: Root); |
4707 | return true; |
4708 | } |
4709 | |
4710 | // Try to match two logic ops to a VPTERNLOG. |
4711 | // FIXME: Handle more complex patterns that use an operand more than once? |
4712 | bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { |
4713 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
4714 | |
4715 | // Make sure we support VPTERNLOG. |
4716 | if (!NVT.isVector() || !Subtarget->hasAVX512() || |
4717 | NVT.getVectorElementType() == MVT::i1) |
4718 | return false; |
4719 | |
4720 | // We need VLX for 128/256-bit. |
4721 | if (!(Subtarget->hasVLX() || NVT.is512BitVector())) |
4722 | return false; |
4723 | |
4724 | SDValue N0 = N->getOperand(Num: 0); |
4725 | SDValue N1 = N->getOperand(Num: 1); |
4726 | |
4727 | auto getFoldableLogicOp = [](SDValue Op) { |
4728 | // Peek through single use bitcast. |
4729 | if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) |
4730 | Op = Op.getOperand(i: 0); |
4731 | |
4732 | if (!Op.hasOneUse()) |
4733 | return SDValue(); |
4734 | |
4735 | unsigned Opc = Op.getOpcode(); |
4736 | if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || |
4737 | Opc == X86ISD::ANDNP) |
4738 | return Op; |
4739 | |
4740 | return SDValue(); |
4741 | }; |
4742 | |
4743 | SDValue A, FoldableOp; |
4744 | if ((FoldableOp = getFoldableLogicOp(N1))) { |
4745 | A = N0; |
4746 | } else if ((FoldableOp = getFoldableLogicOp(N0))) { |
4747 | A = N1; |
4748 | } else |
4749 | return false; |
4750 | |
4751 | SDValue B = FoldableOp.getOperand(i: 0); |
4752 | SDValue C = FoldableOp.getOperand(i: 1); |
4753 | SDNode *ParentA = N; |
4754 | SDNode *ParentB = FoldableOp.getNode(); |
4755 | SDNode *ParentC = FoldableOp.getNode(); |
4756 | |
4757 | // We can build the appropriate control immediate by performing the logic |
4758 | // operation we're matching using these constants for A, B, and C. |
4759 | uint8_t TernlogMagicA = 0xf0; |
4760 | uint8_t TernlogMagicB = 0xcc; |
4761 | uint8_t TernlogMagicC = 0xaa; |
4762 | |
4763 | // Some of the inputs may be inverted, peek through them and invert the |
4764 | // magic values accordingly. |
4765 | // TODO: There may be a bitcast before the xor that we should peek through. |
4766 | auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) { |
4767 | if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() && |
4768 | ISD::isBuildVectorAllOnes(N: Op.getOperand(i: 1).getNode())) { |
4769 | Magic = ~Magic; |
4770 | Parent = Op.getNode(); |
4771 | Op = Op.getOperand(i: 0); |
4772 | } |
4773 | }; |
4774 | |
4775 | PeekThroughNot(A, ParentA, TernlogMagicA); |
4776 | PeekThroughNot(B, ParentB, TernlogMagicB); |
4777 | PeekThroughNot(C, ParentC, TernlogMagicC); |
4778 | |
4779 | uint8_t Imm; |
4780 | switch (FoldableOp.getOpcode()) { |
4781 | default: llvm_unreachable("Unexpected opcode!" ); |
4782 | case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break; |
4783 | case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break; |
4784 | case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break; |
4785 | case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; |
4786 | } |
4787 | |
4788 | switch (N->getOpcode()) { |
4789 | default: llvm_unreachable("Unexpected opcode!" ); |
4790 | case X86ISD::ANDNP: |
4791 | if (A == N0) |
4792 | Imm &= ~TernlogMagicA; |
4793 | else |
4794 | Imm = ~(Imm) & TernlogMagicA; |
4795 | break; |
4796 | case ISD::AND: Imm &= TernlogMagicA; break; |
4797 | case ISD::OR: Imm |= TernlogMagicA; break; |
4798 | case ISD::XOR: Imm ^= TernlogMagicA; break; |
4799 | } |
4800 | |
4801 | return matchVPTERNLOG(Root: N, ParentA, ParentB, ParentC, A, B, C, Imm); |
4802 | } |
4803 | |
4804 | /// If the high bits of an 'and' operand are known zero, try setting the |
4805 | /// high bits of an 'and' constant operand to produce a smaller encoding by |
4806 | /// creating a small, sign-extended negative immediate rather than a large |
4807 | /// positive one. This reverses a transform in SimplifyDemandedBits that |
4808 | /// shrinks mask constants by clearing bits. There is also a possibility that |
4809 | /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that |
4810 | /// case, just replace the 'and'. Return 'true' if the node is replaced. |
4811 | bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { |
4812 | // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't |
4813 | // have immediate operands. |
4814 | MVT VT = And->getSimpleValueType(ResNo: 0); |
4815 | if (VT != MVT::i32 && VT != MVT::i64) |
4816 | return false; |
4817 | |
4818 | auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1)); |
4819 | if (!And1C) |
4820 | return false; |
4821 | |
4822 | // Bail out if the mask constant is already negative. It's can't shrink more. |
4823 | // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel |
4824 | // patterns to use a 32-bit and instead of a 64-bit and by relying on the |
4825 | // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits |
4826 | // are negative too. |
4827 | APInt MaskVal = And1C->getAPIntValue(); |
4828 | unsigned MaskLZ = MaskVal.countl_zero(); |
4829 | if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) |
4830 | return false; |
4831 | |
4832 | // Don't extend into the upper 32 bits of a 64 bit mask. |
4833 | if (VT == MVT::i64 && MaskLZ >= 32) { |
4834 | MaskLZ -= 32; |
4835 | MaskVal = MaskVal.trunc(width: 32); |
4836 | } |
4837 | |
4838 | SDValue And0 = And->getOperand(Num: 0); |
4839 | APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ); |
4840 | APInt NegMaskVal = MaskVal | HighZeros; |
4841 | |
4842 | // If a negative constant would not allow a smaller encoding, there's no need |
4843 | // to continue. Only change the constant when we know it's a win. |
4844 | unsigned MinWidth = NegMaskVal.getSignificantBits(); |
4845 | if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32)) |
4846 | return false; |
4847 | |
4848 | // Extend masks if we truncated above. |
4849 | if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { |
4850 | NegMaskVal = NegMaskVal.zext(width: 64); |
4851 | HighZeros = HighZeros.zext(width: 64); |
4852 | } |
4853 | |
4854 | // The variable operand must be all zeros in the top bits to allow using the |
4855 | // new, negative constant as the mask. |
4856 | // TODO: Handle constant folding? |
4857 | KnownBits Known0 = CurDAG->computeKnownBits(Op: And0); |
4858 | if (Known0.isConstant() || !HighZeros.isSubsetOf(RHS: Known0.Zero)) |
4859 | return false; |
4860 | |
4861 | // Check if the mask is -1. In that case, this is an unnecessary instruction |
4862 | // that escaped earlier analysis. |
4863 | if (NegMaskVal.isAllOnes()) { |
4864 | ReplaceNode(F: And, T: And0.getNode()); |
4865 | return true; |
4866 | } |
4867 | |
4868 | // A negative mask allows a smaller encoding. Create a new 'and' node. |
4869 | SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc(And), VT); |
4870 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(And, 0), N: NewMask); |
4871 | SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(And), VT, N1: And0, N2: NewMask); |
4872 | ReplaceNode(F: And, T: NewAnd.getNode()); |
4873 | SelectCode(N: NewAnd.getNode()); |
4874 | return true; |
4875 | } |
4876 | |
4877 | static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, |
4878 | bool FoldedBCast, bool Masked) { |
4879 | #define VPTESTM_CASE(VT, SUFFIX) \ |
4880 | case MVT::VT: \ |
4881 | if (Masked) \ |
4882 | return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \ |
4883 | return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX; |
4884 | |
4885 | |
4886 | #define VPTESTM_BROADCAST_CASES(SUFFIX) \ |
4887 | default: llvm_unreachable("Unexpected VT!"); \ |
4888 | VPTESTM_CASE(v4i32, DZ128##SUFFIX) \ |
4889 | VPTESTM_CASE(v2i64, QZ128##SUFFIX) \ |
4890 | VPTESTM_CASE(v8i32, DZ256##SUFFIX) \ |
4891 | VPTESTM_CASE(v4i64, QZ256##SUFFIX) \ |
4892 | VPTESTM_CASE(v16i32, DZ##SUFFIX) \ |
4893 | VPTESTM_CASE(v8i64, QZ##SUFFIX) |
4894 | |
4895 | #define VPTESTM_FULL_CASES(SUFFIX) \ |
4896 | VPTESTM_BROADCAST_CASES(SUFFIX) \ |
4897 | VPTESTM_CASE(v16i8, BZ128##SUFFIX) \ |
4898 | VPTESTM_CASE(v8i16, WZ128##SUFFIX) \ |
4899 | VPTESTM_CASE(v32i8, BZ256##SUFFIX) \ |
4900 | VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ |
4901 | VPTESTM_CASE(v64i8, BZ##SUFFIX) \ |
4902 | VPTESTM_CASE(v32i16, WZ##SUFFIX) |
4903 | |
4904 | if (FoldedBCast) { |
4905 | switch (TestVT.SimpleTy) { |
4906 | VPTESTM_BROADCAST_CASES(rmb) |
4907 | } |
4908 | } |
4909 | |
4910 | if (FoldedLoad) { |
4911 | switch (TestVT.SimpleTy) { |
4912 | VPTESTM_FULL_CASES(rm) |
4913 | } |
4914 | } |
4915 | |
4916 | switch (TestVT.SimpleTy) { |
4917 | VPTESTM_FULL_CASES(rr) |
4918 | } |
4919 | |
4920 | #undef VPTESTM_FULL_CASES |
4921 | #undef VPTESTM_BROADCAST_CASES |
4922 | #undef VPTESTM_CASE |
4923 | } |
4924 | |
4925 | // Try to create VPTESTM instruction. If InMask is not null, it will be used |
4926 | // to form a masked operation. |
4927 | bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, |
4928 | SDValue InMask) { |
4929 | assert(Subtarget->hasAVX512() && "Expected AVX512!" ); |
4930 | assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && |
4931 | "Unexpected VT!" ); |
4932 | |
4933 | // Look for equal and not equal compares. |
4934 | ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: 2))->get(); |
4935 | if (CC != ISD::SETEQ && CC != ISD::SETNE) |
4936 | return false; |
4937 | |
4938 | SDValue SetccOp0 = Setcc.getOperand(i: 0); |
4939 | SDValue SetccOp1 = Setcc.getOperand(i: 1); |
4940 | |
4941 | // Canonicalize the all zero vector to the RHS. |
4942 | if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode())) |
4943 | std::swap(a&: SetccOp0, b&: SetccOp1); |
4944 | |
4945 | // See if we're comparing against zero. |
4946 | if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode())) |
4947 | return false; |
4948 | |
4949 | SDValue N0 = SetccOp0; |
4950 | |
4951 | MVT CmpVT = N0.getSimpleValueType(); |
4952 | MVT CmpSVT = CmpVT.getVectorElementType(); |
4953 | |
4954 | // Start with both operands the same. We'll try to refine this. |
4955 | SDValue Src0 = N0; |
4956 | SDValue Src1 = N0; |
4957 | |
4958 | { |
4959 | // Look through single use bitcasts. |
4960 | SDValue N0Temp = N0; |
4961 | if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) |
4962 | N0Temp = N0.getOperand(i: 0); |
4963 | |
4964 | // Look for single use AND. |
4965 | if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { |
4966 | Src0 = N0Temp.getOperand(i: 0); |
4967 | Src1 = N0Temp.getOperand(i: 1); |
4968 | } |
4969 | } |
4970 | |
4971 | // Without VLX we need to widen the operation. |
4972 | bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); |
4973 | |
4974 | auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L, |
4975 | SDValue &Base, SDValue &Scale, SDValue &Index, |
4976 | SDValue &Disp, SDValue &Segment) { |
4977 | // If we need to widen, we can't fold the load. |
4978 | if (!Widen) |
4979 | if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment)) |
4980 | return true; |
4981 | |
4982 | // If we didn't fold a load, try to match broadcast. No widening limitation |
4983 | // for this. But only 32 and 64 bit types are supported. |
4984 | if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64) |
4985 | return false; |
4986 | |
4987 | // Look through single use bitcasts. |
4988 | if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { |
4989 | P = L.getNode(); |
4990 | L = L.getOperand(i: 0); |
4991 | } |
4992 | |
4993 | if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) |
4994 | return false; |
4995 | |
4996 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L); |
4997 | if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits()) |
4998 | return false; |
4999 | |
5000 | return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment); |
5001 | }; |
5002 | |
5003 | // We can only fold loads if the sources are unique. |
5004 | bool CanFoldLoads = Src0 != Src1; |
5005 | |
5006 | bool FoldedLoad = false; |
5007 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5008 | if (CanFoldLoads) { |
5009 | FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2, |
5010 | Tmp3, Tmp4); |
5011 | if (!FoldedLoad) { |
5012 | // And is commutative. |
5013 | FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1, |
5014 | Tmp2, Tmp3, Tmp4); |
5015 | if (FoldedLoad) |
5016 | std::swap(a&: Src0, b&: Src1); |
5017 | } |
5018 | } |
5019 | |
5020 | bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD; |
5021 | |
5022 | bool IsMasked = InMask.getNode() != nullptr; |
5023 | |
5024 | SDLoc dl(Root); |
5025 | |
5026 | MVT ResVT = Setcc.getSimpleValueType(); |
5027 | MVT MaskVT = ResVT; |
5028 | if (Widen) { |
5029 | // Widen the inputs using insert_subreg or copy_to_regclass. |
5030 | unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; |
5031 | unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; |
5032 | unsigned NumElts = CmpVT.getVectorNumElements() * Scale; |
5033 | CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts); |
5034 | MaskVT = MVT::getVectorVT(VT: MVT::i1, NumElements: NumElts); |
5035 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(Opcode: X86::IMPLICIT_DEF, dl, |
5036 | VT: CmpVT), 0); |
5037 | Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0); |
5038 | |
5039 | if (!FoldedBCast) |
5040 | Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1); |
5041 | |
5042 | if (IsMasked) { |
5043 | // Widen the mask. |
5044 | unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID(); |
5045 | SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32); |
5046 | InMask = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS, |
5047 | dl, VT: MaskVT, Op1: InMask, Op2: RC), 0); |
5048 | } |
5049 | } |
5050 | |
5051 | bool IsTestN = CC == ISD::SETEQ; |
5052 | unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast, |
5053 | Masked: IsMasked); |
5054 | |
5055 | MachineSDNode *CNode; |
5056 | if (FoldedLoad) { |
5057 | SDVTList VTs = CurDAG->getVTList(VT1: MaskVT, VT2: MVT::Other); |
5058 | |
5059 | if (IsMasked) { |
5060 | SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, |
5061 | Src1.getOperand(i: 0) }; |
5062 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5063 | } else { |
5064 | SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, |
5065 | Src1.getOperand(i: 0) }; |
5066 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5067 | } |
5068 | |
5069 | // Update the chain. |
5070 | ReplaceUses(F: Src1.getValue(R: 1), T: SDValue(CNode, 1)); |
5071 | // Record the mem-refs |
5072 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()}); |
5073 | } else { |
5074 | if (IsMasked) |
5075 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1); |
5076 | else |
5077 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1); |
5078 | } |
5079 | |
5080 | // If we widened, we need to shrink the mask VT. |
5081 | if (Widen) { |
5082 | unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID(); |
5083 | SDValue RC = CurDAG->getTargetConstant(Val: RegClass, DL: dl, VT: MVT::i32); |
5084 | CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS, |
5085 | dl, VT: ResVT, Op1: SDValue(CNode, 0), Op2: RC); |
5086 | } |
5087 | |
5088 | ReplaceUses(F: SDValue(Root, 0), T: SDValue(CNode, 0)); |
5089 | CurDAG->RemoveDeadNode(N: Root); |
5090 | return true; |
5091 | } |
5092 | |
5093 | // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it |
5094 | // into vpternlog. |
5095 | bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { |
5096 | assert(N->getOpcode() == ISD::OR && "Unexpected opcode!" ); |
5097 | |
5098 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
5099 | |
5100 | // Make sure we support VPTERNLOG. |
5101 | if (!NVT.isVector() || !Subtarget->hasAVX512()) |
5102 | return false; |
5103 | |
5104 | // We need VLX for 128/256-bit. |
5105 | if (!(Subtarget->hasVLX() || NVT.is512BitVector())) |
5106 | return false; |
5107 | |
5108 | SDValue N0 = N->getOperand(Num: 0); |
5109 | SDValue N1 = N->getOperand(Num: 1); |
5110 | |
5111 | // Canonicalize AND to LHS. |
5112 | if (N1.getOpcode() == ISD::AND) |
5113 | std::swap(a&: N0, b&: N1); |
5114 | |
5115 | if (N0.getOpcode() != ISD::AND || |
5116 | N1.getOpcode() != X86ISD::ANDNP || |
5117 | !N0.hasOneUse() || !N1.hasOneUse()) |
5118 | return false; |
5119 | |
5120 | // ANDN is not commutable, use it to pick down A and C. |
5121 | SDValue A = N1.getOperand(i: 0); |
5122 | SDValue C = N1.getOperand(i: 1); |
5123 | |
5124 | // AND is commutable, if one operand matches A, the other operand is B. |
5125 | // Otherwise this isn't a match. |
5126 | SDValue B; |
5127 | if (N0.getOperand(i: 0) == A) |
5128 | B = N0.getOperand(i: 1); |
5129 | else if (N0.getOperand(i: 1) == A) |
5130 | B = N0.getOperand(i: 0); |
5131 | else |
5132 | return false; |
5133 | |
5134 | SDLoc dl(N); |
5135 | SDValue Imm = CurDAG->getTargetConstant(Val: 0xCA, DL: dl, VT: MVT::i8); |
5136 | SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm); |
5137 | ReplaceNode(F: N, T: Ternlog.getNode()); |
5138 | |
5139 | return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(), |
5140 | ParentC: Ternlog.getNode(), A, B, C, Imm: 0xCA); |
5141 | } |
5142 | |
5143 | void X86DAGToDAGISel::Select(SDNode *Node) { |
5144 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
5145 | unsigned Opcode = Node->getOpcode(); |
5146 | SDLoc dl(Node); |
5147 | |
5148 | if (Node->isMachineOpcode()) { |
5149 | LLVM_DEBUG(dbgs() << "== " ; Node->dump(CurDAG); dbgs() << '\n'); |
5150 | Node->setNodeId(-1); |
5151 | return; // Already selected. |
5152 | } |
5153 | |
5154 | switch (Opcode) { |
5155 | default: break; |
5156 | case ISD::INTRINSIC_W_CHAIN: { |
5157 | unsigned IntNo = Node->getConstantOperandVal(Num: 1); |
5158 | switch (IntNo) { |
5159 | default: break; |
5160 | case Intrinsic::x86_encodekey128: |
5161 | case Intrinsic::x86_encodekey256: { |
5162 | if (!Subtarget->hasKL()) |
5163 | break; |
5164 | |
5165 | unsigned Opcode; |
5166 | switch (IntNo) { |
5167 | default: llvm_unreachable("Impossible intrinsic" ); |
5168 | case Intrinsic::x86_encodekey128: |
5169 | Opcode = X86::ENCODEKEY128; |
5170 | break; |
5171 | case Intrinsic::x86_encodekey256: |
5172 | Opcode = X86::ENCODEKEY256; |
5173 | break; |
5174 | } |
5175 | |
5176 | SDValue Chain = Node->getOperand(Num: 0); |
5177 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 3), |
5178 | Glue: SDValue()); |
5179 | if (Opcode == X86::ENCODEKEY256) |
5180 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 4), |
5181 | Glue: Chain.getValue(R: 1)); |
5182 | |
5183 | MachineSDNode *Res = CurDAG->getMachineNode( |
5184 | Opcode, dl, VTs: Node->getVTList(), |
5185 | Ops: {Node->getOperand(Num: 2), Chain, Chain.getValue(R: 1)}); |
5186 | ReplaceNode(F: Node, T: Res); |
5187 | return; |
5188 | } |
5189 | case Intrinsic::x86_tileloaddrs64_internal: |
5190 | case Intrinsic::x86_tileloaddrst164_internal: |
5191 | if (!Subtarget->hasAMXMOVRS()) |
5192 | break; |
5193 | [[fallthrough]]; |
5194 | case Intrinsic::x86_tileloadd64_internal: |
5195 | case Intrinsic::x86_tileloaddt164_internal: { |
5196 | if (!Subtarget->hasAMXTILE()) |
5197 | break; |
5198 | auto *MFI = |
5199 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
5200 | MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); |
5201 | unsigned Opc; |
5202 | switch (IntNo) { |
5203 | default: |
5204 | llvm_unreachable("Unexpected intrinsic!" ); |
5205 | case Intrinsic::x86_tileloaddrs64_internal: |
5206 | Opc = X86::PTILELOADDRSV; |
5207 | break; |
5208 | case Intrinsic::x86_tileloaddrst164_internal: |
5209 | Opc = X86::PTILELOADDRST1V; |
5210 | break; |
5211 | case Intrinsic::x86_tileloadd64_internal: |
5212 | Opc = X86::PTILELOADDV; |
5213 | break; |
5214 | case Intrinsic::x86_tileloaddt164_internal: |
5215 | Opc = X86::PTILELOADDT1V; |
5216 | break; |
5217 | } |
5218 | // _tile_loadd_internal(row, col, buf, STRIDE) |
5219 | SDValue Base = Node->getOperand(Num: 4); |
5220 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
5221 | SDValue Index = Node->getOperand(Num: 5); |
5222 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
5223 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
5224 | SDValue Chain = Node->getOperand(Num: 0); |
5225 | MachineSDNode *CNode; |
5226 | SDValue Ops[] = {Node->getOperand(Num: 2), |
5227 | Node->getOperand(Num: 3), |
5228 | Base, |
5229 | Scale, |
5230 | Index, |
5231 | Disp, |
5232 | Segment, |
5233 | Chain}; |
5234 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, ResultTys: {MVT::x86amx, MVT::Other}, Ops); |
5235 | ReplaceNode(F: Node, T: CNode); |
5236 | return; |
5237 | } |
5238 | } |
5239 | break; |
5240 | } |
5241 | case ISD::INTRINSIC_VOID: { |
5242 | unsigned IntNo = Node->getConstantOperandVal(Num: 1); |
5243 | switch (IntNo) { |
5244 | default: break; |
5245 | case Intrinsic::x86_sse3_monitor: |
5246 | case Intrinsic::x86_monitorx: |
5247 | case Intrinsic::x86_clzero: { |
5248 | bool Use64BitPtr = Node->getOperand(Num: 2).getValueType() == MVT::i64; |
5249 | |
5250 | unsigned Opc = 0; |
5251 | switch (IntNo) { |
5252 | default: llvm_unreachable("Unexpected intrinsic!" ); |
5253 | case Intrinsic::x86_sse3_monitor: |
5254 | if (!Subtarget->hasSSE3()) |
5255 | break; |
5256 | Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; |
5257 | break; |
5258 | case Intrinsic::x86_monitorx: |
5259 | if (!Subtarget->hasMWAITX()) |
5260 | break; |
5261 | Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; |
5262 | break; |
5263 | case Intrinsic::x86_clzero: |
5264 | if (!Subtarget->hasCLZERO()) |
5265 | break; |
5266 | Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; |
5267 | break; |
5268 | } |
5269 | |
5270 | if (Opc) { |
5271 | unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; |
5272 | SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: 0), dl, Reg: PtrReg, |
5273 | N: Node->getOperand(Num: 2), Glue: SDValue()); |
5274 | SDValue InGlue = Chain.getValue(R: 1); |
5275 | |
5276 | if (IntNo == Intrinsic::x86_sse3_monitor || |
5277 | IntNo == Intrinsic::x86_monitorx) { |
5278 | // Copy the other two operands to ECX and EDX. |
5279 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::ECX, N: Node->getOperand(Num: 3), |
5280 | Glue: InGlue); |
5281 | InGlue = Chain.getValue(R: 1); |
5282 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::EDX, N: Node->getOperand(Num: 4), |
5283 | Glue: InGlue); |
5284 | InGlue = Chain.getValue(R: 1); |
5285 | } |
5286 | |
5287 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, |
5288 | Ops: { Chain, InGlue}); |
5289 | ReplaceNode(F: Node, T: CNode); |
5290 | return; |
5291 | } |
5292 | |
5293 | break; |
5294 | } |
5295 | case Intrinsic::x86_tilestored64_internal: { |
5296 | auto *MFI = |
5297 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
5298 | MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); |
5299 | unsigned Opc = X86::PTILESTOREDV; |
5300 | // _tile_stored_internal(row, col, buf, STRIDE, c) |
5301 | SDValue Base = Node->getOperand(Num: 4); |
5302 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
5303 | SDValue Index = Node->getOperand(Num: 5); |
5304 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
5305 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
5306 | SDValue Chain = Node->getOperand(Num: 0); |
5307 | MachineSDNode *CNode; |
5308 | SDValue Ops[] = {Node->getOperand(Num: 2), |
5309 | Node->getOperand(Num: 3), |
5310 | Base, |
5311 | Scale, |
5312 | Index, |
5313 | Disp, |
5314 | Segment, |
5315 | Node->getOperand(Num: 6), |
5316 | Chain}; |
5317 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
5318 | ReplaceNode(F: Node, T: CNode); |
5319 | return; |
5320 | } |
5321 | case Intrinsic::x86_tileloaddrs64: |
5322 | case Intrinsic::x86_tileloaddrst164: |
5323 | if (!Subtarget->hasAMXMOVRS()) |
5324 | break; |
5325 | [[fallthrough]]; |
5326 | case Intrinsic::x86_tileloadd64: |
5327 | case Intrinsic::x86_tileloaddt164: |
5328 | case Intrinsic::x86_tilestored64: { |
5329 | if (!Subtarget->hasAMXTILE()) |
5330 | break; |
5331 | auto *MFI = |
5332 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
5333 | MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); |
5334 | unsigned Opc; |
5335 | switch (IntNo) { |
5336 | default: llvm_unreachable("Unexpected intrinsic!" ); |
5337 | case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; |
5338 | case Intrinsic::x86_tileloaddrs64: |
5339 | Opc = X86::PTILELOADDRS; |
5340 | break; |
5341 | case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; |
5342 | case Intrinsic::x86_tileloaddrst164: |
5343 | Opc = X86::PTILELOADDRST1; |
5344 | break; |
5345 | case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; |
5346 | } |
5347 | // FIXME: Match displacement and scale. |
5348 | unsigned TIndex = Node->getConstantOperandVal(Num: 2); |
5349 | SDValue TReg = getI8Imm(Imm: TIndex, DL: dl); |
5350 | SDValue Base = Node->getOperand(Num: 3); |
5351 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
5352 | SDValue Index = Node->getOperand(Num: 4); |
5353 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
5354 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
5355 | SDValue Chain = Node->getOperand(Num: 0); |
5356 | MachineSDNode *CNode; |
5357 | if (Opc == X86::PTILESTORED) { |
5358 | SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; |
5359 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
5360 | } else { |
5361 | SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; |
5362 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
5363 | } |
5364 | ReplaceNode(F: Node, T: CNode); |
5365 | return; |
5366 | } |
5367 | case Intrinsic::x86_t2rpntlvwz0rs: |
5368 | case Intrinsic::x86_t2rpntlvwz0rst1: |
5369 | case Intrinsic::x86_t2rpntlvwz1rs: |
5370 | case Intrinsic::x86_t2rpntlvwz1rst1: |
5371 | if (!Subtarget->hasAMXMOVRS()) |
5372 | break; |
5373 | [[fallthrough]]; |
5374 | case Intrinsic::x86_t2rpntlvwz0: |
5375 | case Intrinsic::x86_t2rpntlvwz0t1: |
5376 | case Intrinsic::x86_t2rpntlvwz1: |
5377 | case Intrinsic::x86_t2rpntlvwz1t1: { |
5378 | if (!Subtarget->hasAMXTRANSPOSE()) |
5379 | break; |
5380 | auto *MFI = |
5381 | CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
5382 | MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); |
5383 | unsigned Opc; |
5384 | switch (IntNo) { |
5385 | default: |
5386 | llvm_unreachable("Unexpected intrinsic!" ); |
5387 | case Intrinsic::x86_t2rpntlvwz0: |
5388 | Opc = X86::PT2RPNTLVWZ0; |
5389 | break; |
5390 | case Intrinsic::x86_t2rpntlvwz0t1: |
5391 | Opc = X86::PT2RPNTLVWZ0T1; |
5392 | break; |
5393 | case Intrinsic::x86_t2rpntlvwz1: |
5394 | Opc = X86::PT2RPNTLVWZ1; |
5395 | break; |
5396 | case Intrinsic::x86_t2rpntlvwz1t1: |
5397 | Opc = X86::PT2RPNTLVWZ1T1; |
5398 | break; |
5399 | case Intrinsic::x86_t2rpntlvwz0rs: |
5400 | Opc = X86::PT2RPNTLVWZ0RS; |
5401 | break; |
5402 | case Intrinsic::x86_t2rpntlvwz0rst1: |
5403 | Opc = X86::PT2RPNTLVWZ0RST1; |
5404 | break; |
5405 | case Intrinsic::x86_t2rpntlvwz1rs: |
5406 | Opc = X86::PT2RPNTLVWZ1RS; |
5407 | break; |
5408 | case Intrinsic::x86_t2rpntlvwz1rst1: |
5409 | Opc = X86::PT2RPNTLVWZ1RST1; |
5410 | break; |
5411 | } |
5412 | // FIXME: Match displacement and scale. |
5413 | unsigned TIndex = Node->getConstantOperandVal(Num: 2); |
5414 | SDValue TReg = getI8Imm(Imm: TIndex, DL: dl); |
5415 | SDValue Base = Node->getOperand(Num: 3); |
5416 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
5417 | SDValue Index = Node->getOperand(Num: 4); |
5418 | SDValue Disp = CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i32); |
5419 | SDValue Segment = CurDAG->getRegister(Reg: 0, VT: MVT::i16); |
5420 | SDValue Chain = Node->getOperand(Num: 0); |
5421 | SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; |
5422 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Other, Ops); |
5423 | ReplaceNode(F: Node, T: CNode); |
5424 | return; |
5425 | } |
5426 | } |
5427 | break; |
5428 | } |
5429 | case ISD::BRIND: |
5430 | case X86ISD::NT_BRIND: { |
5431 | if (Subtarget->isTargetNaCl()) |
5432 | // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We |
5433 | // leave the instruction alone. |
5434 | break; |
5435 | if (Subtarget->isTarget64BitILP32()) { |
5436 | // Converts a 32-bit register to a 64-bit, zero-extended version of |
5437 | // it. This is needed because x86-64 can do many things, but jmp %r32 |
5438 | // ain't one of them. |
5439 | SDValue Target = Node->getOperand(Num: 1); |
5440 | assert(Target.getValueType() == MVT::i32 && "Unexpected VT!" ); |
5441 | SDValue ZextTarget = CurDAG->getZExtOrTrunc(Op: Target, DL: dl, VT: MVT::i64); |
5442 | SDValue Brind = CurDAG->getNode(Opcode, DL: dl, VT: MVT::Other, |
5443 | N1: Node->getOperand(Num: 0), N2: ZextTarget); |
5444 | ReplaceNode(F: Node, T: Brind.getNode()); |
5445 | SelectCode(N: ZextTarget.getNode()); |
5446 | SelectCode(N: Brind.getNode()); |
5447 | return; |
5448 | } |
5449 | break; |
5450 | } |
5451 | case X86ISD::GlobalBaseReg: |
5452 | ReplaceNode(F: Node, T: getGlobalBaseReg()); |
5453 | return; |
5454 | |
5455 | case ISD::BITCAST: |
5456 | // Just drop all 128/256/512-bit bitcasts. |
5457 | if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || |
5458 | NVT == MVT::f128) { |
5459 | ReplaceUses(F: SDValue(Node, 0), T: Node->getOperand(Num: 0)); |
5460 | CurDAG->RemoveDeadNode(N: Node); |
5461 | return; |
5462 | } |
5463 | break; |
5464 | |
5465 | case ISD::SRL: |
5466 | if (matchBitExtract(Node)) |
5467 | return; |
5468 | [[fallthrough]]; |
5469 | case ISD::SRA: |
5470 | case ISD::SHL: |
5471 | if (tryShiftAmountMod(N: Node)) |
5472 | return; |
5473 | break; |
5474 | |
5475 | case X86ISD::VPTERNLOG: { |
5476 | uint8_t Imm = Node->getConstantOperandVal(Num: 3); |
5477 | if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: 0), |
5478 | B: Node->getOperand(Num: 1), C: Node->getOperand(Num: 2), Imm)) |
5479 | return; |
5480 | break; |
5481 | } |
5482 | |
5483 | case X86ISD::ANDNP: |
5484 | if (tryVPTERNLOG(N: Node)) |
5485 | return; |
5486 | break; |
5487 | |
5488 | case ISD::AND: |
5489 | if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { |
5490 | // Try to form a masked VPTESTM. Operands can be in either order. |
5491 | SDValue N0 = Node->getOperand(Num: 0); |
5492 | SDValue N1 = Node->getOperand(Num: 1); |
5493 | if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && |
5494 | tryVPTESTM(Root: Node, Setcc: N0, InMask: N1)) |
5495 | return; |
5496 | if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && |
5497 | tryVPTESTM(Root: Node, Setcc: N1, InMask: N0)) |
5498 | return; |
5499 | } |
5500 | |
5501 | if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { |
5502 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
5503 | CurDAG->RemoveDeadNode(N: Node); |
5504 | return; |
5505 | } |
5506 | if (matchBitExtract(Node)) |
5507 | return; |
5508 | if (AndImmShrink && shrinkAndImmediate(And: Node)) |
5509 | return; |
5510 | |
5511 | [[fallthrough]]; |
5512 | case ISD::OR: |
5513 | case ISD::XOR: |
5514 | if (tryShrinkShlLogicImm(N: Node)) |
5515 | return; |
5516 | if (Opcode == ISD::OR && tryMatchBitSelect(N: Node)) |
5517 | return; |
5518 | if (tryVPTERNLOG(N: Node)) |
5519 | return; |
5520 | |
5521 | [[fallthrough]]; |
5522 | case ISD::ADD: |
5523 | if (Opcode == ISD::ADD && matchBitExtract(Node)) |
5524 | return; |
5525 | [[fallthrough]]; |
5526 | case ISD::SUB: { |
5527 | // Try to avoid folding immediates with multiple uses for optsize. |
5528 | // This code tries to select to register form directly to avoid going |
5529 | // through the isel table which might fold the immediate. We can't change |
5530 | // the patterns on the add/sub/and/or/xor with immediate paterns in the |
5531 | // tablegen files to check immediate use count without making the patterns |
5532 | // unavailable to the fast-isel table. |
5533 | if (!CurDAG->shouldOptForSize()) |
5534 | break; |
5535 | |
5536 | // Only handle i8/i16/i32/i64. |
5537 | if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) |
5538 | break; |
5539 | |
5540 | SDValue N0 = Node->getOperand(Num: 0); |
5541 | SDValue N1 = Node->getOperand(Num: 1); |
5542 | |
5543 | auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1); |
5544 | if (!Cst) |
5545 | break; |
5546 | |
5547 | int64_t Val = Cst->getSExtValue(); |
5548 | |
5549 | // Make sure its an immediate that is considered foldable. |
5550 | // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. |
5551 | if (!isInt<8>(x: Val) && !isInt<32>(x: Val)) |
5552 | break; |
5553 | |
5554 | // If this can match to INC/DEC, let it go. |
5555 | if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) |
5556 | break; |
5557 | |
5558 | // Check if we should avoid folding this immediate. |
5559 | if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode())) |
5560 | break; |
5561 | |
5562 | // We should not fold the immediate. So we need a register form instead. |
5563 | unsigned ROpc, MOpc; |
5564 | switch (NVT.SimpleTy) { |
5565 | default: llvm_unreachable("Unexpected VT!" ); |
5566 | case MVT::i8: |
5567 | switch (Opcode) { |
5568 | default: llvm_unreachable("Unexpected opcode!" ); |
5569 | case ISD::ADD: |
5570 | ROpc = GET_ND_IF_ENABLED(X86::ADD8rr); |
5571 | MOpc = GET_ND_IF_ENABLED(X86::ADD8rm); |
5572 | break; |
5573 | case ISD::SUB: |
5574 | ROpc = GET_ND_IF_ENABLED(X86::SUB8rr); |
5575 | MOpc = GET_ND_IF_ENABLED(X86::SUB8rm); |
5576 | break; |
5577 | case ISD::AND: |
5578 | ROpc = GET_ND_IF_ENABLED(X86::AND8rr); |
5579 | MOpc = GET_ND_IF_ENABLED(X86::AND8rm); |
5580 | break; |
5581 | case ISD::OR: |
5582 | ROpc = GET_ND_IF_ENABLED(X86::OR8rr); |
5583 | MOpc = GET_ND_IF_ENABLED(X86::OR8rm); |
5584 | break; |
5585 | case ISD::XOR: |
5586 | ROpc = GET_ND_IF_ENABLED(X86::XOR8rr); |
5587 | MOpc = GET_ND_IF_ENABLED(X86::XOR8rm); |
5588 | break; |
5589 | } |
5590 | break; |
5591 | case MVT::i16: |
5592 | switch (Opcode) { |
5593 | default: llvm_unreachable("Unexpected opcode!" ); |
5594 | case ISD::ADD: |
5595 | ROpc = GET_ND_IF_ENABLED(X86::ADD16rr); |
5596 | MOpc = GET_ND_IF_ENABLED(X86::ADD16rm); |
5597 | break; |
5598 | case ISD::SUB: |
5599 | ROpc = GET_ND_IF_ENABLED(X86::SUB16rr); |
5600 | MOpc = GET_ND_IF_ENABLED(X86::SUB16rm); |
5601 | break; |
5602 | case ISD::AND: |
5603 | ROpc = GET_ND_IF_ENABLED(X86::AND16rr); |
5604 | MOpc = GET_ND_IF_ENABLED(X86::AND16rm); |
5605 | break; |
5606 | case ISD::OR: |
5607 | ROpc = GET_ND_IF_ENABLED(X86::OR16rr); |
5608 | MOpc = GET_ND_IF_ENABLED(X86::OR16rm); |
5609 | break; |
5610 | case ISD::XOR: |
5611 | ROpc = GET_ND_IF_ENABLED(X86::XOR16rr); |
5612 | MOpc = GET_ND_IF_ENABLED(X86::XOR16rm); |
5613 | break; |
5614 | } |
5615 | break; |
5616 | case MVT::i32: |
5617 | switch (Opcode) { |
5618 | default: llvm_unreachable("Unexpected opcode!" ); |
5619 | case ISD::ADD: |
5620 | ROpc = GET_ND_IF_ENABLED(X86::ADD32rr); |
5621 | MOpc = GET_ND_IF_ENABLED(X86::ADD32rm); |
5622 | break; |
5623 | case ISD::SUB: |
5624 | ROpc = GET_ND_IF_ENABLED(X86::SUB32rr); |
5625 | MOpc = GET_ND_IF_ENABLED(X86::SUB32rm); |
5626 | break; |
5627 | case ISD::AND: |
5628 | ROpc = GET_ND_IF_ENABLED(X86::AND32rr); |
5629 | MOpc = GET_ND_IF_ENABLED(X86::AND32rm); |
5630 | break; |
5631 | case ISD::OR: |
5632 | ROpc = GET_ND_IF_ENABLED(X86::OR32rr); |
5633 | MOpc = GET_ND_IF_ENABLED(X86::OR32rm); |
5634 | break; |
5635 | case ISD::XOR: |
5636 | ROpc = GET_ND_IF_ENABLED(X86::XOR32rr); |
5637 | MOpc = GET_ND_IF_ENABLED(X86::XOR32rm); |
5638 | break; |
5639 | } |
5640 | break; |
5641 | case MVT::i64: |
5642 | switch (Opcode) { |
5643 | default: llvm_unreachable("Unexpected opcode!" ); |
5644 | case ISD::ADD: |
5645 | ROpc = GET_ND_IF_ENABLED(X86::ADD64rr); |
5646 | MOpc = GET_ND_IF_ENABLED(X86::ADD64rm); |
5647 | break; |
5648 | case ISD::SUB: |
5649 | ROpc = GET_ND_IF_ENABLED(X86::SUB64rr); |
5650 | MOpc = GET_ND_IF_ENABLED(X86::SUB64rm); |
5651 | break; |
5652 | case ISD::AND: |
5653 | ROpc = GET_ND_IF_ENABLED(X86::AND64rr); |
5654 | MOpc = GET_ND_IF_ENABLED(X86::AND64rm); |
5655 | break; |
5656 | case ISD::OR: |
5657 | ROpc = GET_ND_IF_ENABLED(X86::OR64rr); |
5658 | MOpc = GET_ND_IF_ENABLED(X86::OR64rm); |
5659 | break; |
5660 | case ISD::XOR: |
5661 | ROpc = GET_ND_IF_ENABLED(X86::XOR64rr); |
5662 | MOpc = GET_ND_IF_ENABLED(X86::XOR64rm); |
5663 | break; |
5664 | } |
5665 | break; |
5666 | } |
5667 | |
5668 | // Ok this is a AND/OR/XOR/ADD/SUB with constant. |
5669 | |
5670 | // If this is a not a subtract, we can still try to fold a load. |
5671 | if (Opcode != ISD::SUB) { |
5672 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5673 | if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
5674 | SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) }; |
5675 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other); |
5676 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5677 | // Update the chain. |
5678 | ReplaceUses(F: N0.getValue(R: 1), T: SDValue(CNode, 2)); |
5679 | // Record the mem-refs |
5680 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()}); |
5681 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
5682 | CurDAG->RemoveDeadNode(N: Node); |
5683 | return; |
5684 | } |
5685 | } |
5686 | |
5687 | CurDAG->SelectNodeTo(N: Node, MachineOpc: ROpc, VT1: NVT, VT2: MVT::i32, Op1: N0, Op2: N1); |
5688 | return; |
5689 | } |
5690 | |
5691 | case X86ISD::SMUL: |
5692 | // i16/i32/i64 are handled with isel patterns. |
5693 | if (NVT != MVT::i8) |
5694 | break; |
5695 | [[fallthrough]]; |
5696 | case X86ISD::UMUL: { |
5697 | SDValue N0 = Node->getOperand(Num: 0); |
5698 | SDValue N1 = Node->getOperand(Num: 1); |
5699 | |
5700 | unsigned LoReg, ROpc, MOpc; |
5701 | switch (NVT.SimpleTy) { |
5702 | default: llvm_unreachable("Unsupported VT!" ); |
5703 | case MVT::i8: |
5704 | LoReg = X86::AL; |
5705 | ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; |
5706 | MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; |
5707 | break; |
5708 | case MVT::i16: |
5709 | LoReg = X86::AX; |
5710 | ROpc = X86::MUL16r; |
5711 | MOpc = X86::MUL16m; |
5712 | break; |
5713 | case MVT::i32: |
5714 | LoReg = X86::EAX; |
5715 | ROpc = X86::MUL32r; |
5716 | MOpc = X86::MUL32m; |
5717 | break; |
5718 | case MVT::i64: |
5719 | LoReg = X86::RAX; |
5720 | ROpc = X86::MUL64r; |
5721 | MOpc = X86::MUL64m; |
5722 | break; |
5723 | } |
5724 | |
5725 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5726 | bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5727 | // Multiply is commutative. |
5728 | if (!FoldedLoad) { |
5729 | FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5730 | if (FoldedLoad) |
5731 | std::swap(a&: N0, b&: N1); |
5732 | } |
5733 | |
5734 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
5735 | N: N0, Glue: SDValue()).getValue(R: 1); |
5736 | |
5737 | MachineSDNode *CNode; |
5738 | if (FoldedLoad) { |
5739 | // i16/i32/i64 use an instruction that produces a low and high result even |
5740 | // though only the low result is used. |
5741 | SDVTList VTs; |
5742 | if (NVT == MVT::i8) |
5743 | VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32, VT3: MVT::Other); |
5744 | else |
5745 | VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32, VT4: MVT::Other); |
5746 | |
5747 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
5748 | InGlue }; |
5749 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5750 | |
5751 | // Update the chain. |
5752 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); |
5753 | // Record the mem-refs |
5754 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
5755 | } else { |
5756 | // i16/i32/i64 use an instruction that produces a low and high result even |
5757 | // though only the low result is used. |
5758 | SDVTList VTs; |
5759 | if (NVT == MVT::i8) |
5760 | VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::i32); |
5761 | else |
5762 | VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::i32); |
5763 | |
5764 | CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue}); |
5765 | } |
5766 | |
5767 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
5768 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); |
5769 | CurDAG->RemoveDeadNode(N: Node); |
5770 | return; |
5771 | } |
5772 | |
5773 | case ISD::SMUL_LOHI: |
5774 | case ISD::UMUL_LOHI: { |
5775 | SDValue N0 = Node->getOperand(Num: 0); |
5776 | SDValue N1 = Node->getOperand(Num: 1); |
5777 | |
5778 | unsigned Opc, MOpc; |
5779 | unsigned LoReg, HiReg; |
5780 | bool IsSigned = Opcode == ISD::SMUL_LOHI; |
5781 | bool UseMULX = !IsSigned && Subtarget->hasBMI2(); |
5782 | bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); |
5783 | switch (NVT.SimpleTy) { |
5784 | default: llvm_unreachable("Unsupported VT!" ); |
5785 | case MVT::i32: |
5786 | Opc = UseMULXHi ? X86::MULX32Hrr |
5787 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr) |
5788 | : IsSigned ? X86::IMUL32r |
5789 | : X86::MUL32r; |
5790 | MOpc = UseMULXHi ? X86::MULX32Hrm |
5791 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm) |
5792 | : IsSigned ? X86::IMUL32m |
5793 | : X86::MUL32m; |
5794 | LoReg = UseMULX ? X86::EDX : X86::EAX; |
5795 | HiReg = X86::EDX; |
5796 | break; |
5797 | case MVT::i64: |
5798 | Opc = UseMULXHi ? X86::MULX64Hrr |
5799 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr) |
5800 | : IsSigned ? X86::IMUL64r |
5801 | : X86::MUL64r; |
5802 | MOpc = UseMULXHi ? X86::MULX64Hrm |
5803 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm) |
5804 | : IsSigned ? X86::IMUL64m |
5805 | : X86::MUL64m; |
5806 | LoReg = UseMULX ? X86::RDX : X86::RAX; |
5807 | HiReg = X86::RDX; |
5808 | break; |
5809 | } |
5810 | |
5811 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5812 | bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5813 | // Multiply is commutative. |
5814 | if (!foldedLoad) { |
5815 | foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5816 | if (foldedLoad) |
5817 | std::swap(a&: N0, b&: N1); |
5818 | } |
5819 | |
5820 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
5821 | N: N0, Glue: SDValue()).getValue(R: 1); |
5822 | SDValue ResHi, ResLo; |
5823 | if (foldedLoad) { |
5824 | SDValue Chain; |
5825 | MachineSDNode *CNode = nullptr; |
5826 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
5827 | InGlue }; |
5828 | if (UseMULXHi) { |
5829 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: MVT::Other); |
5830 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5831 | ResHi = SDValue(CNode, 0); |
5832 | Chain = SDValue(CNode, 1); |
5833 | } else if (UseMULX) { |
5834 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT, VT3: MVT::Other); |
5835 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5836 | ResHi = SDValue(CNode, 0); |
5837 | ResLo = SDValue(CNode, 1); |
5838 | Chain = SDValue(CNode, 2); |
5839 | } else { |
5840 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
5841 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5842 | Chain = SDValue(CNode, 0); |
5843 | InGlue = SDValue(CNode, 1); |
5844 | } |
5845 | |
5846 | // Update the chain. |
5847 | ReplaceUses(F: N1.getValue(R: 1), T: Chain); |
5848 | // Record the mem-refs |
5849 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
5850 | } else { |
5851 | SDValue Ops[] = { N1, InGlue }; |
5852 | if (UseMULXHi) { |
5853 | SDVTList VTs = CurDAG->getVTList(VT: NVT); |
5854 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5855 | ResHi = SDValue(CNode, 0); |
5856 | } else if (UseMULX) { |
5857 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT); |
5858 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5859 | ResHi = SDValue(CNode, 0); |
5860 | ResLo = SDValue(CNode, 1); |
5861 | } else { |
5862 | SDVTList VTs = CurDAG->getVTList(VT: MVT::Glue); |
5863 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5864 | InGlue = SDValue(CNode, 0); |
5865 | } |
5866 | } |
5867 | |
5868 | // Copy the low half of the result, if it is needed. |
5869 | if (!SDValue(Node, 0).use_empty()) { |
5870 | if (!ResLo) { |
5871 | assert(LoReg && "Register for low half is not defined!" ); |
5872 | ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
5873 | VT: NVT, Glue: InGlue); |
5874 | InGlue = ResLo.getValue(R: 2); |
5875 | } |
5876 | ReplaceUses(F: SDValue(Node, 0), T: ResLo); |
5877 | LLVM_DEBUG(dbgs() << "=> " ; ResLo.getNode()->dump(CurDAG); |
5878 | dbgs() << '\n'); |
5879 | } |
5880 | // Copy the high half of the result, if it is needed. |
5881 | if (!SDValue(Node, 1).use_empty()) { |
5882 | if (!ResHi) { |
5883 | assert(HiReg && "Register for high half is not defined!" ); |
5884 | ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg, |
5885 | VT: NVT, Glue: InGlue); |
5886 | InGlue = ResHi.getValue(R: 2); |
5887 | } |
5888 | ReplaceUses(F: SDValue(Node, 1), T: ResHi); |
5889 | LLVM_DEBUG(dbgs() << "=> " ; ResHi.getNode()->dump(CurDAG); |
5890 | dbgs() << '\n'); |
5891 | } |
5892 | |
5893 | CurDAG->RemoveDeadNode(N: Node); |
5894 | return; |
5895 | } |
5896 | |
5897 | case ISD::SDIVREM: |
5898 | case ISD::UDIVREM: { |
5899 | SDValue N0 = Node->getOperand(Num: 0); |
5900 | SDValue N1 = Node->getOperand(Num: 1); |
5901 | |
5902 | unsigned ROpc, MOpc; |
5903 | bool isSigned = Opcode == ISD::SDIVREM; |
5904 | if (!isSigned) { |
5905 | switch (NVT.SimpleTy) { |
5906 | default: llvm_unreachable("Unsupported VT!" ); |
5907 | case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break; |
5908 | case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break; |
5909 | case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break; |
5910 | case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break; |
5911 | } |
5912 | } else { |
5913 | switch (NVT.SimpleTy) { |
5914 | default: llvm_unreachable("Unsupported VT!" ); |
5915 | case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break; |
5916 | case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break; |
5917 | case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break; |
5918 | case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break; |
5919 | } |
5920 | } |
5921 | |
5922 | unsigned LoReg, HiReg, ClrReg; |
5923 | unsigned SExtOpcode; |
5924 | switch (NVT.SimpleTy) { |
5925 | default: llvm_unreachable("Unsupported VT!" ); |
5926 | case MVT::i8: |
5927 | LoReg = X86::AL; ClrReg = HiReg = X86::AH; |
5928 | SExtOpcode = 0; // Not used. |
5929 | break; |
5930 | case MVT::i16: |
5931 | LoReg = X86::AX; HiReg = X86::DX; |
5932 | ClrReg = X86::DX; |
5933 | SExtOpcode = X86::CWD; |
5934 | break; |
5935 | case MVT::i32: |
5936 | LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; |
5937 | SExtOpcode = X86::CDQ; |
5938 | break; |
5939 | case MVT::i64: |
5940 | LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; |
5941 | SExtOpcode = X86::CQO; |
5942 | break; |
5943 | } |
5944 | |
5945 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5946 | bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5947 | bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0); |
5948 | |
5949 | SDValue InGlue; |
5950 | if (NVT == MVT::i8) { |
5951 | // Special case for div8, just use a move with zero extension to AX to |
5952 | // clear the upper 8 bits (AH). |
5953 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; |
5954 | MachineSDNode *Move; |
5955 | if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
5956 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) }; |
5957 | unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 |
5958 | : X86::MOVZX16rm8; |
5959 | Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT1: MVT::i16, VT2: MVT::Other, Ops); |
5960 | Chain = SDValue(Move, 1); |
5961 | ReplaceUses(F: N0.getValue(R: 1), T: Chain); |
5962 | // Record the mem-refs |
5963 | CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()}); |
5964 | } else { |
5965 | unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 |
5966 | : X86::MOVZX16rr8; |
5967 | Move = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::i16, Op1: N0); |
5968 | Chain = CurDAG->getEntryNode(); |
5969 | } |
5970 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AX, N: SDValue(Move, 0), |
5971 | Glue: SDValue()); |
5972 | InGlue = Chain.getValue(R: 1); |
5973 | } else { |
5974 | InGlue = |
5975 | CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, |
5976 | Reg: LoReg, N: N0, Glue: SDValue()).getValue(R: 1); |
5977 | if (isSigned && !signBitIsZero) { |
5978 | // Sign extend the low part into the high part. |
5979 | InGlue = |
5980 | SDValue(CurDAG->getMachineNode(Opcode: SExtOpcode, dl, VT: MVT::Glue, Op1: InGlue),0); |
5981 | } else { |
5982 | // Zero out the high part, effectively zero extending the input. |
5983 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i32); |
5984 | SDValue ClrNode = |
5985 | SDValue(CurDAG->getMachineNode(Opcode: X86::MOV32r0, dl, VTs, Ops: {}), 0); |
5986 | switch (NVT.SimpleTy) { |
5987 | case MVT::i16: |
5988 | ClrNode = |
5989 | SDValue(CurDAG->getMachineNode( |
5990 | Opcode: TargetOpcode::EXTRACT_SUBREG, dl, VT: MVT::i16, Op1: ClrNode, |
5991 | Op2: CurDAG->getTargetConstant(Val: X86::sub_16bit, DL: dl, |
5992 | VT: MVT::i32)), |
5993 | 0); |
5994 | break; |
5995 | case MVT::i32: |
5996 | break; |
5997 | case MVT::i64: |
5998 | ClrNode = |
5999 | SDValue(CurDAG->getMachineNode( |
6000 | Opcode: TargetOpcode::SUBREG_TO_REG, dl, VT: MVT::i64, |
6001 | Op1: CurDAG->getTargetConstant(Val: 0, DL: dl, VT: MVT::i64), Op2: ClrNode, |
6002 | Op3: CurDAG->getTargetConstant(Val: X86::sub_32bit, DL: dl, |
6003 | VT: MVT::i32)), |
6004 | 0); |
6005 | break; |
6006 | default: |
6007 | llvm_unreachable("Unexpected division source" ); |
6008 | } |
6009 | |
6010 | InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg, |
6011 | N: ClrNode, Glue: InGlue).getValue(R: 1); |
6012 | } |
6013 | } |
6014 | |
6015 | if (foldedLoad) { |
6016 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
6017 | InGlue }; |
6018 | MachineSDNode *CNode = |
6019 | CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::Other, VT2: MVT::Glue, Ops); |
6020 | InGlue = SDValue(CNode, 1); |
6021 | // Update the chain. |
6022 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 0)); |
6023 | // Record the mem-refs |
6024 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
6025 | } else { |
6026 | InGlue = |
6027 | SDValue(CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::Glue, Op1: N1, Op2: InGlue), 0); |
6028 | } |
6029 | |
6030 | // Prevent use of AH in a REX instruction by explicitly copying it to |
6031 | // an ABCD_L register. |
6032 | // |
6033 | // The current assumption of the register allocator is that isel |
6034 | // won't generate explicit references to the GR8_ABCD_H registers. If |
6035 | // the allocator and/or the backend get enhanced to be more robust in |
6036 | // that regard, this can be, and should be, removed. |
6037 | if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { |
6038 | SDValue AHCopy = CurDAG->getRegister(Reg: X86::AH, VT: MVT::i8); |
6039 | unsigned AHExtOpcode = |
6040 | isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; |
6041 | |
6042 | SDNode *RNode = CurDAG->getMachineNode(Opcode: AHExtOpcode, dl, VT1: MVT::i32, |
6043 | VT2: MVT::Glue, Op1: AHCopy, Op2: InGlue); |
6044 | SDValue Result(RNode, 0); |
6045 | InGlue = SDValue(RNode, 1); |
6046 | |
6047 | Result = |
6048 | CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit, DL: dl, VT: MVT::i8, Operand: Result); |
6049 | |
6050 | ReplaceUses(F: SDValue(Node, 1), T: Result); |
6051 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
6052 | dbgs() << '\n'); |
6053 | } |
6054 | // Copy the division (low) result, if it is needed. |
6055 | if (!SDValue(Node, 0).use_empty()) { |
6056 | SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, |
6057 | Reg: LoReg, VT: NVT, Glue: InGlue); |
6058 | InGlue = Result.getValue(R: 2); |
6059 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
6060 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
6061 | dbgs() << '\n'); |
6062 | } |
6063 | // Copy the remainder (high) result, if it is needed. |
6064 | if (!SDValue(Node, 1).use_empty()) { |
6065 | SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, |
6066 | Reg: HiReg, VT: NVT, Glue: InGlue); |
6067 | InGlue = Result.getValue(R: 2); |
6068 | ReplaceUses(F: SDValue(Node, 1), T: Result); |
6069 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
6070 | dbgs() << '\n'); |
6071 | } |
6072 | CurDAG->RemoveDeadNode(N: Node); |
6073 | return; |
6074 | } |
6075 | |
6076 | case X86ISD::FCMP: |
6077 | case X86ISD::STRICT_FCMP: |
6078 | case X86ISD::STRICT_FCMPS: { |
6079 | bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP || |
6080 | Node->getOpcode() == X86ISD::STRICT_FCMPS; |
6081 | SDValue N0 = Node->getOperand(Num: IsStrictCmp ? 1 : 0); |
6082 | SDValue N1 = Node->getOperand(Num: IsStrictCmp ? 2 : 1); |
6083 | |
6084 | // Save the original VT of the compare. |
6085 | MVT CmpVT = N0.getSimpleValueType(); |
6086 | |
6087 | // Floating point needs special handling if we don't have FCOMI. |
6088 | if (Subtarget->canUseCMOV()) |
6089 | break; |
6090 | |
6091 | bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; |
6092 | |
6093 | unsigned Opc; |
6094 | switch (CmpVT.SimpleTy) { |
6095 | default: llvm_unreachable("Unexpected type!" ); |
6096 | case MVT::f32: |
6097 | Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32; |
6098 | break; |
6099 | case MVT::f64: |
6100 | Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64; |
6101 | break; |
6102 | case MVT::f80: |
6103 | Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80; |
6104 | break; |
6105 | } |
6106 | |
6107 | SDValue Chain = |
6108 | IsStrictCmp ? Node->getOperand(Num: 0) : CurDAG->getEntryNode(); |
6109 | SDValue Glue; |
6110 | if (IsStrictCmp) { |
6111 | SDVTList VTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
6112 | Chain = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), 0); |
6113 | Glue = Chain.getValue(R: 1); |
6114 | } else { |
6115 | Glue = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VT: MVT::Glue, Op1: N0, Op2: N1), 0); |
6116 | } |
6117 | |
6118 | // Move FPSW to AX. |
6119 | SDValue FNSTSW = |
6120 | SDValue(CurDAG->getMachineNode(Opcode: X86::FNSTSW16r, dl, VT: MVT::i16, Op1: Glue), 0); |
6121 | |
6122 | // Extract upper 8-bits of AX. |
6123 | SDValue = |
6124 | CurDAG->getTargetExtractSubreg(SRIdx: X86::sub_8bit_hi, DL: dl, VT: MVT::i8, Operand: FNSTSW); |
6125 | |
6126 | // Move AH into flags. |
6127 | // Some 64-bit targets lack SAHF support, but they do support FCOMI. |
6128 | assert(Subtarget->canUseLAHFSAHF() && |
6129 | "Target doesn't support SAHF or FCOMI?" ); |
6130 | SDValue AH = CurDAG->getCopyToReg(Chain, dl, Reg: X86::AH, N: Extract, Glue: SDValue()); |
6131 | Chain = AH; |
6132 | SDValue SAHF = SDValue( |
6133 | CurDAG->getMachineNode(Opcode: X86::SAHF, dl, VT: MVT::i32, Op1: AH.getValue(R: 1)), 0); |
6134 | |
6135 | if (IsStrictCmp) |
6136 | ReplaceUses(F: SDValue(Node, 1), T: Chain); |
6137 | |
6138 | ReplaceUses(F: SDValue(Node, 0), T: SAHF); |
6139 | CurDAG->RemoveDeadNode(N: Node); |
6140 | return; |
6141 | } |
6142 | |
6143 | case X86ISD::CMP: { |
6144 | SDValue N0 = Node->getOperand(Num: 0); |
6145 | SDValue N1 = Node->getOperand(Num: 1); |
6146 | |
6147 | // Optimizations for TEST compares. |
6148 | if (!isNullConstant(V: N1)) |
6149 | break; |
6150 | |
6151 | // Save the original VT of the compare. |
6152 | MVT CmpVT = N0.getSimpleValueType(); |
6153 | |
6154 | // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed |
6155 | // by a test instruction. The test should be removed later by |
6156 | // analyzeCompare if we are using only the zero flag. |
6157 | // TODO: Should we check the users and use the BEXTR flags directly? |
6158 | if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { |
6159 | if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) { |
6160 | unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr |
6161 | : X86::TEST32rr; |
6162 | SDValue BEXTR = SDValue(NewNode, 0); |
6163 | NewNode = CurDAG->getMachineNode(Opcode: TestOpc, dl, VT: MVT::i32, Op1: BEXTR, Op2: BEXTR); |
6164 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
6165 | CurDAG->RemoveDeadNode(N: Node); |
6166 | return; |
6167 | } |
6168 | } |
6169 | |
6170 | // We can peek through truncates, but we need to be careful below. |
6171 | if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) |
6172 | N0 = N0.getOperand(i: 0); |
6173 | |
6174 | // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to |
6175 | // use a smaller encoding. |
6176 | // Look past the truncate if CMP is the only use of it. |
6177 | if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && |
6178 | N0.getValueType() != MVT::i8) { |
6179 | auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1)); |
6180 | if (!MaskC) |
6181 | break; |
6182 | |
6183 | // We may have looked through a truncate so mask off any bits that |
6184 | // shouldn't be part of the compare. |
6185 | uint64_t Mask = MaskC->getZExtValue(); |
6186 | Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits()); |
6187 | |
6188 | // Check if we can replace AND+IMM{32,64} with a shift. This is possible |
6189 | // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the |
6190 | // zero flag. |
6191 | if (CmpVT == MVT::i64 && !isInt<8>(x: Mask) && isShiftedMask_64(Value: Mask) && |
6192 | onlyUsesZeroFlag(Flags: SDValue(Node, 0))) { |
6193 | unsigned ShiftOpcode = ISD::DELETED_NODE; |
6194 | unsigned ShiftAmt; |
6195 | unsigned SubRegIdx; |
6196 | MVT SubRegVT; |
6197 | unsigned TestOpcode; |
6198 | unsigned LeadingZeros = llvm::countl_zero(Val: Mask); |
6199 | unsigned TrailingZeros = llvm::countr_zero(Val: Mask); |
6200 | |
6201 | // With leading/trailing zeros, the transform is profitable if we can |
6202 | // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without |
6203 | // incurring any extra register moves. |
6204 | bool SavesBytes = !isInt<32>(x: Mask) || N0.getOperand(i: 0).hasOneUse(); |
6205 | if (LeadingZeros == 0 && SavesBytes) { |
6206 | // If the mask covers the most significant bit, then we can replace |
6207 | // TEST+AND with a SHR and check eflags. |
6208 | // This emits a redundant TEST which is subsequently eliminated. |
6209 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6210 | ShiftAmt = TrailingZeros; |
6211 | SubRegIdx = 0; |
6212 | TestOpcode = X86::TEST64rr; |
6213 | } else if (TrailingZeros == 0 && SavesBytes) { |
6214 | // If the mask covers the least significant bit, then we can replace |
6215 | // TEST+AND with a SHL and check eflags. |
6216 | // This emits a redundant TEST which is subsequently eliminated. |
6217 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri); |
6218 | ShiftAmt = LeadingZeros; |
6219 | SubRegIdx = 0; |
6220 | TestOpcode = X86::TEST64rr; |
6221 | } else if (MaskC->hasOneUse() && !isInt<32>(x: Mask)) { |
6222 | // If the shifted mask extends into the high half and is 8/16/32 bits |
6223 | // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr. |
6224 | unsigned PopCount = 64 - LeadingZeros - TrailingZeros; |
6225 | if (PopCount == 8) { |
6226 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6227 | ShiftAmt = TrailingZeros; |
6228 | SubRegIdx = X86::sub_8bit; |
6229 | SubRegVT = MVT::i8; |
6230 | TestOpcode = X86::TEST8rr; |
6231 | } else if (PopCount == 16) { |
6232 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6233 | ShiftAmt = TrailingZeros; |
6234 | SubRegIdx = X86::sub_16bit; |
6235 | SubRegVT = MVT::i16; |
6236 | TestOpcode = X86::TEST16rr; |
6237 | } else if (PopCount == 32) { |
6238 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6239 | ShiftAmt = TrailingZeros; |
6240 | SubRegIdx = X86::sub_32bit; |
6241 | SubRegVT = MVT::i32; |
6242 | TestOpcode = X86::TEST32rr; |
6243 | } |
6244 | } |
6245 | if (ShiftOpcode != ISD::DELETED_NODE) { |
6246 | SDValue ShiftC = CurDAG->getTargetConstant(Val: ShiftAmt, DL: dl, VT: MVT::i64); |
6247 | SDValue Shift = SDValue( |
6248 | CurDAG->getMachineNode(Opcode: ShiftOpcode, dl, VT1: MVT::i64, VT2: MVT::i32, |
6249 | Op1: N0.getOperand(i: 0), Op2: ShiftC), |
6250 | 0); |
6251 | if (SubRegIdx != 0) { |
6252 | Shift = |
6253 | CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift); |
6254 | } |
6255 | MachineSDNode *Test = |
6256 | CurDAG->getMachineNode(Opcode: TestOpcode, dl, VT: MVT::i32, Op1: Shift, Op2: Shift); |
6257 | ReplaceNode(F: Node, T: Test); |
6258 | return; |
6259 | } |
6260 | } |
6261 | |
6262 | MVT VT; |
6263 | int SubRegOp; |
6264 | unsigned ROpc, MOpc; |
6265 | |
6266 | // For each of these checks we need to be careful if the sign flag is |
6267 | // being used. It is only safe to use the sign flag in two conditions, |
6268 | // either the sign bit in the shrunken mask is zero or the final test |
6269 | // size is equal to the original compare size. |
6270 | |
6271 | if (isUInt<8>(x: Mask) && |
6272 | (!(Mask & 0x80) || CmpVT == MVT::i8 || |
6273 | hasNoSignFlagUses(Flags: SDValue(Node, 0)))) { |
6274 | // For example, convert "testl %eax, $8" to "testb %al, $8" |
6275 | VT = MVT::i8; |
6276 | SubRegOp = X86::sub_8bit; |
6277 | ROpc = X86::TEST8ri; |
6278 | MOpc = X86::TEST8mi; |
6279 | } else if (OptForMinSize && isUInt<16>(x: Mask) && |
6280 | (!(Mask & 0x8000) || CmpVT == MVT::i16 || |
6281 | hasNoSignFlagUses(Flags: SDValue(Node, 0)))) { |
6282 | // For example, "testl %eax, $32776" to "testw %ax, $32776". |
6283 | // NOTE: We only want to form TESTW instructions if optimizing for |
6284 | // min size. Otherwise we only save one byte and possibly get a length |
6285 | // changing prefix penalty in the decoders. |
6286 | VT = MVT::i16; |
6287 | SubRegOp = X86::sub_16bit; |
6288 | ROpc = X86::TEST16ri; |
6289 | MOpc = X86::TEST16mi; |
6290 | } else if (isUInt<32>(x: Mask) && N0.getValueType() != MVT::i16 && |
6291 | ((!(Mask & 0x80000000) && |
6292 | // Without minsize 16-bit Cmps can get here so we need to |
6293 | // be sure we calculate the correct sign flag if needed. |
6294 | (CmpVT != MVT::i16 || !(Mask & 0x8000))) || |
6295 | CmpVT == MVT::i32 || |
6296 | hasNoSignFlagUses(Flags: SDValue(Node, 0)))) { |
6297 | // For example, "testq %rax, $268468232" to "testl %eax, $268468232". |
6298 | // NOTE: We only want to run that transform if N0 is 32 or 64 bits. |
6299 | // Otherwize, we find ourselves in a position where we have to do |
6300 | // promotion. If previous passes did not promote the and, we assume |
6301 | // they had a good reason not to and do not promote here. |
6302 | VT = MVT::i32; |
6303 | SubRegOp = X86::sub_32bit; |
6304 | ROpc = X86::TEST32ri; |
6305 | MOpc = X86::TEST32mi; |
6306 | } else { |
6307 | // No eligible transformation was found. |
6308 | break; |
6309 | } |
6310 | |
6311 | SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT); |
6312 | SDValue Reg = N0.getOperand(i: 0); |
6313 | |
6314 | // Emit a testl or testw. |
6315 | MachineSDNode *NewNode; |
6316 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
6317 | if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
6318 | if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: 0).getNode())) { |
6319 | if (!LoadN->isSimple()) { |
6320 | unsigned NumVolBits = LoadN->getValueType(ResNo: 0).getSizeInBits(); |
6321 | if ((MOpc == X86::TEST8mi && NumVolBits != 8) || |
6322 | (MOpc == X86::TEST16mi && NumVolBits != 16) || |
6323 | (MOpc == X86::TEST32mi && NumVolBits != 32)) |
6324 | break; |
6325 | } |
6326 | } |
6327 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
6328 | Reg.getOperand(i: 0) }; |
6329 | NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VT1: MVT::i32, VT2: MVT::Other, Ops); |
6330 | // Update the chain. |
6331 | ReplaceUses(F: Reg.getValue(R: 1), T: SDValue(NewNode, 1)); |
6332 | // Record the mem-refs |
6333 | CurDAG->setNodeMemRefs(N: NewNode, |
6334 | NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()}); |
6335 | } else { |
6336 | // Extract the subregister if necessary. |
6337 | if (N0.getValueType() != VT) |
6338 | Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg); |
6339 | |
6340 | NewNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VT: MVT::i32, Op1: Reg, Op2: Imm); |
6341 | } |
6342 | // Replace CMP with TEST. |
6343 | ReplaceNode(F: Node, T: NewNode); |
6344 | return; |
6345 | } |
6346 | break; |
6347 | } |
6348 | case X86ISD::PCMPISTR: { |
6349 | if (!Subtarget->hasSSE42()) |
6350 | break; |
6351 | |
6352 | bool NeedIndex = !SDValue(Node, 0).use_empty(); |
6353 | bool NeedMask = !SDValue(Node, 1).use_empty(); |
6354 | // We can't fold a load if we are going to make two instructions. |
6355 | bool MayFoldLoad = !NeedIndex || !NeedMask; |
6356 | |
6357 | MachineSDNode *CNode; |
6358 | if (NeedMask) { |
6359 | unsigned ROpc = |
6360 | Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri; |
6361 | unsigned MOpc = |
6362 | Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi; |
6363 | CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node); |
6364 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0)); |
6365 | } |
6366 | if (NeedIndex || !NeedMask) { |
6367 | unsigned ROpc = |
6368 | Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri; |
6369 | unsigned MOpc = |
6370 | Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi; |
6371 | CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node); |
6372 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
6373 | } |
6374 | |
6375 | // Connect the flag usage to the last instruction created. |
6376 | ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1)); |
6377 | CurDAG->RemoveDeadNode(N: Node); |
6378 | return; |
6379 | } |
6380 | case X86ISD::PCMPESTR: { |
6381 | if (!Subtarget->hasSSE42()) |
6382 | break; |
6383 | |
6384 | // Copy the two implicit register inputs. |
6385 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EAX, |
6386 | N: Node->getOperand(Num: 1), |
6387 | Glue: SDValue()).getValue(R: 1); |
6388 | InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EDX, |
6389 | N: Node->getOperand(Num: 3), Glue: InGlue).getValue(R: 1); |
6390 | |
6391 | bool NeedIndex = !SDValue(Node, 0).use_empty(); |
6392 | bool NeedMask = !SDValue(Node, 1).use_empty(); |
6393 | // We can't fold a load if we are going to make two instructions. |
6394 | bool MayFoldLoad = !NeedIndex || !NeedMask; |
6395 | |
6396 | MachineSDNode *CNode; |
6397 | if (NeedMask) { |
6398 | unsigned ROpc = |
6399 | Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri; |
6400 | unsigned MOpc = |
6401 | Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi; |
6402 | CNode = |
6403 | emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::v16i8, Node, InGlue); |
6404 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0)); |
6405 | } |
6406 | if (NeedIndex || !NeedMask) { |
6407 | unsigned ROpc = |
6408 | Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri; |
6409 | unsigned MOpc = |
6410 | Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi; |
6411 | CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, VT: MVT::i32, Node, InGlue); |
6412 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
6413 | } |
6414 | // Connect the flag usage to the last instruction created. |
6415 | ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1)); |
6416 | CurDAG->RemoveDeadNode(N: Node); |
6417 | return; |
6418 | } |
6419 | |
6420 | case ISD::SETCC: { |
6421 | if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue(Node, 0), InMask: SDValue())) |
6422 | return; |
6423 | |
6424 | break; |
6425 | } |
6426 | |
6427 | case ISD::STORE: |
6428 | if (foldLoadStoreIntoMemOperand(Node)) |
6429 | return; |
6430 | break; |
6431 | |
6432 | case X86ISD::SETCC_CARRY: { |
6433 | MVT VT = Node->getSimpleValueType(ResNo: 0); |
6434 | SDValue Result; |
6435 | if (Subtarget->hasSBBDepBreaking()) { |
6436 | // We have to do this manually because tblgen will put the eflags copy in |
6437 | // the wrong place if we use an extract_subreg in the pattern. |
6438 | // Copy flags to the EFLAGS register and glue it to next node. |
6439 | SDValue EFLAGS = |
6440 | CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: X86::EFLAGS, |
6441 | N: Node->getOperand(Num: 1), Glue: SDValue()); |
6442 | |
6443 | // Create a 64-bit instruction if the result is 64-bits otherwise use the |
6444 | // 32-bit version. |
6445 | unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; |
6446 | MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; |
6447 | Result = SDValue( |
6448 | CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: 1)), |
6449 | 0); |
6450 | } else { |
6451 | // The target does not recognize sbb with the same reg operand as a |
6452 | // no-source idiom, so we explicitly zero the input values. |
6453 | Result = getSBBZero(N: Node); |
6454 | } |
6455 | |
6456 | // For less than 32-bits we need to extract from the 32-bit node. |
6457 | if (VT == MVT::i8 || VT == MVT::i16) { |
6458 | int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; |
6459 | Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result); |
6460 | } |
6461 | |
6462 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
6463 | CurDAG->RemoveDeadNode(N: Node); |
6464 | return; |
6465 | } |
6466 | case X86ISD::SBB: { |
6467 | if (isNullConstant(V: Node->getOperand(Num: 0)) && |
6468 | isNullConstant(V: Node->getOperand(Num: 1))) { |
6469 | SDValue Result = getSBBZero(N: Node); |
6470 | |
6471 | // Replace the flag use. |
6472 | ReplaceUses(F: SDValue(Node, 1), T: Result.getValue(R: 1)); |
6473 | |
6474 | // Replace the result use. |
6475 | if (!SDValue(Node, 0).use_empty()) { |
6476 | // For less than 32-bits we need to extract from the 32-bit node. |
6477 | MVT VT = Node->getSimpleValueType(ResNo: 0); |
6478 | if (VT == MVT::i8 || VT == MVT::i16) { |
6479 | int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; |
6480 | Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result); |
6481 | } |
6482 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
6483 | } |
6484 | |
6485 | CurDAG->RemoveDeadNode(N: Node); |
6486 | return; |
6487 | } |
6488 | break; |
6489 | } |
6490 | case X86ISD::MGATHER: { |
6491 | auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node); |
6492 | SDValue IndexOp = Mgt->getIndex(); |
6493 | SDValue Mask = Mgt->getMask(); |
6494 | MVT IndexVT = IndexOp.getSimpleValueType(); |
6495 | MVT ValueVT = Node->getSimpleValueType(ResNo: 0); |
6496 | MVT MaskVT = Mask.getSimpleValueType(); |
6497 | |
6498 | // This is just to prevent crashes if the nodes are malformed somehow. We're |
6499 | // otherwise only doing loose type checking in here based on type what |
6500 | // a type constraint would say just like table based isel. |
6501 | if (!ValueVT.isVector() || !MaskVT.isVector()) |
6502 | break; |
6503 | |
6504 | unsigned NumElts = ValueVT.getVectorNumElements(); |
6505 | MVT ValueSVT = ValueVT.getVectorElementType(); |
6506 | |
6507 | bool IsFP = ValueSVT.isFloatingPoint(); |
6508 | unsigned EltSize = ValueSVT.getSizeInBits(); |
6509 | |
6510 | unsigned Opc = 0; |
6511 | bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1; |
6512 | if (AVX512Gather) { |
6513 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
6514 | Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm; |
6515 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
6516 | Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm; |
6517 | else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) |
6518 | Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm; |
6519 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
6520 | Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm; |
6521 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
6522 | Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm; |
6523 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) |
6524 | Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm; |
6525 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
6526 | Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm; |
6527 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
6528 | Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm; |
6529 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) |
6530 | Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm; |
6531 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
6532 | Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm; |
6533 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
6534 | Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm; |
6535 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) |
6536 | Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm; |
6537 | } else { |
6538 | assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() && |
6539 | "Unexpected mask VT!" ); |
6540 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
6541 | Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm; |
6542 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
6543 | Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm; |
6544 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
6545 | Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm; |
6546 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
6547 | Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm; |
6548 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
6549 | Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm; |
6550 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
6551 | Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm; |
6552 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
6553 | Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm; |
6554 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
6555 | Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm; |
6556 | } |
6557 | |
6558 | if (!Opc) |
6559 | break; |
6560 | |
6561 | SDValue Base, Scale, Index, Disp, Segment; |
6562 | if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(), |
6563 | Base, Scale, Index, Disp, Segment)) |
6564 | break; |
6565 | |
6566 | SDValue PassThru = Mgt->getPassThru(); |
6567 | SDValue Chain = Mgt->getChain(); |
6568 | // Gather instructions have a mask output not in the ISD node. |
6569 | SDVTList VTs = CurDAG->getVTList(VT1: ValueVT, VT2: MaskVT, VT3: MVT::Other); |
6570 | |
6571 | MachineSDNode *NewNode; |
6572 | if (AVX512Gather) { |
6573 | SDValue Ops[] = {PassThru, Mask, Base, Scale, |
6574 | Index, Disp, Segment, Chain}; |
6575 | NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
6576 | } else { |
6577 | SDValue Ops[] = {PassThru, Base, Scale, Index, |
6578 | Disp, Segment, Mask, Chain}; |
6579 | NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
6580 | } |
6581 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()}); |
6582 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
6583 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(NewNode, 2)); |
6584 | CurDAG->RemoveDeadNode(N: Node); |
6585 | return; |
6586 | } |
6587 | case X86ISD::MSCATTER: { |
6588 | auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node); |
6589 | SDValue Value = Sc->getValue(); |
6590 | SDValue IndexOp = Sc->getIndex(); |
6591 | MVT IndexVT = IndexOp.getSimpleValueType(); |
6592 | MVT ValueVT = Value.getSimpleValueType(); |
6593 | |
6594 | // This is just to prevent crashes if the nodes are malformed somehow. We're |
6595 | // otherwise only doing loose type checking in here based on type what |
6596 | // a type constraint would say just like table based isel. |
6597 | if (!ValueVT.isVector()) |
6598 | break; |
6599 | |
6600 | unsigned NumElts = ValueVT.getVectorNumElements(); |
6601 | MVT ValueSVT = ValueVT.getVectorElementType(); |
6602 | |
6603 | bool IsFP = ValueSVT.isFloatingPoint(); |
6604 | unsigned EltSize = ValueSVT.getSizeInBits(); |
6605 | |
6606 | unsigned Opc; |
6607 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
6608 | Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr; |
6609 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
6610 | Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr; |
6611 | else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) |
6612 | Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr; |
6613 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
6614 | Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr; |
6615 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
6616 | Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr; |
6617 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) |
6618 | Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr; |
6619 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
6620 | Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr; |
6621 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
6622 | Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr; |
6623 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) |
6624 | Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr; |
6625 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
6626 | Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr; |
6627 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
6628 | Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr; |
6629 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) |
6630 | Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr; |
6631 | else |
6632 | break; |
6633 | |
6634 | SDValue Base, Scale, Index, Disp, Segment; |
6635 | if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(), |
6636 | Base, Scale, Index, Disp, Segment)) |
6637 | break; |
6638 | |
6639 | SDValue Mask = Sc->getMask(); |
6640 | SDValue Chain = Sc->getChain(); |
6641 | // Scatter instructions have a mask output not in the ISD node. |
6642 | SDVTList VTs = CurDAG->getVTList(VT1: Mask.getValueType(), VT2: MVT::Other); |
6643 | SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain}; |
6644 | |
6645 | MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
6646 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()}); |
6647 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 1)); |
6648 | CurDAG->RemoveDeadNode(N: Node); |
6649 | return; |
6650 | } |
6651 | case ISD::PREALLOCATED_SETUP: { |
6652 | auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
6653 | auto CallId = MFI->getPreallocatedIdForCallSite( |
6654 | CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue()); |
6655 | SDValue Chain = Node->getOperand(Num: 0); |
6656 | SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32); |
6657 | MachineSDNode *New = CurDAG->getMachineNode( |
6658 | Opcode: TargetOpcode::PREALLOCATED_SETUP, dl, VT: MVT::Other, Op1: CallIdValue, Op2: Chain); |
6659 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Chain |
6660 | CurDAG->RemoveDeadNode(N: Node); |
6661 | return; |
6662 | } |
6663 | case ISD::PREALLOCATED_ARG: { |
6664 | auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
6665 | auto CallId = MFI->getPreallocatedIdForCallSite( |
6666 | CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue()); |
6667 | SDValue Chain = Node->getOperand(Num: 0); |
6668 | SDValue CallIdValue = CurDAG->getTargetConstant(Val: CallId, DL: dl, VT: MVT::i32); |
6669 | SDValue ArgIndex = Node->getOperand(Num: 2); |
6670 | SDValue Ops[3]; |
6671 | Ops[0] = CallIdValue; |
6672 | Ops[1] = ArgIndex; |
6673 | Ops[2] = Chain; |
6674 | MachineSDNode *New = CurDAG->getMachineNode( |
6675 | Opcode: TargetOpcode::PREALLOCATED_ARG, dl, |
6676 | VTs: CurDAG->getVTList(VT1: TLI->getPointerTy(DL: CurDAG->getDataLayout()), |
6677 | VT2: MVT::Other), |
6678 | Ops); |
6679 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Arg pointer |
6680 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(New, 1)); // Chain |
6681 | CurDAG->RemoveDeadNode(N: Node); |
6682 | return; |
6683 | } |
6684 | case X86ISD::AESENCWIDE128KL: |
6685 | case X86ISD::AESDECWIDE128KL: |
6686 | case X86ISD::AESENCWIDE256KL: |
6687 | case X86ISD::AESDECWIDE256KL: { |
6688 | if (!Subtarget->hasWIDEKL()) |
6689 | break; |
6690 | |
6691 | unsigned Opcode; |
6692 | switch (Node->getOpcode()) { |
6693 | default: |
6694 | llvm_unreachable("Unexpected opcode!" ); |
6695 | case X86ISD::AESENCWIDE128KL: |
6696 | Opcode = X86::AESENCWIDE128KL; |
6697 | break; |
6698 | case X86ISD::AESDECWIDE128KL: |
6699 | Opcode = X86::AESDECWIDE128KL; |
6700 | break; |
6701 | case X86ISD::AESENCWIDE256KL: |
6702 | Opcode = X86::AESENCWIDE256KL; |
6703 | break; |
6704 | case X86ISD::AESDECWIDE256KL: |
6705 | Opcode = X86::AESDECWIDE256KL; |
6706 | break; |
6707 | } |
6708 | |
6709 | SDValue Chain = Node->getOperand(Num: 0); |
6710 | SDValue Addr = Node->getOperand(Num: 1); |
6711 | |
6712 | SDValue Base, Scale, Index, Disp, Segment; |
6713 | if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment)) |
6714 | break; |
6715 | |
6716 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM0, N: Node->getOperand(Num: 2), |
6717 | Glue: SDValue()); |
6718 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM1, N: Node->getOperand(Num: 3), |
6719 | Glue: Chain.getValue(R: 1)); |
6720 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM2, N: Node->getOperand(Num: 4), |
6721 | Glue: Chain.getValue(R: 1)); |
6722 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM3, N: Node->getOperand(Num: 5), |
6723 | Glue: Chain.getValue(R: 1)); |
6724 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM4, N: Node->getOperand(Num: 6), |
6725 | Glue: Chain.getValue(R: 1)); |
6726 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM5, N: Node->getOperand(Num: 7), |
6727 | Glue: Chain.getValue(R: 1)); |
6728 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM6, N: Node->getOperand(Num: 8), |
6729 | Glue: Chain.getValue(R: 1)); |
6730 | Chain = CurDAG->getCopyToReg(Chain, dl, Reg: X86::XMM7, N: Node->getOperand(Num: 9), |
6731 | Glue: Chain.getValue(R: 1)); |
6732 | |
6733 | MachineSDNode *Res = CurDAG->getMachineNode( |
6734 | Opcode, dl, VTs: Node->getVTList(), |
6735 | Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: 1)}); |
6736 | CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand()); |
6737 | ReplaceNode(F: Node, T: Res); |
6738 | return; |
6739 | } |
6740 | case X86ISD::POP_FROM_X87_REG: { |
6741 | SDValue Chain = Node->getOperand(Num: 0); |
6742 | Register Reg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1))->getReg(); |
6743 | SDValue Glue; |
6744 | if (Node->getNumValues() == 3) |
6745 | Glue = Node->getOperand(Num: 2); |
6746 | SDValue Copy = |
6747 | CurDAG->getCopyFromReg(Chain, dl, Reg, VT: Node->getValueType(ResNo: 0), Glue); |
6748 | ReplaceNode(F: Node, T: Copy.getNode()); |
6749 | return; |
6750 | } |
6751 | } |
6752 | |
6753 | SelectCode(N: Node); |
6754 | } |
6755 | |
6756 | bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand( |
6757 | const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, |
6758 | std::vector<SDValue> &OutOps) { |
6759 | SDValue Op0, Op1, Op2, Op3, Op4; |
6760 | switch (ConstraintID) { |
6761 | default: |
6762 | llvm_unreachable("Unexpected asm memory constraint" ); |
6763 | case InlineAsm::ConstraintCode::o: // offsetable ?? |
6764 | case InlineAsm::ConstraintCode::v: // not offsetable ?? |
6765 | case InlineAsm::ConstraintCode::m: // memory |
6766 | case InlineAsm::ConstraintCode::X: |
6767 | case InlineAsm::ConstraintCode::p: // address |
6768 | if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4)) |
6769 | return true; |
6770 | break; |
6771 | } |
6772 | |
6773 | OutOps.push_back(x: Op0); |
6774 | OutOps.push_back(x: Op1); |
6775 | OutOps.push_back(x: Op2); |
6776 | OutOps.push_back(x: Op3); |
6777 | OutOps.push_back(x: Op4); |
6778 | return false; |
6779 | } |
6780 | |
6781 | X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM) |
6782 | : SelectionDAGISelPass( |
6783 | std::make_unique<X86DAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {} |
6784 | |
6785 | /// This pass converts a legalized DAG into a X86-specific DAG, |
6786 | /// ready for instruction scheduling. |
6787 | FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, |
6788 | CodeGenOptLevel OptLevel) { |
6789 | return new X86DAGToDAGISelLegacy(TM, OptLevel); |
6790 | } |
6791 | |